In [1]:
from wikidata_linker_utils.type_collection import TypeCollection
import json
import numpy as np
from os import makedirs
from os.path import join, realpath, dirname, exists

### Type System To Neural Type System

Convert the result of evolve_type_system into a config for training a type classifier.

In [4]:
def convert_types_to_disambiguator_config(type_path,
                                          output_name,
                                          output_dirname,
                                          config_path,
                                          wiki=None,
                                          lang,
                                          min_count=2,
                                          sample_size=9999999,
                                          min_percent=0.001):
    with open(type_path, "rt") as fin:
        all_types = json.load(fin)
    output_dir = join(output_name, output_dirname)
    config = {
        "wiki": wiki,
        "min_count": min_count,
        "min_percent": min_percent,
        "wikidata": "wikidata",
        "prefix": lang + "wiki",
        "sample_size": sample_size,
        "num_names_to_load": 4000,
        "language_path": lang + "_trie_fixed",
        "redirections": lang + "_redirections.tsv",
        "classification": []
    }
    for types in all_types:
        typename = types["qid"]
        relation = types["relation"]
        surname = typename + "_" + relation.replace(" ", "_")
        type_dir = join(output_dir, surname + "_classification")
        config["classification"].append(type_dir)
    
    with open(config_path, "wt") as fout:
        json.dump(config, fout)
    
def convert_types_to_model_config(collection,
                                  wikidata_path,
                                  type_path,
                                  output_name,
                                  output_dirname,
                                  languages):
    with open(type_path, "rt") as fin:
        all_types = json.load(fin)
        
    output_dir = join(dirname(output_name), output_dirname)
        
    makedirs(output_dir, exist_ok=True)
    config = {
        "wikidata_path": wikidata_path,
        "classification_path": output_dirname,
        "features": [
            {
                "type": "word",
                "dimension": 100,
                "max_vocab": 1000000
            },
            {
                "type": "suffix",
                "length": 2,
                "dimension": 6,
                "max_vocab": 1000000
            },
            {
                "type": "suffix",
                "length": 3,
                "dimension": 6,
                "max_vocab": 1000000
            },
            {
                "type": "prefix",
                "length": 2,
                "dimension": 6,
                "max_vocab": 1000000
            },
            {
                "type": "prefix",
                "length": 3,
                "dimension": 6
            },
            {
                "type": "digit"
            },
            {
                "type": "uppercase"
            },
            {
                "type": "punctuation_count"
            }
        ],
        "datasets": [],
        "objectives": []
    }
    for types in all_types:
        typename = types["qid"]
        relation = types["relation"]
        surname = typename + "_" + relation.replace(" ", "_")
        type_dir = join(output_dir, surname + "_classification")
        makedirs(type_dir, exist_ok=True)
        if not exists(join(type_dir, "classification.npy")):
            np.save(join(type_dir, "classification.npy"),
                    collection.satisfy([relation], [collection.name2index[typename]]))
        with open(join(type_dir, "classes.txt"), "wt") as fout:
            fout.write("N\nY\n")
        config["objectives"].append({
            "name": surname,
            "type": "softmax",
            "vocab": join(output_dirname, surname + "_classification", "classes.txt")
        })
   
    datasets = ([("{}_train.h5".format(lang), "train") for lang in languages] +
                [("{}_dev.h5".format(lang), "dev") for lang in languages])
    for dataset, dtype in datasets:
        config["datasets"].append(
            {
                "type": dtype,
                "path": dataset,
                "x": 0,
                "ignore": "other",
                "y": [{"column": 1,
                       "objective": obj["name"],
                       "classification": obj["name"] + "_classification"}
                      for obj in config["objectives"]]
            }
        )
    with open(output_name, "wt") as fout:
        json.dump(config, fout, indent=4)

def search2model(*,
                 collection,
                 dataset_path,
                 report_path,
                 wikidata_path,
                 name,
                 languages=None):
    if languages is None:
        languages = ["en", "fr", "es", "pt", "de"]
    convert_types_to_model_config(collection,
                                  wikidata_path,
                                  report_path,
                                  "/Volumes/Samsung_T3/tahiti/en_fr_wiki_w10/" + name + "_type_config.json",
                                  directory,
                                  languages)
    
    for lang in languages:
        convert_types_to_disambiguator_config(report_path,
                                              dataset_path,
                                              directory,
                                              "extraction/{name}_disambiguator_config_fixed_{lang}.json".format(
                                                  name=name, lang=lang),
                                              min_percent=0,
                                              lang=lang,
                                              min_count=0,
                                              sample_size=1000,
                                              wiki="{}wiki-latest-pages-articles.xml".format(lang))

In [None]:
# load wikidata types
c = TypeCollection("data/wikidata",
                  num_names_to_load=0)
c.load_blacklist("extraction/blacklist.json")

In [14]:
# export your GA model by passing the path to the report file from evolve_type_system,
# and path to wikidata & training data path (exported wikipedia text in h5 format) along
# with the desired name for the model
search2model(collection=c,
             dataset_path="training_data",
             report_path="ga-0.00007.json",
             wikidata_path="data/wikidata",
             name="ga_model")

loading wikidata id -> index
done
