extraction/TypeSystemToNeuralTypeSystem.ipynb (258 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from wikidata_linker_utils.type_collection import TypeCollection\n",
"import json\n",
"import numpy as np\n",
"from os import makedirs\n",
"from os.path import join, realpath, dirname, exists"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Type System To Neural Type System\n",
"\n",
"Convert the result of evolve_type_system into a config for training a type classifier."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def convert_types_to_disambiguator_config(type_path,\n",
" output_name,\n",
" output_dirname,\n",
" config_path,\n",
" wiki=None,\n",
" lang,\n",
" min_count=2,\n",
" sample_size=9999999,\n",
" min_percent=0.001):\n",
" with open(type_path, \"rt\") as fin:\n",
" all_types = json.load(fin)\n",
" output_dir = join(output_name, output_dirname)\n",
" config = {\n",
" \"wiki\": wiki,\n",
" \"min_count\": min_count,\n",
" \"min_percent\": min_percent,\n",
" \"wikidata\": \"wikidata\",\n",
" \"prefix\": lang + \"wiki\",\n",
" \"sample_size\": sample_size,\n",
" \"num_names_to_load\": 4000,\n",
" \"language_path\": lang + \"_trie_fixed\",\n",
" \"redirections\": lang + \"_redirections.tsv\",\n",
" \"classification\": []\n",
" }\n",
" for types in all_types:\n",
" typename = types[\"qid\"]\n",
" relation = types[\"relation\"]\n",
" surname = typename + \"_\" + relation.replace(\" \", \"_\")\n",
" type_dir = join(output_dir, surname + \"_classification\")\n",
" config[\"classification\"].append(type_dir)\n",
" \n",
" with open(config_path, \"wt\") as fout:\n",
" json.dump(config, fout)\n",
" \n",
"def convert_types_to_model_config(collection,\n",
" wikidata_path,\n",
" type_path,\n",
" output_name,\n",
" output_dirname,\n",
" languages):\n",
" with open(type_path, \"rt\") as fin:\n",
" all_types = json.load(fin)\n",
" \n",
" output_dir = join(dirname(output_name), output_dirname)\n",
" \n",
" makedirs(output_dir, exist_ok=True)\n",
" config = {\n",
" \"wikidata_path\": wikidata_path,\n",
" \"classification_path\": output_dirname,\n",
" \"features\": [\n",
" {\n",
" \"type\": \"word\",\n",
" \"dimension\": 100,\n",
" \"max_vocab\": 1000000\n",
" },\n",
" {\n",
" \"type\": \"suffix\",\n",
" \"length\": 2,\n",
" \"dimension\": 6,\n",
" \"max_vocab\": 1000000\n",
" },\n",
" {\n",
" \"type\": \"suffix\",\n",
" \"length\": 3,\n",
" \"dimension\": 6,\n",
" \"max_vocab\": 1000000\n",
" },\n",
" {\n",
" \"type\": \"prefix\",\n",
" \"length\": 2,\n",
" \"dimension\": 6,\n",
" \"max_vocab\": 1000000\n",
" },\n",
" {\n",
" \"type\": \"prefix\",\n",
" \"length\": 3,\n",
" \"dimension\": 6\n",
" },\n",
" {\n",
" \"type\": \"digit\"\n",
" },\n",
" {\n",
" \"type\": \"uppercase\"\n",
" },\n",
" {\n",
" \"type\": \"punctuation_count\"\n",
" }\n",
" ],\n",
" \"datasets\": [],\n",
" \"objectives\": []\n",
" }\n",
" for types in all_types:\n",
" typename = types[\"qid\"]\n",
" relation = types[\"relation\"]\n",
" surname = typename + \"_\" + relation.replace(\" \", \"_\")\n",
" type_dir = join(output_dir, surname + \"_classification\")\n",
" makedirs(type_dir, exist_ok=True)\n",
" if not exists(join(type_dir, \"classification.npy\")):\n",
" np.save(join(type_dir, \"classification.npy\"),\n",
" collection.satisfy([relation], [collection.name2index[typename]]))\n",
" with open(join(type_dir, \"classes.txt\"), \"wt\") as fout:\n",
" fout.write(\"N\\nY\\n\")\n",
" config[\"objectives\"].append({\n",
" \"name\": surname,\n",
" \"type\": \"softmax\",\n",
" \"vocab\": join(output_dirname, surname + \"_classification\", \"classes.txt\")\n",
" })\n",
" \n",
" datasets = ([(\"{}_train.h5\".format(lang), \"train\") for lang in languages] +\n",
" [(\"{}_dev.h5\".format(lang), \"dev\") for lang in languages])\n",
" for dataset, dtype in datasets:\n",
" config[\"datasets\"].append(\n",
" {\n",
" \"type\": dtype,\n",
" \"path\": dataset,\n",
" \"x\": 0,\n",
" \"ignore\": \"other\",\n",
" \"y\": [{\"column\": 1,\n",
" \"objective\": obj[\"name\"],\n",
" \"classification\": obj[\"name\"] + \"_classification\"}\n",
" for obj in config[\"objectives\"]]\n",
" }\n",
" )\n",
" with open(output_name, \"wt\") as fout:\n",
" json.dump(config, fout, indent=4)\n",
"\n",
"def search2model(*,\n",
" collection,\n",
" dataset_path,\n",
" report_path,\n",
" wikidata_path,\n",
" name,\n",
" languages=None):\n",
" if languages is None:\n",
" languages = [\"en\", \"fr\", \"es\", \"pt\", \"de\"]\n",
" convert_types_to_model_config(collection,\n",
" wikidata_path,\n",
" report_path,\n",
" \"/Volumes/Samsung_T3/tahiti/en_fr_wiki_w10/\" + name + \"_type_config.json\",\n",
" directory,\n",
" languages)\n",
" \n",
" for lang in languages:\n",
" convert_types_to_disambiguator_config(report_path,\n",
" dataset_path,\n",
" directory,\n",
" \"extraction/{name}_disambiguator_config_fixed_{lang}.json\".format(\n",
" name=name, lang=lang),\n",
" min_percent=0,\n",
" lang=lang,\n",
" min_count=0,\n",
" sample_size=1000,\n",
" wiki=\"{}wiki-latest-pages-articles.xml\".format(lang))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# load wikidata types\n",
"c = TypeCollection(\"data/wikidata\",\n",
" num_names_to_load=0)\n",
"c.load_blacklist(\"extraction/blacklist.json\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading wikidata id -> index\n",
"done\n"
]
}
],
"source": [
"# export your GA model by passing the path to the report file from evolve_type_system,\n",
"# and path to wikidata & training data path (exported wikipedia text in h5 format) along\n",
"# with the desired name for the model\n",
"search2model(collection=c,\n",
" dataset_path=\"training_data\",\n",
" report_path=\"ga-0.00007.json\",\n",
" wikidata_path=\"data/wikidata\",\n",
" name=\"ga_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}