extraction/TypeSystemToNeuralTypeSystem.ipynb

{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from wikidata_linker_utils.type_collection import TypeCollection\n", "import json\n", "import numpy as np\n", "from os import makedirs\n", "from os.path import join, realpath, dirname, exists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Type System To Neural Type System\n", "\n", "Convert the result of evolve_type_system into a config for training a type classifier." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def convert_types_to_disambiguator_config(type_path,\n", " output_name,\n", " output_dirname,\n", " config_path,\n", " wiki=None,\n", " lang,\n", " min_count=2,\n", " sample_size=9999999,\n", " min_percent=0.001):\n", " with open(type_path, \"rt\") as fin:\n", " all_types = json.load(fin)\n", " output_dir = join(output_name, output_dirname)\n", " config = {\n", " \"wiki\": wiki,\n", " \"min_count\": min_count,\n", " \"min_percent\": min_percent,\n", " \"wikidata\": \"wikidata\",\n", " \"prefix\": lang + \"wiki\",\n", " \"sample_size\": sample_size,\n", " \"num_names_to_load\": 4000,\n", " \"language_path\": lang + \"_trie_fixed\",\n", " \"redirections\": lang + \"_redirections.tsv\",\n", " \"classification\": []\n", " }\n", " for types in all_types:\n", " typename = types[\"qid\"]\n", " relation = types[\"relation\"]\n", " surname = typename + \"_\" + relation.replace(\" \", \"_\")\n", " type_dir = join(output_dir, surname + \"_classification\")\n", " config[\"classification\"].append(type_dir)\n", " \n", " with open(config_path, \"wt\") as fout:\n", " json.dump(config, fout)\n", " \n", "def convert_types_to_model_config(collection,\n", " wikidata_path,\n", " type_path,\n", " output_name,\n", " output_dirname,\n", " languages):\n", " with open(type_path, \"rt\") as fin:\n", " all_types = json.load(fin)\n", " \n", " output_dir = join(dirname(output_name), output_dirname)\n", " \n", " makedirs(output_dir, exist_ok=True)\n", " config = {\n", " \"wikidata_path\": wikidata_path,\n", " \"classification_path\": output_dirname,\n", " \"features\": [\n", " {\n", " \"type\": \"word\",\n", " \"dimension\": 100,\n", " \"max_vocab\": 1000000\n", " },\n", " {\n", " \"type\": \"suffix\",\n", " \"length\": 2,\n", " \"dimension\": 6,\n", " \"max_vocab\": 1000000\n", " },\n", " {\n", " \"type\": \"suffix\",\n", " \"length\": 3,\n", " \"dimension\": 6,\n", " \"max_vocab\": 1000000\n", " },\n", " {\n", " \"type\": \"prefix\",\n", " \"length\": 2,\n", " \"dimension\": 6,\n", " \"max_vocab\": 1000000\n", " },\n", " {\n", " \"type\": \"prefix\",\n", " \"length\": 3,\n", " \"dimension\": 6\n", " },\n", " {\n", " \"type\": \"digit\"\n", " },\n", " {\n", " \"type\": \"uppercase\"\n", " },\n", " {\n", " \"type\": \"punctuation_count\"\n", " }\n", " ],\n", " \"datasets\": [],\n", " \"objectives\": []\n", " }\n", " for types in all_types:\n", " typename = types[\"qid\"]\n", " relation = types[\"relation\"]\n", " surname = typename + \"_\" + relation.replace(\" \", \"_\")\n", " type_dir = join(output_dir, surname + \"_classification\")\n", " makedirs(type_dir, exist_ok=True)\n", " if not exists(join(type_dir, \"classification.npy\")):\n", " np.save(join(type_dir, \"classification.npy\"),\n", " collection.satisfy([relation], [collection.name2index[typename]]))\n", " with open(join(type_dir, \"classes.txt\"), \"wt\") as fout:\n", " fout.write(\"N\\nY\\n\")\n", " config[\"objectives\"].append({\n", " \"name\": surname,\n", " \"type\": \"softmax\",\n", " \"vocab\": join(output_dirname, surname + \"_classification\", \"classes.txt\")\n", " })\n", " \n", " datasets = ([(\"{}_train.h5\".format(lang), \"train\") for lang in languages] +\n", " [(\"{}_dev.h5\".format(lang), \"dev\") for lang in languages])\n", " for dataset, dtype in datasets:\n", " config[\"datasets\"].append(\n", " {\n", " \"type\": dtype,\n", " \"path\": dataset,\n", " \"x\": 0,\n", " \"ignore\": \"other\",\n", " \"y\": [{\"column\": 1,\n", " \"objective\": obj[\"name\"],\n", " \"classification\": obj[\"name\"] + \"_classification\"}\n", " for obj in config[\"objectives\"]]\n", " }\n", " )\n", " with open(output_name, \"wt\") as fout:\n", " json.dump(config, fout, indent=4)\n", "\n", "def search2model(*,\n", " collection,\n", " dataset_path,\n", " report_path,\n", " wikidata_path,\n", " name,\n", " languages=None):\n", " if languages is None:\n", " languages = [\"en\", \"fr\", \"es\", \"pt\", \"de\"]\n", " convert_types_to_model_config(collection,\n", " wikidata_path,\n", " report_path,\n", " \"/Volumes/Samsung_T3/tahiti/en_fr_wiki_w10/\" + name + \"_type_config.json\",\n", " directory,\n", " languages)\n", " \n", " for lang in languages:\n", " convert_types_to_disambiguator_config(report_path,\n", " dataset_path,\n", " directory,\n", " \"extraction/{name}_disambiguator_config_fixed_{lang}.json\".format(\n", " name=name, lang=lang),\n", " min_percent=0,\n", " lang=lang,\n", " min_count=0,\n", " sample_size=1000,\n", " wiki=\"{}wiki-latest-pages-articles.xml\".format(lang))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# load wikidata types\n", "c = TypeCollection(\"data/wikidata\",\n", " num_names_to_load=0)\n", "c.load_blacklist(\"extraction/blacklist.json\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading wikidata id -> index\n", "done\n" ] } ], "source": [ "# export your GA model by passing the path to the report file from evolve_type_system,\n", "# and path to wikidata & training data path (exported wikipedia text in h5 format) along\n", "# with the desired name for the model\n", "search2model(collection=c,\n", " dataset_path=\"training_data\",\n", " report_path=\"ga-0.00007.json\",\n", " wikidata_path=\"data/wikidata\",\n", " name=\"ga_model\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }

extraction/TypeSystemToNeuralTypeSystem.ipynb (258 lines of code) (raw):