extraction/fast_link_fixer.py (509 lines of code) (raw):

""" Perform a reduction on the anchors to articles relation by finding different articles refering to the same item and making the anchor point to the most common version, or by using the wikidata graph to find instance of, and other parent-child relations that allow one article to encompass or be more generic than its co-triggerable articles. Usage: ------ ``` DATA_DIR=data/wikidata LANG_DIR=data/en_trie FIXED_LANG_DIR=data/en_trie_fixed python3 fast_link_fixer.py ${WIKIDATA_PATH} ${LANG_DIR} ${FIXED_LANG_DIR} ``` """ import argparse import time import shutil from os.path import join, realpath, dirname from os import makedirs import numpy as np import marisa_trie from wikidata_linker_utils.type_collection import get_name, TypeCollection from wikidata_linker_utils.logic import logical_and, logical_ands, logical_not, logical_or, logical_ors from wikidata_linker_utils.progressbar import get_progress_bar from wikidata_linker_utils.offset_array import OffsetArray from wikidata_linker_utils.file import true_exists import wikidata_linker_utils.wikidata_properties as wprop from wikidata_linker_utils.successor_mask import ( related_promote_highest, extend_relations, reduce_values, remap_offset_array ) SCRIPT_DIR = dirname(realpath(__file__)) from numpy import logical_not, logical_or, logical_and from wikidata_linker_utils.logic import logical_ors IS_HISTORY = None IS_PEOPLE = None IS_BREED = None IS_PEOPLE_GROUP = None IS_LIST_ARTICLE = None IS_LANGUAGE_ALPHABET = None IS_SPORTS_TEAM = None IS_CARDINAL_DIRECTION = None IS_POLITICAL_PARTY = None IS_SOCIETY = None IS_POSITION = None IS_CHARACTER_HUMAN = None IS_POLITICAL_ORGANIZATION = None IS_LANDFORM = None IS_THING = None IS_BATTLE = None IS_EVENT = None IS_ACTIVITY = None IS_THOROUGHFARE = None IS_KINSHIP = None IS_EPISODE_LIST = None def wkp(c, name): return c.article2id['enwiki/' + name][0][0] def wkd(c, name): return c.name2index[name] def initialize_globals(c): """global variables that guide the metonymy/anaphora removal process.""" global IS_HISTORY global IS_PEOPLE global IS_PEOPLE_GROUP global IS_LIST_ARTICLE global IS_COUNTRY global IS_BREED global IS_EVENT_SPORT global IS_LANGUAGE_ALPHABET global IS_SPORTS_TEAM global IS_CARDINAL_DIRECTION global IS_ACTIVITY global IS_POLITICAL_PARTY global IS_SOCIETY global IS_BATTLE global IS_POSITION global IS_LANDFORM global IS_CHARACTER_HUMAN global IS_POLITICAL_ORGANIZATION global IS_THING global IS_THOROUGHFARE global IS_EVENT global IS_KINSHIP global IS_EPISODE_LIST PEOPLE = wkd(c, "Q2472587") NATIONALITY = wkd(c, "Q231002") ASPECT_OF_HIST = wkd(c, "Q17524420") HISTORY = wkd(c, "Q309") LIST_ARTICLE = wkd(c, "Q13406463") WAR = wkd(c, "Q198") COUNTRY = wkd(c, "Q6256") FORMER_COUNTRY = wkd(c, "Q3024240") DOMINION = wkd(c, "Q223832") LANGUAGE = wkd(c, "Q34770") ALPHABET = wkd(c, "Q9779") COLONY = wkd(c, "Q133156") GOVERNORATE = wkd(c, "Q1798622") SPORTS_TEAM = wkd(c, "Q12973014") ATHLETIC_CONFERENCE = wkd(c, "Q2992826") CARDINAL_DIRECTION = wkd(c, "Q23718") POLITICAL_PARTY = wkd(c, "Q7278") STATE = wkd(c, "Q7275") DYNASTY = wkd(c, "Q164950") SOCIETY = wkd(c, "Q8425") MENS_SINGLES = wkd(c, "Q16893072") SPORT = wkd(c, "Q349") POSITION = wkd(c, "Q4164871") HUMAN = wkd(c, "Q5") FICTIONAL_CHARACTER = wkd(c, "Q95074") BREED = wkd(c, "Q38829") ORTHOGRAPHY = wkd(c, "Q43091") POLITICAL_ORGANIZATION = wkd(c, "Q7210356") GROUP_OF_HUMANS = wkd(c, "Q16334295") LANDFORM = wkd(c, "Q271669") BATTLE = wkd(c, "Q178561") FOOD = wkd(c, "Q2095") DRINK = wkd(c, "Q40050") ANIMAL = wkd(c, "Q16521") WORK = wkd(c, "Q386724") AUTOMOBILE_MODEL = wkd(c, "Q3231690") GOOD = wkd(c, "Q28877") VEHICLE = wkd(c, "Q42889") PUBLICATION = wkd(c, "Q732577") AUDIOVISUAL = wkd(c, "Q2431196") TERRITORIAL_ENTITY = wkd(c, "Q15642541") GEOGRAPHIC_OBJECT = wkd(c, "Q618123") ASTRO_OBJECT = wkd(c, "Q17444909") EVENT_SPORTING = wkd(c, "Q1656682") EVENT_OCCURRENCE = wkd(c, "Q1190554") ELECTROMAGNETIC_SPECTRUM = wkd(c, "Q133139") MAGICAL_ORG = wkd(c, "Q14946195") AUTONOM_CHURCH = wkd(c, "Q20871948") SIGN = wkd(c, "Q3695082") FORM_OF_GOVERNMENT = wkd(c, "Q1307214") SPORTS_ORG = wkd(c, "Q4438121") RECURRING_SPORTING_EVENT = wkd(c, "Q18608583") CLASS_SCHEME = wkd(c, "Q5962346") STYLE = wkd(c, "Q1292119") SIGN_SYSTEM = wkd(c, "Q7512598") PHYSICAL_PHENOMENON = wkd(c, "Q1293220") LAW = wkd(c, "Q7748") WATERCOURSE = wkd(c, "Q355304") BODY_OF_WATER = wkd(c, "Q15324") CHEMICAL_SUBSTANCE = wkd(c, "Q79529") HISTORICAL_PERIOD = wkd(c, "Q11514315") ACTIVITY = wkd(c, "Q815962") THOROUGHFARE = wkd(c, "Q83620") KINSHIP = wkd(c, "Q171318") FICTIONAL_HUMAN = wkd(c, "Q15632617") EPISODE = wkd(c, "Q1983062") IS_CHARACTER_HUMAN = c.satisfy( [wprop.INSTANCE_OF, wprop.SUBCLASS_OF, wprop.IS_A_LIST_OF], [HUMAN, FICTIONAL_HUMAN, FICTIONAL_CHARACTER] ) # to be a history you must be an aspect of history # but not a history itself: IS_HISTORY = logical_and( c.satisfy([wprop.INSTANCE_OF], [ASPECT_OF_HIST]), logical_not(c.satisfy([wprop.INSTANCE_OF], [HISTORY])) ) IS_PEOPLE = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [PEOPLE, NATIONALITY]) IS_PEOPLE_GROUP = np.logical_or( IS_PEOPLE, c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [GROUP_OF_HUMANS, MAGICAL_ORG, AUTONOM_CHURCH]) ) IS_LIST_ARTICLE = c.satisfy([wprop.INSTANCE_OF], [LIST_ARTICLE]) IS_LANGUAGE_ALPHABET = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [LANGUAGE, ALPHABET, ORTHOGRAPHY, SIGN_SYSTEM] ) IS_COUNTRY = c.satisfy([wprop.INSTANCE_OF], [COUNTRY, FORMER_COUNTRY, DOMINION, COLONY, STATE, DYNASTY, GOVERNORATE]) IS_SPORTS_TEAM = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF, wprop.PART_OF], [SPORTS_TEAM, ATHLETIC_CONFERENCE, SPORTS_ORG, RECURRING_SPORTING_EVENT]) IS_CARDINAL_DIRECTION = c.satisfy([wprop.INSTANCE_OF], [CARDINAL_DIRECTION]) IS_POLITICAL_PARTY = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [POLITICAL_PARTY]) IS_SOCIETY = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [SOCIETY, HISTORICAL_PERIOD]) IS_POSITION = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [POSITION]) IS_BREED = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [BREED]) IS_POLITICAL_ORGANIZATION = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [POLITICAL_ORGANIZATION, FORM_OF_GOVERNMENT]) IS_LANDFORM = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [LANDFORM, TERRITORIAL_ENTITY, GEOGRAPHIC_OBJECT, ASTRO_OBJECT, WATERCOURSE, BODY_OF_WATER]) IS_EVENT_SPORT = c.satisfy([wprop.SUBCLASS_OF, wprop.PART_OF, wprop.INSTANCE_OF], [EVENT_SPORTING, SPORT]) IS_THING = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [ AUTOMOBILE_MODEL, FOOD, DRINK, STYLE, ANIMAL, GOOD, LAW, CHEMICAL_SUBSTANCE, SIGN, VEHICLE, PHYSICAL_PHENOMENON, PUBLICATION, AUDIOVISUAL, CLASS_SCHEME, WORK, ELECTROMAGNETIC_SPECTRUM ] ) IS_THOROUGHFARE = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [THOROUGHFARE]) IS_ACTIVITY = c.satisfy([wprop.INSTANCE_OF], [ACTIVITY]) IS_EVENT = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [EVENT_OCCURRENCE]) IS_BATTLE = c.satisfy([wprop.SUBCLASS_OF, wprop.INSTANCE_OF], [BATTLE]) IS_KINSHIP = c.satisfy([wprop.INSTANCE_OF], [KINSHIP]) IS_EPISODE_LIST = c.satisfy([wprop.IS_A_LIST_OF], [EPISODE]) def get_relation_data(collection, relation_paths): """Prepare relations for usage inside extend_relations.""" out = [] for path in relation_paths: promote = path.get("promote", False) numpy_path = [] for step in path["steps"]: if isinstance(step, str): step_name, max_usage = step, 1 else: step_name, max_usage = step relation = collection.relation(step_name) numpy_path.append((relation.offsets, relation.values, max_usage)) inv_relation = collection.get_inverted_relation(step_name).edges() > 0 out.append((numpy_path, inv_relation, promote)) return out def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("wikidata") parser.add_argument("language_path") parser.add_argument("new_language_path") parser.add_argument("--steps", type=int, default=3, help="how many time should fixing be recursed (takes " "about 2mn per step. Has diminishing returns).") return parser.parse_args() def get_trie_properties(trie, offsets, values): """Obtain the length of every trigger in the trie.""" anchor_length = np.zeros(len(values), dtype=np.int32) start, end = 0, 0 for idx, key in enumerate(trie.iterkeys()): end = offsets[idx] anchor_length[start:end] = len(key) start = end return anchor_length def fix(collection, offsets, values, counts, anchor_length, num_category_link=8, keep_min=5): relations_that_can_extend = [ {"steps": [wprop.INSTANCE_OF]}, {"steps": [wprop.INSTANCE_OF, (wprop.SUBCLASS_OF, 2)]}, {"steps": [wprop.INSTANCE_OF, wprop.FACET_OF]}, {"steps": [(wprop.SUBCLASS_OF, 3)]}, {"steps": [wprop.OCCUPATION], "promote": True}, {"steps": [wprop.POSITION_HELD], "promote": True}, {"steps": [wprop.PART_OF, wprop.INSTANCE_OF]}, {"steps": [wprop.SERIES, wprop.INSTANCE_OF]}, {"steps": [wprop.SERIES, wprop.LOCATION]}, {"steps": [wprop.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY]}, {"steps": [wprop.COUNTRY]}, {"steps": [wprop.CATEGORY_LINK, wprop.CATEGORYS_MAIN_TOPIC]}, {"steps": [(wprop.CATEGORY_LINK, num_category_link), wprop.FIXED_POINTS]}, {"steps": [wprop.CATEGORY_LINK, wprop.FIXED_POINTS, wprop.IS_A_LIST_OF]}, {"steps": [wprop.IS_A_LIST_OF, (wprop.SUBCLASS_OF, 2)]} ] relation_data = get_relation_data(collection, relations_that_can_extend) new_values = values # get rid of History of BLAH where link also points to BLAH: is_history = IS_HISTORY[new_values] is_people_mask = IS_PEOPLE[new_values] is_list = IS_LIST_ARTICLE[new_values] new_values = related_promote_highest( new_values, offsets, counts, condition=is_history, alternative=is_people_mask, keep_min=keep_min ) unchanged = values == new_values is_not_history_or_list = logical_and( logical_not(is_history), logical_not(is_list) ) new_values = related_promote_highest( new_values, offsets, counts, condition=logical_and(is_history, unchanged), alternative=is_not_history_or_list, keep_min=keep_min ) is_sport_or_thoroughfare = logical_or( IS_EVENT_SPORT, IS_THOROUGHFARE )[new_values] # delete these references: new_values[anchor_length < 2] = -1 # get rid of shorthand for sports: new_values[logical_and(is_sport_or_thoroughfare, anchor_length <= 2)] = -1 # remove lists of episodes: is_episode_list = IS_EPISODE_LIST[new_values] new_values[is_episode_list] = -1 # get rid of "car" -> "Renault Megane", when "car" -> "Car", # and "Renault Megane" is instance of "Car": is_not_people = logical_not(IS_PEOPLE)[new_values] new_values = extend_relations( relation_data, new_values, offsets, counts, alternative=is_not_people, pbar=get_progress_bar("extend_relations", max_value=len(offsets), item="links"), keep_min=keep_min ) unchanged = values == new_values # remove all non-modified values that are # not instances of anything, nor subclasses of anything: new_values[logical_ands( [ logical_ands([ collection.relation(wprop.INSTANCE_OF).edges() == 0, collection.relation(wprop.SUBCLASS_OF).edges() == 0, collection.relation(wprop.PART_OF).edges() == 0, collection.relation(wprop.CATEGORY_LINK).edges() == 0 ])[new_values], unchanged ])] = -1 is_kinship = IS_KINSHIP[new_values] is_human = IS_CHARACTER_HUMAN[new_values] new_values = related_promote_highest( new_values, offsets, counts, condition=is_human, alternative=is_kinship, keep_min=keep_min ) # replace elements by a country # if a better alternative is present, # counts is less than 100: should_replace_by_country = logical_ands( [ logical_not( logical_ors([ IS_POLITICAL_ORGANIZATION, IS_CARDINAL_DIRECTION, IS_LANGUAGE_ALPHABET, IS_COUNTRY, IS_PEOPLE_GROUP, IS_BREED, IS_BATTLE, IS_SOCIETY, IS_POSITION, IS_POLITICAL_PARTY, IS_SPORTS_TEAM, IS_CHARACTER_HUMAN, IS_LANDFORM, IS_ACTIVITY ]) )[new_values], counts < 100 ] ) # turn this into a promote highest in this order: is_country_or_cardinal = [ IS_CARDINAL_DIRECTION, IS_COUNTRY, IS_POLITICAL_ORGANIZATION ] for i, alternative in enumerate(is_country_or_cardinal): unchanged = values == new_values should_replace_by_country = logical_and( should_replace_by_country, unchanged ) new_values = related_promote_highest( new_values, offsets, counts, condition=should_replace_by_country, alternative=alternative[new_values], keep_min=keep_min ) new_offsets, new_values, new_counts, location_shift = reduce_values( offsets, new_values, counts) return (new_offsets, new_values, new_counts), location_shift def filter_trie(trie, values): return marisa_trie.Trie((trie.restore_key(value) for value in values)) def remap_trie_offset_array(old_trie, new_trie, offsets_values_counts): mapping = np.zeros(len(new_trie), dtype=np.int32) t0 = time.time() for new_index in range(len(new_trie)): mapping[new_index] = old_trie[new_trie.restore_key(new_index)] t1 = time.time() print("Got mapping from old trie to new trie in %.3fs" % (t1 - t0,)) ported = [] for offsets, values, counts in offsets_values_counts: new_offsets, new_values, new_counts = remap_offset_array( mapping, offsets, values, counts ) ported.append((new_offsets, new_values, new_counts)) t2 = time.time() print("Ported counts and values across tries in %.3fs" % (t2 - t1,)) return ported def main(): args = parse_args() if args.new_language_path == args.language_path: raise ValueError("new_language_path and language_path must be " "different: cannot generate a fixed trie in " "the same directory as the original trie.") c = TypeCollection(args.wikidata, num_names_to_load=0) c.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) original_values = np.load( join(args.language_path, "trie_index2indices_values.npy")) original_offsets = np.load( join(args.language_path, "trie_index2indices_offsets.npy")) original_counts = np.load( join(args.language_path, "trie_index2indices_counts.npy")) original_trie_path = join(args.language_path, 'trie.marisa') trie = marisa_trie.Trie().load(original_trie_path) initialize_globals(c) t0 = time.time() old_location_shift = None values, offsets, counts = original_values, original_offsets, original_counts for step in range(args.steps): anchor_length = get_trie_properties(trie, offsets, values) (offsets, values, counts), location_shift = fix( collection=c, offsets=offsets, values=values, counts=counts, anchor_length=anchor_length, num_category_link=8 ) if old_location_shift is not None: # see where newly shifted values are now pointing # to (extra indirection level): location_shift = location_shift[old_location_shift] location_shift[old_location_shift == -1] = -1 old_location_shift = location_shift pre_reduced_values = values[location_shift] pre_reduced_values[location_shift == -1] = -1 num_changes = int((pre_reduced_values != original_values).sum()) change_volume = int((original_counts[pre_reduced_values != original_values].sum())) print("step %d with %d changes, %d total links" % ( step, num_changes, change_volume) ) pre_reduced_values = values[location_shift] pre_reduced_values[location_shift == -1] = -1 t1 = time.time() num_changes = int((pre_reduced_values != original_values).sum()) print("Done with link fixing in %.3fs, with %d changes." % ( t1 - t0, num_changes) ) # show some remappings: np.random.seed(1234) num_samples = 10 samples = np.random.choice( np.where( np.logical_and( np.logical_and( pre_reduced_values != original_values, pre_reduced_values != -1 ), original_values != -1 ) )[0], size=num_samples, replace=False ) print("Sample fixes:") for index in samples: print(" %r (%d) -> %r (%d)" % ( c.get_name(int(original_values[index])), int(original_values[index]), c.get_name(int(pre_reduced_values[index])), int(pre_reduced_values[index]) ) ) print("") samples = np.random.choice( np.where( OffsetArray(values, offsets).edges() == 0 )[0], size=num_samples, replace=False ) print("Sample deletions:") for index in samples: print(" %r" % (trie.restore_key(int(index)))) # prune out anchors where there are no more linked items: print("Removing empty anchors from trie...") t0 = time.time() non_empty_offsets = np.where( OffsetArray(values, offsets).edges() != 0 )[0] fixed_trie = filter_trie(trie, non_empty_offsets) contexts_found = true_exists( join(args.language_path, "trie_index2contexts_values.npy") ) if contexts_found: contexts_values = np.load( join(args.language_path, "trie_index2contexts_values.npy")) contexts_offsets = np.load( join(args.language_path, "trie_index2contexts_offsets.npy")) contexts_counts = np.load( join(args.language_path, "trie_index2contexts_counts.npy")) to_port = [ (offsets, values, counts), (original_offsets, pre_reduced_values, original_values) ] if contexts_found: to_port.append( (contexts_offsets, contexts_values, contexts_counts) ) ported = remap_trie_offset_array(trie, fixed_trie, to_port) offsets, values, counts = ported[0] original_offsets, pre_reduced_values, original_values = ported[1] t1 = time.time() print("Removed %d empty anchors from trie in %.3fs" % ( len(trie) - len(fixed_trie), t1 - t0,) ) print("Saving...") makedirs(args.new_language_path, exist_ok=True) np.save(join(args.new_language_path, "trie_index2indices_values.npy"), values) np.save(join(args.new_language_path, "trie_index2indices_offsets.npy"), offsets) np.save(join(args.new_language_path, "trie_index2indices_counts.npy"), counts) if contexts_found: contexts_offsets, contexts_values, contexts_counts = ported[2] np.save(join(args.new_language_path, "trie_index2contexts_values.npy"), contexts_values) np.save(join(args.new_language_path, "trie_index2contexts_offsets.npy"), contexts_offsets) np.save(join(args.new_language_path, "trie_index2contexts_counts.npy"), contexts_counts) new_trie_path = join(args.new_language_path, 'trie.marisa') fixed_trie.save(new_trie_path) transition = np.vstack([original_values, pre_reduced_values]).T np.save(join(args.new_language_path, "trie_index2indices_transition_values.npy"), transition) np.save(join(args.new_language_path, "trie_index2indices_transition_offsets.npy"), original_offsets) print("Done.") if __name__ == "__main__": main()