extraction/classifiers/type_classifier.py (1,210 lines of code) (raw):
"""
Associate to each entity a type (exclusive membership). Association is imperfect
(e.g. some false positives, false negatives), however the majority of entities
are covered under this umbrella and thus a model can learn to predict several
of the attributes listed below.
"""
from numpy import (
logical_and, logical_or, logical_not, logical_xor, where
)
from wikidata_linker_utils.logic import logical_negate, logical_ors, logical_ands
import wikidata_linker_utils.wikidata_properties as wprop
def wkp(c, name):
return c.article2id['enwiki/' + name][0][0]
def wkd(c, name):
return c.name2index[name]
def classify(c):
TRAVERSIBLE = [wprop.INSTANCE_OF, wprop.SUBCLASS_OF]
TRAVERSIBLE_LO = [wprop.INSTANCE_OF, wprop.SUBCLASS_OF, wprop.IS_A_LIST_OF]
MALE = wkd(c,"Q6581097")
FEMALE = wkd(c,"Q6581072")
HUMAN = wkp(c, "Human")
TAXON = wkd(c, "Q16521")
HORSE = wkd(c, "Q726")
RACE_HORSE = wkd(c, "Q10855242")
FOSSIL_TAXON = wkd(c, "Q23038290")
MONOTYPIC_TAXON = wkd(c, "Q310890")
FOOD = wkp(c, "Food")
DRINK = wkp(c, "Drink")
BIOLOGY = wkp(c, "Biology")
GEOGRAPHICAL_OBJECT = wkd(c, "Q618123")
LOCATION_GEOGRAPHY = wkd(c, "Q2221906")
ORGANISATION = wkp(c, 'Organization')
MUSICAL_WORK = wkd(c, 'Q2188189')
AUDIO_VISUAL_WORK = wkd(c,'Q2431196')
ART_WORK = wkd(c,'Q838948')
PHYSICAL_OBJECT = wkp(c, "Physical body")
VALUE = wkd(c, 'Q614112')
TIME_INTERVAL = wkd(c, 'Q186081')
EVENT = wkd(c, 'Q1656682')
POPULATED_PLACE = wkd(c, 'Q486972')
ACTIVITY = wkd(c, "Q1914636")
PROCESS = wkd(c, "Q3249551")
BODY_OF_WATER = wkd(c, "Q15324")
PEOPLE = wkd(c, "Q2472587")
LANGUAGE = wkd(c, "Q34770")
ALPHABET = wkd(c, "Q9779")
SPEECH = wkd(c, "Q861911")
GAS = wkd(c, "Q11432")
CHEMICAL_COMPOUND = wkd(c, "Q11173")
DRUG = wkd(c, "Q8386")
GEOMETRIC_SHAPE = wkd(c, "Q815741")
MIND = wkd(c, "Q450")
TV_STATION = wkd(c, "Q1616075")
AWARD_CEREMONY = wkd(c, "Q4504495")
SONG = wkd(c, "Q7366")
SINGLE = wkd(c, "Q134556")
CHESS_OPENING = wkd(c, "Q103632")
BATTLE = wkd(c, "Q178561")
BLOCKADE = wkd(c, "Q273976")
MILITARY_OFFENSIVE = wkd(c, "Q2001676")
DEVELOPMENT_BIOLOGY = wkd(c, "Q213713")
UNIT_OF_MASS = wkd(c, "Q3647172")
WATERCOURSE = wkd(c, "Q355304")
VOLCANO = wkd(c, "Q8072")
LAKE = wkd(c, "Q23397")
SEA = wkd(c, "Q165")
BRAND = wkd(c, "Q431289")
AUTOMOBILE_MANUFACTURER = wkd(c, "Q786820")
MOUNTAIN = wkd(c, "Q8502")
MASSIF = wkd(c, "Q1061151")
WAR = wkd(c, "Q198")
CRIME = wkd(c, "Q83267")
GENE = wkd(c, "Q7187")
CHROMOSOME = wkd(c, "Q37748")
DISEASE = wkd(c, "Q12136")
ASTEROID = wkd(c, "Q3863")
COMET = wkd(c, "Q3559")
PLANET = wkd(c, "Q634")
GALAXY = wkd(c, "Q318")
ASTRONOMICAL_OBJECT = wkd(c, "Q6999")
FICTIONAL_ASTRONOMICAL_OBJECT = wkd(c, "Q15831598")
MATHEMATICAL_OBJECT = wkd(c, "Q246672")
REGION = wkd(c, "Q82794")
PHYSICAL_QUANTITY = wkd(c, "Q107715")
NUMBER = wkd(c, "Q11563")
NATURAL_PHENOMENON = wkd(c, "Q1322005")
GEOLOGICAL_FORMATION = wkd(c, "Q736917")
CURRENCY = wkd(c, "Q8142")
MONEY = wkd(c, "Q1368")
LANDFORM = wkd(c, "Q271669")
COUNTRY = wkd(c, "Q6256")
FICTIONAL_HUMAN = wkd(c, "Q15632617")
AWARD = wkd(c, "Q618779")
RELIGIOUS_TEXT = wkd(c, "Q179461")
OCCUPATION = wkd(c, "Q12737077")
PROFESSION = wkd(c, "Q28640")
POSITION = wkd(c, "Q4164871")
RELIGION = wkd(c, "Q9174")
SOFTWARE = wkd(c, "Q7397")
ELECTRONIC_GAME = wkd(c, "Q2249149")
GAME = wkd(c, "Q11410")
VIDEO_GAME_FRANCHISES = wkd(c, "Q7213857")
TRAIN_STATION = wkd(c, "Q55488")
BRIDGE = wkd(c, "Q12280")
AIRPORT = wkd(c, "Q62447")
SURNAME = wkd(c, "Q101352")
GIVEN_NAME = wkd(c, "Q202444")
FEMALE_GIVEN_NAME = wkd(c, "Q11879590")
MALE_GIVEN_NAME = wkd(c, "Q12308941")
GIVEN_NAME = wkd(c, "Q202444")
MOLECULE = wkd(c, "Q11369")
PROTEIN_FAMILY = wkd(c, "Q417841")
PROTEIN_DOMAIN = wkd(c, "Q898273")
MULTIPROTEIN_COMPLEX = wkd(c, "Q420927")
LAW = wkd(c, "Q7748")
VEHICLE = wkd(c, "Q42889")
MODE_OF_TRANSPORT = wkd(c, "Q334166")
WATERCRAFT = wkd(c, "Q1229765")
AIRCRAFT = wkd(c, "Q11436")
ROAD_VEHICLE = wkd(c, "Q1515493")
AUTOMOBILE_MODEL = wkd(c, "Q3231690")
AUTOMOBILE = wkd(c, "Q1420")
TRUCK = wkd(c, "Q43193")
MOTORCYCLE_MODEL = wkd(c, "Q23866334")
TANK = wkd(c, "Q12876")
FIRE_ENGINE = wkd(c, "Q208281")
AMBULANCE = wkd(c, "Q180481")
RAILROAD = wkd(c, "Q22667")
RADIO_PROGRAM = wkd(c, "Q1555508")
DISCOGRAPHY = wkd(c, "Q273057")
WEBSITE = wkd(c, "Q35127")
WEAPON = wkd(c, "Q728")
PUBLICATION = wkd(c, "Q732577")
ARTICLE = wkd(c, "Q191067")
FAMILY = wkd(c, "Q8436")
FICTIONAL_CHARACTER = wkd(c, "Q95074")
FACILITY = wkd(c, "Q13226383")
CONCEPT = wkd(c, "Q151885")
PROVERB = wkd(c, "Q35102")
ANATOMICAL_STRUCTURE = wkd(c, "Q4936952")
BREED = wkd(c, "Q38829")
PLANT_STRUCTURE = wkd(c, "Q25571752")
PLANT = wkd(c, "Q756")
SPECIAL_FIELD = wkd(c, "Q1047113")
ACADEMIC_DISCIPLINE = wkd(c, "Q11862829")
TERM = wkd(c, "Q1969448")
SEXUAL_ORIENTATION = wkd(c, "Q17888")
PARADIGM = wkd(c, "Q28643")
LEGAL_CASE = wkd(c, "Q2334719")
SPORT = wkd(c, "Q349")
RECURRING_SPORTING_EVENT = wkd(c, "Q18608583")
ART_GENRE = wkd(c, "Q1792379")
SPORTING_EVENT = wkd(c, "Q16510064")
COMIC = wkd(c, "Q1004")
CHARACTER = wkd(c, "Q3241972")
PERSON = wkd(c, "Q215627")
NATIONAL_HERITAGE_SITE = wkd(c, "Q358")
ESTATE = wkd(c, "Q2186896")
ELECTION = wkd(c, "Q40231")
LEGISLATIVE_TERM = wkd(c, "Q15238777")
COMPETITION = wkd(c, "Q476300")
LEGAL_ACTION = wkd(c, "Q27095657")
SEX_TOY = wkd(c, "Q10816")
MONUMENT = wkd(c, "Q4989906")
ASSOCIATION_FOOTBALL_POSITION = wkd(c, "Q4611891")
# ICE_HOCKEY_POSITION = wkd(c, "Q18533987")
# PART_OF_LAND = wkd(c, "Q23001306")
MUSIC_DOWNLOAD = wkd(c, "Q6473564")
OCCUPATION = wkd(c, "Q12737077")
KINSHIP = wkd(c, "Q171318")
KIN = wkd(c, "Q21073947")
PSEUDONYM = wkd(c, "Q61002")
STOCK_CHARACTER = wkd(c, "Q162244")
TITLE = wkd(c, "Q4189293")
DATA_FORMAT = wkd(c, "Q494823")
ELECTROMAGNETIC_WAVE = wkd(c, "Q11386")
POSTAL_CODE = wkd(c, "Q37447")
CLOTHING = wkd(c, "Q11460")
NATIONALITY = wkd(c, "Q231002")
BASEBALL_POSITION = wkd(c, "Q1151733")
AMERICAN_FOOTBALL_POSITIONS = wkd(c, "Q694589")
POSITION_TEAM_SPORTS = wkd(c, "Q1781513")
FILE_FORMAT_FAMILY = wkd(c, "Q26085352")
FILE_FORMAT = wkd(c, "Q235557")
TAXONOMIC_RANK = wkd(c, "Q427626")
ORDER_HONOUR = wkd(c, "Q193622")
BRANCH_OF_SCIENCE = wkd(c, "Q2465832")
RESEARCH = wkd(c, "Q42240")
METHOD = wkd(c, "Q1799072")
ALGORITHM = wkd(c, "Q8366")
PROPOSITION = wkd(c, "Q108163")
SPORTSPERSON = wkd(c, "Q2066131")
LAKES_MINESOTTA = wkd(c, "Q8580663")
NAMED_PASSENGER_TRAIN_INDIA = wkd(c, "Q9260591")
TOWNSHIPS_MISOURI = wkd(c, "Q8861637")
RACE_ETHNICITY_USA = wkd(c, "Q2035701")
RECORD_CHART = wkd(c, "Q373899")
SINGLE_ENGINE_AIRCRAFT = wkd(c, "Q7405339")
SIGNIFICANT_OTHER = wkd(c, "Q841509")
BILLBOARDS = wkd(c, "Q19754079")
RADIO_STATION = wkd(c, "Q19754079")
RADIO_STATION2 = wkd(c, "Q1474493")
NOBLE_TITLE = wkd(c, "Q216353")
HOUSES_NATIONAL_REGISTER_ARKANSAS = wkd(c, "Q8526394")
CLADE = wkd(c, "Q713623")
BOARD_GAMES = wkd(c, "Q131436")
CLAN = wkd(c, "Q211503")
ACCIDENT = wkd(c, "Q171558")
MASSACRE = wkd(c, "Q3199915")
TORNADO = wkd(c, "Q8081")
NATURAL_DISASTER = wkd(c, "Q8065")
SPORTS_TEAM = wkd(c, "Q12973014")
BAND_ROCK_AND_POP = wkd(c, "Q215380")
ORGANIZATION_OTHER = wkd(c, "Q43229")
POLITICAL_PARTY = wkd(c, "Q7278")
SPECIES = wkd(c, "Q7432")
CHEMICAL_SUBSTANCE = wkd(c, "Q79529")
THREATENED_SPECIES = wkd(c, "Q515487")
HYPOTHETICAL_SPECIES = wkd(c, "Q5961273")
CONFLICT = wkd(c, "Q180684")
PRIVATE_USE_AREAS = wkd(c, "Q11152836")
BARONETCIES_IN_UK = wkd(c, "Q8290061")
EXTINCT_BARONETCIES_ENGLAND = wkd(c, "Q8432223")
EXTINCT_BARONETCIES_UK = wkd(c, "Q8432226")
WIKIPEDIA_DISAMBIGUATION = wkd(c, "Q4167410")
WIKIPEDIA_TEMPLATE_NAMESPACE = wkd(c, "Q11266439")
WIKIPEDIA_LIST = wkd(c, "Q13406463")
WIKIPEDIA_PROJECT_PAGE = wkd(c, "Q14204246")
WIKIMEDIA_CATEGORY_PAGE = wkd(c, "Q4167836")
WIKIPEDIA_USER_LANGUAGE_TEMPLATE = wkd(c, "Q19842659")
WIKIDATA_PROPERTY = wkd(c, "Q18616576")
COLLEGIATE_ATHLETICS_PROGRAM = wkd(c, "Q5146583")
SPORTS_TRANSFER_AF = wkd(c, "Q1811518")
DEMOGRAPHICS_OF_NORWAY = wkd(c, "Q7664203")
DOCUMENT = wkd(c, "Q49848")
BASIC_STAT_UNIT_NORWAY = wkd(c, "Q4580177")
PUBLIC_TRANSPORT = wkd(c, "Q178512")
HAZARD = wkd(c, "Q1132455")
BASEBALL_RULES = wkd(c, "Q1153773")
HIT_BASEBALL = wkd(c, "Q713493")
OUT_BASEBALL = wkd(c, "Q1153773")
LAWS_OF_ASSOCIATION_FOOTBALL = wkd(c, "Q7215850")
CRICKET_LAWS_AND_REGULATION = wkd(c, "Q8427034")
MEASUREMENTS_OF_POVERTY = wkd(c, "Q8614855")
PROFESSIONAL_WRESTLING_MATCH_TYPES = wkd(c, "Q679633")
CITATION = wkd(c, "Q1713")
INTERNATIONAL_RELATIONS = wkd(c, "Q166542")
WORLD_VIEW = wkd(c, "Q49447")
ROCK_GEOLOGY = wkd(c, "Q8063")
BASEBALL_STATISTIC = wkd(c, "Q8291081")
BASEBALL_STATISTICS = wkd(c, "Q809898")
TRAIN_ACCIDENT = wkd(c, "Q1078765")
CIRCUS_SKILLS = wkd(c, "Q4990963")
FOLKLORE = wkd(c, "Q36192")
NEWS_BUREAU = wkd(c, "Q19824398")
RECESSION = wkd(c, "Q176494")
NYC_BALLET = wkd(c, "Q1336942")
SPORTS_RECORD = wkd(c, "Q1241356")
WINGSPAN = wkd(c, "Q245097")
WIN_LOSS_RECORD_PITCHING = wkd(c, "Q1202506")
CRICKET_TERMINOLOGY = wkd(c, "Q8427141")
UNION_ARMY = wkd(c, "Q1752901")
POPULATION = wkd(c, "Q33829")
WIND = wkd(c, "Q8094")
TORPEDO_TUBE = wkd(c, "Q1330003")
WEAPONS_PLATFORM = wkd(c, "Q7978115")
COLOR = wkd(c, "Q1075")
SOCIAL_SCIENCE = wkd(c, "Q34749")
DISCIPLINE_ACADEMIA = wkd(c, "Q11862829")
FORMAL_SCIENCE = wkd(c, "Q816264")
ASPHALT = wkd(c, "Q167510")
TALK_RADIO = wkd(c, "Q502319")
ART_MOVEMENT = wkd(c, "Q968159")
IDEOLOGY = wkd(c, "Q7257")
# print([c.get_name(idx) for idx in c.relation(wprop.INSTANCE_OF)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.get_inverted_relation(wprop.INSTANCE_OF)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.relation(wprop.PART_OF)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.get_inverted_relation(wprop.PART_OF)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.relation(wprop.SUBCLASS_OF)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.get_inverted_relation(wprop.SUBCLASS_OF)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.relation(wprop.CATEGORY_LINK)[wkd(c, "Q14934048")]])
# print([c.get_name(idx) for idx in c.get_inverted_relation(wprop.CATEGORY_LINK)[wkd(c, "Q14934048")]])
is_sports_terminology = logical_or(
c.satisfy(TRAVERSIBLE_LO, [OUT_BASEBALL, HIT_BASEBALL]),
c.satisfy(
[wprop.CATEGORY_LINK],
[
BASEBALL_RULES,
LAWS_OF_ASSOCIATION_FOOTBALL,
CRICKET_LAWS_AND_REGULATION,
PROFESSIONAL_WRESTLING_MATCH_TYPES,
CRICKET_TERMINOLOGY
],
max_steps=1
)
)
is_accident = c.satisfy(TRAVERSIBLE_LO, [ACCIDENT])
is_taxon = c.satisfy([wprop.INSTANCE_OF, wprop.IS_A_LIST_OF],
[
TAXON, FOSSIL_TAXON, MONOTYPIC_TAXON, HORSE, RACE_HORSE, CLADE, SPECIES,
THREATENED_SPECIES, HYPOTHETICAL_SPECIES
]
)
is_breed = c.satisfy(TRAVERSIBLE_LO, [BREED])
is_taxon_or_breed = logical_or(is_taxon, is_breed)
is_human = c.satisfy(TRAVERSIBLE_LO, [HUMAN, FICTIONAL_HUMAN])
is_country = c.satisfy(TRAVERSIBLE_LO, [COUNTRY])
is_people = c.satisfy(
TRAVERSIBLE_LO,
[
PEOPLE,
NATIONALITY,
SPORTS_TRANSFER_AF,
POPULATION
]
)
is_populated_place = logical_or(
c.satisfy(TRAVERSIBLE_LO, [POPULATED_PLACE]),
c.satisfy([wprop.CATEGORY_LINK], [TOWNSHIPS_MISOURI], max_steps=1)
)
is_organization = c.satisfy(
TRAVERSIBLE_LO,
[
POLITICAL_PARTY,
COLLEGIATE_ATHLETICS_PROGRAM,
ORGANIZATION_OTHER,
ORGANISATION,
SPORTS_TEAM,
BAND_ROCK_AND_POP,
NEWS_BUREAU,
NYC_BALLET,
UNION_ARMY
]
)
is_position = c.satisfy(
TRAVERSIBLE_LO,
[
POSITION,
OCCUPATION,
POSITION_TEAM_SPORTS,
AMERICAN_FOOTBALL_POSITIONS,
ASSOCIATION_FOOTBALL_POSITION,
BASEBALL_POSITION,
# ICE_HOCKEY_POSITION,
SPORTSPERSON
]
)
is_kinship = c.satisfy(TRAVERSIBLE_LO, [KINSHIP])
is_kin = c.satisfy([wprop.SUBCLASS_OF, wprop.IS_A_LIST_OF], [KIN])
is_title = logical_or(
c.satisfy(TRAVERSIBLE_LO, [TITLE, NOBLE_TITLE]),
c.satisfy([wprop.CATEGORY_LINK], [BARONETCIES_IN_UK, EXTINCT_BARONETCIES_UK, EXTINCT_BARONETCIES_ENGLAND], max_steps=1)
)
is_art_work = c.satisfy(TRAVERSIBLE_LO, [ART_WORK, COMIC])
is_audio_visual_work = c.satisfy(TRAVERSIBLE_LO, [AUDIO_VISUAL_WORK, TV_STATION])
is_fictional_character = c.satisfy(TRAVERSIBLE_LO, [FICTIONAL_CHARACTER])
is_name = c.satisfy(TRAVERSIBLE_LO, [GIVEN_NAME, SURNAME, FEMALE_GIVEN_NAME, MALE_GIVEN_NAME, PSEUDONYM])
is_stock_character = c.satisfy([wprop.INSTANCE_OF, wprop.IS_A_LIST_OF], [STOCK_CHARACTER])
is_family = c.satisfy(TRAVERSIBLE_LO, [FAMILY, CLAN])
is_award = c.satisfy(TRAVERSIBLE_LO, [AWARD])
is_electromagnetic_wave = c.satisfy(TRAVERSIBLE_LO, [ELECTROMAGNETIC_WAVE])
is_geographical_object = c.satisfy(
TRAVERSIBLE_LO,
[
GEOGRAPHICAL_OBJECT,
BODY_OF_WATER,
LOCATION_GEOGRAPHY,
GEOLOGICAL_FORMATION,
NATIONAL_HERITAGE_SITE,
ESTATE,
# PART_OF_LAND,
PRIVATE_USE_AREAS
]
)
is_postal_code = c.satisfy(TRAVERSIBLE_LO, [POSTAL_CODE])
is_person = c.satisfy(TRAVERSIBLE_LO, [PERSON])
is_person_only = logical_or(
logical_negate(
is_person,
[
is_human,
is_people,
is_populated_place,
is_organization,
is_position,
is_title,
is_kinship,
is_kin,
is_country,
is_geographical_object,
is_art_work,
is_audio_visual_work,
is_fictional_character,
is_name,
is_family,
is_award
]
), is_stock_character)
is_male = c.satisfy([wprop.SEX_OR_GENDER], [MALE])
is_female = c.satisfy([wprop.SEX_OR_GENDER], [FEMALE])
is_human_male = logical_and(is_human, is_male)
is_human_female = logical_and(is_human, is_female)
is_musical_work = c.satisfy(TRAVERSIBLE_LO, [MUSICAL_WORK, DISCOGRAPHY])
is_song = c.satisfy(TRAVERSIBLE_LO, [SONG, SINGLE])
is_radio_program = c.satisfy(
TRAVERSIBLE_LO,
[
RADIO_PROGRAM,
RADIO_STATION,
RADIO_STATION2,
TALK_RADIO
]
)
is_sexual_orientation = c.satisfy(TRAVERSIBLE_LO, [SEXUAL_ORIENTATION])
is_taxonomic_rank = c.satisfy([wprop.INSTANCE_OF], [TAXONOMIC_RANK])
is_order = c.satisfy(TRAVERSIBLE_LO, [ORDER_HONOUR])
is_train_station = c.satisfy(TRAVERSIBLE_LO, [TRAIN_STATION])
is_bridge = c.satisfy(TRAVERSIBLE_LO, [BRIDGE])
is_airport = c.satisfy(TRAVERSIBLE_LO, [AIRPORT])
is_sex_toy = c.satisfy(TRAVERSIBLE_LO, [SEX_TOY])
is_monument = c.satisfy(TRAVERSIBLE_LO, [MONUMENT])
is_physical_object = c.satisfy(
TRAVERSIBLE_LO,
[
PHYSICAL_OBJECT,
BOARD_GAMES,
ELECTRONIC_GAME,
GAME,
ROCK_GEOLOGY,
ASPHALT
]
)
is_clothing = c.satisfy(TRAVERSIBLE_LO, [CLOTHING])
is_mathematical_object = c.satisfy(TRAVERSIBLE_LO, [MATHEMATICAL_OBJECT])
is_physical_quantity = logical_or(
c.satisfy(
TRAVERSIBLE_LO,
[
PHYSICAL_QUANTITY,
BASIC_STAT_UNIT_NORWAY,
SPORTS_RECORD,
WINGSPAN,
WIN_LOSS_RECORD_PITCHING,
BASEBALL_STATISTICS
]
),
c.satisfy(
[wprop.CATEGORY_LINK],
[
DEMOGRAPHICS_OF_NORWAY,
MEASUREMENTS_OF_POVERTY,
BASEBALL_STATISTIC
],
max_steps=1
)
)
is_number = c.satisfy(TRAVERSIBLE_LO, [NUMBER])
is_astronomical_object = c.satisfy(
TRAVERSIBLE_LO,
[
ASTEROID,
COMET,
PLANET,
GALAXY,
ASTRONOMICAL_OBJECT,
FICTIONAL_ASTRONOMICAL_OBJECT
]
)
is_hazard = c.satisfy(TRAVERSIBLE_LO, [HAZARD, TRAIN_ACCIDENT])
is_date = c.satisfy(TRAVERSIBLE_LO, [TIME_INTERVAL])
is_algorithm = c.satisfy(TRAVERSIBLE_LO, [ALGORITHM])
is_value = c.satisfy(TRAVERSIBLE_LO, [VALUE])
is_currency = c.satisfy(TRAVERSIBLE_LO, [CURRENCY, MONEY])
is_event = c.satisfy(TRAVERSIBLE_LO, [EVENT, RECESSION])
is_election = c.satisfy(TRAVERSIBLE_LO, [ELECTION])
is_legislative_term = c.satisfy(TRAVERSIBLE_LO, [LEGISLATIVE_TERM])
is_activity = c.satisfy([wprop.INSTANCE_OF, wprop.IS_A_LIST_OF], [ACTIVITY, MUSIC_DOWNLOAD, CIRCUS_SKILLS])
is_activity_subclass = c.satisfy([wprop.SUBCLASS_OF], [ACTIVITY, MUSIC_DOWNLOAD, CIRCUS_SKILLS])
is_food = c.satisfy([wprop.INSTANCE_OF, wprop.PART_OF, wprop.SUBCLASS_OF], [FOOD, DRINK])
is_wikidata_prop = c.satisfy(TRAVERSIBLE_LO, [WIKIDATA_PROPERTY])
is_wikipedia_disambiguation = c.satisfy([wprop.INSTANCE_OF], [WIKIPEDIA_DISAMBIGUATION])
is_wikipedia_template_namespace = c.satisfy([wprop.INSTANCE_OF], [WIKIPEDIA_TEMPLATE_NAMESPACE])
is_wikipedia_list = c.satisfy([wprop.INSTANCE_OF], [WIKIPEDIA_LIST])
is_wikipedia_project_page = c.satisfy([wprop.INSTANCE_OF], [WIKIPEDIA_PROJECT_PAGE])
is_wikipedia_user_language_template = c.satisfy([wprop.INSTANCE_OF], [WIKIPEDIA_USER_LANGUAGE_TEMPLATE])
is_wikimedia_category_page = c.satisfy([wprop.INSTANCE_OF], [WIKIMEDIA_CATEGORY_PAGE])
is_legal_case = c.satisfy(TRAVERSIBLE_LO, [LEGAL_CASE])
is_sport = c.satisfy(TRAVERSIBLE_LO, [SPORT])
is_data_format = c.satisfy(TRAVERSIBLE_LO, [DATA_FORMAT, FILE_FORMAT_FAMILY, FILE_FORMAT])
is_research_method = c.satisfy(TRAVERSIBLE_LO, [RESEARCH, METHOD, RACE_ETHNICITY_USA])
is_proposition = c.satisfy(TRAVERSIBLE_LO, [PROPOSITION])
is_record_chart = c.satisfy(TRAVERSIBLE_LO, [RECORD_CHART, BILLBOARDS])
is_international_relations = c.satisfy(TRAVERSIBLE_LO, [INTERNATIONAL_RELATIONS])
is_union = c.satisfy(TRAVERSIBLE_LO, [SIGNIFICANT_OTHER])
is_recurring_sporting_event = c.satisfy(
TRAVERSIBLE_LO,
[RECURRING_SPORTING_EVENT]
)
is_sport_event = logical_or(
logical_and(
is_sport,
c.satisfy([wprop.PART_OF, wprop.IS_A_LIST_OF], where(is_recurring_sporting_event)[0])
),
c.satisfy(TRAVERSIBLE_LO, [SPORTING_EVENT, COMPETITION])
)
is_genre = c.satisfy(TRAVERSIBLE_LO, [ART_GENRE, ART_MOVEMENT])
is_landform = c.satisfy(TRAVERSIBLE_LO, [LANDFORM])
is_language = c.satisfy(TRAVERSIBLE_LO, [LANGUAGE])
is_alphabet = c.satisfy(TRAVERSIBLE_LO, [ALPHABET])
is_railroad = logical_or(
c.satisfy(TRAVERSIBLE_LO, [RAILROAD]),
c.satisfy([wprop.CATEGORY_LINK], [NAMED_PASSENGER_TRAIN_INDIA], max_steps=1)
)
is_speech = c.satisfy(TRAVERSIBLE_LO, [SPEECH])
is_language_only = logical_negate(is_language, [is_speech])
is_alphabet_only = logical_negate(is_alphabet, [is_speech, is_language])
is_war = c.satisfy(TRAVERSIBLE_LO, [WAR])
is_battle = c.satisfy(TRAVERSIBLE_LO, [BATTLE, BLOCKADE, MILITARY_OFFENSIVE, CONFLICT, MASSACRE])
is_crime = c.satisfy(TRAVERSIBLE_LO, [CRIME])
is_gas = c.satisfy(TRAVERSIBLE_LO, [GAS])
is_chemical_compound = c.satisfy(TRAVERSIBLE_LO, [CHEMICAL_COMPOUND, DRUG, CHEMICAL_SUBSTANCE])
is_chemical_compound_only = logical_negate(is_chemical_compound, [is_food])
is_gas_only = logical_negate(is_gas, [is_chemical_compound])
is_geometric_shape = c.satisfy(TRAVERSIBLE_LO, [GEOMETRIC_SHAPE])
is_award_ceremony = c.satisfy(TRAVERSIBLE_LO, [AWARD_CEREMONY])
is_strategy = c.satisfy(TRAVERSIBLE_LO, [CHESS_OPENING])
is_gene = c.satisfy(TRAVERSIBLE_LO, [GENE, CHROMOSOME])
is_character = c.satisfy(TRAVERSIBLE_LO, [CHARACTER])
is_law = c.satisfy(TRAVERSIBLE_LO, [LAW])
is_legal_action = c.satisfy(TRAVERSIBLE_LO, [LEGAL_ACTION])
is_facility = logical_or(
c.satisfy(TRAVERSIBLE_LO, [FACILITY]),
c.satisfy([wprop.CATEGORY_LINK], [HOUSES_NATIONAL_REGISTER_ARKANSAS], max_steps=1)
)
is_molecule = c.satisfy(TRAVERSIBLE_LO, [MOLECULE, PROTEIN_FAMILY, PROTEIN_DOMAIN, MULTIPROTEIN_COMPLEX])
is_disease = c.satisfy(TRAVERSIBLE_LO, [DISEASE])
is_mind = c.satisfy(TRAVERSIBLE_LO, [MIND])
is_religion = c.satisfy(TRAVERSIBLE_LO, [RELIGION])
is_natural_phenomenon = c.satisfy(TRAVERSIBLE_LO, [NATURAL_PHENOMENON, NATURAL_DISASTER, WIND])
is_anatomical_structure = c.satisfy(TRAVERSIBLE_LO, [ANATOMICAL_STRUCTURE])
is_plant = c.satisfy(TRAVERSIBLE_LO + [wprop.PARENT_TAXON], [PLANT_STRUCTURE, PLANT])
is_region = c.satisfy(TRAVERSIBLE_LO, [REGION])
is_software = logical_or(
c.satisfy(TRAVERSIBLE_LO, [SOFTWARE]),
c.satisfy([wprop.CATEGORY_LINK], [VIDEO_GAME_FRANCHISES], max_steps=1)
)
is_website = c.satisfy(TRAVERSIBLE_LO, [WEBSITE])
is_river = logical_and(c.satisfy(TRAVERSIBLE_LO, [WATERCOURSE]), is_geographical_object)
is_lake = logical_or(
logical_and(c.satisfy(TRAVERSIBLE_LO, [LAKE]), is_geographical_object),
c.satisfy([wprop.CATEGORY_LINK], [LAKES_MINESOTTA], max_steps=1)
)
is_sea = logical_and(c.satisfy(TRAVERSIBLE_LO, [SEA]), is_geographical_object)
is_volcano = logical_and(c.satisfy(TRAVERSIBLE_LO, [VOLCANO]), is_geographical_object)
is_development_biology = c.satisfy([wprop.PART_OF, wprop.SUBCLASS_OF, wprop.INSTANCE_OF], [DEVELOPMENT_BIOLOGY, BIOLOGY])
is_unit_of_mass = c.satisfy(TRAVERSIBLE_LO, [UNIT_OF_MASS])
is_vehicle = c.satisfy(TRAVERSIBLE_LO, [VEHICLE, MODE_OF_TRANSPORT, PUBLIC_TRANSPORT])
is_watercraft = c.satisfy(TRAVERSIBLE_LO, [WATERCRAFT])
is_aircraft = logical_or(
c.satisfy(TRAVERSIBLE_LO, [AIRCRAFT]),
c.satisfy([wprop.CATEGORY_LINK], [SINGLE_ENGINE_AIRCRAFT], max_steps=1)
)
is_road_vehicle = c.satisfy(
TRAVERSIBLE_LO,
[
ROAD_VEHICLE,
TANK,
FIRE_ENGINE,
AMBULANCE,
AUTOMOBILE_MODEL,
MOTORCYCLE_MODEL
]
)
is_weapon = c.satisfy(TRAVERSIBLE_LO, [WEAPON, TORPEDO_TUBE, WEAPONS_PLATFORM])
is_book_magazine_article_proverb = c.satisfy(
TRAVERSIBLE_LO,
[
PUBLICATION,
ARTICLE,
RELIGIOUS_TEXT,
PROVERB,
DOCUMENT,
CITATION,
FOLKLORE
]
)
is_brand = c.satisfy(TRAVERSIBLE_LO, [BRAND])
is_concept = logical_or(
c.satisfy([wprop.INSTANCE_OF],
[TERM, ACADEMIC_DISCIPLINE, SPECIAL_FIELD, BRANCH_OF_SCIENCE, WORLD_VIEW]
),
c.satisfy([wprop.SUBCLASS_OF], [SOCIAL_SCIENCE, DISCIPLINE_ACADEMIA, FORMAL_SCIENCE, IDEOLOGY])
)
is_color = c.satisfy(TRAVERSIBLE_LO, [COLOR])
is_paradigm = c.satisfy(TRAVERSIBLE_LO, [PARADIGM])
is_vehicle_brand = logical_or(
logical_and(c.satisfy([wprop.PRODUCT_OR_MATERIAL_PRODUCED], [AUTOMOBILE, TRUCK]), is_brand),
c.satisfy(TRAVERSIBLE_LO, [AUTOMOBILE_MANUFACTURER])
)
is_mountain_massif = logical_and(c.satisfy(TRAVERSIBLE_LO, [MOUNTAIN, MASSIF]), is_geographical_object)
is_mountain_only = logical_negate(
is_mountain_massif,
[
is_volcano
]
)
is_physical_object_only = logical_negate(
is_physical_object,
[
is_audio_visual_work,
is_art_work,
is_musical_work,
is_geographical_object,
is_currency,
is_gas,
is_clothing,
is_chemical_compound,
is_electromagnetic_wave,
is_song,
is_food,
is_character,
is_law,
is_software,
is_website,
is_vehicle,
is_lake,
is_landform,
is_railroad,
is_airport,
is_aircraft,
is_watercraft,
is_sex_toy,
is_data_format,
is_date,
is_research_method,
is_sport,
is_watercraft,
is_aircraft,
is_brand,
is_vehicle_brand,
is_road_vehicle,
is_railroad,
is_radio_program,
is_weapon,
is_book_magazine_article_proverb,
is_brand,
is_organization,
is_facility,
is_anatomical_structure,
is_gene,
is_monument
]
)
is_musical_work_only = logical_negate(
is_musical_work,
[
is_song
]
)
is_geographical_object_only = logical_negate(
is_geographical_object,
[
is_river,
is_lake,
is_sea,
is_volcano,
is_mountain_only,
is_region,
is_monument,
is_country,
is_facility,
is_food,
is_airport,
is_bridge,
is_train_station
]
)
is_event_election_only = logical_negate(
logical_ors([is_event, is_election, is_accident]),
[
is_award_ceremony,
is_war,
is_natural_phenomenon
]
)
is_region_only = logical_negate(
is_region,
[
is_populated_place,
is_country,
is_lake,
is_river,
is_sea,
is_volcano,
is_mountain_only
]
)
is_astronomical_object_only = logical_negate(
is_astronomical_object,
[
is_geographical_object
]
)
is_date_only = logical_negate(
is_date,
[
is_strategy,
is_development_biology
]
)
is_development_biology_date = logical_and(is_development_biology, is_date)
is_value_only = logical_negate(
is_value,
[
is_unit_of_mass,
is_event,
is_election,
is_currency,
is_number,
is_physical_quantity,
is_award,
is_date,
is_postal_code
]
)
is_activity_subclass_only = logical_negate(
logical_or(is_activity_subclass, is_activity),
[
is_crime,
is_war,
is_chemical_compound,
is_gene,
is_molecule,
is_mathematical_object,
is_sport,
is_sport_event,
is_event,
is_paradigm,
is_position,
is_title,
is_algorithm,
is_order,
is_organization,
is_research_method,
is_proposition,
is_taxonomic_rank,
is_algorithm,
is_event,
is_election,
is_genre,
is_concept
]
)
is_crime_only = logical_negate(
is_crime,
[
is_war
]
)
is_number_only = logical_negate(
is_number,
[
is_physical_quantity
]
)
is_molecule_only = logical_negate(
is_molecule,
[
is_gene,
is_chemical_compound
]
)
# VEHICLES:
is_vehicle_only = logical_negate(
is_vehicle,
[
is_watercraft,
is_aircraft,
is_road_vehicle
]
)
is_watercraft_only = logical_negate(
is_watercraft,
[
is_aircraft
]
)
is_road_vehicle_only = logical_negate(
is_road_vehicle,
[
is_aircraft,
is_watercraft,
]
)
# remove groups that have occupations from mathematical objects:
is_object_with_occupation = c.satisfy([wprop.INSTANCE_OF, wprop.OCCUPATION], [OCCUPATION, PROFESSION, POSITION])
is_mathematical_object_only = logical_negate(
is_mathematical_object,
[
is_geometric_shape,
is_physical_quantity,
is_number,
is_object_with_occupation,
is_landform
]
)
is_organization_only = logical_negate(
is_organization,
[
is_country,
is_geographical_object,
is_family,
is_people
]
)
is_art_work_only = logical_negate(
is_art_work,
[
is_musical_work,
is_audio_visual_work,
is_sex_toy,
is_monument
]
)
is_software_only = logical_negate(
is_software,
[
is_language,
is_organization,
is_website
]
)
is_website_only = logical_negate(
is_website,
[
is_organization,
is_language
]
)
is_taxon_or_breed_only = logical_negate(
is_taxon_or_breed,
[
is_human,
is_plant
]
)
is_human_only = logical_negate(
is_human,
[
is_male,
is_female,
is_kin,
is_kinship,
is_title
]
)
is_weapon_only = logical_negate(
is_weapon,
[
is_software,
is_website,
is_vehicle
]
)
is_book_magazine_article_proverb_only = logical_negate(
is_book_magazine_article_proverb,
[
is_software,
is_website,
is_musical_work,
is_song,
is_law,
is_legal_action
]
)
is_fictional_character_only = logical_negate(
is_fictional_character,
[
is_human,
is_stock_character
]
)
is_battle_only = logical_negate(
is_battle,
[
is_war,
is_crime
]
)
is_brand_only = logical_negate(
is_brand,
[
is_vehicle,
is_aircraft,
is_watercraft,
is_website,
is_software,
is_vehicle_brand
]
)
is_vehicle_brand_only = logical_negate(
is_vehicle_brand,
[
is_vehicle,
is_aircraft,
is_watercraft,
is_website,
is_software
]
)
is_concept_paradigm_proposition_only = logical_negate(
logical_ors([is_concept, is_paradigm, is_proposition]),
[
is_physical_object,
is_physical_quantity,
is_software,
is_website,
is_color,
is_vehicle,
is_electromagnetic_wave,
is_brand,
is_vehicle_brand,
is_currency,
is_fictional_character,
is_human,
is_aircraft,
is_geographical_object,
is_geometric_shape,
is_mathematical_object,
is_musical_work,
is_mountain_massif,
is_lake,
is_landform,
is_language,
is_anatomical_structure,
is_book_magazine_article_proverb,
is_development_biology,
is_plant,
is_sexual_orientation,
is_genre,
is_legislative_term
]
)
is_anatomical_structure_only = logical_negate(
is_anatomical_structure,
[
is_plant
]
)
is_facility_only = logical_negate(
is_facility,
[
is_train_station,
is_aircraft,
is_airport,
is_bridge,
is_vehicle,
is_astronomical_object,
is_railroad,
is_monument
]
)
is_wikipedia_list_only = logical_negate(
is_wikipedia_list,
[
is_activity_subclass,
is_alphabet,
is_art_work,
is_astronomical_object,
is_audio_visual_work,
is_award,
is_character,
is_character,
is_chemical_compound,
is_color,
is_currency,
is_disease,
is_election,
is_electromagnetic_wave,
is_facility,
is_fictional_character,
is_gene,
is_genre,
is_geographical_object,
is_human,
is_language,
is_law,
is_law,
is_legal_action,
is_legal_case,
is_legislative_term,
is_mathematical_object,
is_mind,
is_people,
is_person,
is_person,
is_physical_object,
is_populated_place,
is_position,
is_region,
is_religion,
is_research_method,
is_sexual_orientation,
is_software,
is_speech,
is_sport,
is_sport_event,
is_stock_character,
is_strategy,
is_taxon_or_breed,
is_value,
is_vehicle,
is_wikidata_prop,
is_weapon
]
)
is_sport_only = logical_negate(
is_sport,
[
is_sport_event
]
)
is_legal_action_only = logical_negate(
is_legal_action,
[
is_law,
is_election
]
)
is_genre_only = logical_negate(
is_genre,
[
is_physical_object,
is_audio_visual_work,
is_art_work,
is_book_magazine_article_proverb,
is_concept
]
)
is_plant_only = logical_negate(
is_plant,
[
is_food,
is_human,
is_organization
]
)
is_kinship_kin_only = logical_negate(
logical_or(is_kinship, is_kin),
[
is_family
]
)
is_position_only = logical_negate(
is_position,
[
is_organization,
is_human
]
)
is_radio_program_only = logical_negate(
is_radio_program,
[
is_audio_visual_work,
]
)
is_taxonomic_rank_only = logical_negate(
is_taxonomic_rank,
[
is_order
]
)
is_research_method_only = logical_negate(
is_research_method,
[
is_audio_visual_work,
is_book_magazine_article_proverb,
is_art_work,
is_concept,
is_crime,
is_war,
is_algorithm,
is_law,
is_legal_action,
is_legal_case
]
)
is_algorithm_only = logical_negate(
is_algorithm,
[
is_concept,
is_paradigm
]
)
is_union_only = logical_negate(
is_union,
[
is_kinship,
is_human,
is_person
]
)
# get all the wikidata items that are disconnected:
no_instance_subclass_or_cat_link = logical_ands(
[
c.relation(relation_name).edges() == 0
for relation_name in [wprop.PART_OF, wprop.INSTANCE_OF, wprop.SUBCLASS_OF, wprop.CATEGORY_LINK]
]
)
is_sports_terminology_only = logical_negate(
is_sports_terminology,
[
is_organization,
is_human,
is_person,
is_activity,
is_title,
is_physical_quantity
]
)
out = {
"aaa_wikidata_prop": is_wikidata_prop,
"aaa_wikipedia_disambiguation": is_wikipedia_disambiguation,
"aaa_wikipedia_template_namespace": is_wikipedia_template_namespace,
"aaa_wikipedia_user_language_template": is_wikipedia_user_language_template,
"aaa_wikipedia_list": is_wikipedia_list_only,
"aaa_wikipedia_project_page": is_wikipedia_project_page,
"aaa_wikimedia_category_page": is_wikimedia_category_page,
"aaa_no_instance_subclass_or_link": no_instance_subclass_or_cat_link,
"taxon": is_taxon_or_breed_only,
"human_male": is_human_male,
"human_female": is_human_female,
"human": is_human_only,
"fictional_character": is_fictional_character_only,
"people": is_people,
"language": is_language_only,
"alphabet": is_alphabet_only,
"speech": is_speech,
"gas": is_gas_only,
"gene": is_gene,
"molecule": is_molecule_only,
"astronomical_object": is_astronomical_object_only,
"disease": is_disease,
"mind": is_mind,
"song": is_song,
"radio_program": is_radio_program_only,
"law": is_law,
"legal_action": is_legal_action_only,
"book_magazine_article": is_book_magazine_article_proverb_only,
"chemical_compound": is_chemical_compound_only,
"geometric_shape": is_geometric_shape,
"mathematical_object": is_mathematical_object_only,
"physical_quantity": is_physical_quantity,
"number": is_number_only,
"geographical_object": is_geographical_object_only,
"train_station": is_train_station,
"railroad": is_railroad,
"concept": is_concept_paradigm_proposition_only,
"genre": is_genre_only,
"sexual_orientation": is_sexual_orientation,
"bridge": is_bridge,
"airport": is_airport,
"river": is_river,
"lake": is_lake,
"sea": is_sea,
"weapon": is_weapon_only,
"region": is_region_only,
"country": is_country,
"software": is_software_only,
"website": is_website_only,
"volcano": is_volcano,
"mountain": is_mountain_only,
"religion": is_religion,
"organization": is_organization_only,
"musical_work": is_musical_work_only,
"other_art_work": is_art_work_only,
"audio_visual_work": is_audio_visual_work,
"physical_object": is_physical_object_only,
"record_chart": is_record_chart,
"clothing": is_clothing,
"plant": is_plant_only,
"anatomical_structure": is_anatomical_structure_only,
"facility": is_facility_only,
"monument": is_monument,
"vehicle": is_vehicle_only,
"watercraft": is_watercraft_only,
"road_vehicle": is_road_vehicle_only,
"vehicle_brand": is_vehicle_brand_only,
"brand": is_brand_only,
"aircraft": is_aircraft,
"legal_case": is_legal_case,
"position": is_position_only,
"person_role": is_person_only,
"populated_place": is_populated_place,
"value": is_value_only,
"unit_of_mass": is_unit_of_mass,
"currency": is_currency,
"postal_code": is_postal_code,
"name": is_name,
"data_format": is_data_format,
"character": is_character,
"family": is_family,
"sport": is_sport_only,
"taxonomic_rank": is_taxonomic_rank,
"sex_toy": is_sex_toy,
"legislative_term": is_legislative_term,
"sport_event": is_sport_event,
"date": is_date_only,
"kinship": is_kinship_kin_only,
"union": is_union_only,
"research": is_research_method_only,
"title": is_title,
"hazard": is_hazard,
"color": is_color,
"sports_terminology": is_sports_terminology_only,
"developmental_biology_period": is_development_biology_date,
"strategy": is_strategy,
"event": is_event_election_only,
"natural_phenomenon": is_natural_phenomenon,
"electromagnetic_wave": is_electromagnetic_wave,
"war": is_war,
"award": is_award,
"crime": is_crime_only,
"battle": is_battle_only,
"international_relations": is_international_relations,
"food": is_food,
"algorithm": is_algorithm,
"activity": is_activity_subclass_only,
"award_ceremony": is_award_ceremony
}
# is_other = logical_not(logical_ors([val for key, val in out.items() if key != "aaa_wikipedia_list"]))
# c.class_report([wprop.IS_A_LIST_OF, wprop.CATEGORY_LINK], logical_and(
# is_other,
# is_wikipedia_list_only
# ), name="remaining lists")
return out