extraction/classifiers/location_classifier.py (102 lines of code) (raw):
"""
Obtain a coarse-grained classification of places and entities according to their associated
continent/country.
"""
from numpy import (
logical_and, logical_or, logical_not, logical_xor, where
)
from wikidata_linker_utils.logic import logical_negate
import wikidata_linker_utils.wikidata_properties as wprop
def wkp(c, name):
"""Convert a string wikipedia article name to its Wikidata index."""
return c.article2id["enwiki/" + name][0][0]
def wkd(c, name):
"""Convert a wikidata QID to its wikidata index."""
return c.name2index[name]
def classify(c):
EUROPE = wkp(c, 'Europe')
AFRICA = wkp(c, 'Africa')
ASIA = wkp(c, 'Asia')
NORTH_AMERICA = wkp(c, 'North America')
SOUTH_AMERICA = wkp(c, 'South America')
OCEANIA = wkp(c, 'Oceania')
ANTARCTICA = wkp(c, 'Antarctica')
CONTINENT = wkp(c, wprop.CONTINENT)
OUTERSPACE = wkp(c, 'Astronomical object')
EARTH = wkp(c, "Earth")
GEOGRAPHIC_LOCATION = wkd(c, "Q2221906")
POPULATED_PLACE = wkd(c, 'Q486972')
MIDDLE_EAST = [
wkp(c, "Bahrain"),
wkp(c, "Cyprus"),
wkp(c, "Turkish"),
wkp(c, "Egypt"),
wkp(c, "Iran"),
wkp(c, "Iraq"),
wkp(c, "Kurdish"),
wkp(c, "Israel"),
wkp(c, "Arabic"),
wkp(c, "Jordan"),
wkp(c, "Kuwait"),
wkp(c, "Lebanon"),
wkp(c, "Oman"),
wkp(c, "Palestine"),
wkp(c, "Jordanian"),
wkp(c, "Qatar"),
wkp(c, "Saudi Arabia"),
wkp(c, "Syria"),
wkp(c, "Turkey"),
wkp(c, "United Arab Emirates"),
wkp(c, "Yemen")
]
TRAVERSIBLE = [
wprop.INSTANCE_OF,
wprop.SUBCLASS_OF,
wprop.CONTINENT,
wprop.PART_OF,
wprop.COUNTRY_OF_CITIZENSHIP,
wprop.COUNTRY,
wprop.LOCATED_IN_THE_ADMINISTRATIVE_TERRITORIAL_ENTITY
]
# c.describe_connection("Q55", "North America", TRAVERSIBLE)
# return {}
print("is_in_middle_east")
is_in_middle_east = c.satisfy(TRAVERSIBLE, MIDDLE_EAST)
print("is_in_europe")
is_in_europe = c.satisfy(TRAVERSIBLE, [EUROPE])
is_in_europe_only = logical_negate(is_in_europe, [is_in_middle_east])
print("is_in_asia")
is_in_asia = c.satisfy(TRAVERSIBLE, [ASIA])
is_in_asia_only = logical_negate(is_in_asia, [is_in_europe, is_in_middle_east])
print("is_in_africa")
is_in_africa = c.satisfy(TRAVERSIBLE, [AFRICA])
is_in_africa_only = logical_negate(is_in_africa, [is_in_europe, is_in_asia, is_in_middle_east])
print("is_in_north_america")
is_in_north_america = c.satisfy(TRAVERSIBLE, [NORTH_AMERICA])
is_in_north_america_only = logical_negate(is_in_north_america, [is_in_europe, is_in_asia, is_in_middle_east])
print("is_in_south_america")
is_in_south_america = c.satisfy(TRAVERSIBLE, [SOUTH_AMERICA])
print("is_in_antarctica")
is_in_antarctica = c.satisfy(TRAVERSIBLE, [ANTARCTICA])
is_in_antarctica_only = logical_negate(is_in_antarctica, [is_in_europe, is_in_north_america, is_in_asia, is_in_middle_east])
print("is_in_oceania")
is_in_oceania = c.satisfy(TRAVERSIBLE, [OCEANIA])
is_in_oceania_only = logical_negate(is_in_oceania, [is_in_europe, is_in_north_america, is_in_asia, is_in_middle_east])
print("is_in_outer_space")
is_in_outer_space = c.satisfy(TRAVERSIBLE, [OUTERSPACE])
print("part_of_earth")
part_of_earth = c.satisfy(
[wprop.INSTANCE_OF, wprop.PART_OF, wprop.CONTINENT, wprop.COUNTRY_OF_CITIZENSHIP, wprop.COUNTRY, wprop.SUBCLASS_OF],
[GEOGRAPHIC_LOCATION, EARTH]
)
print("is_in_outer_space_not_earth")
is_in_outer_space_not_earth = logical_negate(
is_in_outer_space, [part_of_earth]
)
print("is_a_populated_place")
is_populated_place = c.satisfy([wprop.INSTANCE_OF, wprop.SUBCLASS_OF], [POPULATED_PLACE])
is_unlocalized_populated_place = logical_negate( is_populated_place, [is_in_europe, is_in_asia, is_in_antarctica, is_in_oceania, is_in_outer_space, is_in_south_america, is_in_north_america])
return {
"europe": is_in_europe_only,
"asia": is_in_asia_only,
"africa": is_in_africa_only,
"middle_east": is_in_middle_east,
"north_america": is_in_north_america_only,
"south_america": is_in_south_america,
"antarctica": is_in_antarctica_only,
"oceania": is_in_oceania_only,
"outer_space": is_in_outer_space_not_earth,
# "populated_space": is_populated_place,
"populated_place_unlocalized": is_unlocalized_populated_place
}