extraction/classifiers/time_classifier.py (33 lines of code) (raw):
"""
Create membership rules for entities based on their date of existence/birth/etc.
More classes can be created by selecting other key dates as hyperplanes.
"""
from numpy import (
logical_and, logical_or, logical_not, logical_xor, where
)
from wikidata_linker_utils.logic import logical_negate, logical_ors, logical_ands
import wikidata_linker_utils.wikidata_properties as wprop
def wkp(c, name):
"""Convert a string wikipedia article name to its Wikidata index."""
return c.article2id["enwiki/" + name][0][0]
def wkd(c, name):
"""Convert a wikidata QID to its wikidata index."""
return c.name2index[name]
def classify(c):
D1950 = 1950
pre_1950 = logical_ors([
c.attribute(wprop.PUBLICATION_DATE) < D1950,
c.attribute(wprop.DATE_OF_BIRTH) < D1950,
c.attribute(wprop.INCEPTION) < D1950,
c.attribute(wprop.DISSOLVED_OR_ABOLISHED) < D1950,
c.attribute(wprop.POINT_IN_TIME) < D1950,
c.attribute(wprop.START_TIME) < D1950
])
post_1950 = logical_and(logical_ors([
c.attribute(wprop.PUBLICATION_DATE) >= D1950,
c.attribute(wprop.DATE_OF_BIRTH) >= D1950,
c.attribute(wprop.INCEPTION) >= D1950,
c.attribute(wprop.DISSOLVED_OR_ABOLISHED) >= D1950,
c.attribute(wprop.POINT_IN_TIME) >= D1950,
c.attribute(wprop.START_TIME) >= D1950
]), logical_not(pre_1950))
# some elements are neither pre 1950 or post 1950, they are "undated"
# (e.g. no value was provided for any of the time attributes used
# above)
undated = logical_and(logical_not(pre_1950), logical_not(post_1950))
print("%d items have no date information" % (undated.sum(),))
return {
"pre-1950": pre_1950,
"post-1950": post_1950
}