in src/extract_wikidata_info.py [0:0]
def extract_entity_information(popularity_dump, wikidata_dump, output_file):
"""For each Wikidata entity in the Wikidata dump, we extract out it's entity
type, associated Wikipedia page (used for popularity), all aliases
for the entity, and popularity of the entity's Wikipedia page, then write
this information into a JSON file. We write each dictionary of entity
information in it's own line for easy readability.
Args:
popularity_dump: ``str``: Path to the Wikipedia popularity dump
wikidata_dump: ``str`` Path to the Wikidata dump
output_file: ``str`` Output JSON file
"""
timer = BasicTimer(f"Extracting Wikidata entities information")
# Iterate through the Wikipedia popularity dump without decompressing it,
# storing each English Wikipedia page's number of page views.
wiki_popularity = extract_popularities(popularity_dump)
# Iterate through the Wikidata dump without decompressing it
os.makedirs(os.path.dirname(output_file), exist_ok=True)
writer = gzip.open(output_file, "wb")
with bz2.open(wikidata_dump, "rt") as bz_file:
lines_written = 0
# Each line corresponds to a dictionary about a Wikidata entity
for line in tqdm.tqdm(bz_file, desc="Processing Wikidata", smoothing=0):
# The first and last lines of this file are list delimiters, skip these.
# We also add a hack that checks if the entity has an English Wikipedia
# page. If not, we skip the line (and thus the JSON loading which is slow)
# Removing this hack does not change the resulting file.
line = line.strip()
if line == "[" or line == "]" or '"enwiki"' not in line:
continue
# Remove last character (comma), then decode
line = json.loads(line[:-1])
# For each line, extract out relevant Wikidata information
label = extract_label(line)
aliases = extract_aliases(line)
entity_types = extract_entity_types(line)
wikipedia_page = extract_wikipedia_page(line)
popularity = wiki_popularity.get(wikipedia_page)
# Skip if no entity type, label, or popularity value
if label is None or popularity is None or entity_types == []:
continue
entity_dict = {
"label": label,
"aliases": aliases,
"entity_types": entity_types,
"wikipedia_page": wikipedia_page,
"popularity": popularity,
}
# Write extracted dictionary into a JSON format, one line at a time
if lines_written > 0:
writer.write(b",\n")
writer.write(
f"{json.dumps(line['id'])}: "
f"{json.dumps(entity_dict, ensure_ascii=False)}".encode()
)
lines_written += 1
writer.write(b"\n}")
writer.close()
timer.finish()