def extract_entity_information()

in src/extract_wikidata_info.py [0:0]


def extract_entity_information(popularity_dump, wikidata_dump, output_file):
    """For each Wikidata entity in the Wikidata dump, we extract out it's entity
    type, associated Wikipedia page (used for popularity), all aliases
    for the entity, and popularity of the entity's Wikipedia page, then write
    this information into a JSON file. We write each dictionary of entity
    information in it's own line for easy readability.

    Args:
        popularity_dump: ``str``: Path to the Wikipedia popularity dump
        wikidata_dump: ``str`` Path to the Wikidata dump
        output_file: ``str`` Output JSON file
    """
    timer = BasicTimer(f"Extracting Wikidata entities information")
    # Iterate through the Wikipedia popularity dump without decompressing it,
    # storing each English Wikipedia page's number of page views.
    wiki_popularity = extract_popularities(popularity_dump)

    # Iterate through the Wikidata dump without decompressing it
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    writer = gzip.open(output_file, "wb")

    with bz2.open(wikidata_dump, "rt") as bz_file:
        lines_written = 0
        # Each line corresponds to a dictionary about a Wikidata entity
        for line in tqdm.tqdm(bz_file, desc="Processing Wikidata", smoothing=0):
            # The first and last lines of this file are list delimiters, skip these.
            # We also add a hack that checks if the entity has an English Wikipedia
            # page. If not, we skip the line (and thus the JSON loading which is slow)
            # Removing this hack does not change the resulting file.
            line = line.strip()
            if line == "[" or line == "]" or '"enwiki"' not in line:
                continue

            # Remove last character (comma), then decode
            line = json.loads(line[:-1])

            # For each line, extract out relevant Wikidata information
            label = extract_label(line)
            aliases = extract_aliases(line)
            entity_types = extract_entity_types(line)
            wikipedia_page = extract_wikipedia_page(line)
            popularity = wiki_popularity.get(wikipedia_page)

            # Skip if no entity type, label, or popularity value
            if label is None or popularity is None or entity_types == []:
                continue

            entity_dict = {
                "label": label,
                "aliases": aliases,
                "entity_types": entity_types,
                "wikipedia_page": wikipedia_page,
                "popularity": popularity,
            }

            # Write extracted dictionary into a JSON format, one line at a time
            if lines_written > 0:
                writer.write(b",\n")
            writer.write(
                f"{json.dumps(line['id'])}: "
                f"{json.dumps(entity_dict, ensure_ascii=False)}".encode()
            )
            lines_written += 1

    writer.write(b"\n}")
    writer.close()
    timer.finish()