in blink/candidate_retrieval/data_ingestion.py [0:0]
def get_data_for_key(data, title):
obj = {}
obj["id"] = data[title]["wikipedia_id"]
obj["title"] = title
if ("wikidata_info" in data[title]) and (
data[title]["wikidata_info"]["wikidata_id"] is not None
):
obj["wikidata_id"] = data[title]["wikidata_info"]["wikidata_id"]
else:
obj["wikidata_id"] = data[title]["wikidata_id_from_index"]
description = data[title]["intro_concatenated"]
obj["desc"] = description
if "wikidata_info" in data[title]:
if "description" in data[title]["wikidata_info"]:
wikidata_description = data[title]["wikidata_info"]["description"]
else:
wikidata_description = ""
if ("aliases" in data[title]["wikidata_info"]) and (
data[title]["wikidata_info"]["aliases"]
) is not None:
aliases = " ".join(
[
'"{}"'.format(alias)
for alias in data[title]["wikidata_info"]["aliases"]
if alias not in emoji.UNICODE_EMOJI
]
)
else:
aliases = ""
else:
aliases = ""
wikidata_description = ""
obj["aliases"] = aliases
obj["wikidata_desc"] = wikidata_description
obj["num_tokens"] = data[title]["num_tokens"]
obj["num_incoming_links"] = data[title].get("num_incoming_links", 0)
if args.add_sentence_data:
for k in range(0, 10):
key = "sent_desc_{}".format(k + 1)
obj[key] = data[title].get(key, "")
return obj