in python/json_duplicate_filter.py [0:0]
def generate_new_datapoint(line, dataset):
dataset[line["URI"]] = {
"Language": line["Language"],
"Fasttext_language": line["Fasttext_language"],
"URI": line["URI"],
"UUID": line["UUID"],
"WARC_ID": line["WARC_ID"],
}
dataset[line["URI"]]["Questions"] = {}
for question in line["Questions"]:
condensed_question = copy.copy(question)
# Remove answers to only look at questions
condensed_question.pop("Answers")
dataset[line["URI"]]["Questions"][
normalize_answer(get_full_question(condensed_question))
] = condensed_question
dataset[line["URI"]]["Questions"][
normalize_answer(get_full_question(condensed_question))
]["Answers"] = {}
for answer in question["Answers"]:
dataset[line["URI"]]["Questions"][
normalize_answer(get_full_question(condensed_question))
]["Answers"][normalize_answer(get_full_answer(answer))] = answer
return dataset