def generate_new_datapoint()

in python/json_duplicate_filter.py [0:0]


def generate_new_datapoint(line, dataset):
    dataset[line["URI"]] = {
        "Language": line["Language"],
        "Fasttext_language": line["Fasttext_language"],
        "URI": line["URI"],
        "UUID": line["UUID"],
        "WARC_ID": line["WARC_ID"],
    }
    dataset[line["URI"]]["Questions"] = {}
    for question in line["Questions"]:
        condensed_question = copy.copy(question)
        # Remove answers to only look at questions
        condensed_question.pop("Answers")
        dataset[line["URI"]]["Questions"][
            normalize_answer(get_full_question(condensed_question))
        ] = condensed_question
        dataset[line["URI"]]["Questions"][
            normalize_answer(get_full_question(condensed_question))
        ]["Answers"] = {}
        for answer in question["Answers"]:
            dataset[line["URI"]]["Questions"][
                normalize_answer(get_full_question(condensed_question))
            ]["Answers"][normalize_answer(get_full_answer(answer))] = answer
    return dataset