def generate_passage_retrieval_files()

in python/passage_retrieval_processing.py [0:0]


def generate_passage_retrieval_files(data_path, only_english, keep_markup, output_path):
    instances = []
    with open(data_path, "r") as f:
        for website in f:
            # Process the question
            content = json.loads(website)
            if only_english and content["Fasttext_language"] != "en":
                continue
            questions = content["Questions"]
            for question in questions:
                question_text = ""
                if "name_markup" in question.keys():
                    extracted_text = extract_text(question["name_markup"], keep_markup)
                    if extracted_text is not None:
                        question_text += extracted_text + " "
                if "text_markup" in question.keys():
                    extracted_text = extract_text(question["text_markup"], keep_markup)
                    if extracted_text is not None:
                        question_text += extracted_text
                # If question exists, check the answers for their markup capacities
                if len(question_text) > 0:
                    accepted, suggested, vote = find_markup_options(question["Answers"])
                    # All information available
                    if accepted and suggested and vote:
                        instances.append(
                            full_info(question["Answers"], question_text, keep_markup)
                        )
                    # If no votes are available, pick at random from accepted and suggested
                    elif accepted and suggested:
                        instances.append(
                            acc_sugg_info(
                                question["Answers"], question_text, keep_markup
                            )
                        )
                    # If only votes are available use above/below 2
                    elif vote:
                        instances.append(
                            vote_info(question["Answers"], question_text, keep_markup)
                        )
                    # Otherwise just select one at random to be a positive ctx and no hard negatives
                    else:
                        instances.append(
                            no_info(question["Answers"], question_text, keep_markup)
                        )

    with open(output_path + ".jsonl", "w") as f:
        for sample in instances:
            json_record = json.dumps(sample)
            f.write(json_record + "\n")