def preprocess_mrqa()

in TransferQA/create_qa_data.py [0:0]


def preprocess_mrqa():
    # dataset = [{"context":"text", "qas":{"question":"..", "answer":"..", "negative_questions":[]}, }]

    def _read_data(split = "mrqa_train"):
        datasets = []
        # read from original data
        for filename in os.listdir(os.path.join(DATA_DIR,split)):
            question_collection = []
            dataset = []
            with gzip.open(os.path.join(DATA_DIR,split,filename)) as f:
                for i, line in enumerate(f):
                    example = {"context":"", "qas":[]}
                    obj = json.loads(line)
                    # Skip headers.
                    if i == 0 and 'header' in obj:
                        continue
                    example["context"] = obj["context"].lower()
                    for qa in obj["qas"]:
                        qa_example = {"question":"", "negative_questions":[], "answer":"", "choice":[], "char_spans":[]}
                        answer_spans = []
                        for d_a in qa["detected_answers"]:
                            answer_spans+=d_a["char_spans"]
                        answer_spans.sort(key=lambda x:x[0])
                        qa_example["char_spans"] = answer_spans[0]

                        question = qa["question"].lower()
                        question_collection.append(question)
                        qa_example["question"] = question
                        qa_example["answer"] = qa["detected_answers"][0]["text"].lower()
                        example["qas"].append(qa_example)
                    dataset.append(example)
            print("done")
            print(len(dataset))
            print(dataset[5])
            # randomly sample 3 negative questions
            for i, example in enumerate(dataset):

                for qa in example["qas"]:
                    qa["negative_questions"] = random.sample(question_collection, 3)
                datasets.append(example)

        with open(os.path.join(DATA_DIR,f"{split}.json"), 'w') as fout:
            json.dump(datasets, fout, indent=4)
        return datasets
    data_train = _read_data("mrqa_train")
    data_dev = _read_data("mrqa_valid")



    return data_train, data_dev