def read_QA_data()

in TransferQA/data_loader.py [0:0]


def read_QA_data(args, path_name, tokenizer):
    choice_token = " <extra_id_0> "
    print(("Reading all files from {}".format(path_name)))
    data = []
    # read files
    with open(path_name) as f:
        examples = json.load(f)
        # examples = [{"context":"text", "qas":{"question":"..", "answer":"..", "negative_questions":[]}, }]

        for example in tqdm(examples):
            context = example["context"].strip()

            # save memory
            inputlen = len(tokenizer.encode(context))
            if inputlen>999:
                continue

            for qa in example["qas"]:
                question = qa["question"].strip()

                # input_text = f"extractive question: {question} context: {context}".lower()
                # output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
                # data_detail = {
                #     "intput_text":input_text,
                #     "output_text":output_text,
                #     }
                # data.append(data_detail)

                # multi-choice question
                if len(qa["choice"])>0:
                    choices = (choice_token + choice_token.join(qa["choice"])).lower()
                    input_text = f"multi-choice question: {question} choices: {choices} context: {context}".lower()
                    output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
                    data_detail = {
                        "intput_text":input_text,
                        "output_text":output_text,
                        }
                    data.append(data_detail)

                else:
                    input_text = f"extractive question: {question} context: {context}".lower()
                    output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
                    data_detail = {
                        "intput_text":input_text,
                        "output_text":output_text,
                        }
                    data.append(data_detail)

                if random.random()<args["neg_num"]:
                # for i in range(args["neg_num"]):
                    negative_context = ""
                    if len(qa["char_spans"])>0:
                        for i in range(qa["char_spans"][0],0, -1):
                            if example["context"][i]==".":
                                negative_context = example["context"][:i+1]
                                # print(qa["char_spans"][0], i)
                                break

                    if (negative_context!="") and (random.random()<args["neg_context_ratio"]):
                        # use negative context
                        question = qa["question"].strip()
                        input_text = f"extractive question: {question} context: {negative_context}".lower()
                    else:
                        # use negative question
                        question = qa["negative_questions"][0].strip()
                        input_text = f"extractive question: {question} context: {context}".lower()


                        # print(input_text)
                        # print(qa["answer"])

                    output_text = "none" + f" {tokenizer.eos_token}"
                    data_detail = {
                    "intput_text":input_text,
                    "output_text":output_text,
                    }
                    data.append(data_detail)

    for idx in range(3):
        print(data[idx])
    return data