in TransferQA/data_loader.py [0:0]
def read_QA_data(args, path_name, tokenizer):
choice_token = " <extra_id_0> "
print(("Reading all files from {}".format(path_name)))
data = []
# read files
with open(path_name) as f:
examples = json.load(f)
# examples = [{"context":"text", "qas":{"question":"..", "answer":"..", "negative_questions":[]}, }]
for example in tqdm(examples):
context = example["context"].strip()
# save memory
inputlen = len(tokenizer.encode(context))
if inputlen>999:
continue
for qa in example["qas"]:
question = qa["question"].strip()
# input_text = f"extractive question: {question} context: {context}".lower()
# output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
# data_detail = {
# "intput_text":input_text,
# "output_text":output_text,
# }
# data.append(data_detail)
# multi-choice question
if len(qa["choice"])>0:
choices = (choice_token + choice_token.join(qa["choice"])).lower()
input_text = f"multi-choice question: {question} choices: {choices} context: {context}".lower()
output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
data_detail = {
"intput_text":input_text,
"output_text":output_text,
}
data.append(data_detail)
else:
input_text = f"extractive question: {question} context: {context}".lower()
output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
data_detail = {
"intput_text":input_text,
"output_text":output_text,
}
data.append(data_detail)
if random.random()<args["neg_num"]:
# for i in range(args["neg_num"]):
negative_context = ""
if len(qa["char_spans"])>0:
for i in range(qa["char_spans"][0],0, -1):
if example["context"][i]==".":
negative_context = example["context"][:i+1]
# print(qa["char_spans"][0], i)
break
if (negative_context!="") and (random.random()<args["neg_context_ratio"]):
# use negative context
question = qa["question"].strip()
input_text = f"extractive question: {question} context: {negative_context}".lower()
else:
# use negative question
question = qa["negative_questions"][0].strip()
input_text = f"extractive question: {question} context: {context}".lower()
# print(input_text)
# print(qa["answer"])
output_text = "none" + f" {tokenizer.eos_token}"
data_detail = {
"intput_text":input_text,
"output_text":output_text,
}
data.append(data_detail)
for idx in range(3):
print(data[idx])
return data