in TransferQA/create_qa_data.py [0:0]
def preprocess_mrqa():
# dataset = [{"context":"text", "qas":{"question":"..", "answer":"..", "negative_questions":[]}, }]
def _read_data(split = "mrqa_train"):
datasets = []
# read from original data
for filename in os.listdir(os.path.join(DATA_DIR,split)):
question_collection = []
dataset = []
with gzip.open(os.path.join(DATA_DIR,split,filename)) as f:
for i, line in enumerate(f):
example = {"context":"", "qas":[]}
obj = json.loads(line)
# Skip headers.
if i == 0 and 'header' in obj:
continue
example["context"] = obj["context"].lower()
for qa in obj["qas"]:
qa_example = {"question":"", "negative_questions":[], "answer":"", "choice":[], "char_spans":[]}
answer_spans = []
for d_a in qa["detected_answers"]:
answer_spans+=d_a["char_spans"]
answer_spans.sort(key=lambda x:x[0])
qa_example["char_spans"] = answer_spans[0]
question = qa["question"].lower()
question_collection.append(question)
qa_example["question"] = question
qa_example["answer"] = qa["detected_answers"][0]["text"].lower()
example["qas"].append(qa_example)
dataset.append(example)
print("done")
print(len(dataset))
print(dataset[5])
# randomly sample 3 negative questions
for i, example in enumerate(dataset):
for qa in example["qas"]:
qa["negative_questions"] = random.sample(question_collection, 3)
datasets.append(example)
with open(os.path.join(DATA_DIR,f"{split}.json"), 'w') as fout:
json.dump(datasets, fout, indent=4)
return datasets
data_train = _read_data("mrqa_train")
data_dev = _read_data("mrqa_valid")
return data_train, data_dev