in python/closed_book_processing.py [0:0]
def generate_closed_book_format(data_path, only_english, keep_markup, output_path):
with open(data_path, "r") as f:
question_list = []
answer_list = []
for website in f:
content = json.loads(website)
if only_english and content["Fasttext_language"] != "en":
continue
questions = content["Questions"]
for question in questions:
question_text = ""
if "name_markup" in question.keys():
extracted_text = extract_text(question["name_markup"], keep_markup)
if extracted_text is not None:
question_text += extracted_text + " "
if "text_markup" in question.keys():
extracted_text = extract_text(question["text_markup"], keep_markup)
if extracted_text is not None:
question_text += extracted_text
if len(question_text) > 0:
for answer in question["Answers"]:
if "text_markup" in answer.keys():
answer_text = extract_text(
answer["text_markup"], keep_markup
)
if (
answer_text is not None
and len(answer_text.replace("\n", "").replace("\r", "")) > 0
):
question_list.append(question_text)
answer_list.append(answer_text)
with open(output_path + ".source", "w") as f:
for element in question_list:
f.write(element.replace("\n", "").replace("\r", "") + "\n")
with open(output_path + ".target", "w") as f:
for element in answer_list:
f.write(element.replace("\n", "").replace("\r", "") + "\n")