in python/passage_retrieval_processing.py [0:0]
def generate_passage_retrieval_files(data_path, only_english, keep_markup, output_path):
instances = []
with open(data_path, "r") as f:
for website in f:
# Process the question
content = json.loads(website)
if only_english and content["Fasttext_language"] != "en":
continue
questions = content["Questions"]
for question in questions:
question_text = ""
if "name_markup" in question.keys():
extracted_text = extract_text(question["name_markup"], keep_markup)
if extracted_text is not None:
question_text += extracted_text + " "
if "text_markup" in question.keys():
extracted_text = extract_text(question["text_markup"], keep_markup)
if extracted_text is not None:
question_text += extracted_text
# If question exists, check the answers for their markup capacities
if len(question_text) > 0:
accepted, suggested, vote = find_markup_options(question["Answers"])
# All information available
if accepted and suggested and vote:
instances.append(
full_info(question["Answers"], question_text, keep_markup)
)
# If no votes are available, pick at random from accepted and suggested
elif accepted and suggested:
instances.append(
acc_sugg_info(
question["Answers"], question_text, keep_markup
)
)
# If only votes are available use above/below 2
elif vote:
instances.append(
vote_info(question["Answers"], question_text, keep_markup)
)
# Otherwise just select one at random to be a positive ctx and no hard negatives
else:
instances.append(
no_info(question["Answers"], question_text, keep_markup)
)
with open(output_path + ".jsonl", "w") as f:
for sample in instances:
json_record = json.dumps(sample)
f.write(json_record + "\n")