in python/mhtml_to_json.py [0:0]
def generate_structured_json(files, output_folder, output_file, fasttext_bin):
ft_model = fasttext.load_model(fasttext_bin)
for warc_file in files:
with open(warc_file) as f, open(
os.path.join(
output_folder,
output_file.replace(
"PLACEHOLDER", os.path.basename(warc_file).replace(".mhtml", "")
),
),
"a+",
) as g:
webpages = json.loads(f.read())
for idx, element in enumerate(webpages):
document = {}
html_content = element["mhtml"]
language = element["language"]
uri = element["uri"]
html_root = etree.HTML(html_content)
html_questions, json_questions, questions_language = [], [], []
get_all_questions(html_root, html_questions)
for html_question in html_questions:
json_question = {"Answers": []}
search_tree(html_question, json_question)
# Remove everything that does not have a question name || question text || answer text for the same instance
has_Q_or_A = has_at_least_Q_or_A(json_question)
if has_Q_or_A:
questions_language.append(
predict_question_language(json_question, ft_model)
)
json_questions.append(json_question)
if len(json_questions) > 0:
question_uuid = str(uuid.uuid4())
predicted_language = predict_majority_language(questions_language)
json_record = json.dumps(
{
"Language": language,
"Fasttext_language": predicted_language,
"URI": uri,
"UUID": question_uuid,
"WARC_ID": os.path.basename(warc_file).replace(
".mhtml", ""
),
"Questions": json_questions,
}
)
g.write(json_record + "\n")