def generate_structured_json()

in python/mhtml_to_json.py [0:0]


def generate_structured_json(files, output_folder, output_file, fasttext_bin):
    ft_model = fasttext.load_model(fasttext_bin)
    for warc_file in files:
        with open(warc_file) as f, open(
            os.path.join(
                output_folder,
                output_file.replace(
                    "PLACEHOLDER", os.path.basename(warc_file).replace(".mhtml", "")
                ),
            ),
            "a+",
        ) as g:
            webpages = json.loads(f.read())
            for idx, element in enumerate(webpages):
                document = {}
                html_content = element["mhtml"]
                language = element["language"]
                uri = element["uri"]

                html_root = etree.HTML(html_content)
                html_questions, json_questions, questions_language = [], [], []
                get_all_questions(html_root, html_questions)
                for html_question in html_questions:
                    json_question = {"Answers": []}
                    search_tree(html_question, json_question)
                    # Remove everything that does not have a question name || question text || answer text for the same instance
                    has_Q_or_A = has_at_least_Q_or_A(json_question)
                    if has_Q_or_A:
                        questions_language.append(
                            predict_question_language(json_question, ft_model)
                        )
                        json_questions.append(json_question)
                if len(json_questions) > 0:
                    question_uuid = str(uuid.uuid4())
                    predicted_language = predict_majority_language(questions_language)
                    json_record = json.dumps(
                        {
                            "Language": language,
                            "Fasttext_language": predicted_language,
                            "URI": uri,
                            "UUID": question_uuid,
                            "WARC_ID": os.path.basename(warc_file).replace(
                                ".mhtml", ""
                            ),
                            "Questions": json_questions,
                        }
                    )
                    g.write(json_record + "\n")