def process_miskito_page()

in evals/elsuite/skill_acquisition/scraping/scrape_miskito.py [0:0]


def process_miskito_page():
    qa_pairs_by_lesson = {}
    articles_without_qa_pairs = []
    for idx in range(1, 11):
        response = requests.get(miskito_base_url.format(idx=idx))
        soup = BeautifulSoup(response.text, "html.parser")
        content = soup.find("div", class_="mw-content-ltr mw-parser-output")

        # Extract the question-answer pairs.
        divs_with_specific_style = content.find_all(
            "div", style=lambda value: value and "width:300px; float:right;" in value
        )
        lesson_qa_pairs = []
        for i, div in enumerate(divs_with_specific_style):
            if i == 0 and idx == 1:  # First section of first lesson is not in the same format.
                instructions = "Translate to English:"
                questions = div.find_all("ul")[0].find_all("li")
                questions = [str(q.contents[0]) for q in questions]
                answers = div.find_all("ul")[1].find_all("li")
                answers = [str(a.contents[0]) for a in answers]
                lesson_qa_pairs += [
                    {"question": q, "answer": a, "instructions": instructions}
                    for q, a in zip(questions, answers)
                ]
                continue
            instructions, questions, answers = process_practice_section_div(div)
            for q, a in zip(questions, answers):
                lesson_qa_pairs += [{"question": q, "answer": a, "instructions": instructions}]
        qa_pairs_by_lesson[f"lesson_{idx}"] = lesson_qa_pairs

        # Remove them from the page and store the page contents.
        for div in divs_with_specific_style:
            div.decompose()

        articles_without_qa_pairs += [content]

    return qa_pairs_by_lesson, articles_without_qa_pairs