in evals/elsuite/skill_acquisition/scraping/scrape_miskito.py [0:0]
def process_miskito_page():
qa_pairs_by_lesson = {}
articles_without_qa_pairs = []
for idx in range(1, 11):
response = requests.get(miskito_base_url.format(idx=idx))
soup = BeautifulSoup(response.text, "html.parser")
content = soup.find("div", class_="mw-content-ltr mw-parser-output")
# Extract the question-answer pairs.
divs_with_specific_style = content.find_all(
"div", style=lambda value: value and "width:300px; float:right;" in value
)
lesson_qa_pairs = []
for i, div in enumerate(divs_with_specific_style):
if i == 0 and idx == 1: # First section of first lesson is not in the same format.
instructions = "Translate to English:"
questions = div.find_all("ul")[0].find_all("li")
questions = [str(q.contents[0]) for q in questions]
answers = div.find_all("ul")[1].find_all("li")
answers = [str(a.contents[0]) for a in answers]
lesson_qa_pairs += [
{"question": q, "answer": a, "instructions": instructions}
for q, a in zip(questions, answers)
]
continue
instructions, questions, answers = process_practice_section_div(div)
for q, a in zip(questions, answers):
lesson_qa_pairs += [{"question": q, "answer": a, "instructions": instructions}]
qa_pairs_by_lesson[f"lesson_{idx}"] = lesson_qa_pairs
# Remove them from the page and store the page contents.
for div in divs_with_specific_style:
div.decompose()
articles_without_qa_pairs += [content]
return qa_pairs_by_lesson, articles_without_qa_pairs