def extract_text()

in collection/download_wayback_passages.py [0:0]


def extract_text(html_text: str) -> str:
    """Extracts text from an HTML document."""
    soup = BeautifulSoup(html_text, 'html.parser')
    text = soup.find_all(text=True)
    output = ''
    for t in text:
        if t.parent.name not in blacklist:
            output += f'{t} '

    return output