def get_cirrus_urls()

in cc_net/get_wiki_cirrus.py [0:0]


def get_cirrus_urls(date: str = None) -> Dict[str, str]:
    if date is None:
        cirrus_page = BeautifulSoup(
            urllib.request.urlopen(CIRRUS_URL), features="html.parser"
        )
        dumps = [a.get("href").strip("/") for a in cirrus_page.findAll("a")]
        dumps.remove("..")
        dumps.remove("current")
        # We take the oldest dump since the most recent might be incomplete.
        # The page only link to the N latest dumps so the dump won't be too old.
        date = min(dumps)

    cirrus_url = "/".join((CIRRUS_URL, date))
    print("Will use the Wikipedia dump from:", date, cirrus_url)
    cirrus_page = BeautifulSoup(
        urllib.request.urlopen(cirrus_url), features="html.parser"
    )
    urls = {}
    for link in cirrus_page.findAll("a"):
        match = CIRRUS_DUMP_RE.match(link.get("href"))
        if not match:
            continue

        urls[match.group(1)] = "/".join([cirrus_url, link.get("href")])
    assert urls, f"No valid download urls found at {cirrus_url}"
    return urls