in cc_net/get_wiki_cirrus.py [0:0]
def get_cirrus_urls(date: str = None) -> Dict[str, str]:
if date is None:
cirrus_page = BeautifulSoup(
urllib.request.urlopen(CIRRUS_URL), features="html.parser"
)
dumps = [a.get("href").strip("/") for a in cirrus_page.findAll("a")]
dumps.remove("..")
dumps.remove("current")
# We take the oldest dump since the most recent might be incomplete.
# The page only link to the N latest dumps so the dump won't be too old.
date = min(dumps)
cirrus_url = "/".join((CIRRUS_URL, date))
print("Will use the Wikipedia dump from:", date, cirrus_url)
cirrus_page = BeautifulSoup(
urllib.request.urlopen(cirrus_url), features="html.parser"
)
urls = {}
for link in cirrus_page.findAll("a"):
match = CIRRUS_DUMP_RE.match(link.get("href"))
if not match:
continue
urls[match.group(1)] = "/".join([cirrus_url, link.get("href")])
assert urls, f"No valid download urls found at {cirrus_url}"
return urls