in collection/download_commoncrawl_passages.py [0:0]
def get_cc_wet_paths() -> Dict[str, str]:
"""Get a dict of WET file name to WET URL."""
wet_urls = {}
with tempfile.NamedTemporaryFile() as temp_f:
urllib.request.urlretrieve(
'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-47/wet.paths.gz',
temp_f.name,
)
with gzip.open(temp_f.name, 'rb') as f:
for line in f:
line = line.decode('utf-8').rstrip()
filename = line.split('/')[-1]
wet_urls[filename] = f'https://commoncrawl.s3.amazonaws.com/{line}'
return wet_urls