def get_cc_wet_paths()

in collection/download_commoncrawl_passages.py [0:0]


def get_cc_wet_paths() -> Dict[str, str]:
    """Get a dict of WET file name to WET URL."""
    wet_urls = {}
    with tempfile.NamedTemporaryFile() as temp_f:
        urllib.request.urlretrieve(
            'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-47/wet.paths.gz',
            temp_f.name,
        )
        with gzip.open(temp_f.name, 'rb') as f:
            for line in f:
                line = line.decode('utf-8').rstrip()
                filename = line.split('/')[-1]
                wet_urls[filename] = f'https://commoncrawl.s3.amazonaws.com/{line}'

    return wet_urls