def get_cc_index_paths()

in collection/download_commoncrawl_passages.py [0:0]


def get_cc_index_paths() -> List[str]:
    """Get a list of paths for Common Crawl URL index files."""
    index_paths = []
    with tempfile.NamedTemporaryFile() as temp_f:
        urllib.request.urlretrieve(
            'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-47/cc-index.paths.gz',
            temp_f.name,
        )
        with gzip.open(temp_f.name, 'rb') as f:
            for line in f:
                line = line.decode('utf-8').rstrip()
                if line.endswith('.gz'):
                    index_paths.append(f'https://commoncrawl.s3.amazonaws.com/{line}')

    return index_paths