in collection/download_commoncrawl_passages.py [0:0]
def get_cc_index_paths() -> List[str]:
"""Get a list of paths for Common Crawl URL index files."""
index_paths = []
with tempfile.NamedTemporaryFile() as temp_f:
urllib.request.urlretrieve(
'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-47/cc-index.paths.gz',
temp_f.name,
)
with gzip.open(temp_f.name, 'rb') as f:
for line in f:
line = line.decode('utf-8').rstrip()
if line.endswith('.gz'):
index_paths.append(f'https://commoncrawl.s3.amazonaws.com/{line}')
return index_paths