in collection/download_commoncrawl_passages.py [0:0]
def main(parallelism: int, commoncrawl_docs_root: Path):
cc_index_paths = get_cc_index_paths()
# Construct filter lists
if filter_lists_root.exists():
shutil.rmtree(filter_lists_root)
filter_lists_root.mkdir(exist_ok=True)
for i in range(0, len(cc_index_paths), parallelism):
with Pool(parallelism) as p:
logging.info(
f'Processing Common Crawl index {i+1}-{min(i + parallelism, len(cc_index_paths))} / {len(cc_index_paths)}...'
)
partial_filter_lists = p.map(
process_cc_index, cc_index_paths[i : i + parallelism]
)
for partial_filter_list in partial_filter_lists:
for wet_filename, urls in partial_filter_list.items():
with open(filter_lists_root / f'{wet_filename}.txt', 'a') as f:
for url in urls:
f.writelines(url + '\n')
# Create sampled filter lists
logging.info('Sorting and sampling filter lists...')
sample_filter_lists()
# Download WET files and filter records
logging.info('Processing WET files...')
get_docs_from_wet_files(parallelism, commoncrawl_docs_root)
# Remove temporary files
logging.info('Done processing WET files, removing temporary directories...')
shutil.rmtree(index_files_root)
shutil.rmtree(filter_lists_root)
shutil.rmtree(sampled_filter_lists_root)
shutil.rmtree(wet_files_cache)