def main()

in collection/download_commoncrawl_passages.py [0:0]


def main(parallelism: int, commoncrawl_docs_root: Path):
    cc_index_paths = get_cc_index_paths()

    # Construct filter lists
    if filter_lists_root.exists():
        shutil.rmtree(filter_lists_root)
    filter_lists_root.mkdir(exist_ok=True)

    for i in range(0, len(cc_index_paths), parallelism):
        with Pool(parallelism) as p:
            logging.info(
                f'Processing Common Crawl index {i+1}-{min(i + parallelism, len(cc_index_paths))} / {len(cc_index_paths)}...'
            )
            partial_filter_lists = p.map(
                process_cc_index, cc_index_paths[i : i + parallelism]
            )
            for partial_filter_list in partial_filter_lists:
                for wet_filename, urls in partial_filter_list.items():
                    with open(filter_lists_root / f'{wet_filename}.txt', 'a') as f:
                        for url in urls:
                            f.writelines(url + '\n')

    # Create sampled filter lists
    logging.info('Sorting and sampling filter lists...')
    sample_filter_lists()

    # Download WET files and filter records
    logging.info('Processing WET files...')
    get_docs_from_wet_files(parallelism, commoncrawl_docs_root)

    # Remove temporary files
    logging.info('Done processing WET files, removing temporary directories...')
    shutil.rmtree(index_files_root)
    shutil.rmtree(filter_lists_root)
    shutil.rmtree(sampled_filter_lists_root)
    shutil.rmtree(wet_files_cache)