def get_docs_from_wet_files()

in collection/download_commoncrawl_passages.py [0:0]


def get_docs_from_wet_files(parallelism, commoncrawl_docs_root: Path) -> None:
    """Download WET files and extract webpages whose URLs is in the filter list."""
    wet_files_cache.mkdir(exist_ok=True)
    commoncrawl_docs_root.mkdir(exist_ok=True, parents=True)

    filter_lists = list(sampled_filter_lists_root.iterdir())

    # Download WET file paths
    wet_paths = get_cc_wet_paths()
    wet_names = []
    resolved_wet_paths = []
    for filter_list in filter_lists:
        wet_filename = str(filter_list.name).replace('.warc.gz.txt', '.warc.wet.gz')
        wet_names.append(wet_filename)
        resolved_wet_paths.append(wet_paths[wet_filename])

    with Pool(parallelism) as p:
        for i, _ in enumerate(
            p.imap_unordered(
                process_wet_file,
                zip(
                    filter_lists,
                    wet_names,
                    resolved_wet_paths,
                    [commoncrawl_docs_root for _ in range(len(filter_lists))],
                ),
            )
        ):
            if (i + 1) % 50 == 0:
                logging.info(f'Processed {i + 1} / {len(filter_lists)} WET files...')