in collection/download_commoncrawl_passages.py [0:0]
def get_docs_from_wet_files(parallelism, commoncrawl_docs_root: Path) -> None:
"""Download WET files and extract webpages whose URLs is in the filter list."""
wet_files_cache.mkdir(exist_ok=True)
commoncrawl_docs_root.mkdir(exist_ok=True, parents=True)
filter_lists = list(sampled_filter_lists_root.iterdir())
# Download WET file paths
wet_paths = get_cc_wet_paths()
wet_names = []
resolved_wet_paths = []
for filter_list in filter_lists:
wet_filename = str(filter_list.name).replace('.warc.gz.txt', '.warc.wet.gz')
wet_names.append(wet_filename)
resolved_wet_paths.append(wet_paths[wet_filename])
with Pool(parallelism) as p:
for i, _ in enumerate(
p.imap_unordered(
process_wet_file,
zip(
filter_lists,
wet_names,
resolved_wet_paths,
[commoncrawl_docs_root for _ in range(len(filter_lists))],
),
)
):
if (i + 1) % 50 == 0:
logging.info(f'Processed {i + 1} / {len(filter_lists)} WET files...')