def process_wet_file()

in collection/download_commoncrawl_passages.py [0:0]


def process_wet_file(tup: Tuple[Path, str, str, Path],) -> None:
    """Download WET file and extract webpages from WARC whose URL is in the filter list."""
    filter_list, wet_name, wet_url, commoncrawl_docs_root = tup
    accepted_urls = set()
    with open(filter_list) as f:
        for line in f:
            accepted_urls.add(line.rstrip())

    attempt = 0
    while attempt < 3:
        try:
            urllib.request.urlretrieve(wet_url, wet_files_cache / wet_name)
            break
        except Exception:
            logging.exception(f'Error while downloading {wet_url}')
            attempt += 1

    if not (wet_files_cache / wet_name).exists():
        logging.error(
            f'Failed to download {wet_url} after 3 attempts. Ignoring file...'
        )
        return

    with gzip.open(wet_files_cache / wet_name, 'rb') as stream, open(
        commoncrawl_docs_root / f'{wet_name}.jsonl', 'w'
    ) as f:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'conversion':
                url = record.rec_headers.get_header('WARC-Target-URI')
                if url not in accepted_urls:
                    continue

                contents = record.content_stream().read().decode('utf-8')
                if contents.startswith('404 Not Found'):
                    continue

                output_dict = {'id': url, 'contents': contents}

                f.write(json.dumps(output_dict) + '\n')

    os.remove(wet_files_cache / wet_name)