in collection/download_commoncrawl_passages.py [0:0]
def process_wet_file(tup: Tuple[Path, str, str, Path],) -> None:
"""Download WET file and extract webpages from WARC whose URL is in the filter list."""
filter_list, wet_name, wet_url, commoncrawl_docs_root = tup
accepted_urls = set()
with open(filter_list) as f:
for line in f:
accepted_urls.add(line.rstrip())
attempt = 0
while attempt < 3:
try:
urllib.request.urlretrieve(wet_url, wet_files_cache / wet_name)
break
except Exception:
logging.exception(f'Error while downloading {wet_url}')
attempt += 1
if not (wet_files_cache / wet_name).exists():
logging.error(
f'Failed to download {wet_url} after 3 attempts. Ignoring file...'
)
return
with gzip.open(wet_files_cache / wet_name, 'rb') as stream, open(
commoncrawl_docs_root / f'{wet_name}.jsonl', 'w'
) as f:
for record in ArchiveIterator(stream):
if record.rec_type == 'conversion':
url = record.rec_headers.get_header('WARC-Target-URI')
if url not in accepted_urls:
continue
contents = record.content_stream().read().decode('utf-8')
if contents.startswith('404 Not Found'):
continue
output_dict = {'id': url, 'contents': contents}
f.write(json.dumps(output_dict) + '\n')
os.remove(wet_files_cache / wet_name)