in collection/download_commoncrawl_passages.py [0:0]
def process_cc_index(index_url: str) -> Dict[str, List[str]]:
"""Return a map of WET file to list of URLs it contains."""
# Download index file
filename = index_url.split('/')[-1]
index_files_root.mkdir(exist_ok=True)
if not (index_files_root / filename).exists():
urllib.request.urlretrieve(index_url, index_files_root / filename)
# Parse index file
wet_to_urls = defaultdict(list)
cc_index_line_pattern = re.compile(r'^[\S]+ \d+ (.*)$')
with gzip.open(index_files_root / filename, 'rb') as f:
for line in f:
line = line.decode('utf-8').rstrip()
match = cc_index_line_pattern.match(line)
if match:
url_metadata = json.loads(match.group(1))
if (
url_metadata['status'] == '200'
and url_metadata.get('languages') == 'eng'
and url_metadata['mime'] == 'text/html'
):
wet_filename = url_metadata['filename'].split('/')[-1]
wet_to_urls[wet_filename].append(url_metadata['url'])
else:
logging.error(f'Line in index file cannot be matched by regex: {line}')
return wet_to_urls