def process_cc_index()

in collection/download_commoncrawl_passages.py [0:0]


def process_cc_index(index_url: str) -> Dict[str, List[str]]:
    """Return a map of WET file to list of URLs it contains."""
    # Download index file
    filename = index_url.split('/')[-1]
    index_files_root.mkdir(exist_ok=True)
    if not (index_files_root / filename).exists():
        urllib.request.urlretrieve(index_url, index_files_root / filename)

    # Parse index file
    wet_to_urls = defaultdict(list)
    cc_index_line_pattern = re.compile(r'^[\S]+ \d+ (.*)$')
    with gzip.open(index_files_root / filename, 'rb') as f:
        for line in f:
            line = line.decode('utf-8').rstrip()
            match = cc_index_line_pattern.match(line)
            if match:
                url_metadata = json.loads(match.group(1))
                if (
                    url_metadata['status'] == '200'
                    and url_metadata.get('languages') == 'eng'
                    and url_metadata['mime'] == 'text/html'
                ):
                    wet_filename = url_metadata['filename'].split('/')[-1]
                    wet_to_urls[wet_filename].append(url_metadata['url'])
            else:
                logging.error(f'Line in index file cannot be matched by regex: {line}')

    return wet_to_urls