def _dl_shard()

in cc_net/tools/dl_cc_100.py [0:0]


def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]:
    """
    Download metadata from a shards.

    Sample metadata:

    {
        "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz",
        "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ",
        "url": "http://personals.gearplay.com/ads/DRJONES.htm",
        "line_ids": [10],
        "languages": ["en_XX"],
        "lm_scores": [-2.658],
    }
    """
    snapshot = snapshot.replace("-", "_")
    name = f"snap_{snapshot}_batch_{shard}.json.gz"
    url = "/".join([S3_BUCKET, VERSION, name])
    shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict)
    try:
        cache_file: Optional[Path] = None
        if WET_CACHE is not None:
            cache_file = WET_CACHE / name
        metadata_file = jsonql.open_remote_file(url, cache_file)
    except:
        logging.warning(f"Couldn't open {url}")
        return

    for meta in jsonql.read_jsons(metadata_file):
        shard_metadata[meta["cc_segment"]][meta["digest"]] = meta

    found_pars, missed_pars = 0, 0
    for seg, segment_metadata in shard_metadata.items():
        for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE):
            if doc["digest"] not in segment_metadata:
                continue

            meta = segment_metadata[doc["digest"]]
            full_pars = [doc["title"]] + doc["raw_content"].split("\n")

            assert len(meta["line_ids"]) == len(meta["languages"])
            assert len(meta["line_ids"]) == len(meta["lm_scores"])
            for i, lang, score in zip(
                meta["line_ids"], meta["languages"], meta["lm_scores"]
            ):
                if snapshot != "2018-51" and lang in BIG_LANGUAGES:
                    # Big languages only come from "2018-51" snapshot
                    continue
                if i >= len(full_pars):
                    # This is because CC100 was created by saving only urls.
                    # Some urls appears in different snapshot with slightly different
                    # versions, but we don't know which one is correct.
                    # Here we read both versions, but some index may end up
                    # being incorrect.
                    # This impact ~3% documents.
                    missed_pars += 1
                    continue

                yield Paragraph(lang, full_pars[i], score)
                found_pars += 1
        if missed_pars > 0:
            logging.warning(
                f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes."
            )