in cc_net/tools/dl_cc_100.py [0:0]
def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]:
"""
Download metadata from a shards.
Sample metadata:
{
"cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz",
"digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ",
"url": "http://personals.gearplay.com/ads/DRJONES.htm",
"line_ids": [10],
"languages": ["en_XX"],
"lm_scores": [-2.658],
}
"""
snapshot = snapshot.replace("-", "_")
name = f"snap_{snapshot}_batch_{shard}.json.gz"
url = "/".join([S3_BUCKET, VERSION, name])
shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict)
try:
cache_file: Optional[Path] = None
if WET_CACHE is not None:
cache_file = WET_CACHE / name
metadata_file = jsonql.open_remote_file(url, cache_file)
except:
logging.warning(f"Couldn't open {url}")
return
for meta in jsonql.read_jsons(metadata_file):
shard_metadata[meta["cc_segment"]][meta["digest"]] = meta
found_pars, missed_pars = 0, 0
for seg, segment_metadata in shard_metadata.items():
for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE):
if doc["digest"] not in segment_metadata:
continue
meta = segment_metadata[doc["digest"]]
full_pars = [doc["title"]] + doc["raw_content"].split("\n")
assert len(meta["line_ids"]) == len(meta["languages"])
assert len(meta["line_ids"]) == len(meta["lm_scores"])
for i, lang, score in zip(
meta["line_ids"], meta["languages"], meta["lm_scores"]
):
if snapshot != "2018-51" and lang in BIG_LANGUAGES:
# Big languages only come from "2018-51" snapshot
continue
if i >= len(full_pars):
# This is because CC100 was created by saving only urls.
# Some urls appears in different snapshot with slightly different
# versions, but we don't know which one is correct.
# Here we read both versions, but some index may end up
# being incorrect.
# This impact ~3% documents.
missed_pars += 1
continue
yield Paragraph(lang, full_pars[i], score)
found_pars += 1
if missed_pars > 0:
logging.warning(
f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes."
)