in tasks/CCMatrix/dl_cc_matrix.py [0:0]
def dl_file(metadata_dir: str, outdir: Path, file: str):
metadata = "/".join((metadata_dir, file))
parser = get_typed_parser(NormalizedBitextPtr)
found_bitext, missed_bitext, skipped_line = 0, 0, 0
segment = ""
segment_downloads: Dict[str, int] = defaultdict(int)
raw_documents: Dict[str, str] = {}
cleaned_documents: Dict[str, str] = {}
outfile = outdir / file
if outfile.exists():
return
o = FileWriterWithTmp(outfile)
for i, line in enumerate(open_remote_file(metadata)):
try:
bitext: NormalizedBitextPtr = parser(line)
# Add some more assert in case the line is invalid but still parse
assert bitext.segment.startswith("crawl-data/")
assert bitext.digest.startswith("sha1:")
except AssertionError:
logging.error(f"Skipping line {i}: {line}")
skipped_line += 1
continue
if not segment or bitext.segment != segment:
segment = bitext.segment
segment_downloads[segment] += 1
# Load segment in RAM, purge document cache
raw_documents = get_documents(segment)
cleaned_documents = {}
raw_doc = raw_documents.get(bitext.digest)
if raw_doc is None:
logging.error(f"Document not found: {bitext.digest} in {segment}")
missed_bitext += 1
continue
clean_doc = cleaned_documents.get(bitext.digest)
if clean_doc is None:
clean_doc = clean_content(raw_doc)
cleaned_documents[bitext.digest] = clean_doc
text = clean_doc[bitext.ptr_start : bitext.ptr_end]
score = getattr(bitext, "score", 0.0)
bt = Bitext(bitext.lang_pair, bitext.line_no, score, text)
print(*bt, sep="\t", file=o)
o.close(True)
logging.info(f"Found {found_bitext} sentences, missed {missed_bitext} sentences.")
if skipped_line > 0:
logging.error(f"Skipped {skipped_line} unparsable lines")
expected_dl = len(segment_downloads)
actual_dl = sum(segment_downloads.values())
if actual_dl != expected_dl:
logging.error(
f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}"
)