def dl_file()

in tasks/CCMatrix/dl_cc_matrix.py [0:0]


def dl_file(metadata_dir: str, outdir: Path, file: str):
    metadata = "/".join((metadata_dir, file))
    parser = get_typed_parser(NormalizedBitextPtr)
    found_bitext, missed_bitext, skipped_line = 0, 0, 0
    segment = ""
    segment_downloads: Dict[str, int] = defaultdict(int)
    raw_documents: Dict[str, str] = {}
    cleaned_documents: Dict[str, str] = {}

    outfile = outdir / file
    if outfile.exists():
        return
    o = FileWriterWithTmp(outfile)
    for i, line in enumerate(open_remote_file(metadata)):
        try:
            bitext: NormalizedBitextPtr = parser(line)
            # Add some more assert in case the line is invalid but still parse
            assert bitext.segment.startswith("crawl-data/")
            assert bitext.digest.startswith("sha1:")
        except AssertionError:
            logging.error(f"Skipping line {i}: {line}")
            skipped_line += 1
            continue

        if not segment or bitext.segment != segment:
            segment = bitext.segment
            segment_downloads[segment] += 1
            # Load segment in RAM, purge document cache
            raw_documents = get_documents(segment)
            cleaned_documents = {}

        raw_doc = raw_documents.get(bitext.digest)
        if raw_doc is None:
            logging.error(f"Document not found: {bitext.digest} in {segment}")
            missed_bitext += 1
            continue

        clean_doc = cleaned_documents.get(bitext.digest)
        if clean_doc is None:
            clean_doc = clean_content(raw_doc)
            cleaned_documents[bitext.digest] = clean_doc

        text = clean_doc[bitext.ptr_start : bitext.ptr_end]
        score = getattr(bitext, "score", 0.0)
        bt = Bitext(bitext.lang_pair, bitext.line_no, score, text)
        print(*bt, sep="\t", file=o)

    o.close(True)
    logging.info(f"Found {found_bitext} sentences, missed {missed_bitext} sentences.")
    if skipped_line > 0:
        logging.error(f"Skipped {skipped_line} unparsable lines")
    expected_dl = len(segment_downloads)
    actual_dl = sum(segment_downloads.values())

    if actual_dl != expected_dl:
        logging.error(
            f"Some segments where downloaded twice. Total dl: {actual_dl}, distinct dl: {expected_dl}"
        )