def sort_files()

in tasks/CCMatrix/dl_cc_matrix.py [0:0]


def sort_files(outdir: Path, lang_pair_dir: Path, lang: str) -> Path:
    out = outdir / lang_pair_dir.name / f"{lang}.txt"
    if out.exists():
        return out

    files: List[Path] = []
    for f in lang_pair_dir.iterdir():
        if not f.suffix == ".gz":
            continue
        if f.name.split("_")[0] != lang:
            continue
        files.append(f)

    print(f"Found {len(files)} files for lang '{lang}' in {lang_pair_dir}: {files}")
    assert len(files) > 0

    (outdir / lang_pair_dir.name).mkdir(exist_ok=True, parents=True)
    tmp_out = _tmp(out)
    
    unzipped_files = []
    for f in files:
        subprocess.check_call(["gunzip", "-k", str(f)])
        unzipped_files.append(str(f)[:-3])

    sort_cmd = [
        "sort",
        "-nk1",
        f"--parallel={SORT_PARALLEL}",
        f"--buffer-size={BUFFER_SIZE}",
        "--output",
        str(tmp_out),
        ] + unzipped_files
    subprocess.check_call(sort_cmd)
    tmp_out.rename(out)
    return out