in tasks/CCMatrix/dl_cc_matrix.py [0:0]
def sort_files(outdir: Path, lang_pair_dir: Path, lang: str) -> Path:
out = outdir / lang_pair_dir.name / f"{lang}.txt"
if out.exists():
return out
files: List[Path] = []
for f in lang_pair_dir.iterdir():
if not f.suffix == ".gz":
continue
if f.name.split("_")[0] != lang:
continue
files.append(f)
print(f"Found {len(files)} files for lang '{lang}' in {lang_pair_dir}: {files}")
assert len(files) > 0
(outdir / lang_pair_dir.name).mkdir(exist_ok=True, parents=True)
tmp_out = _tmp(out)
unzipped_files = []
for f in files:
subprocess.check_call(["gunzip", "-k", str(f)])
unzipped_files.append(str(f)[:-3])
sort_cmd = [
"sort",
"-nk1",
f"--parallel={SORT_PARALLEL}",
f"--buffer-size={BUFFER_SIZE}",
"--output",
str(tmp_out),
] + unzipped_files
subprocess.check_call(sort_cmd)
tmp_out.rename(out)
return out