in cc_net/mine.py [0:0]
def hashes(conf: Config) -> List[Path]:
"""Computes hashes for each shard."""
hashes_dir = conf.output_dir / "hashes" / conf.dump
outputs = [hashes_dir / f"{shard:04d}.bin" for shard in range(conf.num_shards)]
missing_outputs = [(shard, o) for shard, o in enumerate(outputs) if not o.exists()]
if not missing_outputs:
return outputs
hashes_dir.mkdir(parents=True, exist_ok=True)
# With FlatHashSet we need ~2Gb of RAM / shard, but we need to account for
# overhead due to how the dynamic allocation works.
ex = conf.get_executor(f"hashes_{conf.dump}", mem_gb=4, timeout_hour=6, cpus=2)
ex(_hashes_shard, repeat(conf), *_transpose(missing_outputs))
# Wait a bit so that files appears on the disk.
time.sleep(20)
assert all(o.exists() for o in outputs)
return outputs