in cc_net/mine.py [0:0]
def _validate_test(conf: Config, output_dir: Path, generate: bool = False):
stats: Dict[str, dict] = {}
for file in sorted(output_dir.glob("*.json.gz")):
fname = "/".join((file.parent.name, file.name))
# The order of documents is not guaranteed inside a shard,
lines = sorted(jsonql.open_read(file))
content = "\n".join(lines)
size = len(content)
checksum = hashlib.sha1(bytes(content, encoding="utf-8")).hexdigest()
# first_document = json.loads(lines[0])
stats[fname] = {"size": size, "checksum": checksum}
def dump(x):
return json.dumps(x, indent=2, ensure_ascii=False)
print("*** Stats ***")
stats_raw = dump(stats)
stats_file = FILE_DIR / "data" / "test_stats.json"
if generate:
print("Saving stats to", stats_file)
stats_file.write_text(stats_raw)
return
expected_stats: Dict[str, dict] = {}
if stats_file.exists():
expected_stats = json.loads(stats_file.read_text())
if expected_stats == stats:
print("Everything looks good !")
return
stats_file.with_suffix(".actual.json").write_text(stats_raw)
print("*** Expected Stats ***")
print(dump(expected_stats))
print("*** Diff ***")
for fname in sorted(expected_stats.keys()):
print(fname)
assert fname in expected_stats, "missing file " + fname
if expected_stats[fname]["size"] != stats[fname]["size"]:
print(
" - Expected size",
expected_stats[fname]["size"],
", size",
stats[fname]["size"],
)
if expected_stats[fname]["checksum"] != stats[fname]["checksum"]:
print(
" - Expected checksum",
expected_stats[fname]["checksum"],
", checksum",
stats[fname]["checksum"],
)