def _validate_test()

in cc_net/mine.py [0:0]


def _validate_test(conf: Config, output_dir: Path, generate: bool = False):
    stats: Dict[str, dict] = {}
    for file in sorted(output_dir.glob("*.json.gz")):
        fname = "/".join((file.parent.name, file.name))
        # The order of documents is not guaranteed inside a shard,
        lines = sorted(jsonql.open_read(file))
        content = "\n".join(lines)
        size = len(content)
        checksum = hashlib.sha1(bytes(content, encoding="utf-8")).hexdigest()
        # first_document = json.loads(lines[0])
        stats[fname] = {"size": size, "checksum": checksum}

    def dump(x):
        return json.dumps(x, indent=2, ensure_ascii=False)

    print("*** Stats ***")
    stats_raw = dump(stats)
    stats_file = FILE_DIR / "data" / "test_stats.json"
    if generate:
        print("Saving stats to", stats_file)
        stats_file.write_text(stats_raw)
        return

    expected_stats: Dict[str, dict] = {}
    if stats_file.exists():
        expected_stats = json.loads(stats_file.read_text())

    if expected_stats == stats:
        print("Everything looks good !")
        return

    stats_file.with_suffix(".actual.json").write_text(stats_raw)
    print("*** Expected Stats ***")
    print(dump(expected_stats))

    print("*** Diff ***")
    for fname in sorted(expected_stats.keys()):
        print(fname)
        assert fname in expected_stats, "missing file " + fname
        if expected_stats[fname]["size"] != stats[fname]["size"]:
            print(
                "  - Expected size",
                expected_stats[fname]["size"],
                ", size",
                stats[fname]["size"],
            )
        if expected_stats[fname]["checksum"] != stats[fname]["checksum"]:
            print(
                "  - Expected checksum",
                expected_stats[fname]["checksum"],
                ", checksum",
                stats[fname]["checksum"],
            )