def seek_to_start()

in src/datatrove/utils/binaryio.py [0:0]


def seek_to_start(f: AbstractBufferedFile, start_hash: int, line_format: str, hash_format: str):
    if start_hash == 0:
        return
    line_size = struct.calcsize(line_format)
    nr_lines = f.size // line_size

    @cache
    def read_line_start(line):
        assert 0 <= line < nr_lines
        f.seek(line * line_size, os.SEEK_SET)
        return struct.unpack(hash_format, f.read(struct.calcsize(hash_format)))[0]

    # save some time with binary search
    # this file is strictly bigger
    if read_line_start(0) >= start_hash:
        f.seek(0, os.SEEK_SET)
        return

    # this file is strictly smaller, ignore it completely
    if read_line_start(nr_lines - 1) < start_hash:
        f.seek(0, os.SEEK_END)
        return

    # binary search to find start line
    start_line, hi = 0, nr_lines
    # Note, the comparison uses "<" to match the
    # __lt__() logic in list.sort() and in heapq.
    while start_line < hi:
        mid = (start_line + hi) // 2
        if read_line_start(mid) < start_hash:
            start_line = mid + 1
        else:
            hi = mid

    if start_line > nr_lines:
        raise ValueError

    # verification check. we know start_line > 0 from the check above
    if (prev_hash := read_line_start(start_line - 1)) >= start_hash:
        raise ValueError(f"Wrong bsearch start line: {prev_hash=} >= {start_hash=}")
    f.seek(start_line * line_size, os.SEEK_SET)