in src/datatrove/utils/binaryio.py [0:0]
def seek_to_start(f: AbstractBufferedFile, start_hash: int, line_format: str, hash_format: str):
if start_hash == 0:
return
line_size = struct.calcsize(line_format)
nr_lines = f.size // line_size
@cache
def read_line_start(line):
assert 0 <= line < nr_lines
f.seek(line * line_size, os.SEEK_SET)
return struct.unpack(hash_format, f.read(struct.calcsize(hash_format)))[0]
# save some time with binary search
# this file is strictly bigger
if read_line_start(0) >= start_hash:
f.seek(0, os.SEEK_SET)
return
# this file is strictly smaller, ignore it completely
if read_line_start(nr_lines - 1) < start_hash:
f.seek(0, os.SEEK_END)
return
# binary search to find start line
start_line, hi = 0, nr_lines
# Note, the comparison uses "<" to match the
# __lt__() logic in list.sort() and in heapq.
while start_line < hi:
mid = (start_line + hi) // 2
if read_line_start(mid) < start_hash:
start_line = mid + 1
else:
hi = mid
if start_line > nr_lines:
raise ValueError
# verification check. we know start_line > 0 from the check above
if (prev_hash := read_line_start(start_line - 1)) >= start_hash:
raise ValueError(f"Wrong bsearch start line: {prev_hash=} >= {start_hash=}")
f.seek(start_line * line_size, os.SEEK_SET)