in tasks/CCMatrix/dl_cc_matrix.py [0:0]
def validate(src_file: Path, trg_file: Path) -> None:
"""Checks that the segments in the given batch are valid."""
lines_src, lines_trg, found_pairs = 0, 0, 0
parser = get_typed_parser(SimpleBitext)
with open(src_file) as src_f, open(trg_file) as trg_f:
src_l = src_f.readline()
trg_l = trg_f.readline()
while src_l and trg_l:
src: SimpleBitext = parser(src_l)
trg: SimpleBitext = parser(trg_l)
if src.line_no <= trg.line_no:
lines_src += 1
src_l = src_f.readline()
if trg.line_no <= src.line_no:
lines_trg += 1
trg_l = trg_f.readline()
if trg.line_no == src.line_no:
found_pairs += 1
if found_pairs == lines_src and found_pairs == lines_trg:
logging.info(
f"Validated {src_file} and {trg_file}. Found {found_pairs} bitexts."
)
else:
logging.error(
f"Validated {src_file} and {trg_file}. "
f"Found {found_pairs} bitexts, from {lines_src} in {src_file} and {lines_trg} in {trg_file}"
)