in 5-4o_fine_tuning/data_validator.py [0:0]
def check_data_overlap(self):
if not self.validation_file:
logger.info(
"No validation file provided. Skipping data overlap check.")
return
files_to_check = [self.train_file, self.validation_file]
for file in files_to_check:
if not file.endswith('.jsonl'):
raise ValueError(f"Invalid JSONL file: {file}")
data_sets = {file: {json.dumps(json.loads(
line.strip()), sort_keys=True) for line in open(file)} for file in files_to_check}
overlaps = {(file1, file2): len(data_sets[file1] & data_sets[file2]) for i, file1 in enumerate(
files_to_check) for file2 in files_to_check[i+1:]}
for (file1, file2), count in overlaps.items():
logger.info(
f"Overlap between {file1} and {file2}: {count} records")
return overlaps