def check_data_overlap()

in 5-4o_fine_tuning/data_validator.py [0:0]


    def check_data_overlap(self):
        if not self.validation_file:
            logger.info(
                "No validation file provided. Skipping data overlap check.")
            return

        files_to_check = [self.train_file, self.validation_file]
        for file in files_to_check:
            if not file.endswith('.jsonl'):
                raise ValueError(f"Invalid JSONL file: {file}")

        data_sets = {file: {json.dumps(json.loads(
            line.strip()), sort_keys=True) for line in open(file)} for file in files_to_check}
        overlaps = {(file1, file2): len(data_sets[file1] & data_sets[file2]) for i, file1 in enumerate(
            files_to_check) for file2 in files_to_check[i+1:]}

        for (file1, file2), count in overlaps.items():
            logger.info(
                f"Overlap between {file1} and {file2}: {count} records")

        return overlaps