def validate_jsonl()

in aoai/token_count_utils.py [0:0]


def validate_jsonl(jsonl_files):

    for jsonl_path in jsonl_files:
            
        # Format error checks
        format_errors = defaultdict(int)
        dataset = []
        logger.info('*' * 50)
        logger.info(f"### [JSONL_VALIDATION] Processing file: {jsonl_path}")        

        with open(jsonl_path, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f, start=1):
                try:
                    parsed_data = json.loads(line)
                    dataset.append(parsed_data)
                except json.JSONDecodeError as e:
                    logger.warning(f"Line {idx}: Invalid JSON format - {e}")
                except Exception as e:
                    logger.warning(f"Line {idx}: Unexpected error - {e}")

        for idx, data in enumerate(dataset):
            is_valid, error_key = validate_json(data)
            if not is_valid:
                logger.warning(f"Validation failed for line {idx + 1}")
                format_errors[error_key] += 1

        if format_errors:
            for k, v in format_errors.items():
                logger.info(f"{k}: {v}")
        else:
            logger.info(f"{jsonl_path}: All examples are valid")
        logger.info('*' * 50)