in aoai/token_count_utils.py [0:0]
def validate_jsonl(jsonl_files):
for jsonl_path in jsonl_files:
# Format error checks
format_errors = defaultdict(int)
dataset = []
logger.info('*' * 50)
logger.info(f"### [JSONL_VALIDATION] Processing file: {jsonl_path}")
with open(jsonl_path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f, start=1):
try:
parsed_data = json.loads(line)
dataset.append(parsed_data)
except json.JSONDecodeError as e:
logger.warning(f"Line {idx}: Invalid JSON format - {e}")
except Exception as e:
logger.warning(f"Line {idx}: Unexpected error - {e}")
for idx, data in enumerate(dataset):
is_valid, error_key = validate_json(data)
if not is_valid:
logger.warning(f"Validation failed for line {idx + 1}")
format_errors[error_key] += 1
if format_errors:
for k, v in format_errors.items():
logger.info(f"{k}: {v}")
else:
logger.info(f"{jsonl_path}: All examples are valid")
logger.info('*' * 50)