train_rick/generate_rick_grpo/extract_and_push.py (20 lines of code) (raw):
import re
import json
from datasets import load_dataset
# Define the regex pattern to match valid JSONL entries with "question" and "answer"
pattern = re.compile(r'^\s*{\s*"question"\s*:\s*".+?",\s*"solution"\s*:\s*".+?"\s*}\s*$')
input_file = f"data.txt"
output_file = f"dataset.jsonl"
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
for line_idx, line in enumerate(infile):
if line.startswith("{"): # Check if the line starts with '{'
try:
row = json.loads(line)
except Exception as e:
print(f"Invalid JSON at line {line_idx + 1}: {line.strip()}")
continue
# filter key "question" and "solution"
row = {key: row[key] for key in row if key in ["question", "solutions"]}
line = json.dumps(row, ensure_ascii=False) + "\n"
outfile.write(line)
dataset = load_dataset("json", data_files=output_file)
dataset = dataset["train"].train_test_split(test_size=204)
dataset.push_to_hub("qgallouedec/rick-physics-grpo")