in 5-4o_fine_tuning/reasoning.py [0:0]
def process_file(func: callable, file_path: str, samples: int = None, output_file_suffix: str = None, workers: int = 20):
"""
Processes JSONL line by line using the provided function to augment the training data with CoT
"""
with open(file_path, 'r') as file:
if samples is not None:
lines = [file.readline() for _ in range(samples)]
else:
lines = file.readlines()
new_file_path = f"{file_path.rsplit('.', 1)[0]}-{output_file_suffix}.jsonl"
updated_lines = []
with ThreadPoolExecutor(max_workers=workers) as executor:
future_to_line = {executor.submit(
process_training_data_sample, line, func): line for line in lines}
for future in tqdm(as_completed(future_to_line), total=len(lines), desc="Processing lines"):
response = future.result()
line = future_to_line[future]
data = json.loads(line)
if "messages" in data:
for i, message in enumerate(data["messages"]):
if message["role"] == "assistant":
message["content"] = f'# {response.replace(chr(10), " ")}\n' + \
message["content"]
break
else:
raise ValueError(
f"No assistant message found in data: {data}")
else:
raise KeyError(f"Key 'messages' not found on line: {line}")
updated_lines.append(json.dumps(data) + "\n")
with open(new_file_path, 'w') as new_file:
new_file.writelines(updated_lines)
print(f"File written: {new_file.name}")