def process_file()

in 5-4o_fine_tuning/reasoning.py [0:0]


def process_file(func: callable, file_path: str, samples: int = None, output_file_suffix: str = None, workers: int = 20):
    """
    Processes JSONL line by line using the provided function to augment the training data with CoT
    """
    with open(file_path, 'r') as file:
        if samples is not None:
            lines = [file.readline() for _ in range(samples)]
        else:
            lines = file.readlines()

    new_file_path = f"{file_path.rsplit('.', 1)[0]}-{output_file_suffix}.jsonl"

    updated_lines = []
    with ThreadPoolExecutor(max_workers=workers) as executor:
        future_to_line = {executor.submit(
            process_training_data_sample, line, func): line for line in lines}
        for future in tqdm(as_completed(future_to_line), total=len(lines), desc="Processing lines"):
            response = future.result()
            line = future_to_line[future]
            data = json.loads(line)
            if "messages" in data:
                for i, message in enumerate(data["messages"]):
                    if message["role"] == "assistant":
                        message["content"] = f'# {response.replace(chr(10), " ")}\n' + \
                            message["content"]
                        break
                else:
                    raise ValueError(
                        f"No assistant message found in data: {data}")
            else:
                raise KeyError(f"Key 'messages' not found on line: {line}")
            updated_lines.append(json.dumps(data) + "\n")

    with open(new_file_path, 'w') as new_file:
        new_file.writelines(updated_lines)
        print(f"File written: {new_file.name}")