3-4o_mini_fine_tuning/1_eval_baseline.py (32 lines of code) (raw):
# %%
import pandas as pd
import os
import yaml
run_name = "customer_service_chat_triage_n=100"
df = pd.read_csv("./data/{}.csv".format(run_name))
# Generate promptfoo eval file
n_rows = len(df)
output_dict = {
"description": run_name,
"prompts": ["../prompts/verbatim_input.txt", "../prompts/cot3.json"],
"providers": [
"openai:chat:gpt-3.5-turbo",
"openai:chat:gpt-4o-mini",
"openai:chat:gpt-4o",
],
"tests": [
{
"vars": {"input": row["prompt"], "target": row["correct_output"]},
"assert": [
{"type": "python", "value": "file://../py/assert_last_line_answer.py"}
],
}
for _, row in df[:n_rows].iterrows()
],
}
os.makedirs("./evals", exist_ok=True)
with open("./evals/{}.yaml".format(run_name), "w") as file:
yaml.dump(output_dict, file, default_flow_style=False)
print(
"Generated {} promptfoo tests and saved to {}".format(
n_rows, "{}.yaml".format(run_name)
)
)