8-evals/04_summary_grader_eval.py (52 lines of code) (raw):

# %% from openai import OpenAI import pandas as pd client = OpenAI() df = pd.read_csv("./data/reviewed_summary_grader_outputs_n=12.csv") df.head() tests = [] for idx, row in df.iterrows(): tests.append( { "description": "Test if summary is correct", "vars": { "input": row["input"], }, "assert": [ { "type": "python", "metric": "frac_key_points_in_summary", "value": f"'{row['ratio_points_in_summary_vs_key_points_total']}' in output.split('\\n')[-1]", }, ], } ) # Add prompts prompts = [ """Evaluate whether the items in summary_key_points are covered in the summary, both provided in <context> tags. Use a full point (1) for a strong match and a half point (0.5) for a partial match. If there are no points in summary_key_points, then assign a score of 0.0. <context> {{input}} </context> Think step by step and output the final ratio on a new line at the end of the output, formatted like "Fraction: 0.25", with no additional markdown or ``` characters. """, ] providers = [ { "id": "openai:gpt-3.5-turbo", }, { "id": "openai:gpt-4o-mini", }, { "id": "openai:gpt-4o-2024-08-06", }, { "id": "openai:o1-mini", }, { "id": "openai:o1-preview", }, ] promptfoo_config = { "tests": tests, "prompts": prompts, "providers": providers, } import yaml output_yaml_path = "./data/pfoo_eval_summary_grader.yaml" with open(output_yaml_path, "w") as file: yaml.dump(promptfoo_config, file) print(f"promptfoo_config saved to {output_yaml_path}")