in evaluation_pipeline/evaluation.py [0:0]
def eval_pipeline(run_llm_judge: bool, file_path, log_to_wandb, specific_k=None):
retrieval_df, k, model_name = load_retrieved(file_path=file_path)
# give option to override k
if specific_k is None:
pass
else:
k = specific_k
if log_to_wandb:
wandb_logging(retrieval_df, k)
results_df = run_vectorized_traditional_eval(retrieval_df, k)
dfs_to_return = []
dfs_to_return.append(results_df)
df_labels = []
df_labels.append("traditional_eval")
if run_llm_judge:
judge = llm_as_judge()
llm_results_df = run_vectorized_llm_eval(retrieval_df, k, judge)
dfs_to_return.append(llm_results_df)
df_labels.append("llm_eval")
for df, label in zip(dfs_to_return, df_labels):
df['model_name'] = model_name
summary_df = df.describe().reset_index()
summary_df = summary_df.loc[:, summary_df.columns != 'query_id'] if 'query_id' in summary_df.columns else summary_df
summary_table = wandb.Table(dataframe=summary_df)
averages = summary_df[summary_df['index'] == 'mean']
if log_to_wandb:
wandb.log({label: summary_table})
wandb.log({"Averages": averages})
wandb.finish()
summary_df.to_csv(f"evaluation_results/{model_name}_{label}_aggregate_metrics.csv")
df.to_csv(f"evaluation_results/{model_name}_{label}_detailed_metrics.csv")