def eval_pipeline()

in evaluation_pipeline/evaluation.py [0:0]


def eval_pipeline(run_llm_judge: bool, file_path, log_to_wandb, specific_k=None):
    retrieval_df, k, model_name = load_retrieved(file_path=file_path)
    # give option to override k
    if specific_k is None:
        pass
    else:
        k = specific_k

    if log_to_wandb:
        wandb_logging(retrieval_df, k)
    results_df = run_vectorized_traditional_eval(retrieval_df, k)
    dfs_to_return = []
    dfs_to_return.append(results_df)
    df_labels = []
    df_labels.append("traditional_eval")
    if run_llm_judge:
        judge = llm_as_judge()
        llm_results_df = run_vectorized_llm_eval(retrieval_df, k, judge)
        dfs_to_return.append(llm_results_df)
        df_labels.append("llm_eval")

    for df, label in zip(dfs_to_return, df_labels):
        df['model_name'] = model_name
        summary_df = df.describe().reset_index()
        summary_df = summary_df.loc[:, summary_df.columns != 'query_id'] if 'query_id' in summary_df.columns else summary_df
        summary_table = wandb.Table(dataframe=summary_df)
        averages = summary_df[summary_df['index'] == 'mean']
        if log_to_wandb:
            wandb.log({label: summary_table})
            wandb.log({"Averages": averages})
            wandb.finish()
        summary_df.to_csv(f"evaluation_results/{model_name}_{label}_aggregate_metrics.csv")
        df.to_csv(f"evaluation_results/{model_name}_{label}_detailed_metrics.csv")