scripts/precompute.py (76 lines of code) (raw):

from typing import Optional import fire import pandas as pd from scipy.stats import pearsonr, spearmanr from alpaca_eval import analyze, annotators, constants from alpaca_eval import main as alpaca_main from alpaca_eval import metrics, utils from alpaca_eval.types import AnyPath def precompute_on_all_human_leaderboard( annotators_config="gpt4", Annotator=annotators.PairwiseAnnotator, all_data=constants.ALPACAFARM_GOLD_ANNOTATIONS, analyzer_kwargs=None, **annotator_kwargs, ): """Precompute all instructions on the eval leaderboard that has been annotated by humans.""" analyzer_kwargs = analyzer_kwargs or {} analyzer = analyze.Analyzer(gold_annotations=all_data, **analyzer_kwargs) df_annotations = analyze.get_annotations( analyzer, Annotator=Annotator, annotators_config=annotators_config, **annotator_kwargs ) def precompute_evaluator_leaderboard( annotators_configs_to_analyze="MINIMAL_EVALUATORS", annotators_configs_to_benchmark="VERIFIED_EVALUATORS", max_instances=None, **kwargs, ): """Precompute evaluator's leaderboard for important API models.""" if isinstance(annotators_configs_to_analyze, str): annotators_configs_to_analyze = getattr(constants, annotators_configs_to_analyze) if isinstance(annotators_configs_to_benchmark, str): annotators_configs_to_benchmark = getattr(constants, annotators_configs_to_benchmark) for annotators_config in annotators_configs_to_analyze: # saving is done automatically _ = alpaca_main.analyze_evaluators( annotators_config=annotators_config, max_instances=max_instances, is_save_leaderboard=max_instances is None, is_return_instead_of_print=True, # don't print current_leaderboard_mode="minimal", **kwargs, ) for annotators_config in annotators_configs_to_benchmark: # saving is done automatically _ = alpaca_main.analyze_evaluators( annotators_config=annotators_config, max_instances=max_instances, is_save_leaderboard=max_instances is None, is_return_instead_of_print=True, # don't print is_single_annotator=True, current_leaderboard_mode="verified", **kwargs, ) def update_leaderboard(leaderboard_path, model_outputs="results/{model_name}/model_outputs.json", **kwargs): """Rerun evaluate on each model in the leaderboard. Useful to add a column suc as avg_length.""" df_leaderboard = utils.load_or_convert_to_dataframe(leaderboard_path) for model_name in df_leaderboard.index: alpaca_main.evaluate(model_outputs=model_outputs.format(model_name=model_name), **kwargs) def compare_leaderboards(leaderboard_path_1, leaderboard_path_2): df_lb_1 = utils.load_or_convert_to_dataframe(leaderboard_path_1) df_lb_2 = utils.load_or_convert_to_dataframe(leaderboard_path_2) # keep only intersection of models and in the same order intersected_models = df_lb_1.index.intersection(df_lb_2.index) df_lb_1 = df_lb_1.loc[intersected_models] df_lb_2 = df_lb_2.loc[intersected_models] metrics = {} metrics["Spearman corr."] = spearmanr(df_lb_1["win_rate"], df_lb_2["win_rate"]).statistic metrics["Pearson corr."] = pearsonr(df_lb_1["avg_length"], df_lb_2["avg_length"]).statistic print(pd.Series(metrics).to_string(float_format="%.2f")) def make_leaderboard_like(leaderboard_to_copy: Optional[AnyPath], **kwargs): """Make a leaderboard on all the models that have been evaluated in another leaderboard.""" df_lb_old = pd.read_csv(leaderboard_to_copy, index_col=0) kwargs["is_cache_leaderboard"] = True kwargs["is_return_instead_of_print"] = True for m, r in df_lb_old.iterrows(): kwargs["current_leaderboard_mode"] = r["mode"] leaderboard_new, _ = alpaca_main.evaluate(model_outputs=f"results/{m}/model_outputs.json", **kwargs) print("Comparison between the leaderboards:") compare_leaderboards(leaderboard_to_copy, leaderboard_new) def main(task, **kwargs): globals()[task](**kwargs) if __name__ == "__main__": fire.Fire(main)