in fastchat/serve/monitor/elo_analysis.py [0:0]
def report_elo_analysis_results(battles_json):
battles = pd.DataFrame(battles_json)
battles = battles.sort_values(ascending=True, by=["tstamp"])
# Only use anonymous votes
battles = battles[battles["anony"]].reset_index(drop=True)
battles_no_ties = battles[~battles["winner"].str.contains("tie")]
# Online update
elo_rating_online = compute_elo(battles)
# Bootstrap
bootstrap_df = get_bootstrap_result(battles, compute_elo)
elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df)
model_order = list(elo_rating_median.keys())
model_order.sort(key=lambda k: -elo_rating_median[k])
limit_show_number = 25 # limit show number to make plots smaller
model_order = model_order[:limit_show_number]
# Plots
leaderboard_table = visualize_leaderboard_table(elo_rating_median)
win_fraction_heatmap = visualize_pairwise_win_fraction(battles_no_ties, model_order)
battle_count_heatmap = visualize_battle_count(battles_no_ties, model_order)
average_win_rate_bar = visualize_average_win_rate(
battles_no_ties, limit_show_number
)
bootstrap_elo_rating = visualize_bootstrap_elo_rating(
bootstrap_df, limit_show_number
)
last_updated_tstamp = battles["tstamp"].max()
last_updated_datetime = datetime.datetime.fromtimestamp(
last_updated_tstamp, tz=timezone("US/Pacific")
).strftime("%Y-%m-%d %H:%M:%S %Z")
return {
"elo_rating_online": elo_rating_online,
"elo_rating_median": elo_rating_median,
"leaderboard_table": leaderboard_table,
"win_fraction_heatmap": win_fraction_heatmap,
"battle_count_heatmap": battle_count_heatmap,
"average_win_rate_bar": average_win_rate_bar,
"bootstrap_elo_rating": bootstrap_elo_rating,
"last_updated_datetime": last_updated_datetime,
"last_updated_tstamp": last_updated_tstamp,
}