in experiments/arena/common/metadata.py [0:0]
def update_elo_ratings(model1: str, model2: str, winner: str, images: list[str], prompt: str, study: str):
"""Update ELO ratings for models"""
current_datetime = datetime.datetime.now()
# Fetch current ELO ratings from Firestore
doc_ref = (
db.collection(config.IMAGE_RATINGS_COLLECTION_NAME)
.where(filter=firestore.FieldFilter("study", "==", study))
.where(filter=firestore.FieldFilter("type", "==", "elo_rating"))
.get()
)
updated_ratings = {}
elo_rating_doc_id = None # Store the document ID
if doc_ref:
for doc in doc_ref:
elo_rating_doc_id = doc.id # Get the document ID
ratings = doc.to_dict().get("ratings", {})
updated_ratings.update(ratings)
elo_model1 = updated_ratings.get(model1, 1000) # Default to 1000 if not found
elo_model2 = updated_ratings.get(model2, 1000)
# Calculate expected scores
expected_model1 = 1 / (1 + 10 ** ((elo_model2 - elo_model1) / 400))
expected_model2 = 1 / (1 + 10 ** ((elo_model1 - elo_model2) / 400))
# Update ELO ratings based on the winner
k_factor = config.ELO_K_FACTOR
if winner == model1:
elo_model1 = elo_model1 + k_factor * (1 - expected_model1)
elo_model2 = elo_model2 + k_factor * (0 - expected_model2)
elif winner == model2:
elo_model1 = elo_model1 + k_factor * (0 - expected_model1)
elo_model2 = elo_model2 + k_factor * (1 - expected_model2)
updated_ratings[model1] = round(elo_model1, 2)
updated_ratings[model2] = round(elo_model2, 2)
print(f"Ratings: {updated_ratings}")
# Store updated ELO ratings in Firestore
if elo_rating_doc_id: # Check if the document ID was found
doc_ref = db.collection(config.IMAGE_RATINGS_COLLECTION_NAME).document(elo_rating_doc_id)
doc_ref.update(
{
"ratings": updated_ratings,
"timestamp": current_datetime,
}
)
print(f"ELO ratings updated in Firestore with document ID: {doc_ref.id}")
else:
# Document doesn't exist, create it
doc_ref = db.collection(config.IMAGE_RATINGS_COLLECTION_NAME).document()
doc_ref.set(
{
"study": study,
"type": "elo_rating",
"ratings": updated_ratings,
"timestamp": current_datetime,
}
)
print(f"ELO ratings created in Firestore with document ID: {doc_ref.id}")
doc_ref = db.collection(config.IMAGE_RATINGS_COLLECTION_NAME).document()
doc_ref.set(
{
"timestamp": current_datetime,
"type": "vote",
"model1": model1,
"image1": images[0],
"model2": model2,
"image2": images[1],
"winner": winner,
"prompt": prompt,
"study": study
}
)
print(f"Vote updated in Firestore with document ID: {doc_ref.id}")
# Update the latest ELO ratings in Spanner
study_tracker = ArenaStudyTracker(
project_id=config.PROJECT_ID,
spanner_instance_id=config.SPANNER_INSTANCE_ID,
spanner_database_id=config.SPANNER_DATABASE_ID,
)
if not study_tracker:
log("Failed to initialize Spanner study tracker.", LogLevel.ERROR)
raise RuntimeError("Spanner study tracker initialization failed.")
elo_ratings_by_model = []
for model, elo in updated_ratings.items():
elo_study_entry = ArenaModelEvaluation(model_name=model,
rating=elo,
study=study)
elo_ratings_by_model.append(elo_study_entry)
try:
study_tracker.upsert_study_runs(study_runs=elo_ratings_by_model)
log(f"ELO ratings updated in Spanner for study '{study}'.", LogLevel.ON)
except Exception as e:
log(f"Failed to update ELO ratings in Spanner: {e}", LogLevel.ERROR)
raise RuntimeError(f"Failed to update ELO ratings in Spanner: {e}")