def update_elo_ratings()

in experiments/arena/common/metadata.py [0:0]
84 lines of code
11 McCabe index (conditional complexity)

def update_elo_ratings(model1: str, model2: str, winner: str, images: list[str], prompt: str, study: str):
    """Update ELO ratings for models"""

    current_datetime = datetime.datetime.now()

    # Fetch current ELO ratings from Firestore
    doc_ref = (
        db.collection(config.IMAGE_RATINGS_COLLECTION_NAME)
        .where(filter=firestore.FieldFilter("study", "==", study))
        .where(filter=firestore.FieldFilter("type", "==", "elo_rating"))
        .get()
    )

    updated_ratings = {}
    elo_rating_doc_id = None  # Store the document ID
    if doc_ref:
        for doc in doc_ref:
            elo_rating_doc_id = doc.id  # Get the document ID
            ratings = doc.to_dict().get("ratings", {})
            updated_ratings.update(ratings)

    elo_model1 = updated_ratings.get(model1, 1000)  # Default to 1000 if not found
    elo_model2 = updated_ratings.get(model2, 1000)

    # Calculate expected scores
    expected_model1 = 1 / (1 + 10 ** ((elo_model2 - elo_model1) / 400))
    expected_model2 = 1 / (1 + 10 ** ((elo_model1 - elo_model2) / 400))

    # Update ELO ratings based on the winner
    k_factor = config.ELO_K_FACTOR
    if winner == model1:
        elo_model1 = elo_model1 + k_factor * (1 - expected_model1)
        elo_model2 = elo_model2 + k_factor * (0 - expected_model2)
    elif winner == model2:
        elo_model1 = elo_model1 + k_factor * (0 - expected_model1)
        elo_model2 = elo_model2 + k_factor * (1 - expected_model2)

    updated_ratings[model1] = round(elo_model1, 2)
    updated_ratings[model2] = round(elo_model2, 2)

    print(f"Ratings: {updated_ratings}")

    # Store updated ELO ratings in Firestore
    if elo_rating_doc_id:  # Check if the document ID was found
        doc_ref = db.collection(config.IMAGE_RATINGS_COLLECTION_NAME).document(elo_rating_doc_id)
        doc_ref.update(
            {
                "ratings": updated_ratings,
                "timestamp": current_datetime,
            }
        )
        print(f"ELO ratings updated in Firestore with document ID: {doc_ref.id}")
    else:
        # Document doesn't exist, create it
        doc_ref = db.collection(config.IMAGE_RATINGS_COLLECTION_NAME).document()
        doc_ref.set(
            {
                "study": study,
                "type": "elo_rating",
                "ratings": updated_ratings,
                "timestamp": current_datetime,
            }
        )

        print(f"ELO ratings created in Firestore with document ID: {doc_ref.id}")

    doc_ref = db.collection(config.IMAGE_RATINGS_COLLECTION_NAME).document()
    doc_ref.set(
        {
            "timestamp": current_datetime,
            "type": "vote",
            "model1": model1,
            "image1": images[0],
            "model2": model2,
            "image2": images[1],
            "winner": winner,
            "prompt": prompt,
            "study": study
        }
    )

    print(f"Vote updated in Firestore with document ID: {doc_ref.id}")

    # Update the latest ELO ratings in Spanner
    study_tracker = ArenaStudyTracker(
        project_id=config.PROJECT_ID,
        spanner_instance_id=config.SPANNER_INSTANCE_ID,
        spanner_database_id=config.SPANNER_DATABASE_ID,
    )
    if not study_tracker:
        log("Failed to initialize Spanner study tracker.", LogLevel.ERROR)
        raise RuntimeError("Spanner study tracker initialization failed.")
    elo_ratings_by_model = []
    for model, elo in updated_ratings.items():
        elo_study_entry = ArenaModelEvaluation(model_name=model, 
                             rating=elo, 
                             study=study)
        elo_ratings_by_model.append(elo_study_entry)
    
    try:
        study_tracker.upsert_study_runs(study_runs=elo_ratings_by_model)
        log(f"ELO ratings updated in Spanner for study '{study}'.", LogLevel.ON)
    except Exception as e:
        log(f"Failed to update ELO ratings in Spanner: {e}", LogLevel.ERROR)
        raise RuntimeError(f"Failed to update ELO ratings in Spanner: {e}")