def extract_results()

in evals/elsuite/incontext_rl/scripts/plot_experiments.py [0:0]


def extract_results(datadir: Path) -> pd.DataFrame:
    """
    Extracts results from the specified directory and returns a DataFrame.

    Args:
    datadir (Path): Path to the directory containing the experiment results.

    Returns:
    pd.DataFrame: DataFrame containing the experiment results.
    """
    print(f"Extracting results from directory: {datadir}")
    df_rows = []
    final_results = get_final_results_from_dir(datadir)
    if not final_results:
        print("No results found in directory.")
        raise ValueError("No results found in directory.")

    for path, results in final_results.items():
        print(f"Processing file: {path}")
        spec = extract_spec(path)
        if not spec:
            raise ValueError(f"No spec found for {path}")
        model = spec.get("completion_fns", [None])[0]
        base_eval = spec.get("base_eval")
        if not model or base_eval is None:
            raise ValueError(f"Missing model or base_eval in spec for {path}")
        
        environments = results.get('environments', [])
        for env in environments:
            metrics = env.get('metrics', {})
            flattened_metrics = {f"{k}": v for k, v in metrics.items()} # Flatten metrics into separate columns
            print(f"Extracted {env['env']} metrics for model: {model}")
            
            # Calculate custom episode end steps for CliffWalking environment
            if env['env'] == "CliffWalking-v0 {}":
                rewards = metrics.get('rewards', [])
                existing_end_steps = metrics.get('episode_end_steps', [])
                episode_end_steps = calculate_custom_episode_end_steps_for_cliffwalking(rewards, existing_end_steps)
                flattened_metrics['episode_end_steps'] = episode_end_steps
            
            df_rows.append({"model": model, "base_eval": base_eval, "environment": env['env'], **flattened_metrics})

    df = pd.DataFrame(df_rows)

    if 'episode_rewards' not in df.columns:
        df['episode_rewards'] = df.apply(calculate_episode_rewards, axis=1)

    # For plots
    df['cumulative_episode_rewards'] = df['episode_rewards'].apply(np.cumsum)
    df['average_episode_reward'] = df['episode_rewards'].apply(np.mean)
    df['window_size'] = df['environment'].map(WINDOW_SIZES).fillna(WINDOW_SIZES.get('default', 20))
    df['rolling_average_rewards'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], row['window_size']), axis=1)

    # We also calculate the rolling average across different window sizes
    df['rolling_average_rewards_5_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 5), axis=1)
    df['rolling_average_rewards_10_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 10), axis=1)
    df['rolling_average_rewards_20_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 20), axis=1)
    df['rolling_average_rewards_50_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 50), axis=1)
    
    # We also calculate the average reward for the last 5, 10, 20, and 50 episodes. For older runs, we may not have this information.
    if 'average_reward_last_5_episodes' not in df.columns:
        df['average_reward_last_5_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-5:]))
    if 'average_reward_last_10_episodes' not in df.columns:
        df['average_reward_last_10_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-10:]))
    if 'average_reward_last_20_episodes' not in df.columns:
        df['average_reward_last_20_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-20:]))
    if 'average_reward_last_50_episodes' not in df.columns:
        df['average_reward_last_50_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-50:]))

    print(f"Extraction complete. {len(df_rows)} rows in DataFrame.")
    return df