in evals/elsuite/incontext_rl/scripts/plot_experiments.py [0:0]
def extract_results(datadir: Path) -> pd.DataFrame:
"""
Extracts results from the specified directory and returns a DataFrame.
Args:
datadir (Path): Path to the directory containing the experiment results.
Returns:
pd.DataFrame: DataFrame containing the experiment results.
"""
print(f"Extracting results from directory: {datadir}")
df_rows = []
final_results = get_final_results_from_dir(datadir)
if not final_results:
print("No results found in directory.")
raise ValueError("No results found in directory.")
for path, results in final_results.items():
print(f"Processing file: {path}")
spec = extract_spec(path)
if not spec:
raise ValueError(f"No spec found for {path}")
model = spec.get("completion_fns", [None])[0]
base_eval = spec.get("base_eval")
if not model or base_eval is None:
raise ValueError(f"Missing model or base_eval in spec for {path}")
environments = results.get('environments', [])
for env in environments:
metrics = env.get('metrics', {})
flattened_metrics = {f"{k}": v for k, v in metrics.items()} # Flatten metrics into separate columns
print(f"Extracted {env['env']} metrics for model: {model}")
# Calculate custom episode end steps for CliffWalking environment
if env['env'] == "CliffWalking-v0 {}":
rewards = metrics.get('rewards', [])
existing_end_steps = metrics.get('episode_end_steps', [])
episode_end_steps = calculate_custom_episode_end_steps_for_cliffwalking(rewards, existing_end_steps)
flattened_metrics['episode_end_steps'] = episode_end_steps
df_rows.append({"model": model, "base_eval": base_eval, "environment": env['env'], **flattened_metrics})
df = pd.DataFrame(df_rows)
if 'episode_rewards' not in df.columns:
df['episode_rewards'] = df.apply(calculate_episode_rewards, axis=1)
# For plots
df['cumulative_episode_rewards'] = df['episode_rewards'].apply(np.cumsum)
df['average_episode_reward'] = df['episode_rewards'].apply(np.mean)
df['window_size'] = df['environment'].map(WINDOW_SIZES).fillna(WINDOW_SIZES.get('default', 20))
df['rolling_average_rewards'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], row['window_size']), axis=1)
# We also calculate the rolling average across different window sizes
df['rolling_average_rewards_5_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 5), axis=1)
df['rolling_average_rewards_10_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 10), axis=1)
df['rolling_average_rewards_20_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 20), axis=1)
df['rolling_average_rewards_50_episodes'] = df.apply(lambda row: calculate_rolling_average(row['episode_rewards'], 50), axis=1)
# We also calculate the average reward for the last 5, 10, 20, and 50 episodes. For older runs, we may not have this information.
if 'average_reward_last_5_episodes' not in df.columns:
df['average_reward_last_5_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-5:]))
if 'average_reward_last_10_episodes' not in df.columns:
df['average_reward_last_10_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-10:]))
if 'average_reward_last_20_episodes' not in df.columns:
df['average_reward_last_20_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-20:]))
if 'average_reward_last_50_episodes' not in df.columns:
df['average_reward_last_50_episodes'] = df['episode_rewards'].apply(lambda rewards: np.mean(rewards[-50:]))
print(f"Extraction complete. {len(df_rows)} rows in DataFrame.")
return df