in scripts/figs.py [0:0]
def overall_stats(df: pd.DataFrame, tag: str):
"""Compute overall statistics for a given dataset.
Args:
df (pd.DataFrame): Input dataframe with necessary columns.
tag (str): Name identifier (e.g., "Moirai") for the results.
Returns:
pd.DataFrame: DataFrame containing computed metrics.
"""
df = df[df['mode'] == 'size-predict'].copy()
# Compute total job ingress/egress volume
df['job_ingress_bytes'] = df['ingress_byte_Presto'] + df['ingress_byte_Spark']
df['job_egress_bytes'] = df['egress_byte_Presto'] + df['egress_byte_Spark']
# Compute total ingress and egress volume (including movement)
df['ingress_volume'] = df['job_ingress_bytes'] + df['movement_ingress_bytes']
df['egress_volume'] = df['job_egress_bytes'] + df['movement_egress_bytes']
df['traffic_volume'] = df['ingress_volume'] + df['egress_volume']
# Compute cost components
df['ingress_volume_Spark'] = df['ingress_byte_Spark'] + df['movement_ingress_bytes']
df['ingress_volume_Presto'] = df['ingress_byte_Presto'] + df['movement_ingress_bytes']
df['egress_volume_Spark'] = df['egress_byte_Spark'] + df['movement_egress_bytes']
df['egress_volume_Presto'] = df['egress_byte_Presto'] + df['movement_egress_bytes']
df['cost'] = df['egress_volume_Presto'] / 1024 ** 3 * 0.02 + \
df['egress_volume_Spark'] / 1024 ** 3 * 0.02 + \
df['rep_bytes'] / 1024 ** 3 * 0.023 / 4
results = []
for c in df['cloud_computation_target'].unique():
df_c = df[df['cloud_computation_target'] == c]
# Compute costs
network_cost = df_c['P95_traffic_bps'].max() / (100 * 1024 ** 3) * 23.3 * 24 * 7
egress_cost_presto = df_c['egress_volume_Presto'].mean() / 1024 ** 3 * 0.02
egress_cost_spark = df_c['egress_volume_Spark'].mean() / 1024 ** 3 * 0.02
rep_cost = df_c['rep_bytes'].mean() / 1024 ** 3 * 0.023 / 4
# total_cost = network_cost + egress_cost_spark + egress_cost_presto + rep_cost
total_cost = df_c['cost'].mean() + network_cost
# Compute standard deviation for total cost (variance)
total_cost_std = df_c['cost'].std()
# Append results
results.append({
"tag": tag,
"c": c,
"network_cost": network_cost,
"egress_cost_Spark": egress_cost_spark,
"egress_cost_Presto": egress_cost_presto,
"egress_cost": egress_cost_spark + egress_cost_presto,
"rep_cost": rep_cost,
"total_cost": total_cost,
"total_cost_std": total_cost_std, # Add standard deviation column
"ingress_volume": df_c['ingress_volume'].mean() / 1024 ** 4,
"egress_volume": df_c['egress_volume'].mean() / 1024 ** 4,
})
return pd.DataFrame(results)