def overall_stats()

in scripts/figs.py [0:0]


def overall_stats(df: pd.DataFrame, tag: str):
    """Compute overall statistics for a given dataset.

    Args:
        df (pd.DataFrame): Input dataframe with necessary columns.
        tag (str): Name identifier (e.g., "Moirai") for the results.

    Returns:
        pd.DataFrame: DataFrame containing computed metrics.
    """
    df = df[df['mode'] == 'size-predict'].copy()

    # Compute total job ingress/egress volume
    df['job_ingress_bytes'] = df['ingress_byte_Presto'] + df['ingress_byte_Spark']
    df['job_egress_bytes'] = df['egress_byte_Presto'] + df['egress_byte_Spark']

    # Compute total ingress and egress volume (including movement)
    df['ingress_volume'] = df['job_ingress_bytes'] + df['movement_ingress_bytes']
    df['egress_volume'] = df['job_egress_bytes'] + df['movement_egress_bytes']
    df['traffic_volume'] = df['ingress_volume'] + df['egress_volume']

    # Compute cost components
    df['ingress_volume_Spark'] = df['ingress_byte_Spark'] + df['movement_ingress_bytes']
    df['ingress_volume_Presto'] = df['ingress_byte_Presto'] + df['movement_ingress_bytes']
    df['egress_volume_Spark'] = df['egress_byte_Spark'] + df['movement_egress_bytes']
    df['egress_volume_Presto'] = df['egress_byte_Presto'] + df['movement_egress_bytes']
    df['cost'] = df['egress_volume_Presto'] / 1024 ** 3 * 0.02 + \
                          df['egress_volume_Spark'] / 1024 ** 3 * 0.02 + \
                            df['rep_bytes'] / 1024 ** 3 * 0.023 / 4

    results = []

    for c in df['cloud_computation_target'].unique():
        df_c = df[df['cloud_computation_target'] == c]

        # Compute costs
        network_cost = df_c['P95_traffic_bps'].max() / (100 * 1024 ** 3) * 23.3 * 24 * 7
        egress_cost_presto = df_c['egress_volume_Presto'].mean() / 1024 ** 3 * 0.02
        egress_cost_spark = df_c['egress_volume_Spark'].mean() / 1024 ** 3 * 0.02
        rep_cost = df_c['rep_bytes'].mean() / 1024 ** 3 * 0.023 / 4
        # total_cost = network_cost + egress_cost_spark + egress_cost_presto + rep_cost
        total_cost = df_c['cost'].mean() + network_cost

        # Compute standard deviation for total cost (variance)
        total_cost_std = df_c['cost'].std()

        # Append results
        results.append({
            "tag": tag,
            "c": c,
            "network_cost": network_cost,
            "egress_cost_Spark": egress_cost_spark,
            "egress_cost_Presto": egress_cost_presto,
            "egress_cost": egress_cost_spark + egress_cost_presto,
            "rep_cost": rep_cost,
            "total_cost": total_cost,
            "total_cost_std": total_cost_std,  # Add standard deviation column
            "ingress_volume": df_c['ingress_volume'].mean() / 1024 ** 4,
            "egress_volume": df_c['egress_volume'].mean() / 1024 ** 4,
        })

    return pd.DataFrame(results)