# Load Presto and Spark data
import pandas as pd

from utility import human_readable_size

print("======= # of jobs and traffic volume ========")
presto_df = pd.read_csv("../metrics_per_day_presto.csv", parse_dates=['date'])
spark_df = pd.read_csv("../metrics_per_day_spark.csv", parse_dates=['date'])

# Compute totals
presto_total_jobs = presto_df['daily_jobs'].sum()
spark_total_jobs = spark_df['daily_jobs'].sum()

presto_total_rw_bytes = presto_df['daily_read_volume'].sum()
spark_total_rw_bytes = spark_df['daily_read_volume'].sum() + spark_df['daily_write_volume'].sum()

print(f"Presto: {presto_total_jobs:,} jobs, {presto_total_rw_bytes / 1024 ** 5:.2f} PB read")
print(f"Spark: {spark_total_jobs:,} jobs, {spark_total_rw_bytes / 1024 ** 5:.2f} PB read+write")
print(f"Total jobs: {presto_total_jobs + spark_total_jobs:,} jobs")
print(f"Total traffic volume: {(presto_total_rw_bytes + spark_total_rw_bytes) / 1024 ** 6:.2f} EB read+write")

print("======= Sec1 Overall Statistics ========")
# header: tag,c,network_cost, egress_cost,rep_cost,
# total_cost,total_cost_std,ingress_volume,egress_volume
df = pd.read_csv("../overall_stats.csv")

# Filter data for c = 50
df_c50 = df[df['c'] == 50]

# Get Moirai's data and all unique tags
moirai_data = df_c50[df_c50['tag'] == 'Moirai']
print("Moirai data", moirai_data[['c', 'network_cost', 'egress_cost', 'rep_cost']])


# Loop through each tag and calculate the improvement
for tag in ['Yugong', 'No\nRep', 'Rep\n3Mon.']:
    # Get the data for the current tag
    tag_data = df_c50[df_c50['tag'] == tag]

    # Merge tag data with Moirai data based on index to align rows
    merged_data = pd.merge(tag_data,
                           moirai_data,
                           on='c', suffixes=('_tag', '_moirai'))
    # Calculate the percentage improvement for each cost metric
    merged_data['total'] = (merged_data['total_cost_tag'] - merged_data[
        'total_cost_moirai']) / merged_data['total_cost_tag'] * 100
    merged_data['network'] = (merged_data['network_cost_tag'] - merged_data[
        'network_cost_moirai']) / merged_data['network_cost_tag'] * 100
    merged_data['egress'] = (merged_data['egress_cost_tag'] - merged_data[
        'egress_cost_moirai']) / merged_data['egress_cost_tag'] * 100
    merged_data['rep'] = (merged_data['rep_cost_tag'] - merged_data[
        'rep_cost_moirai']) / merged_data['rep_cost_tag'] * 100

    print(tag)
    print(merged_data[['total', 'network', 'egress', 'rep']])

print("======= Sec5 Overall Statistics ========")
# header: tag,c,network_cost, egress_cost,rep_cost,
# total_cost,total_cost_std,ingress_volume,egress_volume
df = pd.read_csv("../overall_stats.csv")
moirai_data = df[df['tag'] == 'Moirai']

print("** how Moirai improves over other approaches in Sec5 **")
# Loop through each tag and calculate the improvement
for tag in ['Volley', 'No\nRep', 'Rep\n3Mon.', 'Rep\nTop2.5%', 'Yugong']:
    # Get the data for the current tag
    tag_data = df[df['tag'] == tag]

    # Merge tag data with Moirai data based on index to align rows
    merged_data = pd.merge(tag_data,
                           moirai_data,
                           on='c', suffixes=('_tag', '_moirai'))
    # Calculate the percentage improvement for each cost metric
    merged_data['total'] = (merged_data['total_cost_tag'] - merged_data[
        'total_cost_moirai']) / merged_data['total_cost_tag'] * 100
    merged_data['traffic'] = (merged_data['ingress_volume_tag'] + merged_data['egress_volume_tag'] - \
                            (merged_data['ingress_volume_moirai'] + merged_data['egress_volume_moirai'])) / \
                            (merged_data['ingress_volume_tag'] + merged_data['egress_volume_tag']) * 100
    merged_data['network'] = (merged_data['network_cost_tag'] - merged_data[
        'network_cost_moirai']) / merged_data['network_cost_tag'] * 100
    merged_data['egress'] = (merged_data['egress_cost_tag'] - merged_data[
        'egress_cost_moirai']) / merged_data['egress_cost_tag'] * 100
    merged_data['rep'] = (merged_data['rep_cost_tag'] - merged_data[
        'rep_cost_moirai']) / merged_data['rep_cost_tag'] * 100

    print(tag)
    print(merged_data[['c', 'total', 'traffic', 'network', 'egress', 'rep']].round(2))

print("Reduction of Volley vs No Rep:")
merged_data = pd.merge(df[df['tag'] == 'No\nRep'], df[df['tag'] == 'Volley'],
                           on='c', suffixes=('_norep', '_volley'))
merged_data['total'] = (merged_data['total_cost_norep'] - merged_data['total_cost_volley']) / \
    merged_data['total_cost_volley'] * 100

print(merged_data[['c', 'total']].round(1))



print("** gap between other approaches and Moirai **")
# Loop through each tag and calculate the improvement
for tag in ['Volley', 'No\nRep', 'Rep\n3Mon.', 'Rep\nTop2.5%', 'Yugong', 'Moi\nJobDist', 'Moi\n1%Job']:
    # Get the data for the current tag
    tag_data = df[df['tag'] == tag]

    # Merge tag data with Moirai data based on index to align rows
    merged_data = pd.merge(tag_data,
                           moirai_data,
                           on='c', suffixes=('_tag', '_moirai'))
    # Calculate the percentage improvement for each cost metric
    merged_data['total'] = (merged_data['total_cost_tag'] - merged_data[
        'total_cost_moirai']) / merged_data['total_cost_moirai'] * 100
    merged_data['traffic'] = (merged_data['ingress_volume_tag'] + merged_data['egress_volume_tag'] - \
                            (merged_data['ingress_volume_moirai'] + merged_data['egress_volume_moirai'])) / \
                            (merged_data['ingress_volume_moirai'] + merged_data['egress_volume_moirai']) * 100
    merged_data['network'] = (merged_data['network_cost_tag'] - merged_data[
        'network_cost_moirai']) / merged_data['network_cost_moirai'] * 100
    merged_data['egress'] = (merged_data['egress_cost_tag'] - merged_data[
        'egress_cost_moirai']) / merged_data['egress_cost_moirai'] * 100
    merged_data['rep'] = (merged_data['rep_cost_tag'] - merged_data[
        'rep_cost_moirai']) / merged_data['rep_cost_moirai'] * 100

    print(tag)
    print(merged_data[['c', 'total', 'traffic', 'network', 'egress', 'rep']].round(0).astype(int))

print("========= Moirai Data Movement weekly =========")
# period,mode,cloud_computation_target,movement_ingress_bytes,movement_egress_bytes
df = pd.read_csv("../sample_1.000_rep0.002/log.csv")
df = df[df['mode'] == 'size-predict']
df['movement_bytes'] = df['movement_ingress_bytes'] + df['movement_egress_bytes']
df['movement_egress_cost'] = df['movement_egress_bytes'] * 0.02 / 1024 ** 3  # egress cost $0.02 per GB

# Group and process
grouped = df.groupby('cloud_computation_target')['movement_bytes'].apply(list)
# Print sorted values per target in human-readable form
for target, values in grouped.items():
    sorted_human_readable = [human_readable_size(v) for v in sorted(values)]
    print(f"{target}: {sorted_human_readable}")

# Group and process
grouped = df.groupby('cloud_computation_target')['movement_egress_cost'].apply(list)
# Print sorted values per target in human-readable form
for target, values in grouped.items():
    sorted_human_readable = [round(v, 0) for v in sorted(values)]
    print(f"{target}: {sorted_human_readable}")

print("========= Moirai Job Routing =========")
# header: period,mode,cloud_computation_target,
# ingress_byte_Presto,egress_byte_Presto,
# ingress_byte_Spark,egress_byte_Spark
df = pd.read_csv("../sample_1.000_rep0.002/log.csv")
df['egress_byte'] = df['egress_byte_Presto'] + df['egress_byte_Spark']
mean_egress_size_predict = df[df['mode'] == 'size-predict'].groupby('cloud_computation_target')['egress_byte'].mean()
mean_egress_independent = df[df['mode'] == 'independent'].groupby('cloud_computation_target')['egress_byte'].mean()
mean_egress_size_unaware = df[df['mode'] == 'size-unaware'].groupby('cloud_computation_target')['egress_byte'].mean()
mean_egress_size_aware = df[df['mode'] == 'size-aware'].groupby('cloud_computation_target')['egress_byte'].mean()
# reduction on independent mode
reduction = (mean_egress_independent - mean_egress_size_predict) / mean_egress_independent * 100
print("** size-predict reduction on independent", reduction.round(1))
# reduction on size-unaware mode
reduction = (mean_egress_size_unaware - mean_egress_size_predict) / mean_egress_size_unaware * 100
print("** size-predict reduction on size-unaware", reduction.round(1))
gap = (mean_egress_size_predict - mean_egress_size_aware) / mean_egress_size_aware * 100
print("** size-predict is within ?% of size-aware", gap.round(1))
