scripts/evaluation.py (106 lines of code) (raw):

# Load Presto and Spark data import pandas as pd from utility import human_readable_size print("======= # of jobs and traffic volume ========") presto_df = pd.read_csv("../metrics_per_day_presto.csv", parse_dates=['date']) spark_df = pd.read_csv("../metrics_per_day_spark.csv", parse_dates=['date']) # Compute totals presto_total_jobs = presto_df['daily_jobs'].sum() spark_total_jobs = spark_df['daily_jobs'].sum() presto_total_rw_bytes = presto_df['daily_read_volume'].sum() spark_total_rw_bytes = spark_df['daily_read_volume'].sum() + spark_df['daily_write_volume'].sum() print(f"Presto: {presto_total_jobs:,} jobs, {presto_total_rw_bytes / 1024 ** 5:.2f} PB read") print(f"Spark: {spark_total_jobs:,} jobs, {spark_total_rw_bytes / 1024 ** 5:.2f} PB read+write") print(f"Total jobs: {presto_total_jobs + spark_total_jobs:,} jobs") print(f"Total traffic volume: {(presto_total_rw_bytes + spark_total_rw_bytes) / 1024 ** 6:.2f} EB read+write") print("======= Sec1 Overall Statistics ========") # header: tag,c,network_cost, egress_cost,rep_cost, # total_cost,total_cost_std,ingress_volume,egress_volume df = pd.read_csv("../overall_stats.csv") # Filter data for c = 50 df_c50 = df[df['c'] == 50] # Get Moirai's data and all unique tags moirai_data = df_c50[df_c50['tag'] == 'Moirai'] print("Moirai data", moirai_data[['c', 'network_cost', 'egress_cost', 'rep_cost']]) # Loop through each tag and calculate the improvement for tag in ['Yugong', 'No\nRep', 'Rep\n3Mon.']: # Get the data for the current tag tag_data = df_c50[df_c50['tag'] == tag] # Merge tag data with Moirai data based on index to align rows merged_data = pd.merge(tag_data, moirai_data, on='c', suffixes=('_tag', '_moirai')) # Calculate the percentage improvement for each cost metric merged_data['total'] = (merged_data['total_cost_tag'] - merged_data[ 'total_cost_moirai']) / merged_data['total_cost_tag'] * 100 merged_data['network'] = (merged_data['network_cost_tag'] - merged_data[ 'network_cost_moirai']) / merged_data['network_cost_tag'] * 100 merged_data['egress'] = (merged_data['egress_cost_tag'] - merged_data[ 'egress_cost_moirai']) / merged_data['egress_cost_tag'] * 100 merged_data['rep'] = (merged_data['rep_cost_tag'] - merged_data[ 'rep_cost_moirai']) / merged_data['rep_cost_tag'] * 100 print(tag) print(merged_data[['total', 'network', 'egress', 'rep']]) print("======= Sec5 Overall Statistics ========") # header: tag,c,network_cost, egress_cost,rep_cost, # total_cost,total_cost_std,ingress_volume,egress_volume df = pd.read_csv("../overall_stats.csv") moirai_data = df[df['tag'] == 'Moirai'] print("** how Moirai improves over other approaches in Sec5 **") # Loop through each tag and calculate the improvement for tag in ['Volley', 'No\nRep', 'Rep\n3Mon.', 'Rep\nTop2.5%', 'Yugong']: # Get the data for the current tag tag_data = df[df['tag'] == tag] # Merge tag data with Moirai data based on index to align rows merged_data = pd.merge(tag_data, moirai_data, on='c', suffixes=('_tag', '_moirai')) # Calculate the percentage improvement for each cost metric merged_data['total'] = (merged_data['total_cost_tag'] - merged_data[ 'total_cost_moirai']) / merged_data['total_cost_tag'] * 100 merged_data['traffic'] = (merged_data['ingress_volume_tag'] + merged_data['egress_volume_tag'] - \ (merged_data['ingress_volume_moirai'] + merged_data['egress_volume_moirai'])) / \ (merged_data['ingress_volume_tag'] + merged_data['egress_volume_tag']) * 100 merged_data['network'] = (merged_data['network_cost_tag'] - merged_data[ 'network_cost_moirai']) / merged_data['network_cost_tag'] * 100 merged_data['egress'] = (merged_data['egress_cost_tag'] - merged_data[ 'egress_cost_moirai']) / merged_data['egress_cost_tag'] * 100 merged_data['rep'] = (merged_data['rep_cost_tag'] - merged_data[ 'rep_cost_moirai']) / merged_data['rep_cost_tag'] * 100 print(tag) print(merged_data[['c', 'total', 'traffic', 'network', 'egress', 'rep']].round(2)) print("Reduction of Volley vs No Rep:") merged_data = pd.merge(df[df['tag'] == 'No\nRep'], df[df['tag'] == 'Volley'], on='c', suffixes=('_norep', '_volley')) merged_data['total'] = (merged_data['total_cost_norep'] - merged_data['total_cost_volley']) / \ merged_data['total_cost_volley'] * 100 print(merged_data[['c', 'total']].round(1)) print("** gap between other approaches and Moirai **") # Loop through each tag and calculate the improvement for tag in ['Volley', 'No\nRep', 'Rep\n3Mon.', 'Rep\nTop2.5%', 'Yugong', 'Moi\nJobDist', 'Moi\n1%Job']: # Get the data for the current tag tag_data = df[df['tag'] == tag] # Merge tag data with Moirai data based on index to align rows merged_data = pd.merge(tag_data, moirai_data, on='c', suffixes=('_tag', '_moirai')) # Calculate the percentage improvement for each cost metric merged_data['total'] = (merged_data['total_cost_tag'] - merged_data[ 'total_cost_moirai']) / merged_data['total_cost_moirai'] * 100 merged_data['traffic'] = (merged_data['ingress_volume_tag'] + merged_data['egress_volume_tag'] - \ (merged_data['ingress_volume_moirai'] + merged_data['egress_volume_moirai'])) / \ (merged_data['ingress_volume_moirai'] + merged_data['egress_volume_moirai']) * 100 merged_data['network'] = (merged_data['network_cost_tag'] - merged_data[ 'network_cost_moirai']) / merged_data['network_cost_moirai'] * 100 merged_data['egress'] = (merged_data['egress_cost_tag'] - merged_data[ 'egress_cost_moirai']) / merged_data['egress_cost_moirai'] * 100 merged_data['rep'] = (merged_data['rep_cost_tag'] - merged_data[ 'rep_cost_moirai']) / merged_data['rep_cost_moirai'] * 100 print(tag) print(merged_data[['c', 'total', 'traffic', 'network', 'egress', 'rep']].round(0).astype(int)) print("========= Moirai Data Movement weekly =========") # period,mode,cloud_computation_target,movement_ingress_bytes,movement_egress_bytes df = pd.read_csv("../sample_1.000_rep0.002/log.csv") df = df[df['mode'] == 'size-predict'] df['movement_bytes'] = df['movement_ingress_bytes'] + df['movement_egress_bytes'] df['movement_egress_cost'] = df['movement_egress_bytes'] * 0.02 / 1024 ** 3 # egress cost $0.02 per GB # Group and process grouped = df.groupby('cloud_computation_target')['movement_bytes'].apply(list) # Print sorted values per target in human-readable form for target, values in grouped.items(): sorted_human_readable = [human_readable_size(v) for v in sorted(values)] print(f"{target}: {sorted_human_readable}") # Group and process grouped = df.groupby('cloud_computation_target')['movement_egress_cost'].apply(list) # Print sorted values per target in human-readable form for target, values in grouped.items(): sorted_human_readable = [round(v, 0) for v in sorted(values)] print(f"{target}: {sorted_human_readable}") print("========= Moirai Job Routing =========") # header: period,mode,cloud_computation_target, # ingress_byte_Presto,egress_byte_Presto, # ingress_byte_Spark,egress_byte_Spark df = pd.read_csv("../sample_1.000_rep0.002/log.csv") df['egress_byte'] = df['egress_byte_Presto'] + df['egress_byte_Spark'] mean_egress_size_predict = df[df['mode'] == 'size-predict'].groupby('cloud_computation_target')['egress_byte'].mean() mean_egress_independent = df[df['mode'] == 'independent'].groupby('cloud_computation_target')['egress_byte'].mean() mean_egress_size_unaware = df[df['mode'] == 'size-unaware'].groupby('cloud_computation_target')['egress_byte'].mean() mean_egress_size_aware = df[df['mode'] == 'size-aware'].groupby('cloud_computation_target')['egress_byte'].mean() # reduction on independent mode reduction = (mean_egress_independent - mean_egress_size_predict) / mean_egress_independent * 100 print("** size-predict reduction on independent", reduction.round(1)) # reduction on size-unaware mode reduction = (mean_egress_size_unaware - mean_egress_size_predict) / mean_egress_size_unaware * 100 print("** size-predict reduction on size-unaware", reduction.round(1)) gap = (mean_egress_size_predict - mean_egress_size_aware) / mean_egress_size_aware * 100 print("** size-predict is within ?% of size-aware", gap.round(1))