in scripts/figs.py [0:0]
def replication_effects():
# header: abstractFingerPrint,db_name,table_name,inputDataSize,cputime,outputDataSize
# presto_df = pd.read_csv('../newTraces/report-abFP-volume-table-20241022-20241028-Presto.csv')
# presto_df['db_table'] = presto_df['db_name'] + '.' + presto_df['table_name']
# spark_df = pd.read_csv('../newTraces/report-abFP-volume-table-20241022-20241028-Spark.csv')
# spark_df['db_table'] = spark_df['db_name'] + '.' + spark_df['table_name']
presto_df = pd.read_csv('../newTraces/report-abFP-volume-table-20250114-20250120-Presto.csv')
presto_df['db_table'] = presto_df['db_name'] + '.' + presto_df['table_name']
spark_df = pd.read_csv('../newTraces/report-abFP-volume-table-20250114-20250120-Spark.csv')
spark_df['db_table'] = spark_df['db_name'] + '.' + spark_df['table_name']
for rep_rate in [0.02, 0.002]:
print(f'Rep: {rep_rate}')
for strategy in [
# 'read_traffic_volume','inverse_dataset_size',
# 'job_access_frequency',
# 'read_traffic_density',
'job_access_density'
]:
path = f"../sample_1.000_rep{rep_rate:.3f}_strategies/replicated_tables_{str(rep_rate)}_{strategy}.csv"
if not os.path.exists(path):
continue
rep_list = pd.read_csv(path)[f'replicated_tables'].to_list()
effective_presto_df = presto_df[~presto_df['db_table'].isin(rep_list)]
effective_spark_df = spark_df[~spark_df['db_table'].isin(rep_list)]
print(f"Strategy: {strategy}")
#print("Presto # of edges, all:", len(presto_df), "effective:", len(presto_df) - len(reduced_presto_df))
#print("Spark # of edges, all:", len(spark_df), "effective:", len(spark_df) - len(reduced_spark_df))
print("# effective edges", len(effective_spark_df) + len(effective_presto_df))
print("# of effective jobs", effective_spark_df['abstractFingerPrint'].nunique() + effective_presto_df['abstractFingerPrint'].nunique())
print("# of unique db_tables", pd.concat([effective_spark_df, effective_presto_df])['db_table'].nunique())
for rep_rate in [0.001, 0.002, 0.004]:
rep_list = pd.read_csv(f"../sample_1.000_rep{rep_rate:.3f}/replicated_tables.csv")['replicated_tables'].to_list()
reduced_presto_df = presto_df[presto_df['db_table'].isin(rep_list)]
reduced_spark_df = spark_df[spark_df['db_table'].isin(rep_list)]
print(f"Replication rate: {rep_rate:.3f}")
print("Presto # of edges, all:", len(presto_df), "affected:", len(reduced_presto_df))
print("Spark # of edges, all:", len(spark_df), "affected:", len(reduced_spark_df))
# header: abstractFingerPrint,db_name,table_name,inputDataSize,outputDataSize,cputime
yugong_df = pd.read_csv('../yugongTraces/report-uown-volume-table-20241022-20241028.csv')
yugong_df['db_table'] = yugong_df['db_name'] + '.' + yugong_df['table_name']
for rep_rate in [0.004]:
rep_list = pd.read_csv(f"../yugong_results_rep{rep_rate:.3f}/replicated_tables_0.004.csv")['replicated_tables'].to_list()
reduced_yugong_df = yugong_df[yugong_df['db_table'].isin(rep_list)]
print(f"Replication rate: {rep_rate:.3f}")
print("Yugong # of edges, all:", len(yugong_df), "affected:", len(reduced_yugong_df))