in scheduler.py [0:0]
def calculate_traffic_percentiles(traffic_dir: str, start_date: datetime, end_date: datetime, debug: bool = False):
"""Reads traffic data from CSV files and computes percentiles"""
all_traffic_rates = []
for single_date in pd.date_range(start_date, end_date):
traffic_file = os.path.join(traffic_dir, f"traffic_{single_date.strftime('%Y%m%d')}.csv")
if os.path.exists(traffic_file):
df = pd.read_csv(traffic_file)
df['egress_rate_bps'] = df['egress_rate_presto_bps'] + df['egress_rate_spark_bps']
df['ingress_rate_bps'] = df['ingress_rate_presto_bps'] + df['ingress_rate_spark_bps']
df['traffic_rate_bps'] = df['egress_rate_bps'] + df['ingress_rate_bps']
if debug and len(df) != 1440:
print(f"Check {traffic_file}: {len(df)}")
all_traffic_rates.extend(df["traffic_rate_bps"].tolist())
else:
print(f"Traffic file not found: {traffic_file}")
if not all_traffic_rates:
return None, None, None # No data found
return (
int(np.percentile(all_traffic_rates, 90)),
int(np.percentile(all_traffic_rates, 95)),
int(np.percentile(all_traffic_rates, 99)),
)