in scripts/figs.py [0:0]
def draw_growth(format="png"):
def compute_manual_linear_slope(x, y):
"""Computes the slope of the best-fit line using least squares regression."""
n = len(x)
mean_x, mean_y = np.mean(x), np.mean(y)
numerator = np.sum((x - mean_x) * (y - mean_y))
denominator = np.sum((x - mean_x) ** 2)
slope = numerator / denominator if denominator != 0 else 0
return slope
# Function to compute and plot linear fit manually
def plot_manual_linear_fit(x, y, label, ax, color):
"""Computes and plots a manual least squares linear fit."""
# Compute slope and intercept manually
slope = compute_manual_linear_slope(x, y)
intercept = np.mean(y) - slope * np.mean(x)
# Generate fitted line
linear_fit = slope * x + intercept
ax.plot(x, linear_fit, linestyle="-", label=label, color=color)
print(label, "slope:", slope)
# Function to format tick labels
def format_ticks(value, _):
if value >= 1e12:
return f"{value / 1e12:.1f}T"
elif value >= 1e9:
return f"{value / 1e9:.1f}B"
elif value >= 1e6:
return f"{value / 1e6:.1f}M"
elif value >= 1e3:
return f"{value / 1e3:.0f}K"
else:
return str(int(value))
# Load Presto and Spark data
presto_df = pd.read_csv("../metrics_per_day_presto.csv", parse_dates=['date'])
spark_df = pd.read_csv("../metrics_per_day_spark.csv", parse_dates=['date'])
# # Compute totals
# presto_total_jobs = presto_df['daily_jobs'].sum()
# spark_total_jobs = spark_df['daily_jobs'].sum()
#
# presto_total_rw_bytes = presto_df['daily_read_volume'].sum()
# spark_total_rw_bytes = spark_df['daily_read_volume'].sum() + spark_df['daily_write_volume'].sum()
#
# print("=== Totals Over the Period ===")
# print(f"Presto: {presto_total_jobs:,} jobs, {presto_total_rw_bytes / 1024 ** 5:.2f} PB read")
# print(f"Spark: {spark_total_jobs:,} jobs, {spark_total_rw_bytes / 1024 ** 5:.2f} PB read+write")
# Sort by date
presto_df.sort_values('date', inplace=True)
spark_df.sort_values('date', inplace=True)
# Compute days elapsed since the first date
min_date = min(presto_df['date'].min(), spark_df['date'].min())
presto_df['days_elapsed'] = (presto_df['date'] - min_date).dt.days
spark_df['days_elapsed'] = (spark_df['date'] - min_date).dt.days
# Plot 1: Number of daily jobs
plt.figure(figsize=(10, 4))
plt.tick_params(axis='both', labelsize=font_size - 2)
plt.plot(presto_df['days_elapsed'], presto_df['daily_jobs'], label="Presto", color='blue', linestyle='--')
plot_manual_linear_fit(presto_df['days_elapsed'].values, presto_df['daily_jobs'].values, "Trend line", plt.gca(),
'blue')
plt.plot(spark_df['days_elapsed'], spark_df['daily_jobs'], label="Spark", color='orange', linestyle='--')
plot_manual_linear_fit(spark_df['days_elapsed'].values, spark_df['daily_jobs'].values, "Trend line", plt.gca(), 'orange')
plt.text(50, 280*1000, "30% annual increase", fontsize=font_size - 2, color='red')
plt.annotate('', xy=(58, 395 * 1000), xytext=(55, 310 * 1000),
arrowprops=dict(arrowstyle="->", color='red', lw=2))
plt.annotate('', xy=(60, 240 * 1000), xytext=(55, 275 * 1000),
arrowprops=dict(arrowstyle="->", color='red', lw=2))
plt.xlabel("Day", fontsize=font_size)
plt.ylabel("# of Daily Jobs", fontsize=font_size)
plt.legend(fontsize=font_size - 3, ncol=4, bbox_to_anchor=(0.5, 1.2), loc='upper center')
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_ticks))
plt.ylim(bottom=0, top=600 * 1000)
plt.xlim(0, 110)
plt.grid()
plt.tight_layout()
plt.subplots_adjust(top=0.8) # Adjust to leave space for the legend above
plt.savefig(f"daily_jobs.{format}", bbox_inches='tight')
# Plot 2: Number of daily templates
plt.figure(figsize=(10, 4))
plt.tick_params(axis='both', labelsize=font_size - 2)
plt.plot(presto_df['days_elapsed'], presto_df['daily_templates'], label="Presto", color='blue', linestyle='--')
plt.plot(spark_df['days_elapsed'], spark_df['daily_templates'], label="Spark", color='orange', linestyle='--')
plot_manual_linear_fit(presto_df['days_elapsed'].values, presto_df['daily_templates'].values, "Trend line", plt.gca(), 'blue')
plot_manual_linear_fit(spark_df['days_elapsed'].values, spark_df['daily_templates'].values, "Trend line", plt.gca(), 'orange')
plt.text(50, 50*1000, "20% annual increase", fontsize=font_size - 2, color='red')
plt.annotate('', xy=(58, 105 * 1000), xytext=(55, 60 * 1000),
arrowprops=dict(arrowstyle="->", color='red', lw=2))
plt.annotate('', xy=(60, 35 * 1000), xytext=(55, 46 * 1000),
arrowprops=dict(arrowstyle="->", color='red', lw=2))
plt.xlabel("Day", fontsize=font_size)
plt.ylabel("# of Daily Templates", fontsize=font_size)
plt.legend(fontsize=font_size - 3, ncol=4, bbox_to_anchor=(0.5, 1.2), loc='upper center')
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_ticks))
plt.ylim(bottom=0, top=150 * 1000)
plt.xlim(0, 110)
plt.grid()
plt.tight_layout()
plt.savefig(f"daily_templates.{format}", bbox_inches='tight')
# Plot 3: Daily traffic volume (read/write)
plt.figure(figsize=(10, 3.5))
plt.tick_params(axis='both', labelsize=font_size - 2)
plt.plot(presto_df['days_elapsed'], presto_df['daily_read_volume'] / 1024 ** 5, label="Presto", color='blue', linestyle='--')
plot_manual_linear_fit(presto_df['days_elapsed'].values, presto_df['daily_read_volume'].values / 1024 ** 5, "Trend line", plt.gca(), 'blue')
plt.plot(spark_df['days_elapsed'], (spark_df['daily_read_volume'] + spark_df['daily_write_volume']) / 1024 ** 5,
label="Spark", color='orange', linestyle='--')
plot_manual_linear_fit(spark_df['days_elapsed'].values, (spark_df['daily_read_volume'] + spark_df['daily_write_volume']).values / 1024 ** 5, "Trend line", plt.gca(), 'orange')
# plt.plot(spark_df['days_elapsed'], spark_df['daily_write_volume'] / 1024 ** 5, label="Spark Write", color='purple', linestyle='--')
plt.text(50, 55, "30% annual increase", fontsize=font_size - 2, color='red')
plt.annotate('', xy=(58, 105), xytext=(55, 65),
arrowprops=dict(arrowstyle="->", color='red', lw=2))
plt.annotate('', xy=(60, 25), xytext=(55, 50),
arrowprops=dict(arrowstyle="->", color='red', lw=2))
plt.xlabel("Day", fontsize=font_size)
plt.ylabel("Daily Traffic (PB)", fontsize=font_size)
#plt.legend(fontsize=font_size - 3, ncol=2)
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_ticks))
plt.ylim(bottom=0, top=150)
plt.xlim(0, 110)
plt.grid()
plt.tight_layout()
plt.savefig(f"daily_traffic_volume.{format}", bbox_inches='tight')