def draw_growth()

in scripts/figs.py [0:0]
94 lines of code
10 McCabe index (conditional complexity)

def draw_growth(format="png"):
    def compute_manual_linear_slope(x, y):
        """Computes the slope of the best-fit line using least squares regression."""
        n = len(x)
        mean_x, mean_y = np.mean(x), np.mean(y)
        numerator = np.sum((x - mean_x) * (y - mean_y))
        denominator = np.sum((x - mean_x) ** 2)
        slope = numerator / denominator if denominator != 0 else 0
        return slope

    # Function to compute and plot linear fit manually
    def plot_manual_linear_fit(x, y, label, ax, color):
        """Computes and plots a manual least squares linear fit."""
        # Compute slope and intercept manually
        slope = compute_manual_linear_slope(x, y)
        intercept = np.mean(y) - slope * np.mean(x)

        # Generate fitted line
        linear_fit = slope * x + intercept

        ax.plot(x, linear_fit, linestyle="-", label=label, color=color)
        print(label, "slope:", slope)

    # Function to format tick labels
    def format_ticks(value, _):
        if value >= 1e12:
            return f"{value / 1e12:.1f}T"
        elif value >= 1e9:
            return f"{value / 1e9:.1f}B"
        elif value >= 1e6:
            return f"{value / 1e6:.1f}M"
        elif value >= 1e3:
            return f"{value / 1e3:.0f}K"
        else:
            return str(int(value))

    # Load Presto and Spark data
    presto_df = pd.read_csv("../metrics_per_day_presto.csv", parse_dates=['date'])
    spark_df = pd.read_csv("../metrics_per_day_spark.csv", parse_dates=['date'])

    # # Compute totals
    # presto_total_jobs = presto_df['daily_jobs'].sum()
    # spark_total_jobs = spark_df['daily_jobs'].sum()
    #
    # presto_total_rw_bytes = presto_df['daily_read_volume'].sum()
    # spark_total_rw_bytes = spark_df['daily_read_volume'].sum() + spark_df['daily_write_volume'].sum()
    #
    # print("=== Totals Over the Period ===")
    # print(f"Presto: {presto_total_jobs:,} jobs, {presto_total_rw_bytes / 1024 ** 5:.2f} PB read")
    # print(f"Spark: {spark_total_jobs:,} jobs, {spark_total_rw_bytes / 1024 ** 5:.2f} PB read+write")


    # Sort by date
    presto_df.sort_values('date', inplace=True)
    spark_df.sort_values('date', inplace=True)

    # Compute days elapsed since the first date
    min_date = min(presto_df['date'].min(), spark_df['date'].min())
    presto_df['days_elapsed'] = (presto_df['date'] - min_date).dt.days
    spark_df['days_elapsed'] = (spark_df['date'] - min_date).dt.days

    # Plot 1: Number of daily jobs
    plt.figure(figsize=(10, 4))
    plt.tick_params(axis='both', labelsize=font_size - 2)
    plt.plot(presto_df['days_elapsed'], presto_df['daily_jobs'], label="Presto", color='blue', linestyle='--')
    plot_manual_linear_fit(presto_df['days_elapsed'].values, presto_df['daily_jobs'].values, "Trend line", plt.gca(),
                           'blue')
    plt.plot(spark_df['days_elapsed'], spark_df['daily_jobs'], label="Spark", color='orange', linestyle='--')
    plot_manual_linear_fit(spark_df['days_elapsed'].values, spark_df['daily_jobs'].values, "Trend line", plt.gca(), 'orange')
    plt.text(50, 280*1000, "30% annual increase", fontsize=font_size - 2, color='red')
    plt.annotate('', xy=(58, 395 * 1000), xytext=(55, 310 * 1000),
                 arrowprops=dict(arrowstyle="->", color='red', lw=2))

    plt.annotate('', xy=(60, 240 * 1000), xytext=(55, 275 * 1000),
                 arrowprops=dict(arrowstyle="->", color='red', lw=2))
    plt.xlabel("Day", fontsize=font_size)
    plt.ylabel("# of Daily Jobs", fontsize=font_size)
    plt.legend(fontsize=font_size - 3, ncol=4, bbox_to_anchor=(0.5, 1.2), loc='upper center')

    plt.gca().yaxis.set_major_formatter(FuncFormatter(format_ticks))
    plt.ylim(bottom=0, top=600 * 1000)
    plt.xlim(0, 110)
    plt.grid()
    plt.tight_layout()
    plt.subplots_adjust(top=0.8)  # Adjust to leave space for the legend above
    plt.savefig(f"daily_jobs.{format}", bbox_inches='tight')


    # Plot 2: Number of daily templates
    plt.figure(figsize=(10, 4))
    plt.tick_params(axis='both', labelsize=font_size - 2)
    plt.plot(presto_df['days_elapsed'], presto_df['daily_templates'], label="Presto", color='blue', linestyle='--')
    plt.plot(spark_df['days_elapsed'], spark_df['daily_templates'], label="Spark", color='orange', linestyle='--')
    plot_manual_linear_fit(presto_df['days_elapsed'].values, presto_df['daily_templates'].values, "Trend line", plt.gca(), 'blue')
    plot_manual_linear_fit(spark_df['days_elapsed'].values, spark_df['daily_templates'].values, "Trend line", plt.gca(), 'orange')
    plt.text(50, 50*1000, "20% annual increase", fontsize=font_size - 2, color='red')
    plt.annotate('', xy=(58, 105 * 1000), xytext=(55, 60 * 1000),
                 arrowprops=dict(arrowstyle="->", color='red', lw=2))

    plt.annotate('', xy=(60, 35 * 1000), xytext=(55, 46 * 1000),
                 arrowprops=dict(arrowstyle="->", color='red', lw=2))

    plt.xlabel("Day", fontsize=font_size)
    plt.ylabel("# of Daily Templates", fontsize=font_size)
    plt.legend(fontsize=font_size - 3, ncol=4, bbox_to_anchor=(0.5, 1.2), loc='upper center')
    plt.gca().yaxis.set_major_formatter(FuncFormatter(format_ticks))
    plt.ylim(bottom=0, top=150 * 1000)
    plt.xlim(0, 110)
    plt.grid()
    plt.tight_layout()
    plt.savefig(f"daily_templates.{format}", bbox_inches='tight')

    # Plot 3: Daily traffic volume (read/write)
    plt.figure(figsize=(10, 3.5))
    plt.tick_params(axis='both', labelsize=font_size - 2)
    plt.plot(presto_df['days_elapsed'], presto_df['daily_read_volume'] / 1024 ** 5, label="Presto", color='blue', linestyle='--')
    plot_manual_linear_fit(presto_df['days_elapsed'].values, presto_df['daily_read_volume'].values / 1024 ** 5, "Trend line", plt.gca(), 'blue')
    plt.plot(spark_df['days_elapsed'], (spark_df['daily_read_volume'] + spark_df['daily_write_volume']) / 1024 ** 5,
             label="Spark", color='orange', linestyle='--')
    plot_manual_linear_fit(spark_df['days_elapsed'].values, (spark_df['daily_read_volume'] + spark_df['daily_write_volume']).values / 1024 ** 5, "Trend line", plt.gca(), 'orange')
   # plt.plot(spark_df['days_elapsed'], spark_df['daily_write_volume'] / 1024 ** 5, label="Spark Write", color='purple', linestyle='--')

    plt.text(50, 55, "30% annual increase", fontsize=font_size - 2, color='red')
    plt.annotate('', xy=(58, 105), xytext=(55, 65),
                 arrowprops=dict(arrowstyle="->", color='red', lw=2))

    plt.annotate('', xy=(60, 25), xytext=(55, 50),
                 arrowprops=dict(arrowstyle="->", color='red', lw=2))

    plt.xlabel("Day", fontsize=font_size)
    plt.ylabel("Daily Traffic (PB)", fontsize=font_size)
    #plt.legend(fontsize=font_size - 3, ncol=2)
    plt.gca().yaxis.set_major_formatter(FuncFormatter(format_ticks))
    plt.ylim(bottom=0, top=150)
    plt.xlim(0, 110)
    plt.grid()
    plt.tight_layout()
    plt.savefig(f"daily_traffic_volume.{format}", bbox_inches='tight')