scripts/generate-comparison.py (108 lines of code) (raw):

# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import argparse import json import matplotlib.pyplot as plt import numpy as np def geomean(data): return np.prod(data) ** (1 / len(data)) def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str): results = [] for query in range(1, query_count(benchmark)+1): a = np.median(np.array(baseline[str(query)])) b = np.median(np.array(comparison[str(query)])) if a > b: speedup = a/b-1 else: speedup = -(1/(a/b)-1) results.append(("q" + str(query), round(speedup*100, 0))) results = sorted(results, key=lambda x: -x[1]) queries, speedups = zip(*results) # Create figure and axis if benchmark == "tpch": fig, ax = plt.subplots(figsize=(10, 6)) else: fig, ax = plt.subplots(figsize=(35, 10)) # Create bar chart bars = ax.bar(queries, speedups, color='skyblue') # Add text annotations for bar, speedup in zip(bars, speedups): yval = bar.get_height() if yval >= 0: ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8, color='blue', rotation=90) else: ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8, color='blue', rotation=90) # Add title and labels ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")") ax.set_ylabel('Speedup (100% speedup = 2x faster)') ax.set_xlabel('Query') # Customize the y-axis to handle both positive and negative values better ax.axhline(0, color='black', linewidth=0.8) min_value = (min(speedups) // 100) * 100 max_value = ((max(speedups) // 100) + 1) * 100 + 50 if benchmark == "tpch": ax.set_ylim(min_value, max_value) else: # TODO improve this ax.set_ylim(-250, 300) # Show grid for better readability ax.yaxis.grid(True) # Save the plot as an image file plt.savefig(f'{benchmark}_queries_speedup.png', format='png') def generate_query_comparison_chart(results, labels, benchmark: str, title: str): queries = [] benches = [] for _ in results: benches.append([]) for query in range(1, query_count(benchmark)+1): queries.append("q" + str(query)) for i in range(0, len(results)): benches[i].append(np.median(np.array(results[i][str(query)]))) # Define the width of the bars bar_width = 0.3 # Define the positions of the bars on the x-axis index = np.arange(len(queries)) * 1.5 # Create a bar chart if benchmark == "tpch": fig, ax = plt.subplots(figsize=(15, 6)) else: fig, ax = plt.subplots(figsize=(35, 6)) for i in range(0, len(results)): bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i]) # Add labels, title, and legend ax.set_title(title) ax.set_xlabel('Queries') ax.set_ylabel('Query Time (seconds)') ax.set_xticks(index + bar_width / 2) ax.set_xticklabels(queries) ax.legend() # Save the plot as an image file plt.savefig(f'{benchmark}_queries_compare.png', format='png') def generate_summary(results, labels, benchmark: str, title: str): timings = [] for _ in results: timings.append(0) num_queries = query_count(benchmark) for query in range(1, num_queries + 1): for i in range(0, len(results)): timings[i] += np.median(np.array(results[i][str(query)])) # Create figure and axis fig, ax = plt.subplots() # Add title and labels ax.set_title(title) ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)') times = [round(x,0) for x in timings] # Create bar chart bars = ax.bar(labels, times, color='skyblue') # Add text annotations for bar in bars: yval = bar.get_height() ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment plt.savefig(f'{benchmark}_allqueries.png', format='png') def query_count(benchmark: str): if benchmark == "tpch": return 22 elif benchmark == "tpcds": return 99 else: raise "invalid benchmark name" def main(files, labels, benchmark: str, title: str): results = [] for filename in files: with open(filename) as f: results.append(json.load(f)) generate_summary(results, labels, benchmark, title) generate_query_comparison_chart(results, labels, benchmark, title) if len(files) == 2: generate_query_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title) if __name__ == '__main__': argparse = argparse.ArgumentParser(description='Generate comparison') argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files') argparse.add_argument('--labels', nargs='+', type=str, help='Labels') argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)') argparse.add_argument('--title', type=str, help='Chart title') args = argparse.parse_args() main(args.filenames, args.labels, args.benchmark, args.title)