scripts/generate-comparison.py (108 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import argparse
import json
import matplotlib.pyplot as plt
import numpy as np
def geomean(data):
return np.prod(data) ** (1 / len(data))
def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
results = []
for query in range(1, query_count(benchmark)+1):
a = np.median(np.array(baseline[str(query)]))
b = np.median(np.array(comparison[str(query)]))
if a > b:
speedup = a/b-1
else:
speedup = -(1/(a/b)-1)
results.append(("q" + str(query), round(speedup*100, 0)))
results = sorted(results, key=lambda x: -x[1])
queries, speedups = zip(*results)
# Create figure and axis
if benchmark == "tpch":
fig, ax = plt.subplots(figsize=(10, 6))
else:
fig, ax = plt.subplots(figsize=(35, 10))
# Create bar chart
bars = ax.bar(queries, speedups, color='skyblue')
# Add text annotations
for bar, speedup in zip(bars, speedups):
yval = bar.get_height()
if yval >= 0:
ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
color='blue', rotation=90)
else:
ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
color='blue', rotation=90)
# Add title and labels
ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
ax.set_ylabel('Speedup (100% speedup = 2x faster)')
ax.set_xlabel('Query')
# Customize the y-axis to handle both positive and negative values better
ax.axhline(0, color='black', linewidth=0.8)
min_value = (min(speedups) // 100) * 100
max_value = ((max(speedups) // 100) + 1) * 100 + 50
if benchmark == "tpch":
ax.set_ylim(min_value, max_value)
else:
# TODO improve this
ax.set_ylim(-250, 300)
# Show grid for better readability
ax.yaxis.grid(True)
# Save the plot as an image file
plt.savefig(f'{benchmark}_queries_speedup.png', format='png')
def generate_query_comparison_chart(results, labels, benchmark: str, title: str):
queries = []
benches = []
for _ in results:
benches.append([])
for query in range(1, query_count(benchmark)+1):
queries.append("q" + str(query))
for i in range(0, len(results)):
benches[i].append(np.median(np.array(results[i][str(query)])))
# Define the width of the bars
bar_width = 0.3
# Define the positions of the bars on the x-axis
index = np.arange(len(queries)) * 1.5
# Create a bar chart
if benchmark == "tpch":
fig, ax = plt.subplots(figsize=(15, 6))
else:
fig, ax = plt.subplots(figsize=(35, 6))
for i in range(0, len(results)):
bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i])
# Add labels, title, and legend
ax.set_title(title)
ax.set_xlabel('Queries')
ax.set_ylabel('Query Time (seconds)')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(queries)
ax.legend()
# Save the plot as an image file
plt.savefig(f'{benchmark}_queries_compare.png', format='png')
def generate_summary(results, labels, benchmark: str, title: str):
timings = []
for _ in results:
timings.append(0)
num_queries = query_count(benchmark)
for query in range(1, num_queries + 1):
for i in range(0, len(results)):
timings[i] += np.median(np.array(results[i][str(query)]))
# Create figure and axis
fig, ax = plt.subplots()
# Add title and labels
ax.set_title(title)
ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)')
times = [round(x,0) for x in timings]
# Create bar chart
bars = ax.bar(labels, times, color='skyblue')
# Add text annotations
for bar in bars:
yval = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment
plt.savefig(f'{benchmark}_allqueries.png', format='png')
def query_count(benchmark: str):
if benchmark == "tpch":
return 22
elif benchmark == "tpcds":
return 99
else:
raise "invalid benchmark name"
def main(files, labels, benchmark: str, title: str):
results = []
for filename in files:
with open(filename) as f:
results.append(json.load(f))
generate_summary(results, labels, benchmark, title)
generate_query_comparison_chart(results, labels, benchmark, title)
if len(files) == 2:
generate_query_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)
if __name__ == '__main__':
argparse = argparse.ArgumentParser(description='Generate comparison')
argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files')
argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)')
argparse.add_argument('--title', type=str, help='Chart title')
args = argparse.parse_args()
main(args.filenames, args.labels, args.benchmark, args.title)