java/benchmark/analyze.py (216 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
process fury/kryo/fst/hession performance data
"""
import datetime
import matplotlib.pyplot as plt
import os
import pandas as pd
from pathlib import Path
import re
import sys
dir_path = os.path.dirname(os.path.realpath(__file__))
def to_markdown(df: pd.DataFrame, filepath: str):
columns = df.columns.tolist()
for col in list(columns):
if len(df[col].value_counts()) == 1:
columns.remove(col)
if "Lib" in columns:
columns.remove("Lib")
columns.insert(0, "Lib")
if "Tps" in columns:
columns.remove("Tps")
columns.append("Tps")
df = df[columns]
with open(filepath, "w") as f:
f.write(_to_markdown(df))
def _to_markdown(df: pd.DataFrame):
lines = list(df.values.tolist())
width = len(df.columns)
lines.insert(0, df.columns.values.tolist())
lines.insert(1, ["-------"] * width)
md_table = "\n".join(
["| " + " | ".join([str(item) for item in line]) + " |" for line in lines]
)
return md_table
def process_data(filepath: str):
df = pd.read_csv(filepath)
columns = list(df.columns.values)
for column in columns:
if "Score Error" in column:
df.drop([column], axis=1, inplace=True)
if column == "Score":
df.rename({"Score": "Tps"}, axis=1, inplace=True)
if "Param: " in column:
df.rename({column: column.replace("Param: ", "")}, axis=1, inplace=True)
def process_df(bench_df):
if bench_df.shape[0] > 0:
benchmark_name = bench_df["Benchmark"].str.rsplit(
pat=".", n=1, expand=True
)[1]
bench_df[["Lib", "Benchmark"]] = benchmark_name.str.split(
pat="_", n=1, expand=True
)
bench_df["Lib"] = bench_df["Lib"].str.capitalize()
bench_df.drop(["Threads"], axis=1, inplace=True)
return bench_df
zero_copy_bench = df[df["Benchmark"].str.contains("ZeroCopy")]
zero_copy_bench = process_df(zero_copy_bench)
bench = df[~df["Benchmark"].str.contains("ZeroCopy")]
bench = process_df(bench)
return zero_copy_bench, bench
color_map = {
"Fury": "#7845FD",
"Furymetashared": "#B237ED", # (1, 0.65, 0.55)
# "Kryo": (1, 0.5, 1),
# "Kryo": (1, 0.84, 0.25),
"Kryo": "#55BCC2",
"Kryo_deserialize": "#55BCC2",
"Fst": (0.90, 0.43, 0.5),
"Hession": (0.80, 0.5, 0.6),
"Hession_deserialize": (0.80, 0.5, 0.6),
"Protostuff": (1, 0.84, 0.66),
"Jdk": (0.55, 0.40, 0.45),
"Jsonb": (0.45, 0.40, 0.55),
}
scaler = 10000
def format_scaler(x):
if x > 100:
return round(x)
else:
return round(x, 1)
def plot(df: pd.DataFrame, file_dir, filename, column="Tps"):
df["ns"] = (1 / df["Tps"] * 10**9).astype(int)
data = df.fillna("")
data.to_csv(f"{file_dir}/pd_{filename}")
if "objectType" in data.columns:
group_cols = ["Benchmark", "objectType", "bufferType"]
else:
group_cols = ["Benchmark", "bufferType"]
compatible = data[data["Benchmark"].str.contains("compatible")]
plot_color_map = dict(color_map)
if len(compatible) > 0:
jdk = data[data["Lib"].str.contains("Jdk")].copy()
jdk["Benchmark"] = jdk["Benchmark"] + "_compatible"
data = pd.concat([data, jdk])
fury_metashared_color = plot_color_map["Furymetashared"]
fury_color = plot_color_map["Fury"]
plot_color_map["Fury"] = fury_metashared_color
plot_color_map["Furymetashared"] = fury_color
ylable = column
if column == "Tps":
ylable = f"Tps/{scaler}"
data[column] = (data[column] / scaler).apply(format_scaler)
grouped = data.groupby(group_cols)
files_dict = {}
count = 0
for keys, sub_df in grouped:
count = count + 1
sub_df = sub_df[["Lib", "references", column]]
if keys[0].startswith("serialize"):
title = " ".join(keys[:-1]) + " to " + keys[-1]
else:
title = " ".join(keys[:-1]) + " from " + keys[-1]
kind = "Time" if column == "ns" else "Tps"
save_filename = f"""{filename}_{title.replace(" ", "_")}_{kind.lower()}"""
cnt = files_dict.get(save_filename, 0)
if cnt > 0:
files_dict[save_filename] = cnt = cnt + 1
save_filename += "_" + cnt
title = f"{title} ({kind})"
fig, ax = plt.subplots()
final_df = (
sub_df.reset_index(drop=True)
.set_index(["Lib", "references"])
.unstack("Lib")
)
print(final_df)
libs = final_df.columns.to_frame()["Lib"]
color = [plot_color_map[lib] for lib in libs]
sub_plot = final_df.plot.bar(
title=title, color=color, ax=ax, figsize=(7, 7), width=0.7
)
for container in ax.containers:
ax.bar_label(container)
ax.set_xlabel("enable_references")
ax.set_ylabel(ylable)
libs = libs.str.replace("metashared", "meta\nshared")
ax.legend(libs, loc="upper right", prop={"size": 13})
save_dir = get_plot_dir(file_dir)
sub_plot.get_figure().savefig(save_dir + "/" + save_filename)
def plot_zero_copy(df: pd.DataFrame, file_dir, filename, column="Tps"):
df["ns"] = (1 / df["Tps"] * 10**9).astype(int)
data = df.fillna("")
data.to_csv(f"{file_dir}/pd_{filename}")
if "dataType" in data.columns:
group_cols = ["Benchmark", "dataType", "bufferType"]
else:
group_cols = ["Benchmark", "bufferType"]
ylable = column
if column == "Tps":
ylable = f"Tps/{scaler}"
data[column] = (data[column] / scaler).apply(format_scaler)
grouped = data.groupby(group_cols)
files_dict = {}
count = 0
for keys, sub_df in grouped:
count = count + 1
sub_df = sub_df[["Lib", "array_size", column]]
if keys[0].startswith("serialize"):
title = " ".join(keys[:-1]) + " to " + keys[-1]
else:
title = " ".join(keys[:-1]) + " from " + keys[-1]
kind = "Time" if column == "ns" else "Tps"
save_filename = f"""{filename}_{title.replace(" ", "_")}_{kind.lower()}"""
cnt = files_dict.get(save_filename, 0)
if cnt > 0:
files_dict[save_filename] = cnt = cnt + 1
save_filename += "_" + cnt
title = f"{title} ({kind})"
fig, ax = plt.subplots()
final_df = (
sub_df.reset_index(drop=True)
.set_index(["Lib", "array_size"])
.unstack("Lib")
)
print(final_df)
libs = final_df.columns.to_frame()["Lib"]
color = [color_map[lib] for lib in libs]
sub_plot = final_df.plot.bar(title=title, color=color, ax=ax, figsize=(7, 7))
for container in ax.containers:
ax.bar_label(container)
ax.set_xlabel("array_size")
ax.set_ylabel(ylable)
ax.legend(libs, bbox_to_anchor=(0.23, 0.99), prop={"size": 13})
save_dir = get_plot_dir(file_dir)
sub_plot.get_figure().savefig(save_dir + "/" + save_filename)
time_str = datetime.datetime.now().strftime("%m%d_%H%M_%S")
def get_plot_dir(_file_dir):
plot_dir = _file_dir + "/" + time_str
if not os.path.exists(plot_dir):
os.makedirs(plot_dir)
return plot_dir
def camel_to_snake(name):
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z\\d])([A-Z])", r"\1_\2", name).lower()
def get_datasize_markdown(size_log):
lines = [line.rsplit("===>", 1)[-1] for line in size_log.split("\n")]
lines = [
[item.strip() for item in line.split("|")][:-1] for line in lines if "|" in line
]
columns = "Lib,objectType,references,bufferType,size".split(",")
df = pd.DataFrame(lines, columns=columns)
df["size"] = df["size"].astype(int)
df = df["objectType,references,bufferType,size".split(",") + ["Lib"]]
grouped_df = df.sort_values("objectType,references,bufferType,size".split(","))
grouped_df = grouped_df[~grouped_df["bufferType"].str.contains("directBuffer")]
grouped_df = grouped_df["objectType,references,Lib,size".split(",")]
return _to_markdown(grouped_df)
if __name__ == "__main__":
# size_markdown = get_datasize_markdown("""
# """)
# print(size_markdown)
args = sys.argv[1:]
if args:
file_name = args[0]
else:
file_name = "jmh-jdk-11-deserialization.csv"
file_dir = f"{dir_path}/../../docs/benchmarks/data"
zero_copy_bench, bench = process_data(os.path.join(file_dir, file_name))
if zero_copy_bench.shape[0] > 0:
to_markdown(zero_copy_bench, str(Path(file_name).with_suffix(".zero_copy.md")))
plot_zero_copy(zero_copy_bench, file_dir, "zero_copy_bench", column="Tps")
if bench.shape[0] > 0:
to_markdown(bench, str(Path(file_name).with_suffix(".bench.md")))
plot(bench, file_dir, "bench", column="Tps")