java/benchmark/analyze.py

# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ process fury/kryo/fst/hession performance data """ import datetime import matplotlib.pyplot as plt import os import pandas as pd from pathlib import Path import re import sys dir_path = os.path.dirname(os.path.realpath(__file__)) def to_markdown(df: pd.DataFrame, filepath: str): columns = df.columns.tolist() for col in list(columns): if len(df[col].value_counts()) == 1: columns.remove(col) if "Lib" in columns: columns.remove("Lib") columns.insert(0, "Lib") if "Tps" in columns: columns.remove("Tps") columns.append("Tps") df = df[columns] with open(filepath, "w") as f: f.write(_to_markdown(df)) def _to_markdown(df: pd.DataFrame): lines = list(df.values.tolist()) width = len(df.columns) lines.insert(0, df.columns.values.tolist()) lines.insert(1, ["-------"] * width) md_table = "\n".join( ["| " + " | ".join([str(item) for item in line]) + " |" for line in lines] ) return md_table def process_data(filepath: str): df = pd.read_csv(filepath) columns = list(df.columns.values) for column in columns: if "Score Error" in column: df.drop([column], axis=1, inplace=True) if column == "Score": df.rename({"Score": "Tps"}, axis=1, inplace=True) if "Param: " in column: df.rename({column: column.replace("Param: ", "")}, axis=1, inplace=True) def process_df(bench_df): if bench_df.shape[0] > 0: benchmark_name = bench_df["Benchmark"].str.rsplit( pat=".", n=1, expand=True )[1] bench_df[["Lib", "Benchmark"]] = benchmark_name.str.split( pat="_", n=1, expand=True ) bench_df["Lib"] = bench_df["Lib"].str.capitalize() bench_df.drop(["Threads"], axis=1, inplace=True) return bench_df zero_copy_bench = df[df["Benchmark"].str.contains("ZeroCopy")] zero_copy_bench = process_df(zero_copy_bench) bench = df[~df["Benchmark"].str.contains("ZeroCopy")] bench = process_df(bench) return zero_copy_bench, bench color_map = { "Fury": "#7845FD", "Furymetashared": "#B237ED", # (1, 0.65, 0.55) # "Kryo": (1, 0.5, 1), # "Kryo": (1, 0.84, 0.25), "Kryo": "#55BCC2", "Kryo_deserialize": "#55BCC2", "Fst": (0.90, 0.43, 0.5), "Hession": (0.80, 0.5, 0.6), "Hession_deserialize": (0.80, 0.5, 0.6), "Protostuff": (1, 0.84, 0.66), "Jdk": (0.55, 0.40, 0.45), "Jsonb": (0.45, 0.40, 0.55), } scaler = 10000 def format_scaler(x): if x > 100: return round(x) else: return round(x, 1) def plot(df: pd.DataFrame, file_dir, filename, column="Tps"): df["ns"] = (1 / df["Tps"] * 10**9).astype(int) data = df.fillna("") data.to_csv(f"{file_dir}/pd_{filename}") if "objectType" in data.columns: group_cols = ["Benchmark", "objectType", "bufferType"] else: group_cols = ["Benchmark", "bufferType"] compatible = data[data["Benchmark"].str.contains("compatible")] plot_color_map = dict(color_map) if len(compatible) > 0: jdk = data[data["Lib"].str.contains("Jdk")].copy() jdk["Benchmark"] = jdk["Benchmark"] + "_compatible" data = pd.concat([data, jdk]) fury_metashared_color = plot_color_map["Furymetashared"] fury_color = plot_color_map["Fury"] plot_color_map["Fury"] = fury_metashared_color plot_color_map["Furymetashared"] = fury_color ylable = column if column == "Tps": ylable = f"Tps/{scaler}" data[column] = (data[column] / scaler).apply(format_scaler) grouped = data.groupby(group_cols) files_dict = {} count = 0 for keys, sub_df in grouped: count = count + 1 sub_df = sub_df[["Lib", "references", column]] if keys[0].startswith("serialize"): title = " ".join(keys[:-1]) + " to " + keys[-1] else: title = " ".join(keys[:-1]) + " from " + keys[-1] kind = "Time" if column == "ns" else "Tps" save_filename = f"""{filename}_{title.replace(" ", "_")}_{kind.lower()}""" cnt = files_dict.get(save_filename, 0) if cnt > 0: files_dict[save_filename] = cnt = cnt + 1 save_filename += "_" + cnt title = f"{title} ({kind})" fig, ax = plt.subplots() final_df = ( sub_df.reset_index(drop=True) .set_index(["Lib", "references"]) .unstack("Lib") ) print(final_df) libs = final_df.columns.to_frame()["Lib"] color = [plot_color_map[lib] for lib in libs] sub_plot = final_df.plot.bar( title=title, color=color, ax=ax, figsize=(7, 7), width=0.7 ) for container in ax.containers: ax.bar_label(container) ax.set_xlabel("enable_references") ax.set_ylabel(ylable) libs = libs.str.replace("metashared", "meta\nshared") ax.legend(libs, loc="upper right", prop={"size": 13}) save_dir = get_plot_dir(file_dir) sub_plot.get_figure().savefig(save_dir + "/" + save_filename) def plot_zero_copy(df: pd.DataFrame, file_dir, filename, column="Tps"): df["ns"] = (1 / df["Tps"] * 10**9).astype(int) data = df.fillna("") data.to_csv(f"{file_dir}/pd_{filename}") if "dataType" in data.columns: group_cols = ["Benchmark", "dataType", "bufferType"] else: group_cols = ["Benchmark", "bufferType"] ylable = column if column == "Tps": ylable = f"Tps/{scaler}" data[column] = (data[column] / scaler).apply(format_scaler) grouped = data.groupby(group_cols) files_dict = {} count = 0 for keys, sub_df in grouped: count = count + 1 sub_df = sub_df[["Lib", "array_size", column]] if keys[0].startswith("serialize"): title = " ".join(keys[:-1]) + " to " + keys[-1] else: title = " ".join(keys[:-1]) + " from " + keys[-1] kind = "Time" if column == "ns" else "Tps" save_filename = f"""{filename}_{title.replace(" ", "_")}_{kind.lower()}""" cnt = files_dict.get(save_filename, 0) if cnt > 0: files_dict[save_filename] = cnt = cnt + 1 save_filename += "_" + cnt title = f"{title} ({kind})" fig, ax = plt.subplots() final_df = ( sub_df.reset_index(drop=True) .set_index(["Lib", "array_size"]) .unstack("Lib") ) print(final_df) libs = final_df.columns.to_frame()["Lib"] color = [color_map[lib] for lib in libs] sub_plot = final_df.plot.bar(title=title, color=color, ax=ax, figsize=(7, 7)) for container in ax.containers: ax.bar_label(container) ax.set_xlabel("array_size") ax.set_ylabel(ylable) ax.legend(libs, bbox_to_anchor=(0.23, 0.99), prop={"size": 13}) save_dir = get_plot_dir(file_dir) sub_plot.get_figure().savefig(save_dir + "/" + save_filename) time_str = datetime.datetime.now().strftime("%m%d_%H%M_%S") def get_plot_dir(_file_dir): plot_dir = _file_dir + "/" + time_str if not os.path.exists(plot_dir): os.makedirs(plot_dir) return plot_dir def camel_to_snake(name): name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub("([a-z\\d])([A-Z])", r"\1_\2", name).lower() def get_datasize_markdown(size_log): lines = [line.rsplit("===>", 1)[-1] for line in size_log.split("\n")] lines = [ [item.strip() for item in line.split("|")][:-1] for line in lines if "|" in line ] columns = "Lib,objectType,references,bufferType,size".split(",") df = pd.DataFrame(lines, columns=columns) df["size"] = df["size"].astype(int) df = df["objectType,references,bufferType,size".split(",") + ["Lib"]] grouped_df = df.sort_values("objectType,references,bufferType,size".split(",")) grouped_df = grouped_df[~grouped_df["bufferType"].str.contains("directBuffer")] grouped_df = grouped_df["objectType,references,Lib,size".split(",")] return _to_markdown(grouped_df) if __name__ == "__main__": # size_markdown = get_datasize_markdown(""" # """) # print(size_markdown) args = sys.argv[1:] if args: file_name = args[0] else: file_name = "jmh-jdk-11-deserialization.csv" file_dir = f"{dir_path}/../../docs/benchmarks/data" zero_copy_bench, bench = process_data(os.path.join(file_dir, file_name)) if zero_copy_bench.shape[0] > 0: to_markdown(zero_copy_bench, str(Path(file_name).with_suffix(".zero_copy.md"))) plot_zero_copy(zero_copy_bench, file_dir, "zero_copy_bench", column="Tps") if bench.shape[0] > 0: to_markdown(bench, str(Path(file_name).with_suffix(".bench.md"))) plot(bench, file_dir, "bench", column="Tps")

java/benchmark/analyze.py (216 lines of code) (raw):