extra/dashboard/app.py

from contextlib import ExitStack from dataclasses import dataclass from typing import List import click import gradio as gr import pandas as pd from parse_results import build_results @dataclass class PlotConfig: x_title: str y_title: str title: str percentiles: List[float] = None def run(from_results_dir, datasource, port): css = ''' .summary span { font-size: 10px; padding-top:0; padding-bottom:0; } ''' summary_desc = ''' ## Summary This table shows the average of the metrics for each model and QPS rate. The metrics are: * Inter token latency: Time to generate a new output token for each user querying the system. It translates as the “speed” perceived by the end-user. We aim for at least 300 words per minute (average reading speed), so ITL<150ms * Time to First Token: Time the user has to wait before seeing the first token of its answer. Lower waiting time are essential for real-time interactions, less so for offline workloads. * End-to-end latency: The overall time the system took to generate the full response to the user. * Throughput: The number of tokens per second the system can generate across all requests * Successful requests: The number of requests the system was able to honor in the benchmark timeframe * Error rate: The percentage of requests that ended up in error, as the system could not process them in time or failed to process them. ''' df_bench = pd.DataFrame() line_plots_bench = [] column_mappings = {'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)', 'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)', 'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model', 'rate': 'QPS', 'run_id': 'Run ID'} default_df = pd.DataFrame.from_dict( {"rate": [1, 2], "inter_token_latency_ms_p90": [10, 20], "version": ["default", "default"], "model": ["default", "default"]}) def load_demo(model_bench, percentiles): return update_bench(model_bench, percentiles) def update_bench(model, percentiles): res = [] for plot in line_plots_bench: if plot['config'].percentiles: k = plot['metric'] + '_' + str(percentiles) df_bench[plot['metric']] = df_bench[k] if k in df_bench.columns else 0 res.append(df_bench[(df_bench['model'] == model)]) return res + [summary_table()] def summary_table() -> pd.DataFrame: data = df_bench.groupby(['model', 'run_id', 'rate']).agg( {'inter_token_latency_ms_p90': 'mean', 'time_to_first_token_ms_p90': 'mean', 'e2e_latency_ms_p90': 'mean', 'token_throughput_secs': 'mean', 'successful_requests': 'mean', 'error_rate': 'mean'}).reset_index() data = data[ ['run_id', 'model', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90', 'token_throughput_secs']] for metric in ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90', 'token_throughput_secs']: data[metric] = data[metric].apply(lambda x: f"{x:.2f}") data = data.rename( columns=column_mappings) return data def load_bench_results(source) -> pd.DataFrame: data = pd.read_parquet(source) # remove warmup and throughput data = data[(data['id'] != 'warmup') & (data['id'] != 'throughput')] # only keep constant rate data = data[data['executor_type'] == 'ConstantArrivalRate'] return data def select_region(selection: gr.SelectData, model): min_w, max_w = selection.index data = df_bench[(df_bench['model'] == model) & (df_bench['rate'] >= min_w) & ( df_bench['rate'] <= max_w)] res = [] for plot in line_plots_bench: # find the y values for the selected region metric = plot["metric"] y_min = data[metric].min() y_max = data[metric].max() res.append(gr.LinePlot(x_lim=[min_w, max_w], y_lim=[y_min, y_max])) return res def reset_region(): res = [] for _ in line_plots_bench: res.append(gr.LinePlot(x_lim=None, y_lim=None)) return res def load_datasource(datasource, fn): if datasource.startswith('file://'): return fn(datasource) elif datasource.startswith('s3://'): return fn(datasource) else: raise ValueError(f"Unknown datasource: {datasource}") if from_results_dir is not None: build_results(from_results_dir, 'benchmarks.parquet', None) # Load data df_bench = load_datasource(datasource, load_bench_results) # Define metrics metrics = { "inter_token_latency_ms": PlotConfig(title="Inter Token Latency (lower is better)", x_title="QPS", y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), "time_to_first_token_ms": PlotConfig(title="TTFT (lower is better)", x_title="QPS", y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), "e2e_latency_ms": PlotConfig(title="End to End Latency (lower is better)", x_title="QPS", y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), "token_throughput_secs": PlotConfig(title="Request Output Throughput (higher is better)", x_title="QPS", y_title="Tokens/s"), "successful_requests": PlotConfig(title="Successful requests (higher is better)", x_title="QPS", y_title="Count"), "error_rate": PlotConfig(title="Error rate", x_title="QPS", y_title="%"), "prompt_tokens": PlotConfig(title="Prompt tokens", x_title="QPS", y_title="Count"), "decoded_tokens": PlotConfig(title="Decoded tokens", x_title="QPS", y_title="Count") } models = df_bench["model"].unique() run_ids = df_bench["run_id"].unique() # get all available percentiles percentiles = set() for k, v in metrics.items(): if v.percentiles: percentiles.update(v.percentiles) percentiles = map(lambda p: f'p{int(float(p) * 100)}', percentiles) percentiles = sorted(list(percentiles)) percentiles.append('avg') with gr.Blocks(css=css, title="Inference Benchmarker") as demo: with gr.Row(): gr.Markdown("# Inference-benchmarker 🤗\n## Benchmarks results") with gr.Row(): gr.Markdown(summary_desc) with gr.Row(): table = gr.DataFrame( pd.DataFrame(), elem_classes=["summary"], ) with gr.Row(): details_desc = gr.Markdown("## Details") with gr.Row(): model = gr.Dropdown(list(models), label="Select model", value=models[0]) with gr.Row(): percentiles_bench = gr.Radio(percentiles, label="", value="avg") i = 0 with ExitStack() as stack: for k, v in metrics.items(): if i % 2 == 0: stack.close() gs = stack.enter_context(gr.Row()) line_plots_bench.append( {"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k, y_title=v.y_title, x_title=v.x_title, color="run_id" ), "model": model.value, "metric": k, "config": v }, ) i += 1 for component in [model, percentiles_bench]: component.change(update_bench, [model, percentiles_bench], [item["component"] for item in line_plots_bench] + [table]) gr.on([plot["component"].select for plot in line_plots_bench], select_region, [model], outputs=[item["component"] for item in line_plots_bench]) gr.on([plot["component"].double_click for plot in line_plots_bench], reset_region, None, outputs=[item["component"] for item in line_plots_bench]) demo.load(load_demo, [model, percentiles_bench], [item["component"] for item in line_plots_bench] + [table]) demo.launch(server_port=port, server_name="0.0.0.0") @click.command() @click.option('--from-results-dir', default=None, help='Load inference-benchmarker results from a directory') @click.option('--datasource', default='file://benchmarks.parquet', help='Load a Parquet file already generated') @click.option('--port', default=7860, help='Port to run the dashboard') def main(from_results_dir, datasource, port): run(from_results_dir, datasource, port) if __name__ == '__main__': main(auto_envvar_prefix='DASHBOARD')

extra/dashboard/app.py (175 lines of code) (raw):