fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py (242 lines of code) (raw):
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import logging
import time
from typing import Callable, Tuple
import click
import torch
from torch import Tensor
logging.basicConfig(level=logging.DEBUG)
try:
# pyre-ignore[21]
from fbgemm_gpu import open_source # noqa: F401
except Exception:
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
def benchmark_hbc_function(
func: Callable[[Tensor], Tuple[Tensor, Tensor]],
input: Tensor,
) -> Tuple[float, Tensor]:
if input.is_cuda:
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
# Benchmark code
output, _ = func(input)
# Accumulate the time for iters iteration
end_event.record()
torch.cuda.synchronize()
elapsed_time = start_event.elapsed_time(end_event) * 1.0e-3
else:
start_time = time.time()
output, _ = func(input)
elapsed_time = time.time() - start_time
return float(elapsed_time), output
@click.command()
@click.option("--iters", default=100)
@click.option("--warmup-runs", default=2)
def main(
iters: int,
warmup_runs: int,
) -> None:
data_types = [torch.half, torch.float, torch.double]
total_time = {
"hbc": {
"cpu": {
torch.half: 0.0,
torch.float: 0.0,
torch.double: 0.0,
},
"gpu": {
torch.half: 0.0,
torch.float: 0.0,
torch.double: 0.0,
},
},
"hbc_by_feature": {
"cpu": {
torch.half: 0.0,
torch.float: 0.0,
torch.double: 0.0,
},
"gpu": {
torch.half: 0.0,
torch.float: 0.0,
torch.double: 0.0,
},
},
"generic_hbc_by_feature": {
"cpu": {
torch.half: 0.0,
torch.float: 0.0,
torch.double: 0.0,
},
"gpu": {
torch.half: 0.0,
torch.float: 0.0,
torch.double: 0.0,
},
},
}
num_bins: int = 5000
num_segments: int = 42
num_logits = 5000
input_data_cpu = torch.rand(num_logits, dtype=torch.float)
segment_lengths: Tensor = torch.randint(0, 2, (num_logits,))
num_values: int = int(torch.sum(segment_lengths).item())
segment_values: Tensor = torch.randint(
0,
num_segments,
(num_values,),
)
lower_bound: float = 0.0
upper_bound: float = 1.0
w: float = (upper_bound - lower_bound) / num_bins
bin_num_examples: Tensor = torch.empty([num_bins], dtype=torch.float64).fill_(0.0)
bin_num_positives: Tensor = torch.empty([num_bins], dtype=torch.float64).fill_(0.0)
bin_boundaries: Tensor = torch.arange(
lower_bound + w, upper_bound - w / 2, w, dtype=torch.float64
)
by_feature_bin_num_examples: Tensor = torch.empty(
[num_bins * (num_segments + 1)], dtype=torch.float64
).fill_(0.0)
by_feature_bin_num_positives: Tensor = torch.empty(
[num_bins * (num_segments + 1)], dtype=torch.float64
).fill_(0.0)
def fbgemm_hbc_cpu(input: Tensor) -> Tuple[Tensor, Tensor]:
return torch.ops.fbgemm.histogram_binning_calibration(
input,
bin_num_examples,
bin_num_positives,
0.4,
lower_bound,
upper_bound,
0,
0.9995,
)
def fbgemm_hbc_by_feature_cpu(input: Tensor) -> Tuple[Tensor, Tensor]:
return torch.ops.fbgemm.histogram_binning_calibration_by_feature(
input,
segment_values,
segment_lengths,
num_segments,
by_feature_bin_num_examples,
by_feature_bin_num_positives,
num_bins,
0.4,
lower_bound,
upper_bound,
0,
0.9995,
)
def fbgemm_generic_hbc_by_feature_cpu(input: Tensor) -> Tuple[Tensor, Tensor]:
return torch.ops.fbgemm.generic_histogram_binning_calibration_by_feature(
input,
segment_values,
segment_lengths,
num_segments,
by_feature_bin_num_examples,
by_feature_bin_num_positives,
bin_boundaries,
0.4,
0,
0.9995,
)
for step in range(iters + warmup_runs):
for data_type in data_types:
curr_input = input_data_cpu.to(data_type)
hbc_time, _ = benchmark_hbc_function(
fbgemm_hbc_cpu,
curr_input,
)
hbc_by_feature_time, _ = benchmark_hbc_function(
fbgemm_hbc_by_feature_cpu, curr_input
)
generic_hbc_by_feature_time, _ = benchmark_hbc_function(
fbgemm_generic_hbc_by_feature_cpu, curr_input
)
if step >= warmup_runs:
total_time["hbc"]["cpu"][data_type] += hbc_time
total_time["hbc_by_feature"]["cpu"][data_type] += hbc_by_feature_time
total_time["generic_hbc_by_feature"]["cpu"][
data_type
] += generic_hbc_by_feature_time
if torch.cuda.is_available():
bin_num_examples_gpu: Tensor = bin_num_examples.cuda()
bin_num_positives_gpu: Tensor = bin_num_positives.cuda()
def fbgemm_hbc_gpu(input: Tensor) -> Tuple[Tensor, Tensor]:
return torch.ops.fbgemm.histogram_binning_calibration(
input,
bin_num_examples_gpu,
bin_num_positives_gpu,
0.4,
lower_bound,
upper_bound,
0,
0.9995,
)
segment_values_gpu: Tensor = segment_values.cuda()
segment_lengths_gpu: Tensor = segment_lengths.cuda()
by_feature_bin_num_examples_gpu: Tensor = by_feature_bin_num_examples.cuda()
by_feature_bin_num_positives_gpu: Tensor = (
by_feature_bin_num_positives.cuda()
)
def fbgemm_hbc_by_feature_gpu(input: Tensor) -> Tuple[Tensor, Tensor]:
return torch.ops.fbgemm.histogram_binning_calibration_by_feature(
input,
segment_values_gpu,
segment_lengths_gpu,
num_segments,
by_feature_bin_num_examples_gpu,
by_feature_bin_num_positives_gpu,
num_bins,
0.4,
lower_bound,
upper_bound,
0,
0.9995,
)
bin_boundaries_gpu: Tensor = bin_boundaries.cuda()
def fbgemm_generic_hbc_by_feature_gpu(
input: Tensor,
) -> Tuple[Tensor, Tensor]:
return (
torch.ops.fbgemm.generic_histogram_binning_calibration_by_feature(
input,
segment_values_gpu,
segment_lengths_gpu,
num_segments,
by_feature_bin_num_examples_gpu,
by_feature_bin_num_positives_gpu,
bin_boundaries_gpu,
0.4,
0,
0.9995,
)
)
for data_type in data_types:
curr_input_gpu = input_data_cpu.cuda().to(data_type)
hbc_time, _ = benchmark_hbc_function(
fbgemm_hbc_gpu,
curr_input_gpu,
)
hbc_by_feature_time, _ = benchmark_hbc_function(
fbgemm_hbc_by_feature_gpu,
curr_input_gpu,
)
generic_hbc_by_feature_time, _ = benchmark_hbc_function(
fbgemm_generic_hbc_by_feature_gpu,
curr_input_gpu,
)
if step >= warmup_runs:
total_time["hbc"]["gpu"][data_type] += hbc_time
total_time["hbc_by_feature"]["gpu"][
data_type
] += hbc_by_feature_time
total_time["generic_hbc_by_feature"]["gpu"][
data_type
] += generic_hbc_by_feature_time
for op, curr_items in total_time.items():
for platform, data_items in curr_items.items():
for dtype, t_time in data_items.items():
logging.info(
f"{op}_{platform}_{dtype} time per iter: {t_time / iters * 1.0e6:.0f}us"
)
if __name__ == "__main__":
main()