fbgemm_gpu/bench/histogram_binning_calibration

# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import logging import time from typing import Callable, Tuple import click import torch from torch import Tensor logging.basicConfig(level=logging.DEBUG) try: # pyre-ignore[21] from fbgemm_gpu import open_source # noqa: F401 except Exception: torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops") torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu") def benchmark_hbc_function( func: Callable[[Tensor], Tuple[Tensor, Tensor]], input: Tensor, ) -> Tuple[float, Tensor]: if input.is_cuda: torch.cuda.synchronize() start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() # Benchmark code output, _ = func(input) # Accumulate the time for iters iteration end_event.record() torch.cuda.synchronize() elapsed_time = start_event.elapsed_time(end_event) * 1.0e-3 else: start_time = time.time() output, _ = func(input) elapsed_time = time.time() - start_time return float(elapsed_time), output @click.command() @click.option("--iters", default=100) @click.option("--warmup-runs", default=2) def main( iters: int, warmup_runs: int, ) -> None: data_types = [torch.half, torch.float, torch.double] total_time = { "hbc": { "cpu": { torch.half: 0.0, torch.float: 0.0, torch.double: 0.0, }, "gpu": { torch.half: 0.0, torch.float: 0.0, torch.double: 0.0, }, }, "hbc_by_feature": { "cpu": { torch.half: 0.0, torch.float: 0.0, torch.double: 0.0, }, "gpu": { torch.half: 0.0, torch.float: 0.0, torch.double: 0.0, }, }, "generic_hbc_by_feature": { "cpu": { torch.half: 0.0, torch.float: 0.0, torch.double: 0.0, }, "gpu": { torch.half: 0.0, torch.float: 0.0, torch.double: 0.0, }, }, } num_bins: int = 5000 num_segments: int = 42 num_logits = 5000 input_data_cpu = torch.rand(num_logits, dtype=torch.float) segment_lengths: Tensor = torch.randint(0, 2, (num_logits,)) num_values: int = int(torch.sum(segment_lengths).item()) segment_values: Tensor = torch.randint( 0, num_segments, (num_values,), ) lower_bound: float = 0.0 upper_bound: float = 1.0 w: float = (upper_bound - lower_bound) / num_bins bin_num_examples: Tensor = torch.empty([num_bins], dtype=torch.float64).fill_(0.0) bin_num_positives: Tensor = torch.empty([num_bins], dtype=torch.float64).fill_(0.0) bin_boundaries: Tensor = torch.arange( lower_bound + w, upper_bound - w / 2, w, dtype=torch.float64 ) by_feature_bin_num_examples: Tensor = torch.empty( [num_bins * (num_segments + 1)], dtype=torch.float64 ).fill_(0.0) by_feature_bin_num_positives: Tensor = torch.empty( [num_bins * (num_segments + 1)], dtype=torch.float64 ).fill_(0.0) def fbgemm_hbc_cpu(input: Tensor) -> Tuple[Tensor, Tensor]: return torch.ops.fbgemm.histogram_binning_calibration( input, bin_num_examples, bin_num_positives, 0.4, lower_bound, upper_bound, 0, 0.9995, ) def fbgemm_hbc_by_feature_cpu(input: Tensor) -> Tuple[Tensor, Tensor]: return torch.ops.fbgemm.histogram_binning_calibration_by_feature( input, segment_values, segment_lengths, num_segments, by_feature_bin_num_examples, by_feature_bin_num_positives, num_bins, 0.4, lower_bound, upper_bound, 0, 0.9995, ) def fbgemm_generic_hbc_by_feature_cpu(input: Tensor) -> Tuple[Tensor, Tensor]: return torch.ops.fbgemm.generic_histogram_binning_calibration_by_feature( input, segment_values, segment_lengths, num_segments, by_feature_bin_num_examples, by_feature_bin_num_positives, bin_boundaries, 0.4, 0, 0.9995, ) for step in range(iters + warmup_runs): for data_type in data_types: curr_input = input_data_cpu.to(data_type) hbc_time, _ = benchmark_hbc_function( fbgemm_hbc_cpu, curr_input, ) hbc_by_feature_time, _ = benchmark_hbc_function( fbgemm_hbc_by_feature_cpu, curr_input ) generic_hbc_by_feature_time, _ = benchmark_hbc_function( fbgemm_generic_hbc_by_feature_cpu, curr_input ) if step >= warmup_runs: total_time["hbc"]["cpu"][data_type] += hbc_time total_time["hbc_by_feature"]["cpu"][data_type] += hbc_by_feature_time total_time["generic_hbc_by_feature"]["cpu"][ data_type ] += generic_hbc_by_feature_time if torch.cuda.is_available(): bin_num_examples_gpu: Tensor = bin_num_examples.cuda() bin_num_positives_gpu: Tensor = bin_num_positives.cuda() def fbgemm_hbc_gpu(input: Tensor) -> Tuple[Tensor, Tensor]: return torch.ops.fbgemm.histogram_binning_calibration( input, bin_num_examples_gpu, bin_num_positives_gpu, 0.4, lower_bound, upper_bound, 0, 0.9995, ) segment_values_gpu: Tensor = segment_values.cuda() segment_lengths_gpu: Tensor = segment_lengths.cuda() by_feature_bin_num_examples_gpu: Tensor = by_feature_bin_num_examples.cuda() by_feature_bin_num_positives_gpu: Tensor = ( by_feature_bin_num_positives.cuda() ) def fbgemm_hbc_by_feature_gpu(input: Tensor) -> Tuple[Tensor, Tensor]: return torch.ops.fbgemm.histogram_binning_calibration_by_feature( input, segment_values_gpu, segment_lengths_gpu, num_segments, by_feature_bin_num_examples_gpu, by_feature_bin_num_positives_gpu, num_bins, 0.4, lower_bound, upper_bound, 0, 0.9995, ) bin_boundaries_gpu: Tensor = bin_boundaries.cuda() def fbgemm_generic_hbc_by_feature_gpu( input: Tensor, ) -> Tuple[Tensor, Tensor]: return ( torch.ops.fbgemm.generic_histogram_binning_calibration_by_feature( input, segment_values_gpu, segment_lengths_gpu, num_segments, by_feature_bin_num_examples_gpu, by_feature_bin_num_positives_gpu, bin_boundaries_gpu, 0.4, 0, 0.9995, ) ) for data_type in data_types: curr_input_gpu = input_data_cpu.cuda().to(data_type) hbc_time, _ = benchmark_hbc_function( fbgemm_hbc_gpu, curr_input_gpu, ) hbc_by_feature_time, _ = benchmark_hbc_function( fbgemm_hbc_by_feature_gpu, curr_input_gpu, ) generic_hbc_by_feature_time, _ = benchmark_hbc_function( fbgemm_generic_hbc_by_feature_gpu, curr_input_gpu, ) if step >= warmup_runs: total_time["hbc"]["gpu"][data_type] += hbc_time total_time["hbc_by_feature"]["gpu"][ data_type ] += hbc_by_feature_time total_time["generic_hbc_by_feature"]["gpu"][ data_type ] += generic_hbc_by_feature_time for op, curr_items in total_time.items(): for platform, data_items in curr_items.items(): for dtype, t_time in data_items.items(): logging.info( f"{op}_{platform}_{dtype} time per iter: {t_time / iters * 1.0e6:.0f}us" ) if __name__ == "__main__": main()

fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py (242 lines of code) (raw):