perfkitbenchmarker/data/mlperf_inference_dlrm_server_custom.py (365 lines of code) (raw):
# Copyright 2023 PerfKitBenchmarker Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MLPerf Inference custom system DLRM configuration."""
from . import AccuracyTarget
from . import ConfigRegistry
from . import HarnessType
from . import KnownSystem
from . import PowerSetting
from . import ServerGPUBaseConfig
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudT4x1(ServerGPUBaseConfig):
system = KnownSystem.CloudT4x1
enable_interleaved_top_mlp = True
deque_timeout_usec = 1
embedding_weights_on_gpu_part = 0.5
gpu_batch_size = 65500
gpu_num_bundles = 2
num_staging_batches = 2
num_staging_threads = 4
server_target_qps = 24000
use_jemalloc = True
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudT4x1HighAccuracy(CloudT4x1):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudT4x1Triton(CloudT4x1):
buffer_manager_thread_count = 8
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudT4x1HighAccuracyTriton(CloudT4x1Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudT4x4(CloudT4x1):
system = KnownSystem.CloudT4x4
gpu_num_bundles = 1
num_staging_batches = 4
num_staging_threads = 4
use_jemalloc = True
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudT4x4HighAccuracy(CloudT4x4):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudT4x4Triton(CloudT4x4):
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudT4x4HighAccuracyTriton(CloudT4x4Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x1(ServerGPUBaseConfig):
system = KnownSystem.CloudL4x1
embedding_weights_on_gpu_part = 0.8
num_staging_batches = 2
num_staging_threads = 2
gpu_num_bundles = 2
gpu_batch_size = 14000
server_target_qps = 89000
use_jemalloc = False
use_small_tile_gemm_plugin = True
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x1HighAccuracy(CloudL4x1):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x1Triton(CloudL4x1):
buffer_manager_thread_count = 8
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x1HighAccuracyTriton(CloudL4x1Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x2(CloudL4x1):
system = KnownSystem.CloudL4x2
gpu_batch_size = 14000 * 2
num_staging_batches = 2
num_staging_threads = 2
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x2HighAccuracy(CloudL4x2):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x2Triton(CloudL4x2):
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x2HighAccuracyTriton(CloudL4x2Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x4(CloudL4x1):
system = KnownSystem.CloudL4x4
gpu_batch_size = 14000 * 4
num_staging_batches = 4
num_staging_threads = 4
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x4HighAccuracy(CloudL4x4):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x4Triton(CloudL4x4):
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x4HighAccuracyTriton(CloudL4x4Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x8(CloudL4x1):
system = KnownSystem.CloudL4x8
gpu_batch_size = 14000 * 8
num_staging_batches = 8
num_staging_threads = 4
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x8HighAccuracy(CloudL4x8):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class CloudL4x8Triton(CloudL4x8):
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class CloudL4x8HighAccuracyTriton(CloudL4x8Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10Gx1(ServerGPUBaseConfig):
system = KnownSystem.A10Gx1
deque_timeout_usec = 1
embedding_weights_on_gpu_part = 0.8
gpu_batch_size = 65500
gpu_num_bundles = 2
num_staging_batches = 2
num_staging_threads = 4
server_target_qps = 68000
use_jemalloc = True
use_small_tile_gemm_plugin = True
gemm_plugin_fairshare_cache_size = 18
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10Gx1HighAccuracy(A10Gx1):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10Gx1Triton(A10Gx1):
buffer_manager_thread_count = 0
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10Gx1HighAccuracyTriton(A10Gx1Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10Gx4(A10Gx1):
system = KnownSystem.A10Gx4
gpu_batch_size = 60000
num_staging_batches = 4
num_staging_threads = 8
use_jemalloc = False
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10Gx4HighAccuracy(A10Gx4):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10Gx4Triton(A10Gx4):
buffer_manager_thread_count = 0
use_triton = True
batch_triton_requests = True
gather_kernel_buffer_threshold = 64
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10Gx4HighAccuracyTriton(A10Gx4Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10Gx8(A10Gx1):
system = KnownSystem.A10Gx8
gpu_batch_size = 60000
num_staging_batches = 8
num_staging_threads = 8
use_jemalloc = False
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10Gx8HighAccuracy(A10Gx8):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10Gx8Triton(A10Gx8):
buffer_manager_thread_count = 0
use_triton = True
batch_triton_requests = True
gather_kernel_buffer_threshold = 64
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10Gx8HighAccuracyTriton(A10Gx8Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10x1(ServerGPUBaseConfig):
system = KnownSystem.A10x1
deque_timeout_usec = 1
embedding_weights_on_gpu_part = 0.8
gpu_batch_size = 65500
gpu_num_bundles = 2
num_staging_batches = 2
num_staging_threads = 4
server_target_qps = 68000
use_jemalloc = True
use_small_tile_gemm_plugin = True
gemm_plugin_fairshare_cache_size = 18
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10x1HighAccuracy(A10x1):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10x1Triton(A10x1):
buffer_manager_thread_count = 0
use_triton = True
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10x1HighAccuracyTriton(A10x1Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10x2(A10x1):
system = KnownSystem.A10x2
gpu_batch_size = 60000
num_staging_batches = 2
num_staging_threads = 8
use_jemalloc = False
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10x2HighAccuracy(A10x2):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class A10x2Triton(A10x2):
buffer_manager_thread_count = 0
use_triton = True
batch_triton_requests = True
gather_kernel_buffer_threshold = 64
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class A10x2HighAccuracyTriton(A10x2Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class H100x1(ServerGPUBaseConfig):
"""Base DLRM H100 config."""
system = KnownSystem.H100x1
use_small_tile_gemm_plugin = False
use_jemalloc = True
gpu_batch_size = 350000
gpu_num_bundles = 4
num_staging_batches = 2
num_staging_threads = 4
gpu_inference_streams = 1
max_pairs_per_staging_thread = 180000
complete_threads = 4
server_target_qps = 735000
compress_categorical_inputs = False
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP
)
class H100x8(H100x1):
system = KnownSystem.H100x8
server_target_qps = 720000 * 8
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class H100x1HighAccuracy(H100x1):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class H100x1Triton(H100x1):
gpu_batch_size = 200000
server_target_qps = 120000
batch_triton_requests = True
gather_kernel_buffer_threshold = 32
max_queue_delay_usec = 1000
use_triton = True
numa_config = None
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class H100x1HighAccuracyTriton(H100x1Triton):
pass
@ConfigRegistry.register(
HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class H100x8HighAccuracy(H100x8):
pass
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP
)
class H100x8Triton(H100x8):
gpu_batch_size = 200000
server_target_qps = 120000 * 8
batch_triton_requests = True
gather_kernel_buffer_threshold = 32
max_queue_delay_usec = 1000
use_triton = True
numa_config = None
@ConfigRegistry.register(
HarnessType.Triton, AccuracyTarget.k_99_9, PowerSetting.MaxP
)
class H100x8HighAccuracyTriton(H100x8Triton):
pass