in easycv/toolkit/blade/cv_blade_utils.py [0:0]
def blade_optimize(speed_test_model,
model,
inputs,
blade_config=dict(
enable_fp16=True, fp16_fallback_op_ratio=0.05),
backend='TensorRT',
batch=1,
warmup_iters=10,
compute_cost=True,
use_profile=False,
check_result=False,
static_opt=True,
min_num_nodes=None,
check_inputs=True,
fp16=False):
if not static_opt:
logging.info(
'PAI-Blade use dynamic optimize for input model, export model is build for dynamic shape input'
)
optimize_op = optimize
else:
logging.info(
'PAI-Blade use static optimize for input model, export model must be used as static shape input'
)
from torch_blade.optimization import _static_optimize
optimize_op = _static_optimize
if min_num_nodes is not None:
import torch_blade.clustering.support_fusion_group as blade_fusion
with blade_fusion.min_group_nodes(min_num_nodes=min_num_nodes):
with opt_trt_config(blade_config):
opt_model = optimize_op(
model,
allow_tracing=True,
model_inputs=tuple(copy.deepcopy(inputs)),
)
else:
with opt_trt_config(blade_config):
opt_model = optimize_op(
model,
allow_tracing=True,
model_inputs=tuple(copy.deepcopy(inputs)),
)
if compute_cost:
logging.info('Running benchmark...')
results = []
inputs_t = copy.deepcopy(inputs)
# end2end model and scripts needs different channel purmulate, encounter this problem only when we use end2end export
if check_inputs and (inputs_t[0].shape[-1] == 3):
shape_length = len(inputs_t[0].shape)
if shape_length == 4:
inputs_t = inputs_t[0].permute(0, 3, 1, 2)
inputs_t = [inputs_t]
if shape_length == 3:
inputs_t = inputs_t[0].permute(2, 0, 1)
inputs_t = (torch.unsqueeze(inputs_t, 0), )
results.append(
benchmark(
speed_test_model,
inputs_t,
backend,
batch,
'easycv',
warmup_iters=warmup_iters,
fp16=fp16))
results.append(
benchmark(
model,
copy.deepcopy(inputs),
backend,
batch,
'easycv script',
warmup_iters=warmup_iters,
fp16=fp16))
results.append(
benchmark(
opt_model,
copy.deepcopy(inputs),
backend,
batch,
'blade',
warmup_iters=warmup_iters,
fp16=fp16))
logging.info('Model Summary:')
summary = pd.DataFrame(results)
print(summary.to_markdown())
if use_profile:
torch.cuda.empty_cache()
# warm-up
for k in range(warmup_iters):
test_result = opt_model(*copy.deepcopy(inputs))
torch.cuda.synchronize()
torch.cuda.synchronize()
cu_prof_start()
for k in range(warmup_iters):
test_result = opt_model(*copy.deepcopy(inputs))
torch.cuda.synchronize()
cu_prof_stop()
import torch.autograd.profiler as profiler
with profiler.profile(use_cuda=True) as prof:
for k in range(warmup_iters):
test_result = opt_model(*copy.deepcopy(inputs))
torch.cuda.synchronize()
with profiler.profile(use_cuda=True) as prof:
for k in range(warmup_iters):
test_result = opt_model(*copy.deepcopy(inputs))
torch.cuda.synchronize()
prof_str = prof.key_averages().table(sort_by='cuda_time_total')
print(f'{prof_str}')
if check_result:
output = model(*copy.deepcopy(inputs))
test_result = opt_model(*copy.deepcopy(inputs))
check_results(output, test_result)
return opt_model