def blade_optimize()

in easycv/toolkit/blade/cv_blade_utils.py [0:0]


def blade_optimize(speed_test_model,
                   model,
                   inputs,
                   blade_config=dict(
                       enable_fp16=True, fp16_fallback_op_ratio=0.05),
                   backend='TensorRT',
                   batch=1,
                   warmup_iters=10,
                   compute_cost=True,
                   use_profile=False,
                   check_result=False,
                   static_opt=True,
                   min_num_nodes=None,
                   check_inputs=True,
                   fp16=False):

    if not static_opt:
        logging.info(
            'PAI-Blade use dynamic optimize for input model, export model is build for dynamic shape input'
        )
        optimize_op = optimize
    else:
        logging.info(
            'PAI-Blade use static optimize for input model, export model must be used as static shape input'
        )
        from torch_blade.optimization import _static_optimize
        optimize_op = _static_optimize
    if min_num_nodes is not None:
        import torch_blade.clustering.support_fusion_group as blade_fusion
        with blade_fusion.min_group_nodes(min_num_nodes=min_num_nodes):
            with opt_trt_config(blade_config):
                opt_model = optimize_op(
                    model,
                    allow_tracing=True,
                    model_inputs=tuple(copy.deepcopy(inputs)),
                )
    else:
        with opt_trt_config(blade_config):
            opt_model = optimize_op(
                model,
                allow_tracing=True,
                model_inputs=tuple(copy.deepcopy(inputs)),
            )
    if compute_cost:
        logging.info('Running benchmark...')
        results = []
        inputs_t = copy.deepcopy(inputs)

        # end2end model and scripts needs different channel purmulate, encounter this problem only when we use end2end export
        if check_inputs and (inputs_t[0].shape[-1] == 3):
            shape_length = len(inputs_t[0].shape)
            if shape_length == 4:
                inputs_t = inputs_t[0].permute(0, 3, 1, 2)
                inputs_t = [inputs_t]

            if shape_length == 3:
                inputs_t = inputs_t[0].permute(2, 0, 1)
                inputs_t = (torch.unsqueeze(inputs_t, 0), )

        results.append(
            benchmark(
                speed_test_model,
                inputs_t,
                backend,
                batch,
                'easycv',
                warmup_iters=warmup_iters,
                fp16=fp16))
        results.append(
            benchmark(
                model,
                copy.deepcopy(inputs),
                backend,
                batch,
                'easycv script',
                warmup_iters=warmup_iters,
                fp16=fp16))
        results.append(
            benchmark(
                opt_model,
                copy.deepcopy(inputs),
                backend,
                batch,
                'blade',
                warmup_iters=warmup_iters,
                fp16=fp16))

        logging.info('Model Summary:')
        summary = pd.DataFrame(results)
        print(summary.to_markdown())

    if use_profile:
        torch.cuda.empty_cache()
        # warm-up
        for k in range(warmup_iters):
            test_result = opt_model(*copy.deepcopy(inputs))
            torch.cuda.synchronize()

        torch.cuda.synchronize()
        cu_prof_start()
        for k in range(warmup_iters):
            test_result = opt_model(*copy.deepcopy(inputs))
            torch.cuda.synchronize()
        cu_prof_stop()
        import torch.autograd.profiler as profiler
        with profiler.profile(use_cuda=True) as prof:
            for k in range(warmup_iters):
                test_result = opt_model(*copy.deepcopy(inputs))
                torch.cuda.synchronize()

        with profiler.profile(use_cuda=True) as prof:
            for k in range(warmup_iters):
                test_result = opt_model(*copy.deepcopy(inputs))
                torch.cuda.synchronize()

        prof_str = prof.key_averages().table(sort_by='cuda_time_total')
        print(f'{prof_str}')

    if check_result:
        output = model(*copy.deepcopy(inputs))
        test_result = opt_model(*copy.deepcopy(inputs))
        check_results(output, test_result)

    return opt_model