def _alter_conv2d_layout()

in python/tvm/topi/cuda/conv2d_alter_op.py [0:0]


def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
    target = tvm.target.Target.current(allow_none=False)
    dispatch_ctx = autotvm.task.DispatchContext.current

    new_attrs = {k: attrs[k] for k in attrs.keys()}
    strides = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    data_layout = attrs["data_layout"]
    kernel_layout = attrs["kernel_layout"]
    data, kernel = tinfos
    out_dtype = out_type.dtype

    impl, outs = relay.backend.compile_engine.select_implementation(
        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
    )
    workload = autotvm.task.get_workload(outs)
    if workload is None:
        # The best implementation is not an AutoTVM template.
        # It may be from the auto-scheduler

        if impl.name.find("winograd") != -1:
            if dilation != (1, 1):
                logger.warning("Does not support weight pre-transform for dilated convolution.")
                return None

            assert data_layout == "NHWC" and kernel_layout == "HWIO"
            N, H, W, CI = get_const_tuple(data.shape)
            KH, KW, _, CO = get_const_tuple(kernel.shape)

            # Pre-compute weight transformation in winograd
            tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NHWC")

            # HWIO -> OIHW
            kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
            # alpha, alpha, CO, CI
            weight = relay.nn.contrib_conv2d_winograd_weight_transform(
                kernel_transform, tile_size=tile_size
            )
            new_attrs["tile_size"] = tile_size
            new_attrs["channels"] = CO
            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
                inputs[0], weight, **new_attrs
            )

        return None

    cfg = dispatch_ctx.query(target, workload)
    if cfg.is_fallback:  # if is fallback, clear query cache and return None
        autotvm.task.clear_fallback_cache(target, workload)
        return None

    topi_tmpl = workload[0]
    if topi_tmpl == "conv2d_NCHWc_int8.cuda":
        assert data_layout == "NCHW" and kernel_layout == "OIHW"
        N, CI, H, W = get_const_tuple(data.shape)
        CO, _, KH, KW = get_const_tuple(kernel.shape)

        new_layout = "NCHW4c"
        new_attrs["channels"] = CO
        new_attrs["data_layout"] = new_layout
        new_attrs["out_layout"] = new_layout
        new_attrs["kernel_layout"] = "OIHW4o4i"
        ic_block_factor = oc_block_factor = 4

        # Store the same config for the altered operator (workload)
        new_data = te.placeholder(
            (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype
        )
        new_kernel = te.placeholder(
            (
                CO // oc_block_factor,
                CI // ic_block_factor,
                KH,
                KW,
                oc_block_factor,
                ic_block_factor,
            ),
            dtype=kernel.dtype,
        )
        new_workload = autotvm.task.args_to_workload(
            [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
            "conv2d_NCHWc_int8.cuda",
        )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.conv2d(*inputs, **new_attrs)

    if topi_tmpl == "conv2d_nchw_winograd.cuda":
        if dilation != (1, 1):
            logger.warning("Does not support weight pre-transform for dilated convolution.")
            return None

        assert data_layout == "NCHW" and kernel_layout == "OIHW"
        N, CI, H, W = get_const_tuple(data.shape)
        CO, _, KH, KW = get_const_tuple(kernel.shape)

        # pre-compute weight transformation in winograd
        tile_size = _infer_tile_size(tinfos[0], tinfos[1])

        weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
        new_attrs["tile_size"] = tile_size
        new_attrs["channels"] = CO

        # Store the same config for the altered operator (workload)
        new_data = data
        new_weight = te.placeholder(
            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype
        )
        new_workload = autotvm.task.args_to_workload(
            [new_data, new_weight, strides, padding, dilation, out_dtype],
            "conv2d_nchw_winograd_without_weight_transform.cuda",
        )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
            inputs[0], weight, **new_attrs
        )

    if topi_tmpl in ("conv2d_nhwc_winograd_direct.cuda", "conv2d_nhwc_winograd_tensorcore.cuda"):
        if dilation != (1, 1):
            logger.warning("Does not support weight pre-transform for dilated convolution.")
            return None

        assert data_layout == "NHWC" and kernel_layout == "HWIO"
        N, H, W, CI = get_const_tuple(data.shape)
        KH, KW, _, CO = get_const_tuple(kernel.shape)

        # Pre-compute weight transformation in winograd
        tile_size = _infer_tile_size(data, kernel, layout="NHWC")
        kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
        weight = relay.nn.contrib_conv2d_winograd_weight_transform(
            kernel_transform, tile_size=tile_size
        )
        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
        new_attrs["tile_size"] = tile_size
        new_attrs["channels"] = CO
        # Store the same config for the altered operator (workload)
        new_data = data
        new_weight = te.placeholder(
            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype
        )
        if topi_tmpl == "conv2d_nhwc_winograd_direct.cuda":
            new_workload = autotvm.task.args_to_workload(
                [new_data, new_weight, strides, padding, dilation, out_dtype],
                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )
        elif topi_tmpl == "conv2d_nhwc_winograd_tensorcore.cuda":
            new_workload = autotvm.task.args_to_workload(
                [new_data, new_weight, strides, padding, dilation, out_dtype],
                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
            inputs[0], weight, **new_attrs
        )

    if topi_tmpl == "group_conv2d_NCHWc_int8.cuda":
        assert data_layout == "NCHW" and kernel_layout == "OIHW"
        N, CI, H, W = get_const_tuple(data.shape)
        CO, _, KH, KW = get_const_tuple(kernel.shape)

        new_layout = "NCHW4c"
        new_attrs["channels"] = CO
        new_attrs["data_layout"] = new_layout
        new_attrs["out_layout"] = new_layout
        new_attrs["kernel_layout"] = "OIHW4o4i"
        ic_block_factor = oc_block_factor = 4

        # Store the same config for the altered operator (workload)
        new_data = te.placeholder(
            (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype
        )
        new_kernel = te.placeholder(
            (
                CO // oc_block_factor,
                CI // ic_block_factor // groups,
                KH,
                KW,
                oc_block_factor,
                ic_block_factor,
            ),
            dtype=kernel.dtype,
        )
        new_workload = autotvm.task.args_to_workload(
            [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
            "group_conv2d_NCHWc_int8.cuda",
        )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.conv2d(*inputs, **new_attrs)

    if topi_tmpl == "conv2d_HWNCnc_tensorcore.cuda":
        assert data_layout == "HWNC" and kernel_layout == "HWOI"
        assert float(tvm.cuda(0).compute_version) >= 7.5
        H, W, N, CI = get_const_tuple(data.shape)
        KH, KW, CO, _ = get_const_tuple(kernel.shape)

        if (
            kernel.dtype in ["int4", "uint4"]
            and (CI % 32 != 0 or CO % 8 != 0)
            or kernel.dtype in ["int8", "uint8"]
            and (CI % 16 != 0 or CO % 32 != 0)
        ):
            return relay.nn.conv2d(*inputs, **new_attrs)

        new_attrs["channels"] = CO
        if kernel.dtype in ["int4", "uint4"]:
            new_attrs["kernel_layout"] = "HWOI8o32i"
            ic_block_factor = 32
            oc_block_factor = 8
        else:
            new_attrs["kernel_layout"] = "HWOI32o16i"
            ic_block_factor = 16
            oc_block_factor = 32

        new_kernel = te.placeholder(
            (
                KH,
                KW,
                CO // oc_block_factor,
                CI // ic_block_factor,
                oc_block_factor,
                ic_block_factor,
            ),
            dtype=kernel.dtype,
        )

        new_workload = autotvm.task.args_to_workload(
            [data, new_kernel, strides, padding, dilation, out_dtype],
            "conv2d_HWNCnc_tensorcore.cuda",
        )

        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.conv2d(*inputs, **new_attrs)

    return None