in python/tvm/topi/cuda/conv2d_alter_op.py [0:0]
def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
target = tvm.target.Target.current(allow_none=False)
dispatch_ctx = autotvm.task.DispatchContext.current
new_attrs = {k: attrs[k] for k in attrs.keys()}
strides = attrs.get_int_tuple("strides")
padding = attrs.get_int_tuple("padding")
dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups")
data_layout = attrs["data_layout"]
kernel_layout = attrs["kernel_layout"]
data, kernel = tinfos
out_dtype = out_type.dtype
impl, outs = relay.backend.compile_engine.select_implementation(
relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
)
workload = autotvm.task.get_workload(outs)
if workload is None:
# The best implementation is not an AutoTVM template.
# It may be from the auto-scheduler
if impl.name.find("winograd") != -1:
if dilation != (1, 1):
logger.warning("Does not support weight pre-transform for dilated convolution.")
return None
assert data_layout == "NHWC" and kernel_layout == "HWIO"
N, H, W, CI = get_const_tuple(data.shape)
KH, KW, _, CO = get_const_tuple(kernel.shape)
# Pre-compute weight transformation in winograd
tile_size = _infer_tile_size(tinfos[0], tinfos[1], layout="NHWC")
# HWIO -> OIHW
kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
# alpha, alpha, CO, CI
weight = relay.nn.contrib_conv2d_winograd_weight_transform(
kernel_transform, tile_size=tile_size
)
new_attrs["tile_size"] = tile_size
new_attrs["channels"] = CO
return relay.nn.contrib_conv2d_winograd_without_weight_transform(
inputs[0], weight, **new_attrs
)
return None
cfg = dispatch_ctx.query(target, workload)
if cfg.is_fallback: # if is fallback, clear query cache and return None
autotvm.task.clear_fallback_cache(target, workload)
return None
topi_tmpl = workload[0]
if topi_tmpl == "conv2d_NCHWc_int8.cuda":
assert data_layout == "NCHW" and kernel_layout == "OIHW"
N, CI, H, W = get_const_tuple(data.shape)
CO, _, KH, KW = get_const_tuple(kernel.shape)
new_layout = "NCHW4c"
new_attrs["channels"] = CO
new_attrs["data_layout"] = new_layout
new_attrs["out_layout"] = new_layout
new_attrs["kernel_layout"] = "OIHW4o4i"
ic_block_factor = oc_block_factor = 4
# Store the same config for the altered operator (workload)
new_data = te.placeholder(
(N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype
)
new_kernel = te.placeholder(
(
CO // oc_block_factor,
CI // ic_block_factor,
KH,
KW,
oc_block_factor,
ic_block_factor,
),
dtype=kernel.dtype,
)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
"conv2d_NCHWc_int8.cuda",
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.conv2d(*inputs, **new_attrs)
if topi_tmpl == "conv2d_nchw_winograd.cuda":
if dilation != (1, 1):
logger.warning("Does not support weight pre-transform for dilated convolution.")
return None
assert data_layout == "NCHW" and kernel_layout == "OIHW"
N, CI, H, W = get_const_tuple(data.shape)
CO, _, KH, KW = get_const_tuple(kernel.shape)
# pre-compute weight transformation in winograd
tile_size = _infer_tile_size(tinfos[0], tinfos[1])
weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
weight = relay.transpose(weight, axes=[0, 1, 3, 2])
new_attrs["tile_size"] = tile_size
new_attrs["channels"] = CO
# Store the same config for the altered operator (workload)
new_data = data
new_weight = te.placeholder(
(KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype
)
new_workload = autotvm.task.args_to_workload(
[new_data, new_weight, strides, padding, dilation, out_dtype],
"conv2d_nchw_winograd_without_weight_transform.cuda",
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_conv2d_winograd_without_weight_transform(
inputs[0], weight, **new_attrs
)
if topi_tmpl in ("conv2d_nhwc_winograd_direct.cuda", "conv2d_nhwc_winograd_tensorcore.cuda"):
if dilation != (1, 1):
logger.warning("Does not support weight pre-transform for dilated convolution.")
return None
assert data_layout == "NHWC" and kernel_layout == "HWIO"
N, H, W, CI = get_const_tuple(data.shape)
KH, KW, _, CO = get_const_tuple(kernel.shape)
# Pre-compute weight transformation in winograd
tile_size = _infer_tile_size(data, kernel, layout="NHWC")
kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
weight = relay.nn.contrib_conv2d_winograd_weight_transform(
kernel_transform, tile_size=tile_size
)
weight = relay.transpose(weight, axes=[0, 1, 3, 2])
new_attrs["tile_size"] = tile_size
new_attrs["channels"] = CO
# Store the same config for the altered operator (workload)
new_data = data
new_weight = te.placeholder(
(KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype
)
if topi_tmpl == "conv2d_nhwc_winograd_direct.cuda":
new_workload = autotvm.task.args_to_workload(
[new_data, new_weight, strides, padding, dilation, out_dtype],
"conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
)
elif topi_tmpl == "conv2d_nhwc_winograd_tensorcore.cuda":
new_workload = autotvm.task.args_to_workload(
[new_data, new_weight, strides, padding, dilation, out_dtype],
"conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_conv2d_winograd_without_weight_transform(
inputs[0], weight, **new_attrs
)
if topi_tmpl == "group_conv2d_NCHWc_int8.cuda":
assert data_layout == "NCHW" and kernel_layout == "OIHW"
N, CI, H, W = get_const_tuple(data.shape)
CO, _, KH, KW = get_const_tuple(kernel.shape)
new_layout = "NCHW4c"
new_attrs["channels"] = CO
new_attrs["data_layout"] = new_layout
new_attrs["out_layout"] = new_layout
new_attrs["kernel_layout"] = "OIHW4o4i"
ic_block_factor = oc_block_factor = 4
# Store the same config for the altered operator (workload)
new_data = te.placeholder(
(N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype
)
new_kernel = te.placeholder(
(
CO // oc_block_factor,
CI // ic_block_factor // groups,
KH,
KW,
oc_block_factor,
ic_block_factor,
),
dtype=kernel.dtype,
)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
"group_conv2d_NCHWc_int8.cuda",
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.conv2d(*inputs, **new_attrs)
if topi_tmpl == "conv2d_HWNCnc_tensorcore.cuda":
assert data_layout == "HWNC" and kernel_layout == "HWOI"
assert float(tvm.cuda(0).compute_version) >= 7.5
H, W, N, CI = get_const_tuple(data.shape)
KH, KW, CO, _ = get_const_tuple(kernel.shape)
if (
kernel.dtype in ["int4", "uint4"]
and (CI % 32 != 0 or CO % 8 != 0)
or kernel.dtype in ["int8", "uint8"]
and (CI % 16 != 0 or CO % 32 != 0)
):
return relay.nn.conv2d(*inputs, **new_attrs)
new_attrs["channels"] = CO
if kernel.dtype in ["int4", "uint4"]:
new_attrs["kernel_layout"] = "HWOI8o32i"
ic_block_factor = 32
oc_block_factor = 8
else:
new_attrs["kernel_layout"] = "HWOI32o16i"
ic_block_factor = 16
oc_block_factor = 32
new_kernel = te.placeholder(
(
KH,
KW,
CO // oc_block_factor,
CI // ic_block_factor,
oc_block_factor,
ic_block_factor,
),
dtype=kernel.dtype,
)
new_workload = autotvm.task.args_to_workload(
[data, new_kernel, strides, padding, dilation, out_dtype],
"conv2d_HWNCnc_tensorcore.cuda",
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.conv2d(*inputs, **new_attrs)
return None