in tensorflow/tensorflow/core/kernels/conv_ops.cc [607:1054]
void LaunchConv2DOp<GPUDevice, T>::operator()(
OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
const Tensor& input_param, const Tensor& filter, int row_dilation,
int col_dilation, int row_stride, int col_stride, const Padding& padding,
const std::vector<int64>& explicit_paddings, Tensor* output,
TensorFormat data_format) {
using se::dnn::AlgorithmConfig;
using se::dnn::AlgorithmDesc;
using se::dnn::ProfileResult;
auto* stream = ctx->op_device_context()->stream();
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
if (!use_cudnn) {
ctx->SetStatus(
errors::Unimplemented("Conv2D for GPU is not currently supported "
"without cudnn"));
return;
}
Tensor input = input_param;
const int64 in_batch = GetTensorDim(input, data_format, 'N');
int64 in_rows = GetTensorDim(input, data_format, 'H');
int64 in_cols = GetTensorDim(input, data_format, 'W');
const int64 in_depths = GetTensorDim(input, data_format, 'C');
const int64 patch_rows = filter.dim_size(0);
const int64 patch_cols = filter.dim_size(1);
const int64 patch_depths = filter.dim_size(2);
// If the filter in-depth (patch_depths) is 1 and smaller than the input
// depth, it's a depthwise convolution. More generally, if the filter in-depth
// divides but is smaller than the input depth, it is a grouped convolution.
bool is_grouped_convolution = patch_depths != in_depths;
if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
col_stride == 1 && data_format == FORMAT_NHWC &&
(padding == VALID || padding == SAME)) {
// 1x1 filter, so call cublas directly.
const uint64 m = in_batch * in_rows * in_cols;
const uint64 k = patch_depths;
const uint64 n = filter.dim_size(3);
auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
filter.template flat<T>().size());
auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
output->template flat<T>().size());
auto no_transpose = se::blas::Transpose::kNoTranspose;
bool blas_launch_status =
stream
->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
a_ptr, k, 0.0f, &c_ptr, n)
.ok();
if (!blas_launch_status) {
ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
", n=", n, ", k=", k));
}
return;
} else if (patch_rows == in_rows && patch_cols == in_cols &&
!is_grouped_convolution && row_dilation == 1 &&
col_dilation == 1 && padding == VALID &&
data_format == FORMAT_NHWC) {
// The input data and filter have the same height/width, so call cublas
// directly.
const uint64 m = in_batch;
const uint64 k = patch_rows * patch_cols * patch_depths;
const uint64 n = filter.dim_size(3);
auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
filter.template flat<T>().size());
auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
output->template flat<T>().size());
auto no_transpose = se::blas::Transpose::kNoTranspose;
bool blas_launch_status =
stream
->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
a_ptr, k, 0.0f, &c_ptr, n)
.ok();
if (!blas_launch_status) {
ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
", n=", n, ", k=", k));
}
return;
}
// Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16
// in NHWC data layout. In all other configurations it's more efficient to
// run computation in NCHW data format.
const bool compute_in_nhwc =
DataTypeToEnum<T>::value == DT_HALF && IsVoltaOrLater(*stream->parent());
// We only do one directional conversion: NHWC->NCHW. We never convert in the
// other direction. Grappler layout optimizer selects preferred layout and
// adds necessary annotations to the graph.
// TODO(ezhulenev): Convert in other direction for fp16?
const TensorFormat compute_data_format =
(compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
: FORMAT_NCHW;
VLOG(3) << "Compute Conv2D with cuDNN:"
<< " data_format=" << ToString(data_format)
<< " compute_data_format=" << ToString(compute_data_format);
const int64 out_batch = GetTensorDim(*output, data_format, 'N');
const int64 out_rows = GetTensorDim(*output, data_format, 'H');
const int64 out_cols = GetTensorDim(*output, data_format, 'W');
const int64 out_depths = GetTensorDim(*output, data_format, 'C');
int64 padding_top = -1, padding_bottom = -1;
int64 padding_left = -1, padding_right = -1;
if (padding == EXPLICIT) {
GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
&padding_bottom);
GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
&padding_right);
}
int64 out_rows_check, out_cols_check;
Status status = GetWindowedOutputSizeVerboseV2(
in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
&padding_top, &padding_bottom);
// The status is guaranteed to be OK because we checked the output and padding
// was valid earlier.
TF_CHECK_OK(status);
DCHECK_EQ(out_rows, out_rows_check);
status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
col_stride, padding, &out_cols_check,
&padding_left, &padding_right);
TF_CHECK_OK(status);
DCHECK_EQ(out_cols, out_cols_check);
const int64 common_padding_rows = std::min(padding_top, padding_bottom);
const int64 common_padding_cols = std::min(padding_left, padding_right);
if (padding_top != padding_bottom || padding_left != padding_right) {
// cuDNN only supports padding the same amount on the left and right sides,
// and on the top and bottom sides. So we manually create a new padded
// input tensor such that we can pass it to cuDNN.
VLOG(4) << "Pad input tensor:"
<< " padding_top=" << padding_top
<< " padding_bottom=" << padding_bottom
<< " padding_left=" << padding_left
<< " padding_right=" << padding_right;
// TODO(reedwm): In some cases, we can avoid an allocation even if the two
// padding sides are different. For example, if the input is 2x2, the filter
// is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
// equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
// such a way would allow us to avoid the allocation.
Tensor transformed_input;
const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
const int64 padding_cols_diff = std::abs(padding_right - padding_left);
const int64 new_in_rows = in_rows + padding_rows_diff;
const int64 new_in_cols = in_cols + padding_cols_diff;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(
DataTypeToEnum<T>::value,
ShapeFromFormat(data_format, in_batch, new_in_rows,
new_in_cols, in_depths),
&transformed_input));
const int64 input_pad_top = padding_top - common_padding_rows;
const int64 input_pad_bottom = padding_bottom - common_padding_rows;
const int64 input_pad_left = padding_left - common_padding_cols;
const int64 input_pad_right = padding_right - common_padding_cols;
bool in_bounds =
FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
if (!in_bounds) {
ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
return;
}
functor::PadInput<GPUDevice, T, int, 4>()(
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
{{static_cast<int>(input_pad_bottom),
static_cast<int>(input_pad_right)}},
To32Bit(transformed_input.tensor<T, 4>()), data_format);
input = transformed_input;
in_rows = new_in_rows;
in_cols = new_in_cols;
}
if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
VLOG(4) << "Convert the input tensor from NHWC to NCHW.";
TensorShape nchw_shape =
ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
if (in_depths > 1) {
Tensor transformed_input;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
nchw_shape, &transformed_input));
functor::NHWCToNCHW<GPUDevice, T, 4>()(
ctx->eigen_device<GPUDevice>(),
const_cast<const Tensor&>(input).tensor<T, 4>(),
transformed_input.tensor<T, 4>());
input = transformed_input;
} else {
// If depth <= 1, then just reshape.
CHECK(input.CopyFrom(input, nchw_shape));
}
} else {
CHECK(data_format == compute_data_format) // Crash OK
<< "Illegal data and compute format pair:"
<< " data_format=" << ToString(data_format)
<< " compute_data_format=" << ToString(compute_data_format);
}
CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK
<< "Negative row or col paddings: (" << common_padding_rows << ", "
<< common_padding_cols << ")";
constexpr auto kComputeInNHWC =
std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
se::dnn::FilterLayout::kOutputYXInput);
constexpr auto kComputeInNCHW =
std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
se::dnn::FilterLayout::kOutputInputYX);
se::dnn::DataLayout compute_data_layout;
se::dnn::FilterLayout filter_layout;
std::tie(compute_data_layout, filter_layout) =
compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
se::dnn::BatchDescriptor input_desc;
input_desc.set_count(in_batch)
.set_feature_map_count(in_depths)
.set_height(in_rows)
.set_width(in_cols)
.set_layout(compute_data_layout);
se::dnn::BatchDescriptor output_desc;
output_desc.set_count(out_batch)
.set_height(out_rows)
.set_width(out_cols)
.set_feature_map_count(out_depths)
.set_layout(compute_data_layout);
se::dnn::FilterDescriptor filter_desc;
filter_desc.set_input_filter_height(patch_rows)
.set_input_filter_width(patch_cols)
.set_input_feature_map_count(patch_depths)
.set_output_feature_map_count(filter.dim_size(3))
.set_layout(filter_layout);
se::dnn::ConvolutionDescriptor conv_desc;
conv_desc.set_vertical_dilation_rate(row_dilation)
.set_horizontal_dilation_rate(col_dilation)
.set_vertical_filter_stride(row_stride)
.set_horizontal_filter_stride(col_stride)
.set_zero_padding_height(common_padding_rows)
.set_zero_padding_width(common_padding_cols)
.set_group_count(in_depths / patch_depths);
Tensor transformed_filter;
const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
<< " to " << ToString(dst_format);
TensorShape dst_shape =
dst_format == FORMAT_OIHW
? TensorShape({filter.dim_size(3), filter.dim_size(2),
filter.dim_size(0), filter.dim_size(1)})
: TensorShape({filter.dim_size(3), filter.dim_size(0),
filter.dim_size(1), filter.dim_size(2)});
TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
&transformed_filter));
functor::TransformFilter<GPUDevice, T, int, 4>()(
ctx->eigen_device<GPUDevice>(), dst_format,
To32Bit(filter.tensor<T, 4>()),
To32Bit(transformed_filter.tensor<T, 4>()));
return Status::OK();
};
if (compute_data_format == FORMAT_NCHW) {
OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
} else if (compute_data_format == FORMAT_NHWC) {
OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
} else {
ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
ToString(compute_data_format)));
return;
}
Tensor transformed_output;
if (data_format != compute_data_format) {
VLOG(4) << "Allocate temporary memory for output in compute data format";
OP_REQUIRES_OK(
ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
ShapeFromFormat(compute_data_format, out_batch,
out_rows, out_cols, out_depths),
&transformed_output));
} else {
transformed_output = *output;
}
auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto filter_ptr =
AsDeviceMemory(transformed_filter.template flat<T>().data(),
transformed_filter.template flat<T>().size());
auto output_ptr =
AsDeviceMemory(transformed_output.template flat<T>().data(),
transformed_output.template flat<T>().size());
static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
// default value is in bytes despite the name of the environment variable
"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
);
int device_id = stream->parent()->device_ordinal();
DataType dtype = input.dtype();
ConvParameters conv_parameters = {in_batch, // batch
in_depths, // in_depths
{{in_rows, // in_rows
in_cols}}, // in_cols
compute_data_format, // compute_data_format
out_depths, // out_depths
{{patch_rows, // filter_rows
patch_cols, // filter_cols
patch_depths}}, // filter_depths
{{row_dilation, // dilation_rows
col_dilation}}, // dilation_cols
{{row_stride, // stride_rows
col_stride}}, // stride_cols
{{common_padding_rows, // padding_rows
common_padding_cols}}, // padding_cols
dtype, // tensor datatype
device_id, // device_id
conv_desc.group_count()};
AlgorithmConfig algorithm_config;
if (cudnn_use_autotune &&
!AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
#if GOOGLE_CUDA
std::vector<AlgorithmDesc> algorithms;
OP_REQUIRES(
ctx,
stream->parent()->GetConvolveAlgorithms(
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
stream->parent()),
&algorithms),
errors::Unknown("Failed to get convolution algorithm. This is probably "
"because cuDNN failed to initialize, so try looking to "
"see if a warning log message was printed above."));
se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
stream);
se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
se::cuda::PtxCompilationOptions());
se::DeviceMemory<T> output_tensor(
WrapRedzoneBestEffort(&rz_allocator, output_ptr));
std::vector<tensorflow::AutotuneResult> results;
for (auto profile_algorithm : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
se::cuda::RedzoneAllocator rz_scratch_allocator(
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
/*memory_limit=*/ConvolveScratchSize);
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
se::ScratchAllocator* allocator_used =
!RedzoneCheckDisabled()
? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
: static_cast<se::ScratchAllocator*>(&scratch_allocator);
ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenConvolveWithAlgorithm(
input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
output_desc, &output_tensor, allocator_used,
AlgorithmConfig(profile_algorithm), &profile_result)
.ok();
if (cudnn_launch_status && profile_result.is_valid()) {
results.emplace_back();
auto& result = results.back();
result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
result.mutable_conv()->set_tensor_ops_enabled(
profile_algorithm.tensor_ops_enabled());
result.set_scratch_bytes(
!RedzoneCheckDisabled()
? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
: scratch_allocator.TotalByteSize());
*result.mutable_run_time() = proto_utils::ToDurationProto(
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
CheckRedzones(rz_scratch_allocator, &result);
CheckRedzones(rz_allocator, &result);
}
}
LogConvAutotuneResults(se::dnn::ConvolutionKind::FORWARD,
se::dnn::ToDataType<T>::value, input_ptr, filter_ptr,
output_tensor, input_desc, filter_desc, output_desc,
conv_desc, stream->parent(), results);
OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
#elif TENSORFLOW_USE_ROCM
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
ProfileResult best_result;
bool miopen_find_status =
stream
->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
filter_ptr, conv_desc, output_desc,
&output_ptr, &scratch_allocator,
AlgorithmConfig(), &best_result)
.ok();
OP_REQUIRES(ctx, miopen_find_status && best_result.is_valid(),
errors::NotFound("Failed to find conv algorithm!"));
algorithm_config.set_algorithm(best_result.algorithm());
algorithm_config.set_scratch_size(best_result.scratch_size());
#endif
AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
}
VLOG(4) << "Convolution Algorithm: "
<< algorithm_config.algorithm()->algo_id();
VLOG(4) << "tensor_ops_enabled: "
<< algorithm_config.algorithm()->tensor_ops_enabled();
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
bool cudnn_launch_status =
stream
->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
filter_ptr, conv_desc, output_desc,
&output_ptr, &scratch_allocator,
algorithm_config, nullptr)
.ok();
if (!cudnn_launch_status) {
ctx->SetStatus(errors::Internal(
"cuDNN launch failure : input shape(", input.shape().DebugString(),
") filter shape(", filter.shape().DebugString(), ")"));
}
if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
functor::NCHWToNHWC<GPUDevice, T, 4>()(
ctx->eigen_device<GPUDevice>(),
const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
output->tensor<T, 4>());
}
}