in tensorflow/tensorflow/core/kernels/conv_ops_3d.cc [210:555]
static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
const Tensor& input_param, const Tensor& filter,
const std::array<int64, 3>& dilations,
const std::array<int64, 3>& strides, const Padding padding,
TensorFormat data_format, Tensor* output) {
auto* stream = ctx->op_device_context()->stream();
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
Tensor input = input_param;
const int64 in_batch = GetTensorDim(input, data_format, 'N');
int64 in_planes = GetTensorDim(input, data_format, '0');
int64 in_rows = GetTensorDim(input, data_format, '1');
int64 in_cols = GetTensorDim(input, data_format, '2');
const int64 in_depth = GetTensorDim(input, data_format, 'C');
const int64 filter_planes = filter.dim_size(0);
const int64 filter_rows = filter.dim_size(1);
const int64 filter_cols = filter.dim_size(2);
const int64 filter_depth = filter.dim_size(3);
const int64 out_depth = filter.dim_size(4);
int64 pad_planes = 0, pad_rows = 0, pad_cols = 0;
int64 out_planes = GetTensorDim(*output, data_format, '0');
int64 out_rows = GetTensorDim(*output, data_format, '1');
int64 out_cols = GetTensorDim(*output, data_format, '2');
if (padding == Padding::SAME) {
pad_planes = std::max<int64>(
0, (out_planes - 1) * strides[0] + filter_planes - in_planes);
pad_rows = std::max<int64>(
0, (out_rows - 1) * strides[1] + filter_rows - in_rows);
pad_cols = std::max<int64>(
0, (out_cols - 1) * strides[2] + filter_cols - in_cols);
}
bool is_grouped_convolution = filter_depth != in_depth;
// NOTE: This only works in NHWC.
if (!is_grouped_convolution && filter_planes == 1 && filter_rows == 1 &&
filter_cols == 1 && dilations[0] == 1 && dilations[1] == 1 &&
dilations[2] == 1 && strides[0] == 1 && strides[1] == 1 &&
strides[2] == 1 && data_format == FORMAT_NHWC) {
// 1x1 filter, so call cublas directly.
const uint64 m = in_batch * in_planes * in_rows * in_cols;
const uint64 k = in_depth;
const uint64 n = out_depth;
auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
filter.template flat<T>().size());
auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
output->template flat<T>().size());
auto no_transpose = se::blas::Transpose::kNoTranspose;
bool blas_launch_status =
stream
->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
n, a_ptr, k, 0.0f, &c_ptr, n)
.ok();
if (!blas_launch_status) {
ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
", n=", n, ", k=", k));
}
return;
} else if (!is_grouped_convolution && filter_planes == in_planes &&
filter_rows == in_rows && filter_cols == in_cols &&
padding == Padding::VALID && data_format == FORMAT_NHWC) {
// The input data and filter have the same planes/height/width, so call
// cublas directly.
const uint64 m = in_batch;
const uint64 k = in_planes * in_rows * in_cols * in_depth;
const uint64 n = out_depth;
auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
filter.template flat<T>().size());
auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
output->template flat<T>().size());
auto no_transpose = se::blas::Transpose::kNoTranspose;
bool blas_launch_status =
stream
->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
n, a_ptr, k, 0.0f, &c_ptr, n)
.ok();
if (!blas_launch_status) {
ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
", n=", n, ", k=", k));
}
return;
}
if (padding == Padding::SAME) {
const bool rows_odd = (pad_rows % 2 != 0);
const bool cols_odd = (pad_cols % 2 != 0);
const bool planes_odd = (pad_planes % 2 != 0);
// Necessary because cuDNN only supports symmetric padding.
// TODO(mjanusz): Consider making this optional? This would save some
// overhead and would work as long as an op trained this way is only
// used on GPU.
if (rows_odd || cols_odd || planes_odd) {
const int64 new_in_rows = in_rows + rows_odd;
const int64 new_in_cols = in_cols + cols_odd;
const int64 new_in_planes = in_planes + planes_odd;
Tensor transformed_input;
TensorShape transformed_shape = ShapeFromFormat(
data_format, in_batch, {{new_in_planes, new_in_rows, new_in_cols}},
in_depth);
OP_REQUIRES_OK(
ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, transformed_shape,
&transformed_input));
functor::PadInput<GPUDevice, T, int, 5>()(
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
{{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
To32Bit(transformed_input.tensor<T, 5>()), data_format);
input = transformed_input;
in_rows = new_in_rows;
in_cols = new_in_cols;
in_planes = new_in_planes;
}
}
if (data_format == FORMAT_NHWC) {
const TensorShape nchw_shape = ShapeFromFormat(
FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
if (in_depth > 1) {
Tensor transformed_input;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
nchw_shape, &transformed_input));
// input: [b, x, y, z, d]
// t_input: [b, d, x, y, z]
// NCDHW is the only format universally supported by cuDNN.
functor::NHWCToNCHW<GPUDevice, T, 5>()(
ctx->eigen_device<GPUDevice>(),
const_cast<const Tensor&>(input).tensor<T, 5>(),
transformed_input.tensor<T, 5>());
input = transformed_input;
} else {
CHECK(input.CopyFrom(input, nchw_shape));
}
}
CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
<< "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
<< pad_planes << ")";
se::dnn::BatchDescriptor input_desc(3);
input_desc.set_count(in_batch)
.set_feature_map_count(in_depth)
.set_spatial_dim(DimIndex::X, in_cols)
.set_spatial_dim(DimIndex::Y, in_rows)
.set_spatial_dim(DimIndex::Z, in_planes)
.set_layout(se::dnn::DataLayout::kBatchDepthYX);
se::dnn::BatchDescriptor output_desc(3);
output_desc.set_count(in_batch)
.set_spatial_dim(DimIndex::X, out_cols)
.set_spatial_dim(DimIndex::Y, out_rows)
.set_spatial_dim(DimIndex::Z, out_planes)
.set_feature_map_count(out_depth)
.set_layout(se::dnn::DataLayout::kBatchDepthYX);
se::dnn::FilterDescriptor filter_desc(3);
filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
.set_spatial_dim(DimIndex::Y, filter_rows)
.set_spatial_dim(DimIndex::Z, filter_planes)
.set_input_feature_map_count(filter_depth)
.set_output_feature_map_count(out_depth);
se::dnn::ConvolutionDescriptor conv_desc(3);
conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
.set_dilation_rate(DimIndex::Y, dilations[1])
.set_dilation_rate(DimIndex::Z, dilations[0])
.set_filter_stride(DimIndex::X, strides[2])
.set_filter_stride(DimIndex::Y, strides[1])
.set_filter_stride(DimIndex::Z, strides[0])
.set_zero_padding(DimIndex::X, pad_cols / 2)
.set_zero_padding(DimIndex::Y, pad_rows / 2)
.set_zero_padding(DimIndex::Z, pad_planes / 2)
.set_group_count(in_depth / filter_depth);
Tensor transformed_filter;
OP_REQUIRES_OK(
ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
TensorShape({out_depth, in_depth, filter_planes,
filter_rows, filter_cols}),
&transformed_filter));
// filter: [x, y, z, in, out]
// t_filter: [out, in, x, y, z]
functor::TransformFilter<GPUDevice, T, int, 5>()(
ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
To32Bit(filter.tensor<T, 5>()),
To32Bit(transformed_filter.tensor<T, 5>()));
Tensor transformed_output;
OP_REQUIRES_OK(
ctx, ctx->allocate_temp(
DataTypeToEnum<T>::value,
ShapeFromFormat(FORMAT_NCHW, in_batch,
{{out_planes, out_rows, out_cols}}, out_depth),
&transformed_output));
auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto filter_ptr =
AsDeviceMemory(transformed_filter.template flat<T>().data(),
transformed_filter.template flat<T>().size());
auto output_ptr =
AsDeviceMemory(transformed_output.template flat<T>().data(),
transformed_output.template flat<T>().size());
static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32); // 4GB by default
int device_id = stream->parent()->device_ordinal();
DataType dtype = input.dtype();
ConvParameters conv_parameters = {
in_batch,
in_depth,
{{in_planes, in_rows, in_cols}},
FORMAT_NCHW,
out_depth,
{{filter_planes, filter_rows, filter_cols}},
{{dilations[0], dilations[1], dilations[2]}},
{{strides[0], strides[1], strides[2]}},
{{pad_planes, pad_rows, pad_cols}},
dtype,
device_id,
conv_desc.group_count()};
using se::dnn::AlgorithmConfig;
using se::dnn::AlgorithmDesc;
using se::dnn::ProfileResult;
AlgorithmConfig algorithm_config;
if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
conv_parameters, &algorithm_config)) {
#if GOOGLE_CUDA
se::TfAllocatorAdapter tf_allocator_adapter(
ctx->device()->GetAllocator({}), stream);
se::cuda::RedzoneAllocator rz_allocator(
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
se::DeviceMemory<T> output_ptr_rz(
WrapRedzoneBestEffort(&rz_allocator, output_ptr));
std::vector<AlgorithmDesc> algorithms;
OP_REQUIRES(ctx,
stream->parent()->GetConvolveAlgorithms(
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
stream->parent()),
&algorithms),
errors::Unknown(
"Failed to get convolution algorithm. This is probably "
"because cuDNN failed to initialize, so try looking to "
"see if a warning log message was printed above."));
std::vector<tensorflow::AutotuneResult> results;
for (auto profile_algorithm : algorithms) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
se::cuda::RedzoneAllocator rz_scratch_allocator(
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
/*memory_limit=*/ConvolveScratchSize);
se::ScratchAllocator* allocator_used =
!RedzoneCheckDisabled()
? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
: static_cast<se::ScratchAllocator*>(&scratch_allocator);
ProfileResult profile_result;
bool cudnn_launch_status =
stream
->ThenConvolveWithAlgorithm(
input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
output_desc, &output_ptr_rz, allocator_used,
AlgorithmConfig(profile_algorithm), &profile_result)
.ok();
if (cudnn_launch_status) {
if (profile_result.is_valid()) {
results.emplace_back();
auto& result = results.back();
result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
result.mutable_conv()->set_tensor_ops_enabled(
profile_algorithm.tensor_ops_enabled());
result.set_scratch_bytes(
!RedzoneCheckDisabled()
? rz_scratch_allocator
.TotalAllocatedBytesExcludingRedzones()
: scratch_allocator.TotalByteSize());
*result.mutable_run_time() = proto_utils::ToDurationProto(
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
CheckRedzones(rz_scratch_allocator, &result);
CheckRedzones(rz_allocator, &result);
}
}
}
LogConvAutotuneResults(se::dnn::ConvolutionKind::FORWARD,
se::dnn::ToDataType<T>::value, input_ptr,
filter_ptr, output_ptr, input_desc, filter_desc,
output_desc, conv_desc, stream->parent(), results);
OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
#elif TENSORFLOW_USE_ROCM
ProfileResult best_result;
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
bool miopen_find_status =
stream
->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
filter_ptr, conv_desc, output_desc,
&output_ptr, &scratch_allocator,
AlgorithmConfig(), &best_result)
.ok();
OP_REQUIRES(ctx, miopen_find_status && best_result.is_valid(),
errors::NotFound("Failed to find conv algorithm!"));
algorithm_config.set_algorithm(best_result.algorithm());
algorithm_config.set_scratch_size(best_result.scratch_size());
#endif
AutoTuneConv3d::GetInstance()->Insert(conv_parameters, algorithm_config);
}
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
bool cudnn_launch_status =
stream
->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
filter_ptr, conv_desc, output_desc,
&output_ptr, &scratch_allocator,
algorithm_config, nullptr)
.ok();
if (!cudnn_launch_status) {
ctx->SetStatus(errors::Internal(
"cuDNN launch failure : input shape(", input.shape().DebugString(),
") filter shape(", filter.shape().DebugString(), ")"));
}
if (data_format == FORMAT_NHWC) {
// t_output: [b, out, x, y, z]
// output: [b, x, y, z, out]
functor::NCHWToNHWC<GPUDevice, T, 5>()(
ctx->eigen_device<GPUDevice>(),
const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
output->tensor<T, 5>());
} else {
*output = transformed_output;
}
}