static void launch()

in tensorflow/tensorflow/core/kernels/conv_ops_3d.cc [210:555]
300 lines of code
39 McCabe index (conditional complexity)

  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
                     const Tensor& input_param, const Tensor& filter,
                     const std::array<int64, 3>& dilations,
                     const std::array<int64, 3>& strides, const Padding padding,
                     TensorFormat data_format, Tensor* output) {
    auto* stream = ctx->op_device_context()->stream();
    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

    Tensor input = input_param;

    const int64 in_batch = GetTensorDim(input, data_format, 'N');
    int64 in_planes = GetTensorDim(input, data_format, '0');
    int64 in_rows = GetTensorDim(input, data_format, '1');
    int64 in_cols = GetTensorDim(input, data_format, '2');
    const int64 in_depth = GetTensorDim(input, data_format, 'C');

    const int64 filter_planes = filter.dim_size(0);
    const int64 filter_rows = filter.dim_size(1);
    const int64 filter_cols = filter.dim_size(2);
    const int64 filter_depth = filter.dim_size(3);
    const int64 out_depth = filter.dim_size(4);

    int64 pad_planes = 0, pad_rows = 0, pad_cols = 0;
    int64 out_planes = GetTensorDim(*output, data_format, '0');
    int64 out_rows = GetTensorDim(*output, data_format, '1');
    int64 out_cols = GetTensorDim(*output, data_format, '2');

    if (padding == Padding::SAME) {
      pad_planes = std::max<int64>(
          0, (out_planes - 1) * strides[0] + filter_planes - in_planes);
      pad_rows = std::max<int64>(
          0, (out_rows - 1) * strides[1] + filter_rows - in_rows);
      pad_cols = std::max<int64>(
          0, (out_cols - 1) * strides[2] + filter_cols - in_cols);
    }

    bool is_grouped_convolution = filter_depth != in_depth;

    // NOTE: This only works in NHWC.
    if (!is_grouped_convolution && filter_planes == 1 && filter_rows == 1 &&
        filter_cols == 1 && dilations[0] == 1 && dilations[1] == 1 &&
        dilations[2] == 1 && strides[0] == 1 && strides[1] == 1 &&
        strides[2] == 1 && data_format == FORMAT_NHWC) {
      // 1x1 filter, so call cublas directly.
      const uint64 m = in_batch * in_planes * in_rows * in_cols;
      const uint64 k = in_depth;
      const uint64 n = out_depth;

      auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                  input.template flat<T>().size());
      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
                                  filter.template flat<T>().size());
      auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                  output->template flat<T>().size());

      auto no_transpose = se::blas::Transpose::kNoTranspose;
      bool blas_launch_status =
          stream
              ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
                             n, a_ptr, k, 0.0f, &c_ptr, n)
              .ok();
      if (!blas_launch_status) {
        ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
                                        ", n=", n, ", k=", k));
      }
      return;
    } else if (!is_grouped_convolution && filter_planes == in_planes &&
               filter_rows == in_rows && filter_cols == in_cols &&
               padding == Padding::VALID && data_format == FORMAT_NHWC) {
      // The input data and filter have the same planes/height/width, so call
      // cublas directly.
      const uint64 m = in_batch;
      const uint64 k = in_planes * in_rows * in_cols * in_depth;
      const uint64 n = out_depth;

      auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                  input.template flat<T>().size());
      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
                                  filter.template flat<T>().size());
      auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                  output->template flat<T>().size());

      auto no_transpose = se::blas::Transpose::kNoTranspose;
      bool blas_launch_status =
          stream
              ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
                             n, a_ptr, k, 0.0f, &c_ptr, n)
              .ok();
      if (!blas_launch_status) {
        ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
                                        ", n=", n, ", k=", k));
      }
      return;
    }

    if (padding == Padding::SAME) {
      const bool rows_odd = (pad_rows % 2 != 0);
      const bool cols_odd = (pad_cols % 2 != 0);
      const bool planes_odd = (pad_planes % 2 != 0);

      // Necessary because cuDNN only supports symmetric padding.
      // TODO(mjanusz): Consider making this optional? This would save some
      // overhead and would work as long as an op trained this way is only
      // used on GPU.
      if (rows_odd || cols_odd || planes_odd) {
        const int64 new_in_rows = in_rows + rows_odd;
        const int64 new_in_cols = in_cols + cols_odd;
        const int64 new_in_planes = in_planes + planes_odd;

        Tensor transformed_input;
        TensorShape transformed_shape = ShapeFromFormat(
            data_format, in_batch, {{new_in_planes, new_in_rows, new_in_cols}},
            in_depth);
        OP_REQUIRES_OK(
            ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, transformed_shape,
                                    &transformed_input));

        functor::PadInput<GPUDevice, T, int, 5>()(
            ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
            {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
            To32Bit(transformed_input.tensor<T, 5>()), data_format);
        input = transformed_input;
        in_rows = new_in_rows;
        in_cols = new_in_cols;
        in_planes = new_in_planes;
      }
    }

    if (data_format == FORMAT_NHWC) {
      const TensorShape nchw_shape = ShapeFromFormat(
          FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
      if (in_depth > 1) {
        Tensor transformed_input;
        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
                                               nchw_shape, &transformed_input));
        // input: [b, x, y, z, d]
        // t_input: [b, d, x, y, z]
        // NCDHW is the only format universally supported by cuDNN.
        functor::NHWCToNCHW<GPUDevice, T, 5>()(
            ctx->eigen_device<GPUDevice>(),
            const_cast<const Tensor&>(input).tensor<T, 5>(),
            transformed_input.tensor<T, 5>());
        input = transformed_input;
      } else {
        CHECK(input.CopyFrom(input, nchw_shape));
      }
    }

    CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
        << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
        << pad_planes << ")";
    se::dnn::BatchDescriptor input_desc(3);
    input_desc.set_count(in_batch)
        .set_feature_map_count(in_depth)
        .set_spatial_dim(DimIndex::X, in_cols)
        .set_spatial_dim(DimIndex::Y, in_rows)
        .set_spatial_dim(DimIndex::Z, in_planes)
        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
    se::dnn::BatchDescriptor output_desc(3);
    output_desc.set_count(in_batch)
        .set_spatial_dim(DimIndex::X, out_cols)
        .set_spatial_dim(DimIndex::Y, out_rows)
        .set_spatial_dim(DimIndex::Z, out_planes)
        .set_feature_map_count(out_depth)
        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
    se::dnn::FilterDescriptor filter_desc(3);
    filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
        .set_spatial_dim(DimIndex::Y, filter_rows)
        .set_spatial_dim(DimIndex::Z, filter_planes)
        .set_input_feature_map_count(filter_depth)
        .set_output_feature_map_count(out_depth);
    se::dnn::ConvolutionDescriptor conv_desc(3);
    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
        .set_dilation_rate(DimIndex::Y, dilations[1])
        .set_dilation_rate(DimIndex::Z, dilations[0])
        .set_filter_stride(DimIndex::X, strides[2])
        .set_filter_stride(DimIndex::Y, strides[1])
        .set_filter_stride(DimIndex::Z, strides[0])
        .set_zero_padding(DimIndex::X, pad_cols / 2)
        .set_zero_padding(DimIndex::Y, pad_rows / 2)
        .set_zero_padding(DimIndex::Z, pad_planes / 2)
        .set_group_count(in_depth / filter_depth);

    Tensor transformed_filter;
    OP_REQUIRES_OK(
        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
                                TensorShape({out_depth, in_depth, filter_planes,
                                             filter_rows, filter_cols}),
                                &transformed_filter));
    // filter: [x, y, z, in, out]
    // t_filter: [out, in, x, y, z]
    functor::TransformFilter<GPUDevice, T, int, 5>()(
        ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
        To32Bit(filter.tensor<T, 5>()),
        To32Bit(transformed_filter.tensor<T, 5>()));

    Tensor transformed_output;
    OP_REQUIRES_OK(
        ctx, ctx->allocate_temp(
                 DataTypeToEnum<T>::value,
                 ShapeFromFormat(FORMAT_NCHW, in_batch,
                                 {{out_planes, out_rows, out_cols}}, out_depth),
                 &transformed_output));

    auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                    input.template flat<T>().size());
    auto filter_ptr =
        AsDeviceMemory(transformed_filter.template flat<T>().data(),
                       transformed_filter.template flat<T>().size());
    auto output_ptr =
        AsDeviceMemory(transformed_output.template flat<T>().data(),
                       transformed_output.template flat<T>().size());

    static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
        "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default

    int device_id = stream->parent()->device_ordinal();
    DataType dtype = input.dtype();
    ConvParameters conv_parameters = {
        in_batch,
        in_depth,
        {{in_planes, in_rows, in_cols}},
        FORMAT_NCHW,
        out_depth,
        {{filter_planes, filter_rows, filter_cols}},
        {{dilations[0], dilations[1], dilations[2]}},
        {{strides[0], strides[1], strides[2]}},
        {{pad_planes, pad_rows, pad_cols}},
        dtype,
        device_id,
        conv_desc.group_count()};

    using se::dnn::AlgorithmConfig;
    using se::dnn::AlgorithmDesc;
    using se::dnn::ProfileResult;

    AlgorithmConfig algorithm_config;

    if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
                                  conv_parameters, &algorithm_config)) {
#if GOOGLE_CUDA
      se::TfAllocatorAdapter tf_allocator_adapter(
          ctx->device()->GetAllocator({}), stream);
      se::cuda::RedzoneAllocator rz_allocator(
          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
      se::DeviceMemory<T> output_ptr_rz(
          WrapRedzoneBestEffort(&rz_allocator, output_ptr));
      std::vector<AlgorithmDesc> algorithms;
      OP_REQUIRES(ctx,
                  stream->parent()->GetConvolveAlgorithms(
                      conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
                          stream->parent()),
                      &algorithms),
                  errors::Unknown(
                      "Failed to get convolution algorithm. This is probably "
                      "because cuDNN failed to initialize, so try looking to "
                      "see if a warning log message was printed above."));

      std::vector<tensorflow::AutotuneResult> results;
      for (auto profile_algorithm : algorithms) {
        // TODO(zhengxq): profile each algorithm multiple times to better
        // accuracy.
        DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
        se::cuda::RedzoneAllocator rz_scratch_allocator(
            stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
            /*memory_limit=*/ConvolveScratchSize);
        se::ScratchAllocator* allocator_used =
            !RedzoneCheckDisabled()
                ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
                : static_cast<se::ScratchAllocator*>(&scratch_allocator);
        ProfileResult profile_result;
        bool cudnn_launch_status =
            stream
                ->ThenConvolveWithAlgorithm(
                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
                    output_desc, &output_ptr_rz, allocator_used,
                    AlgorithmConfig(profile_algorithm), &profile_result)
                .ok();
        if (cudnn_launch_status) {
          if (profile_result.is_valid()) {
            results.emplace_back();
            auto& result = results.back();
            result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
            result.mutable_conv()->set_tensor_ops_enabled(
                profile_algorithm.tensor_ops_enabled());
            result.set_scratch_bytes(
                !RedzoneCheckDisabled()
                    ? rz_scratch_allocator
                          .TotalAllocatedBytesExcludingRedzones()
                    : scratch_allocator.TotalByteSize());
            *result.mutable_run_time() = proto_utils::ToDurationProto(
                absl::Milliseconds(profile_result.elapsed_time_in_ms()));
            CheckRedzones(rz_scratch_allocator, &result);
            CheckRedzones(rz_allocator, &result);
          }
        }
      }
      LogConvAutotuneResults(se::dnn::ConvolutionKind::FORWARD,
                             se::dnn::ToDataType<T>::value, input_ptr,
                             filter_ptr, output_ptr, input_desc, filter_desc,
                             output_desc, conv_desc, stream->parent(), results);
      OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
#elif TENSORFLOW_USE_ROCM
      ProfileResult best_result;
      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
      bool miopen_find_status =
          stream
              ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
                                          filter_ptr, conv_desc, output_desc,
                                          &output_ptr, &scratch_allocator,
                                          AlgorithmConfig(), &best_result)
              .ok();
      OP_REQUIRES(ctx, miopen_find_status && best_result.is_valid(),
                  errors::NotFound("Failed to find conv algorithm!"));
      algorithm_config.set_algorithm(best_result.algorithm());
      algorithm_config.set_scratch_size(best_result.scratch_size());
#endif
      AutoTuneConv3d::GetInstance()->Insert(conv_parameters, algorithm_config);
    }

    DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
    bool cudnn_launch_status =
        stream
            ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
                                        filter_ptr, conv_desc, output_desc,
                                        &output_ptr, &scratch_allocator,
                                        algorithm_config, nullptr)
            .ok();

    if (!cudnn_launch_status) {
      ctx->SetStatus(errors::Internal(
          "cuDNN launch failure : input shape(", input.shape().DebugString(),
          ") filter shape(", filter.shape().DebugString(), ")"));
    }

    if (data_format == FORMAT_NHWC) {
      // t_output: [b, out, x, y, z]
      // output: [b, x, y, z, out]
      functor::NCHWToNHWC<GPUDevice, T, 5>()(
          ctx->eigen_device<GPUDevice>(),
          const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
          output->tensor<T, 5>());
    } else {
      *output = transformed_output;
    }
  }