void fn_impl()

in src/core/tensor/tensor_math_cuda.h [303:344]


  void fn_impl(const Tensor& in1, const Tensor& in2, Tensor* out,             \
               Context* ctx) {                                                \
    const T* inPtr1 = static_cast<const T*>(in1.block()->data());             \
    const T* inPtr2 = static_cast<const T*>(in2.block()->data());             \
    T* outPtr = static_cast<T*>(out->block()->mutable_data());                \
    const size_t num = out->Size();                                           \
                                                                              \
    if (!in1.broadcasted() && !in2.broadcasted()) {                           \
      if (!in1.transpose() && !in2.transpose() &&                             \
          (in1.stride() == in2.stride())) {                                   \
        kernel(num, inPtr1, inPtr2, outPtr, ctx->stream);                     \
      } else {                                                                \
        if (in1.transpose() && in2.transpose()) {                             \
          Tensor t(in1.shape(), in1.device(), in1.data_type());               \
          Transform<T, lang::Cuda>(in1, &t, ctx);                             \
          Transform<T, lang::Cuda>(in2, out, ctx);                            \
                                                                              \
          T* tPtr = static_cast<T*>(t.block()->mutable_data());               \
          kernel(num, tPtr, outPtr, outPtr, ctx->stream);                     \
        } else if (in1.transpose()) {                                         \
          Transform<T, lang::Cuda>(in1, out, ctx);                            \
          kernel(num, outPtr, inPtr2, outPtr, ctx->stream);                   \
        } else if (in2.transpose()) {                                         \
          Transform<T, lang::Cuda>(in2, out, ctx);                            \
          kernel(num, inPtr1, outPtr, outPtr, ctx->stream);                   \
        }                                                                     \
      }                                                                       \
    } else {                                                                  \
      Tensor in1bc, in2bc;                                                    \
      if (in1.broadcasted()) {                                                \
        in1bc = Tensor(in1.shape(), in1.device(), in1.data_type());           \
        Transform<T, lang::Cuda>(in1, &in1bc, ctx);                           \
        inPtr1 = static_cast<const T*>(in1bc.block()->data());                \
      }                                                                       \
      if (in2.broadcasted()) {                                                \
        in2bc = Tensor(in2.shape(), in2.device(), in2.data_type());           \
        Transform<T, lang::Cuda>(in2, &in2bc, ctx);                           \
        inPtr2 = static_cast<const T*>(in2bc.block()->data());                \
      }                                                                       \
      kernel(num, inPtr1, inPtr2, outPtr, ctx->stream);                       \
    }                                                                         \
  }                                                                           \