in src/core/tensor/tensor_math_cuda.h [303:344]
void fn_impl(const Tensor& in1, const Tensor& in2, Tensor* out, \
Context* ctx) { \
const T* inPtr1 = static_cast<const T*>(in1.block()->data()); \
const T* inPtr2 = static_cast<const T*>(in2.block()->data()); \
T* outPtr = static_cast<T*>(out->block()->mutable_data()); \
const size_t num = out->Size(); \
\
if (!in1.broadcasted() && !in2.broadcasted()) { \
if (!in1.transpose() && !in2.transpose() && \
(in1.stride() == in2.stride())) { \
kernel(num, inPtr1, inPtr2, outPtr, ctx->stream); \
} else { \
if (in1.transpose() && in2.transpose()) { \
Tensor t(in1.shape(), in1.device(), in1.data_type()); \
Transform<T, lang::Cuda>(in1, &t, ctx); \
Transform<T, lang::Cuda>(in2, out, ctx); \
\
T* tPtr = static_cast<T*>(t.block()->mutable_data()); \
kernel(num, tPtr, outPtr, outPtr, ctx->stream); \
} else if (in1.transpose()) { \
Transform<T, lang::Cuda>(in1, out, ctx); \
kernel(num, outPtr, inPtr2, outPtr, ctx->stream); \
} else if (in2.transpose()) { \
Transform<T, lang::Cuda>(in2, out, ctx); \
kernel(num, inPtr1, outPtr, outPtr, ctx->stream); \
} \
} \
} else { \
Tensor in1bc, in2bc; \
if (in1.broadcasted()) { \
in1bc = Tensor(in1.shape(), in1.device(), in1.data_type()); \
Transform<T, lang::Cuda>(in1, &in1bc, ctx); \
inPtr1 = static_cast<const T*>(in1bc.block()->data()); \
} \
if (in2.broadcasted()) { \
in2bc = Tensor(in2.shape(), in2.device(), in2.data_type()); \
Transform<T, lang::Cuda>(in2, &in2bc, ctx); \
inPtr2 = static_cast<const T*>(in2bc.block()->data()); \
} \
kernel(num, inPtr1, inPtr2, outPtr, ctx->stream); \
} \
} \