in functorch/csrc/PointwiseOperatorCompileCache.cpp [374:447]
void call(at::Tensor *args) {
for (const auto &ck : shapeChecks_) {
if (args[std::get<0>(ck)].size(std::get<1>(ck)) !=
args[std::get<2>(ck)].size(std::get<3>(ck))) {
// TODO(jansel): make this error message match aten
throw std::runtime_error("The size of tensor A must match the size of "
"tensor B at non-singleton dimension X");
}
}
// NOLINTNEXTLINE: C-style arrays
void *callArgs[Counts::numBuffers + (Counts::numKeys + 1) * MAX_DIMS];
constexpr int allocatedArgsOffset = Counts::numKeys;
for (int i = 0; i < allocatedArgsOffset; ++i) {
callArgs[i] = args[i].data_ptr();
}
constexpr int strideArgsOffset =
allocatedArgsOffset + Counts::numOutAllocated;
for (int i : c10::irange(strideArgsFrom_.size())) {
auto &item = strideArgsFrom_[i];
callArgs[strideArgsOffset + i] =
// NOLINTNEXTLINE: const_cast
const_cast<int64_t *>(&args[item.first].strides()[item.second]);
}
int shapeArgsOffset = strideArgsOffset + strideArgsFrom_.size();
size_t numel = 1;
// NOLINTNEXTLINE: C-style arrays
int64_t shapes[MAX_DIMS];
int ndims = shapeFrom_.size();
for (int i = 0; i < ndims; ++i) {
shapes[i] = args[shapeFrom_[i].first].size(shapeFrom_[i].second);
numel *= shapes[i];
callArgs[shapeArgsOffset + i] = &shapes[i];
}
for (int i = 0; i < Counts::numOutAllocated; ++i) {
int optionsFrom = allocatedOutputs_[i].first;
auto &outputOrder = allocatedOutputs_[i].second;
// NOLINTNEXTLINE: C-style arrays
int64_t strides[MAX_DIMS];
int64_t nextStride = 1;
for (int j : outputOrder) {
strides[j] = nextStride;
nextStride *= shapes[j];
}
args[allocatedArgsOffset + i] =
at::empty_strided(c10::IntArrayRef(shapes, shapes + ndims),
c10::IntArrayRef(strides, strides + ndims),
args[optionsFrom].options());
callArgs[allocatedArgsOffset + i] =
args[allocatedArgsOffset + i].data_ptr();
}
// Release the GIL before calling the kernel, unless the kernel is
// tiny.
if (numel < 128) {
// TODO(jansel): should we also skip releasing the GIL on GPU?
cg_->call_with_numel(callArgs, numel);
} else {
py::gil_scoped_release release;
cg_->call_with_numel(callArgs, numel);
}
if (backwards_functions_.size() > 0) {
std::shared_ptr<CompiledAutoGradNode> node(new CompiledAutoGradNode(),
torch::autograd::deleteNode);
node->setup(backwards_functions_, args, Counts::numIn);
for (int i = 0; i < Counts::numOut; ++i) {
torch::autograd::create_gradient_edge(args[Counts::numIn + i], node);
}
}
}