void call()

in functorch/csrc/PointwiseOperatorCompileCache.cpp [374:447]


  void call(at::Tensor *args) {
    for (const auto &ck : shapeChecks_) {
      if (args[std::get<0>(ck)].size(std::get<1>(ck)) !=
          args[std::get<2>(ck)].size(std::get<3>(ck))) {
        // TODO(jansel): make this error message match aten
        throw std::runtime_error("The size of tensor A must match the size of "
                                 "tensor B at non-singleton dimension X");
      }
    }

    // NOLINTNEXTLINE: C-style arrays
    void *callArgs[Counts::numBuffers + (Counts::numKeys + 1) * MAX_DIMS];
    constexpr int allocatedArgsOffset = Counts::numKeys;
    for (int i = 0; i < allocatedArgsOffset; ++i) {
      callArgs[i] = args[i].data_ptr();
    }

    constexpr int strideArgsOffset =
        allocatedArgsOffset + Counts::numOutAllocated;
    for (int i : c10::irange(strideArgsFrom_.size())) {
      auto &item = strideArgsFrom_[i];
      callArgs[strideArgsOffset + i] =
          // NOLINTNEXTLINE: const_cast
          const_cast<int64_t *>(&args[item.first].strides()[item.second]);
    }

    int shapeArgsOffset = strideArgsOffset + strideArgsFrom_.size();
    size_t numel = 1;
    // NOLINTNEXTLINE: C-style arrays
    int64_t shapes[MAX_DIMS];
    int ndims = shapeFrom_.size();
    for (int i = 0; i < ndims; ++i) {
      shapes[i] = args[shapeFrom_[i].first].size(shapeFrom_[i].second);
      numel *= shapes[i];
      callArgs[shapeArgsOffset + i] = &shapes[i];
    }

    for (int i = 0; i < Counts::numOutAllocated; ++i) {
      int optionsFrom = allocatedOutputs_[i].first;
      auto &outputOrder = allocatedOutputs_[i].second;
      // NOLINTNEXTLINE: C-style arrays
      int64_t strides[MAX_DIMS];
      int64_t nextStride = 1;
      for (int j : outputOrder) {
        strides[j] = nextStride;
        nextStride *= shapes[j];
      }
      args[allocatedArgsOffset + i] =
          at::empty_strided(c10::IntArrayRef(shapes, shapes + ndims),
                            c10::IntArrayRef(strides, strides + ndims),
                            args[optionsFrom].options());
      callArgs[allocatedArgsOffset + i] =
          args[allocatedArgsOffset + i].data_ptr();
    }

    // Release the GIL before calling the kernel, unless the kernel is
    // tiny.
    if (numel < 128) {
      // TODO(jansel): should we also skip releasing the GIL on GPU?
      cg_->call_with_numel(callArgs, numel);
    } else {
      py::gil_scoped_release release;
      cg_->call_with_numel(callArgs, numel);
    }

    if (backwards_functions_.size() > 0) {
      std::shared_ptr<CompiledAutoGradNode> node(new CompiledAutoGradNode(),
                                                 torch::autograd::deleteNode);
      node->setup(backwards_functions_, args, Counts::numIn);
      for (int i = 0; i < Counts::numOut; ++i) {
        torch::autograd::create_gradient_edge(args[Counts::numIn + i], node);
      }
    }
  }