EIGEN_DONT_INLINE void operator()

in backends/common/include/tfrt/common/compat/eigen/spatial_convolution_data_mapper.h [985:1156]


  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
                                    Index depth, Index num_cols,
                                    Index stride = 0, Index offset = 0) const {
    assert(stride == 0);
    assert(offset == 0);

    const Index packet_cols4 = (num_cols / 4) * 4;
    const Index peeled_k = (depth / packet_size) * packet_size;
    const bool non_standard_patches = rhs.nonStandardPatches();

    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);

      Index k = 0;
      if ((packet_size % 4) == 0 && !non_standard_patches) {
        // FAST PATH: Iterate over patch columns and rows, if we know that a
        // single packet does not span across multiple rows or columns.
        if ((rhs.patchDepth() % packet_size) == 0) {
          const Index start_col = rhs.colOffset();
          const Index max_col = rhs.maxCol(peeled_k);

          for (Index c = start_col; c < max_col; ++c) {
            assert(k <= peeled_k);

            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
            const Index max_row = rhs.maxRow(peeled_k, c);

            const bool pad_col0 = dm0.padCol(c);
            const bool pad_col1 = dm1.padCol(c);
            const bool pad_col2 = dm2.padCol(c);
            const bool pad_col3 = dm3.padCol(c);

            // Check if we can squeeze reads along the `row` and `depth`
            // dimensions (two innermost dimensions).
            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&
                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&
                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&
                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&
                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
              // Compute how many elements we can squeeze read.
              const Index start_depth =
                  (c == start_col) ? rhs.depthOffset() : 0;

              // Upper bound for the number of elements in the depth dimension
              // that we can squeeze read.
              const Index squeeze_length =
                  (max_row - start_row) * rhs.patchDepth() - start_depth;

              // Do not overshoot beyond the block size.
              const Index max_depth =
                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
              assert((max_depth - start_depth) % packet_size == 0);

              const Index idx0 = dm0.baseIndex(start_row, c);
              const Index idx1 = dm1.baseIndex(start_row, c);
              const Index idx2 = dm2.baseIndex(start_row, c);
              const Index idx3 = dm3.baseIndex(start_row, c);

              for (Index d = start_depth; d < max_depth; d += packet_size) {
                assert(k < peeled_k);
                PacketBlock<Packet, 4> kernel;
                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
                ptranspose(kernel);
                pstoreu(block + 0 * packet_size, kernel.packet[0]);
                pstoreu(block + 1 * packet_size, kernel.packet[1]);
                pstoreu(block + 2 * packet_size, kernel.packet[2]);
                pstoreu(block + 3 * packet_size, kernel.packet[3]);
                block += 4 * packet_size;
                k += packet_size;
              }

              // Go to the next column.
              continue;
            }

            // If we can't squeeze reads, process rows one by one.
            for (Index r = start_row; r < max_row; ++r) {
              assert(k <= peeled_k);

              const bool pad0 = pad_col0 || dm0.padRow(r);
              const bool pad1 = pad_col1 || dm1.padRow(r);
              const bool pad2 = pad_col2 || dm2.padRow(r);
              const bool pad3 = pad_col3 || dm3.padRow(r);

              const Index idx0 = dm0.baseIndex(r, c);
              const Index idx1 = dm1.baseIndex(r, c);
              const Index idx2 = dm2.baseIndex(r, c);
              const Index idx3 = dm3.baseIndex(r, c);

              const Index start_depth = ((c == start_col) && (r == start_row))
                                            ? rhs.depthOffset()
                                            : 0;
              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
              assert((max_depth - start_depth) % packet_size == 0);

              for (Index d = start_depth; d < max_depth; d += packet_size) {
                assert(k < peeled_k);
                PacketBlock<Packet, 4> kernel;
                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
                                        : rhs.packetNoPadding(d, idx0);
                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
                                        : rhs.packetNoPadding(d, idx1);
                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
                                        : rhs.packetNoPadding(d, idx2);
                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
                                        : rhs.packetNoPadding(d, idx3);
                ptranspose(kernel);
                pstoreu(block + 0 * packet_size, kernel.packet[0]);
                pstoreu(block + 1 * packet_size, kernel.packet[1]);
                pstoreu(block + 2 * packet_size, kernel.packet[2]);
                pstoreu(block + 3 * packet_size, kernel.packet[3]);
                block += 4 * packet_size;
                k += packet_size;
              }
            }
          }

          // The loop above should fill peeled_k elements.
          assert(peeled_k == k);

        } else {
          for (; k < peeled_k; k += packet_size) {
            PacketBlock<Packet, 4> kernel;
            kernel.packet[0] = dm0.loadPacketStandard(k);
            kernel.packet[1] = dm1.loadPacketStandard(k);
            kernel.packet[2] = dm2.loadPacketStandard(k);
            kernel.packet[3] = dm3.loadPacketStandard(k);
            ptranspose(kernel);
            pstoreu(block + 0 * packet_size, kernel.packet[0]);
            pstoreu(block + 1 * packet_size, kernel.packet[1]);
            pstoreu(block + 2 * packet_size, kernel.packet[2]);
            pstoreu(block + 3 * packet_size, kernel.packet[3]);
            block += 4 * packet_size;
          }
        }
      }

      // Copy the remaining coefficients of the column block after the peeled_k.
      if (!rhs.nonStandardPatches()) {
        for (; k < depth; k++) {
          block[0] = dm0.loadCoeffStandard(k);
          block[1] = dm1.loadCoeffStandard(k);
          block[2] = dm2.loadCoeffStandard(k);
          block[3] = dm3.loadCoeffStandard(k);
          block += 4;
        }
      } else {
        for (; k < depth; k++) {
          block[0] = dm0(k);
          block[1] = dm1(k);
          block[2] = dm2(k);
          block[3] = dm3(k);
          block += 4;
        }
      }
    }

    // Copy the remaining columns one at a time (nr==1).
    for (Index j2 = packet_cols4; j2 < num_cols; ++j2) {
      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
      for (Index k = 0; k < depth; k++) {
        *block = dm0(k);
        block += 1;
      }
    }
  }