EIGEN_ALWAYS_INLINE void packStandardPatches()

in backends/common/include/tfrt/common/compat/eigen/spatial_convolution_data_mapper.h [1596:1771]


  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
                                               const DataMapper rhs,
                                               IndexType num_rows,
                                               IndexType num_cols) {
    assert(!rhs.nonStandardPatches());

    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
    const IndexType peeled_k = (num_rows / packet_size) * packet_size;

    const IndexType start_col = rhs.colOffset();
    const IndexType max_col = rhs.maxCol(peeled_k);
    const IndexType depth_offset = rhs.depthOffset();

    for (IndexType col = 0; col < num_cols; ++col) {
      SubMapper lm = rhs.getLinearMapper(0, col);

      IndexType k = 0;
      for (Index c = start_col; c < max_col; ++c) {
        assert(k <= peeled_k);

        const IndexType start_row = (c == start_col) ? rhs.rowOffset() : 0;
        const IndexType max_row = rhs.maxRow(peeled_k, c);
        const bool pad_col = has_padding && lm.padCol(c);

        eigen_assert(has_padding || !lm.padCol(c));
        eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1));

        // We can squeeze reads for all rows in [start_row, max_row) range.
        if (!has_padding ||
            (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
          const IndexType start_depth = (c == start_col) ? depth_offset : 0;

          const IndexType max_depth =
              std::min<IndexType>(start_depth + (peeled_k - k),
                                  (max_row - start_row) * rhs.patchDepth());

          const IndexType base_idx = lm.baseIndex(start_row, c);

          if (patch_depth_is_multiple_of_packet_size) {
            // If patch depth is a multiple of packet size, it's guaranteed that
            // we can process all values in depth dimension with packets.
            assert((max_depth - start_depth) % packet_size == 0);
            IndexType d = start_depth;

            const IndexType unrolled_depth = max_depth - 4 * packet_size;
            for (; d <= unrolled_depth; d += 4 * packet_size) {
              assert(k < peeled_k);

              Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
              Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
              Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
              Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);

              internal::pstoreu(block + 0 * packet_size, p0);
              internal::pstoreu(block + 1 * packet_size, p1);
              internal::pstoreu(block + 2 * packet_size, p2);
              internal::pstoreu(block + 3 * packet_size, p3);

              block += 4 * packet_size;
              k += 4 * packet_size;
            }

            for (; d < max_depth; d += packet_size) {
              assert(k < peeled_k);
              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
              block += packet_size;
              k += packet_size;
            }

          } else {
            IndexType d = start_depth;

            const IndexType unrolled_depth = max_depth - 4 * packet_size;
            for (; d <= unrolled_depth; d += 4 * packet_size) {
              eigen_assert(k < peeled_k);

              Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
              Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
              Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
              Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);

              internal::pstoreu(block + 0 * packet_size, p0);
              internal::pstoreu(block + 1 * packet_size, p1);
              internal::pstoreu(block + 2 * packet_size, p2);
              internal::pstoreu(block + 3 * packet_size, p3);

              block += 4 * packet_size;
              k += 4 * packet_size;
            }

            const IndexType vectorized_depth = max_depth - packet_size;
            for (; d <= vectorized_depth; d += packet_size) {
              assert(k < peeled_k);
              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
              block += packet_size;
              k += packet_size;
            }

            assert(k <= peeled_k);
            const Index num_coeffs =
                CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth);

            k += num_coeffs;
            block += num_coeffs;
            assert(k <= peeled_k);
          }

          // Go to the next column.
          continue;
        }

        // If we are not allowed to squeeze reads along the `row` and `depth`
        // dimensions, we must process rows one by one.
        for (IndexType r = start_row; r < max_row; ++r) {
          assert(k <= peeled_k);

          const IndexType start_depth =
              ((c == start_col) && (r == start_row)) ? depth_offset : 0;
          const IndexType max_depth = rhs.maxDepth(peeled_k - k, start_depth);

          const bool pad = has_padding && (pad_col || lm.padRow(r));
          eigen_assert(has_padding || !lm.padRow(r));

          const IndexType base_idx = lm.baseIndex(r, c);

          if (patch_depth_is_multiple_of_packet_size) {
            // If patch depth is a multiple of packet size, it's guaranteed that
            // we can process all values in depth dimension with packets.
            assert((max_depth - start_depth) % packet_size == 0);
            IndexType d = start_depth;

            for (; d < max_depth; d += packet_size) {
              assert(k < peeled_k);
              const Packet p = (has_padding && pad)
                                   ? pset1<Packet>(Scalar(0))
                                   : rhs.packetNoPadding(d, base_idx);
              internal::pstoreu(block, p);
              block += packet_size;
              k += packet_size;
            }

          } else {
            IndexType d = start_depth;

            const IndexType vectorized_depth = max_depth - packet_size;
            for (; d <= vectorized_depth; d += packet_size) {
              assert(k < peeled_k);
              const Packet p = (has_padding && pad)
                                   ? pset1<Packet>(Scalar(0))
                                   : rhs.packetNoPadding(d, base_idx);
              internal::pstoreu(block, p);
              block += packet_size;
              k += packet_size;
            }

            assert(k <= peeled_k);
            const Index num_coeffs = CoeffFinalizer::finalize(
                block, rhs, base_idx, d, max_depth, has_padding && pad);

            k += num_coeffs;
            block += num_coeffs;
            assert(k <= peeled_k);
          }
        }
      }

      // The loop above should fill peeled_k elements.
      assert(peeled_k == k);

      // Fill remaining elements using loadCoeffStandard.
      for (; k < num_rows; ++k) {
        *block = lm.loadCoeffStandard(k);
        ++block;
      }
    }
  }