in backends/common/include/tfrt/common/compat/eigen/spatial_convolution_data_mapper.h [1596:1771]
EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
const DataMapper rhs,
IndexType num_rows,
IndexType num_cols) {
assert(!rhs.nonStandardPatches());
// Give vectorized_rows the name used in all other gemm_pack_rhs above.
const IndexType peeled_k = (num_rows / packet_size) * packet_size;
const IndexType start_col = rhs.colOffset();
const IndexType max_col = rhs.maxCol(peeled_k);
const IndexType depth_offset = rhs.depthOffset();
for (IndexType col = 0; col < num_cols; ++col) {
SubMapper lm = rhs.getLinearMapper(0, col);
IndexType k = 0;
for (Index c = start_col; c < max_col; ++c) {
assert(k <= peeled_k);
const IndexType start_row = (c == start_col) ? rhs.rowOffset() : 0;
const IndexType max_row = rhs.maxRow(peeled_k, c);
const bool pad_col = has_padding && lm.padCol(c);
eigen_assert(has_padding || !lm.padCol(c));
eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1));
// We can squeeze reads for all rows in [start_row, max_row) range.
if (!has_padding ||
(!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
const IndexType start_depth = (c == start_col) ? depth_offset : 0;
const IndexType max_depth =
std::min<IndexType>(start_depth + (peeled_k - k),
(max_row - start_row) * rhs.patchDepth());
const IndexType base_idx = lm.baseIndex(start_row, c);
if (patch_depth_is_multiple_of_packet_size) {
// If patch depth is a multiple of packet size, it's guaranteed that
// we can process all values in depth dimension with packets.
assert((max_depth - start_depth) % packet_size == 0);
IndexType d = start_depth;
const IndexType unrolled_depth = max_depth - 4 * packet_size;
for (; d <= unrolled_depth; d += 4 * packet_size) {
assert(k < peeled_k);
Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
internal::pstoreu(block + 0 * packet_size, p0);
internal::pstoreu(block + 1 * packet_size, p1);
internal::pstoreu(block + 2 * packet_size, p2);
internal::pstoreu(block + 3 * packet_size, p3);
block += 4 * packet_size;
k += 4 * packet_size;
}
for (; d < max_depth; d += packet_size) {
assert(k < peeled_k);
internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
block += packet_size;
k += packet_size;
}
} else {
IndexType d = start_depth;
const IndexType unrolled_depth = max_depth - 4 * packet_size;
for (; d <= unrolled_depth; d += 4 * packet_size) {
eigen_assert(k < peeled_k);
Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
internal::pstoreu(block + 0 * packet_size, p0);
internal::pstoreu(block + 1 * packet_size, p1);
internal::pstoreu(block + 2 * packet_size, p2);
internal::pstoreu(block + 3 * packet_size, p3);
block += 4 * packet_size;
k += 4 * packet_size;
}
const IndexType vectorized_depth = max_depth - packet_size;
for (; d <= vectorized_depth; d += packet_size) {
assert(k < peeled_k);
internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
block += packet_size;
k += packet_size;
}
assert(k <= peeled_k);
const Index num_coeffs =
CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth);
k += num_coeffs;
block += num_coeffs;
assert(k <= peeled_k);
}
// Go to the next column.
continue;
}
// If we are not allowed to squeeze reads along the `row` and `depth`
// dimensions, we must process rows one by one.
for (IndexType r = start_row; r < max_row; ++r) {
assert(k <= peeled_k);
const IndexType start_depth =
((c == start_col) && (r == start_row)) ? depth_offset : 0;
const IndexType max_depth = rhs.maxDepth(peeled_k - k, start_depth);
const bool pad = has_padding && (pad_col || lm.padRow(r));
eigen_assert(has_padding || !lm.padRow(r));
const IndexType base_idx = lm.baseIndex(r, c);
if (patch_depth_is_multiple_of_packet_size) {
// If patch depth is a multiple of packet size, it's guaranteed that
// we can process all values in depth dimension with packets.
assert((max_depth - start_depth) % packet_size == 0);
IndexType d = start_depth;
for (; d < max_depth; d += packet_size) {
assert(k < peeled_k);
const Packet p = (has_padding && pad)
? pset1<Packet>(Scalar(0))
: rhs.packetNoPadding(d, base_idx);
internal::pstoreu(block, p);
block += packet_size;
k += packet_size;
}
} else {
IndexType d = start_depth;
const IndexType vectorized_depth = max_depth - packet_size;
for (; d <= vectorized_depth; d += packet_size) {
assert(k < peeled_k);
const Packet p = (has_padding && pad)
? pset1<Packet>(Scalar(0))
: rhs.packetNoPadding(d, base_idx);
internal::pstoreu(block, p);
block += packet_size;
k += packet_size;
}
assert(k <= peeled_k);
const Index num_coeffs = CoeffFinalizer::finalize(
block, rhs, base_idx, d, max_depth, has_padding && pad);
k += num_coeffs;
block += num_coeffs;
assert(k <= peeled_k);
}
}
}
// The loop above should fill peeled_k elements.
assert(peeled_k == k);
// Fill remaining elements using loadCoeffStandard.
for (; k < num_rows; ++k) {
*block = lm.loadCoeffStandard(k);
++block;
}
}
}