in backends/common/include/tfrt/common/compat/eigen/spatial_convolution_data_mapper.h [985:1156]
EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
Index depth, Index num_cols,
Index stride = 0, Index offset = 0) const {
assert(stride == 0);
assert(offset == 0);
const Index packet_cols4 = (num_cols / 4) * 4;
const Index peeled_k = (depth / packet_size) * packet_size;
const bool non_standard_patches = rhs.nonStandardPatches();
for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
Index k = 0;
if ((packet_size % 4) == 0 && !non_standard_patches) {
// FAST PATH: Iterate over patch columns and rows, if we know that a
// single packet does not span across multiple rows or columns.
if ((rhs.patchDepth() % packet_size) == 0) {
const Index start_col = rhs.colOffset();
const Index max_col = rhs.maxCol(peeled_k);
for (Index c = start_col; c < max_col; ++c) {
assert(k <= peeled_k);
const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
const Index max_row = rhs.maxRow(peeled_k, c);
const bool pad_col0 = dm0.padCol(c);
const bool pad_col1 = dm1.padCol(c);
const bool pad_col2 = dm2.padCol(c);
const bool pad_col3 = dm3.padCol(c);
// Check if we can squeeze reads along the `row` and `depth`
// dimensions (two innermost dimensions).
if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&
!dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&
!dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&
!dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&
!dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
// Compute how many elements we can squeeze read.
const Index start_depth =
(c == start_col) ? rhs.depthOffset() : 0;
// Upper bound for the number of elements in the depth dimension
// that we can squeeze read.
const Index squeeze_length =
(max_row - start_row) * rhs.patchDepth() - start_depth;
// Do not overshoot beyond the block size.
const Index max_depth =
start_depth + std::min<Index>(peeled_k - k, squeeze_length);
assert((max_depth - start_depth) % packet_size == 0);
const Index idx0 = dm0.baseIndex(start_row, c);
const Index idx1 = dm1.baseIndex(start_row, c);
const Index idx2 = dm2.baseIndex(start_row, c);
const Index idx3 = dm3.baseIndex(start_row, c);
for (Index d = start_depth; d < max_depth; d += packet_size) {
assert(k < peeled_k);
PacketBlock<Packet, 4> kernel;
kernel.packet[0] = rhs.packetNoPadding(d, idx0);
kernel.packet[1] = rhs.packetNoPadding(d, idx1);
kernel.packet[2] = rhs.packetNoPadding(d, idx2);
kernel.packet[3] = rhs.packetNoPadding(d, idx3);
ptranspose(kernel);
pstoreu(block + 0 * packet_size, kernel.packet[0]);
pstoreu(block + 1 * packet_size, kernel.packet[1]);
pstoreu(block + 2 * packet_size, kernel.packet[2]);
pstoreu(block + 3 * packet_size, kernel.packet[3]);
block += 4 * packet_size;
k += packet_size;
}
// Go to the next column.
continue;
}
// If we can't squeeze reads, process rows one by one.
for (Index r = start_row; r < max_row; ++r) {
assert(k <= peeled_k);
const bool pad0 = pad_col0 || dm0.padRow(r);
const bool pad1 = pad_col1 || dm1.padRow(r);
const bool pad2 = pad_col2 || dm2.padRow(r);
const bool pad3 = pad_col3 || dm3.padRow(r);
const Index idx0 = dm0.baseIndex(r, c);
const Index idx1 = dm1.baseIndex(r, c);
const Index idx2 = dm2.baseIndex(r, c);
const Index idx3 = dm3.baseIndex(r, c);
const Index start_depth = ((c == start_col) && (r == start_row))
? rhs.depthOffset()
: 0;
const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
assert((max_depth - start_depth) % packet_size == 0);
for (Index d = start_depth; d < max_depth; d += packet_size) {
assert(k < peeled_k);
PacketBlock<Packet, 4> kernel;
kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
: rhs.packetNoPadding(d, idx0);
kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
: rhs.packetNoPadding(d, idx1);
kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
: rhs.packetNoPadding(d, idx2);
kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
: rhs.packetNoPadding(d, idx3);
ptranspose(kernel);
pstoreu(block + 0 * packet_size, kernel.packet[0]);
pstoreu(block + 1 * packet_size, kernel.packet[1]);
pstoreu(block + 2 * packet_size, kernel.packet[2]);
pstoreu(block + 3 * packet_size, kernel.packet[3]);
block += 4 * packet_size;
k += packet_size;
}
}
}
// The loop above should fill peeled_k elements.
assert(peeled_k == k);
} else {
for (; k < peeled_k; k += packet_size) {
PacketBlock<Packet, 4> kernel;
kernel.packet[0] = dm0.loadPacketStandard(k);
kernel.packet[1] = dm1.loadPacketStandard(k);
kernel.packet[2] = dm2.loadPacketStandard(k);
kernel.packet[3] = dm3.loadPacketStandard(k);
ptranspose(kernel);
pstoreu(block + 0 * packet_size, kernel.packet[0]);
pstoreu(block + 1 * packet_size, kernel.packet[1]);
pstoreu(block + 2 * packet_size, kernel.packet[2]);
pstoreu(block + 3 * packet_size, kernel.packet[3]);
block += 4 * packet_size;
}
}
}
// Copy the remaining coefficients of the column block after the peeled_k.
if (!rhs.nonStandardPatches()) {
for (; k < depth; k++) {
block[0] = dm0.loadCoeffStandard(k);
block[1] = dm1.loadCoeffStandard(k);
block[2] = dm2.loadCoeffStandard(k);
block[3] = dm3.loadCoeffStandard(k);
block += 4;
}
} else {
for (; k < depth; k++) {
block[0] = dm0(k);
block[1] = dm1(k);
block[2] = dm2(k);
block[3] = dm3(k);
block += 4;
}
}
}
// Copy the remaining columns one at a time (nr==1).
for (Index j2 = packet_cols4; j2 < num_cols; ++j2) {
const SubMapper dm0 = rhs.getLinearMapper(0, j2);
for (Index k = 0; k < depth; k++) {
*block = dm0(k);
block += 1;
}
}
}