in source/backend/cpu/compute/CommonOptFunction.cpp [1595:2239]
void MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap) {
auto eP = parameter[0] / sizeof(float);
MNN_ASSERT((eP & 0x03) == 0); // In sparse calculate, eP should be evenly divided by 4
auto h = parameter[2];
auto l = parameter[1];
auto cStride = parameter[3] / sizeof(float);
auto aStride = eP * l;
auto hRemain = parameter[4];
auto bExtraStride = parameter[5] / sizeof(float);
auto bStride = bExtraStride + l * 4;
auto hC4 = UP_DIV(h, 4);
float minValue = -std::numeric_limits<float>().max();
float maxValue = std::numeric_limits<float>().max();
if (nullptr != postParameters) {
minValue = postParameters[2];
maxValue = postParameters[3];
}
// MNN_PRINT("MNNPackedSparseMatMul 16x4 eP:%lu, eSize:%lu, l:%lu, h:%lu, cStride:%lu, aStride:%lu\n", eP, eSize, l, h, cStride, aStride);
const int sparseBlockOC = 4;
const float* a = A;
size_t ie = 0;
for (ie = 0; ie < eSize && eP <= eSize; ie += eP) {
const int* dataOffset = dataOffsetMap;
const int diff = *dataOffset++;
a += diff;
const float* w = B;
float* blockC = C + (ie << 2);
const unsigned int* nnz = NNZMap;
size_t ih = 0;
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
auto ihPack = ih >> 2;
auto c = blockC + ihPack * cStride;
float initValue[4] = {0, 0, 0, 0};
if (nullptr != bias) {
memcpy(initValue, bias + ih, 4 * sizeof(float));
}
float acc0[4];
float acc1[4];
float acc2[4];
float acc3[4];
float acc4[4];
float acc5[4];
float acc6[4];
float acc7[4];
float acc8[4];
float acc9[4];
float acc10[4];
float acc11[4];
float acc12[4];
float acc13[4];
float acc14[4];
float acc15[4];
memcpy(acc0, initValue, 4 * sizeof(float));
memcpy(acc1, initValue, 4 * sizeof(float));
memcpy(acc2, initValue, 4 * sizeof(float));
memcpy(acc3, initValue, 4 * sizeof(float));
memcpy(acc4, initValue, 4 * sizeof(float));
memcpy(acc5, initValue, 4 * sizeof(float));
memcpy(acc6, initValue, 4 * sizeof(float));
memcpy(acc7, initValue, 4 * sizeof(float));
memcpy(acc8, initValue, 4 * sizeof(float));
memcpy(acc9, initValue, 4 * sizeof(float));
memcpy(acc10, initValue, 4 * sizeof(float));
memcpy(acc11, initValue, 4 * sizeof(float));
memcpy(acc12, initValue, 4 * sizeof(float));
memcpy(acc13, initValue, 4 * sizeof(float));
memcpy(acc14, initValue, 4 * sizeof(float));
memcpy(acc15, initValue, 4 * sizeof(float));
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float a2 = a[2];
const float a3 = a[3];
const float a4 = a[4];
const float a5 = a[5];
const float a6 = a[6];
const float a7 = a[7];
const float a8 = a[8];
const float a9 = a[9];
const float a10 = a[10];
const float a11 = a[11];
const float a12 = a[12];
const float a13 = a[13];
const float a14 = a[14];
const float a15 = a[15];
const float wv[4] = {*w++, *w++, *w++, *w++};
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff;
for (int lane = 0; lane < 4; lane++) {
acc0[lane] += a0 * wv[lane];
acc1[lane] += a1 * wv[lane];
acc2[lane] += a2 * wv[lane];
acc3[lane] += a3 * wv[lane];
acc4[lane] += a4 * wv[lane];
acc5[lane] += a5 * wv[lane];
acc6[lane] += a6 * wv[lane];
acc7[lane] += a7 * wv[lane];
acc8[lane] += a8 * wv[lane];
acc9[lane] += a9 * wv[lane];
acc10[lane] += a10 * wv[lane];
acc11[lane] += a11 * wv[lane];
acc12[lane] += a12 * wv[lane];
acc13[lane] += a13 * wv[lane];
acc14[lane] += a14 * wv[lane];
acc15[lane] += a15 * wv[lane];
}
}
for (int lane = 0; lane < 4; lane++) {
acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
acc2[lane] = std::max(std::min(maxValue, acc2[lane]), minValue);
acc3[lane] = std::max(std::min(maxValue, acc3[lane]), minValue);
acc4[lane] = std::max(std::min(maxValue, acc4[lane]), minValue);
acc5[lane] = std::max(std::min(maxValue, acc5[lane]), minValue);
acc6[lane] = std::max(std::min(maxValue, acc6[lane]), minValue);
acc7[lane] = std::max(std::min(maxValue, acc7[lane]), minValue);
acc8[lane] = std::max(std::min(maxValue, acc8[lane]), minValue);
acc9[lane] = std::max(std::min(maxValue, acc9[lane]), minValue);
acc10[lane] = std::max(std::min(maxValue, acc10[lane]), minValue);
acc11[lane] = std::max(std::min(maxValue, acc11[lane]), minValue);
acc12[lane] = std::max(std::min(maxValue, acc12[lane]), minValue);
acc13[lane] = std::max(std::min(maxValue, acc13[lane]), minValue);
acc14[lane] = std::max(std::min(maxValue, acc14[lane]), minValue);
acc15[lane] = std::max(std::min(maxValue, acc15[lane]), minValue);
}
memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
memcpy(c + 4, acc1, 4 * sizeof(float));
memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
memcpy(c + 4 * 4, acc4, 4 * sizeof(float));
memcpy(c + 4 * 5, acc5, 4 * sizeof(float));
memcpy(c + 4 * 6, acc6, 4 * sizeof(float));
memcpy(c + 4 * 7, acc7, 4 * sizeof(float));
memcpy(c + 4 * 8, acc8, 4 * sizeof(float));
memcpy(c + 4 * 9, acc9, 4 * sizeof(float));
memcpy(c + 4 * 10, acc10, 4 * sizeof(float));
memcpy(c + 4 * 11, acc11, 4 * sizeof(float));
memcpy(c + 4 * 12, acc12, 4 * sizeof(float));
memcpy(c + 4 * 13, acc13, 4 * sizeof(float));
memcpy(c + 4 * 14, acc14, 4 * sizeof(float));
memcpy(c + 4 * 15, acc15, 4 * sizeof(float));
}
blockC += (h >> 2) * cStride;
for (; ih < h; ih++) {
auto ihSubIndex = ih & 0x03;
auto c = blockC + ihSubIndex;
const float initValue = nullptr != bias ? bias[ih] : 0;
float acc0 = initValue;
float acc1 = initValue;
float acc2 = initValue;
float acc3 = initValue;
float acc4 = initValue;
float acc5 = initValue;
float acc6 = initValue;
float acc7 = initValue;
float acc8 = initValue;
float acc9 = initValue;
float acc10 = initValue;
float acc11 = initValue;
float acc12 = initValue;
float acc13 = initValue;
float acc14 = initValue;
float acc15 = initValue;
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float a2 = a[2];
const float a3 = a[3];
const float a4 = a[4];
const float a5 = a[5];
const float a6 = a[6];
const float a7 = a[7];
const float a8 = a[8];
const float a9 = a[9];
const float a10 = a[10];
const float a11 = a[11];
const float a12 = a[12];
const float a13 = a[13];
const float a14 = a[14];
const float a15 = a[15];
const float oneW = *w++;
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff;
acc0 += a0 * oneW;
acc1 += a1 * oneW;
acc2 += a2 * oneW;
acc3 += a3 * oneW;
acc4 += a4 * oneW;
acc5 += a5 * oneW;
acc6 += a6 * oneW;
acc7 += a7 * oneW;
acc8 += a8 * oneW;
acc9 += a9 * oneW;
acc10 += a10 * oneW;
acc11 += a11 * oneW;
acc12 += a12 * oneW;
acc13 += a13 * oneW;
acc14 += a14 * oneW;
acc15 += a15 * oneW;
}
acc0 = std::max(std::min(maxValue, acc0), minValue);
acc1 = std::max(std::min(maxValue, acc1), minValue);
acc2 = std::max(std::min(maxValue, acc2), minValue);
acc3 = std::max(std::min(maxValue, acc3), minValue);
acc4 = std::max(std::min(maxValue, acc4), minValue);
acc5 = std::max(std::min(maxValue, acc5), minValue);
acc6 = std::max(std::min(maxValue, acc6), minValue);
acc7 = std::max(std::min(maxValue, acc7), minValue);
acc8 = std::max(std::min(maxValue, acc8), minValue);
acc9 = std::max(std::min(maxValue, acc9), minValue);
acc10 = std::max(std::min(maxValue, acc10), minValue);
acc11 = std::max(std::min(maxValue, acc11), minValue);
acc12 = std::max(std::min(maxValue, acc12), minValue);
acc13 = std::max(std::min(maxValue, acc13), minValue);
acc14 = std::max(std::min(maxValue, acc14), minValue);
acc15 = std::max(std::min(maxValue, acc15), minValue);
// how to store faster: st4 / transpose /
c[0] = acc0;
c[4] = acc1;
c[4 * 2] = acc2;
c[4 * 3] = acc3;
c[4 * 4] = acc4;
c[4 * 5] = acc5;
c[4 * 6] = acc6;
c[4 * 7] = acc7;
c[4 * 8] = acc8;
c[4 * 9] = acc9;
c[4 * 10] = acc10;
c[4 * 11] = acc11;
c[4 * 12] = acc12;
c[4 * 13] = acc13;
c[4 * 14] = acc14;
c[4 * 15] = acc15;
}
a += aStride;
}
// const float* blockA = A + ie * l;
if (eSize & 0x08) {
const int* dataOffset = dataOffsetMap;
const int diff = *dataOffset++;
// a = blockA + diff;
a += diff;
const float* w = B;
float* blockC = C + (ie << 2);
const unsigned int* nnz = NNZMap;
size_t ih = 0;
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
auto ihPack = ih >> 2;
auto c = blockC + ihPack * cStride;
float initValue[4] = {0, 0, 0, 0};
if (nullptr != bias) {
memcpy(initValue, bias + ih, 4 * sizeof(float));
}
float acc0[4];
float acc1[4];
float acc2[4];
float acc3[4];
float acc4[4];
float acc5[4];
float acc6[4];
float acc7[4];
memcpy(acc0, initValue, 4 * sizeof(float));
memcpy(acc1, initValue, 4 * sizeof(float));
memcpy(acc2, initValue, 4 * sizeof(float));
memcpy(acc3, initValue, 4 * sizeof(float));
memcpy(acc4, initValue, 4 * sizeof(float));
memcpy(acc5, initValue, 4 * sizeof(float));
memcpy(acc6, initValue, 4 * sizeof(float));
memcpy(acc7, initValue, 4 * sizeof(float));
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float a2 = a[2];
const float a3 = a[3];
const float a4 = a[4];
const float a5 = a[5];
const float a6 = a[6];
const float a7 = a[7];
const float wv[4] = {*w++, *w++, *w++, *w++};
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff;
for (int lane = 0; lane < 4; lane++) {
acc0[lane] += a0 * wv[lane];
acc1[lane] += a1 * wv[lane];
acc2[lane] += a2 * wv[lane];
acc3[lane] += a3 * wv[lane];
acc4[lane] += a4 * wv[lane];
acc5[lane] += a5 * wv[lane];
acc6[lane] += a6 * wv[lane];
acc7[lane] += a7 * wv[lane];
}
}
for (int lane = 0; lane < 4; lane++) {
acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
acc2[lane] = std::max(std::min(maxValue, acc2[lane]), minValue);
acc3[lane] = std::max(std::min(maxValue, acc3[lane]), minValue);
acc4[lane] = std::max(std::min(maxValue, acc4[lane]), minValue);
acc5[lane] = std::max(std::min(maxValue, acc5[lane]), minValue);
acc6[lane] = std::max(std::min(maxValue, acc6[lane]), minValue);
acc7[lane] = std::max(std::min(maxValue, acc7[lane]), minValue);
}
memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
memcpy(c + 4, acc1, 4 * sizeof(float));
memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
memcpy(c + 4 * 4, acc4, 4 * sizeof(float));
memcpy(c + 4 * 5, acc5, 4 * sizeof(float));
memcpy(c + 4 * 6, acc6, 4 * sizeof(float));
memcpy(c + 4 * 7, acc7, 4 * sizeof(float));
}
blockC += (ih >> 2) * cStride;
for (; ih < h; ih++) {
auto ihSubIndex = ih & 0x03;
auto c = blockC + ihSubIndex;
const float initValue = nullptr != bias ? bias[ih] : 0;
float acc0 = initValue;
float acc1 = initValue;
float acc2 = initValue;
float acc3 = initValue;
float acc4 = initValue;
float acc5 = initValue;
float acc6 = initValue;
float acc7 = initValue;
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float a2 = a[2];
const float a3 = a[3];
const float a4 = a[4];
const float a5 = a[5];
const float a6 = a[6];
const float a7 = a[7];
const float oneW = *w++;
// MNN_PRINT("8-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-7]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {8});
// MNN_PRINT("\n");
a = a + diff;
acc0 += a0 * oneW;
acc1 += a1 * oneW;
acc2 += a2 * oneW;
acc3 += a3 * oneW;
acc4 += a4 * oneW;
acc5 += a5 * oneW;
acc6 += a6 * oneW;
acc7 += a7 * oneW;
}
acc0 = std::max(std::min(maxValue, acc0), minValue);
acc1 = std::max(std::min(maxValue, acc1), minValue);
acc2 = std::max(std::min(maxValue, acc2), minValue);
acc3 = std::max(std::min(maxValue, acc3), minValue);
acc4 = std::max(std::min(maxValue, acc4), minValue);
acc5 = std::max(std::min(maxValue, acc5), minValue);
acc6 = std::max(std::min(maxValue, acc6), minValue);
acc7 = std::max(std::min(maxValue, acc7), minValue);
// how to store faster: st4 / transpose /
c[0] = acc0;
c[4] = acc1;
c[4 * 2] = acc2;
c[4 * 3] = acc3;
c[4 * 4] = acc4;
c[4 * 5] = acc5;
c[4 * 6] = acc6;
c[4 * 7] = acc7;
}
ie += 8;
a += 8;
}
if (eSize & 0x04) {
const int* dataOffset = dataOffsetMap;
const int diff = *dataOffset++;
// const float* a = blockA + diff;
a += diff;
const float* w = B;
float* blockC = C + (ie << 2);
const unsigned int* nnz = NNZMap;
size_t ih = 0;
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
auto ihPack = ih >> 2;
auto c = blockC + ihPack * cStride;
float initValue[4] = {0, 0, 0, 0};
if (nullptr != bias) {
memcpy(initValue, bias + ih, 4 * sizeof(float));
}
float acc0[4];
float acc1[4];
float acc2[4];
float acc3[4];
memcpy(acc0, initValue, 4 * sizeof(float));
memcpy(acc1, initValue, 4 * sizeof(float));
memcpy(acc2, initValue, 4 * sizeof(float));
memcpy(acc3, initValue, 4 * sizeof(float));
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float a2 = a[2];
const float a3 = a[3];
const float wv[4] = {*w++, *w++, *w++, *w++};
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff;
for (int lane = 0; lane < 4; lane++) {
acc0[lane] += a0 * wv[lane];
acc1[lane] += a1 * wv[lane];
acc2[lane] += a2 * wv[lane];
acc3[lane] += a3 * wv[lane];
}
}
for (int lane = 0; lane < 4; lane++) {
acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
acc2[lane] = std::max(std::min(maxValue, acc2[lane]), minValue);
acc3[lane] = std::max(std::min(maxValue, acc3[lane]), minValue);
}
memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
memcpy(c + 4, acc1, 4 * sizeof(float));
memcpy(c + 4 * 2, acc2, 4 * sizeof(float));
memcpy(c + 4 * 3, acc3, 4 * sizeof(float));
}
blockC += (ih >> 2) * cStride;
for (; ih < h; ih++) {
auto ihSubIndex = ih & 0x03;
auto c = blockC + ihSubIndex;
const float initValue = nullptr != bias ? bias[ih] : 0;
float acc0 = initValue;
float acc1 = initValue;
float acc2 = initValue;
float acc3 = initValue;
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float a2 = a[2];
const float a3 = a[3];
const float oneW = *w++;
// MNN_PRINT("4-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-3]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {4});
// MNN_PRINT("\n");
a = a + diff;
acc0 += a0 * oneW;
acc1 += a1 * oneW;
acc2 += a2 * oneW;
acc3 += a3 * oneW;
}
acc0 = std::max(std::min(maxValue, acc0), minValue);
acc1 = std::max(std::min(maxValue, acc1), minValue);
acc2 = std::max(std::min(maxValue, acc2), minValue);
acc3 = std::max(std::min(maxValue, acc3), minValue);
// how to store faster: st4 / transpose /
c[0] = acc0;
c[4] = acc1;
c[4 * 2] = acc2;
c[4 * 3] = acc3;
}
ie += 4;
a += 4;
}
if (eSize & 0x02) {
const int* dataOffset = dataOffsetMap;
const int diff = *dataOffset++;
// const float* a = blockA + diff;
a += diff;
const float* w = B;
float* blockC = C + (ie << 2);
const unsigned int* nnz = NNZMap;
size_t ih = 0;
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
auto ihPack = ih >> 2;
auto c = blockC + ihPack * cStride;
float initValue[4] = {0, 0, 0, 0};
if (nullptr != bias) {
memcpy(initValue, bias + ih, 4 * sizeof(float));
}
float acc0[4];
float acc1[4];
memcpy(acc0, initValue, 4 * sizeof(float));
memcpy(acc1, initValue, 4 * sizeof(float));
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float wv[4] = {*w++, *w++, *w++, *w++};
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff;
for (int lane = 0; lane < 4; lane++) {
acc0[lane] += a0 * wv[lane];
acc1[lane] += a1 * wv[lane];
}
}
for (int lane = 0; lane < 4; lane++) {
acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
acc1[lane] = std::max(std::min(maxValue, acc1[lane]), minValue);
}
memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
memcpy(c + 4, acc1, 4 * sizeof(float));
}
blockC += (ih >> 2) * cStride;
for (; ih < h; ih++) {
auto ihPack = ih >> 2;
auto ihSubIndex = ih & 0x03;
auto c = blockC + ihSubIndex;
const float initValue = nullptr != bias ? bias[ih] : 0;
float acc0 = initValue;
float acc1 = initValue;
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float a1 = a[1];
const float oneW = *w++;
// MNN_PRINT("2-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-1]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {2});
// MNN_PRINT("\n");
a = a + diff;
acc0 += a0 * oneW;
acc1 += a1 * oneW;
}
acc0 = std::max(std::min(maxValue, acc0), minValue);
acc1 = std::max(std::min(maxValue, acc1), minValue);
// how to store faster: st4 / transpose /
c[0] = acc0;
c[4] = acc1;
}
ie += 2;
a += 2;
}
if (eSize & 0x01) {
const int* dataOffset = dataOffsetMap;
const int diff = *dataOffset++;
// const float* a = blockA + diff;
a += diff;
const float* w = B;
float* blockC = C + (ie << 2);
const unsigned int* nnz = NNZMap;
size_t ih = 0;
for (; ih < (h & (~0x03)); ih += sparseBlockOC) {
auto ihPack = ih >> 2;
auto c = blockC + ihPack * cStride;
float initValue[4] = {0, 0, 0, 0};
if (nullptr != bias) {
memcpy(initValue, bias + ih, 4 * sizeof(float));
}
float acc0[4];
memcpy(acc0, initValue, 4 * sizeof(float));
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float wv[4] = {*w++, *w++, *w++, *w++};
// MNN_PRINT("16-loop: ie:%zu, a offset:%ld, w offset:%ld, c offset:%ld, w value:%f, a value[0-15]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {16});
// MNN_PRINT("\n");
a = a + diff;
for (int lane = 0; lane < 4; lane++) {
acc0[lane] += a0 * wv[lane];
}
}
for (int lane = 0; lane < 4; lane++) {
acc0[lane] = std::max(std::min(maxValue, acc0[lane]), minValue);
}
memcpy(c, acc0, 4 * sizeof(float)); // store continuous c
}
blockC += (ih >> 2) * cStride;
for (; ih < h; ih++) {
auto ihSubIndex = ih & 0x03;
auto c = blockC + ihSubIndex;
const float initValue = nullptr != bias ? bias[ih] : 0;
float acc0 = initValue;
const int lElement = *nnz++;
for (auto il = 0; il < lElement; il++) {
const int diff = *dataOffset++;
const float a0 = a[0];
const float oneW = *w++;
// MNN_PRINT("1-loop: ie:%zu, a offset:%ld, c offset:%ld, w offset:%ld, w value:%f, a value[0]:", ie, a - A, w - B - 1, c - C, oneW);
// formatMatrix(a, {1});
// MNN_PRINT("\n");
a = a + diff;
acc0 += a0 * oneW;
}
acc0 = std::max(std::min(maxValue, acc0), minValue);
// how to store faster: st4 / transpose /
c[0] = acc0;
}
ie += 1;
// a += 1;
}
return;
}