static WEBP_INLINE void ExportRowShrink_0()

in Extended/libwebp/src/dsp/rescaler_msa.c [265:349]


static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
                                          uint8_t* dst, int length,
                                          const uint32_t yscale,
                                          WebPRescaler* const wrk) {
  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  const v4i32 zero = { 0 };

  while (length >= 16) {
    v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
    v16u8 out;
    LD_UW4(frow, 4, src0, src1, src2, src3);
    CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
                      frac0, frac1, frac2, frac3);
    LD_UW4(irow, 4, src0, src1, src2, src3);
    SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
         src0, src1, src2, src3);
    CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
    ST_UB(out, dst);
    ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
    frow   += 16;
    irow   += 16;
    dst    += 16;
    length -= 16;
  }
  if (length > 0) {
    int x_out;
    if (length >= 12) {
      uint32_t val0_m, val1_m, val2_m;
      v4u32 src0, src1, src2, frac0, frac1, frac2;
      LD_UW3(frow, 4, src0, src1, src2);
      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
      CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
      LD_UW3(irow, 4, src0, src1, src2);
      SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
      CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
      SW3(val0_m, val1_m, val2_m, dst, 4);
      ST_UW3(frac0, frac1, frac2, irow, 4);
      frow   += 12;
      irow   += 12;
      dst    += 12;
      length -= 12;
    } else if (length >= 8) {
      uint32_t val0_m, val1_m;
      v4u32 src0, src1, frac0, frac1;
      LD_UW2(frow, 4, src0, src1);
      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
      LD_UW2(irow, 4, src0, src1);
      SUB2(src0, frac0, src1, frac1, src0, src1);
      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
      SW2(val0_m, val1_m, dst, 4);
      ST_UW2(frac0, frac1, irow, 4);
      frow   += 8;
      irow   += 8;
      dst    += 8;
      length -= 8;
    } else if (length >= 4) {
      uint32_t val0_m;
      v4u32 frac0;
      v4u32 src0 = LD_UW(frow);
      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
      src0 = LD_UW(irow);
      src0 = src0 - frac0;
      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
      SW(val0_m, dst);
      ST_UW(frac0, irow);
      frow   += 4;
      irow   += 4;
      dst    += 4;
      length -= 4;
    }
    for (x_out = 0; x_out < length; ++x_out) {
      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
      irow[x_out] = frac;
    }
  }
}