static void porter_stemmer()

in mailnews/extensions/fts3/fts3_porter.c [570:777]


static void porter_stemmer(const unsigned char* zIn, unsigned int nIn,
                           unsigned char* zOut, int* pnOut) {
  unsigned int i, j, c;
  char zReverse[28];
  char *z, *z2;
  const unsigned char* zTerm = zIn + nIn;
  const unsigned char* zTmp = zIn;

  if (nIn < 3 || nIn >= sizeof(zReverse) - 7) {
    /* The word is too big or too small for the porter stemmer.
    ** Fallback to the copy stemmer */
    copy_stemmer(zIn, nIn, zOut, pnOut);
    return;
  }
  for (j = sizeof(zReverse) - 6; zTmp < zTerm; j--) {
    READ_UTF8(zTmp, zTerm, c);
    c = normalize_character(c);
    if (c >= 'a' && c <= 'z') {
      zReverse[j] = c;
    } else {
      /* The use of a character not in [a-zA-Z] means that we fallback
      ** to the copy stemmer */
      copy_stemmer(zIn, nIn, zOut, pnOut);
      return;
    }
  }
  memset(&zReverse[sizeof(zReverse) - 5], 0, 5);
  z = &zReverse[j + 1];

  /* Step 1a */
  if (z[0] == 's') {
    if (!stem(&z, "sess", "ss", 0) && !stem(&z, "sei", "i", 0) &&
        !stem(&z, "ss", "ss", 0)) {
      z++;
    }
  }

  /* Step 1b */
  z2 = z;
  if (stem(&z, "dee", "ee", m_gt_0)) {
    /* Do nothing.  The work was all in the test */
  } else if ((stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) &&
             z != z2) {
    if (stem(&z, "ta", "ate", 0) || stem(&z, "lb", "ble", 0) ||
        stem(&z, "zi", "ize", 0)) {
      /* Do nothing.  The work was all in the test */
    } else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) {
      z++;
    } else if (m_eq_1(z) && star_oh(z)) {
      *(--z) = 'e';
    }
  }

  /* Step 1c */
  if (z[0] == 'y' && hasVowel(z + 1)) {
    z[0] = 'i';
  }

  /* Step 2 */
  switch (z[1]) {
    case 'a':
      (void)(stem(&z, "lanoita", "ate", m_gt_0) ||
             stem(&z, "lanoit", "tion", m_gt_0));
      break;
    case 'c':
      (void)(stem(&z, "icne", "ence", m_gt_0) ||
             stem(&z, "icna", "ance", m_gt_0));
      break;
    case 'e':
      (void)(stem(&z, "rezi", "ize", m_gt_0));
      break;
    case 'g':
      (void)(stem(&z, "igol", "log", m_gt_0));
      break;
    case 'l':
      (void)(stem(&z, "ilb", "ble", m_gt_0) || stem(&z, "illa", "al", m_gt_0) ||
             stem(&z, "iltne", "ent", m_gt_0) || stem(&z, "ile", "e", m_gt_0) ||
             stem(&z, "ilsuo", "ous", m_gt_0));
      break;
    case 'o':
      (void)(stem(&z, "noitazi", "ize", m_gt_0) ||
             stem(&z, "noita", "ate", m_gt_0) ||
             stem(&z, "rota", "ate", m_gt_0));
      break;
    case 's':
      (void)(stem(&z, "msila", "al", m_gt_0) ||
             stem(&z, "ssenevi", "ive", m_gt_0) ||
             stem(&z, "ssenluf", "ful", m_gt_0) ||
             stem(&z, "ssensuo", "ous", m_gt_0));
      break;
    case 't':
      (void)(stem(&z, "itila", "al", m_gt_0) ||
             stem(&z, "itivi", "ive", m_gt_0) ||
             stem(&z, "itilib", "ble", m_gt_0));
      break;
  }

  /* Step 3 */
  switch (z[0]) {
    case 'e':
      (void)(stem(&z, "etaci", "ic", m_gt_0) || stem(&z, "evita", "", m_gt_0) ||
             stem(&z, "ezila", "al", m_gt_0));
      break;
    case 'i':
      (void)(stem(&z, "itici", "ic", m_gt_0));
      break;
    case 'l':
      (void)(stem(&z, "laci", "ic", m_gt_0) || stem(&z, "luf", "", m_gt_0));
      break;
    case 's':
      (void)(stem(&z, "ssen", "", m_gt_0));
      break;
  }

  /* Step 4 */
  switch (z[1]) {
    case 'a':
      if (z[0] == 'l' && m_gt_1(z + 2)) {
        z += 2;
      }
      break;
    case 'c':
      if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') &&
          m_gt_1(z + 4)) {
        z += 4;
      }
      break;
    case 'e':
      if (z[0] == 'r' && m_gt_1(z + 2)) {
        z += 2;
      }
      break;
    case 'i':
      if (z[0] == 'c' && m_gt_1(z + 2)) {
        z += 2;
      }
      break;
    case 'l':
      if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') &&
          m_gt_1(z + 4)) {
        z += 4;
      }
      break;
    case 'n':
      if (z[0] == 't') {
        if (z[2] == 'a') {
          if (m_gt_1(z + 3)) {
            z += 3;
          }
        } else if (z[2] == 'e') {
          (void)(stem(&z, "tneme", "", m_gt_1) ||
                 stem(&z, "tnem", "", m_gt_1) || stem(&z, "tne", "", m_gt_1));
        }
      }
      break;
    case 'o':
      if (z[0] == 'u') {
        if (m_gt_1(z + 2)) {
          z += 2;
        }
      } else if (z[3] == 's' || z[3] == 't') {
        (void)(stem(&z, "noi", "", m_gt_1));
      }
      break;
    case 's':
      if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) {
        z += 3;
      }
      break;
    case 't':
      (void)(stem(&z, "eta", "", m_gt_1) || stem(&z, "iti", "", m_gt_1));
      break;
    case 'u':
      if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) {
        z += 3;
      }
      break;
    case 'v':
    case 'z':
      if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) {
        z += 3;
      }
      break;
  }

  /* Step 5a */
  if (z[0] == 'e') {
    if (m_gt_1(z + 1)) {
      z++;
    } else if (m_eq_1(z + 1) && !star_oh(z + 1)) {
      z++;
    }
  }

  /* Step 5b */
  if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') {
    z++;
  }

  /* z[] is now the stemmed word in reverse order.  Flip it back
  ** around into forward order and return.
  */
  *pnOut = i = strlen(z);
  zOut[i] = 0;
  while (*z) {
    zOut[--i] = *(z++);
  }
}