std::vector clean_ngram()

in scripts/clean_training_data/janitor_util.cpp [24:104]


std::vector<std::string> clean_ngram(std::string const &input,
                                     std::string const &ignore,
                                     size_t ngram_n) noexcept {

  size_t num_grams = 0;
  std::vector<std::string> ngram_list;
  std::vector<uint8_t> gram_lengths;
  std::string current_ngram;

  // Max gram length is set to 10 below.
  current_ngram.reserve(11 * ngram_n);
  gram_lengths.reserve(ngram_n);

  bool started_gram = false;
  gram_lengths.push_back(0);

  // for (size_t i=0; i<input.length(); i++) {
  //  this is slightly faster, and we don't need the index in this one
  for (auto iter = input.begin(); iter != input.end(); iter++) {

    // If whitespace, end the current ngram and start the next
    // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ...
    // }
    if (is_whitespace(*iter) || gram_lengths.back() > 10) {

      // Skip all whitespace
      while (++iter != input.end() && is_whitespace(*iter))
        ;
      iter--;

      if (started_gram) {
        num_grams += 1;

        // Building 1grams is a special case
        if (ngram_n == 1) {
          ngram_list.push_back(current_ngram);
          current_ngram = current_ngram.substr(gram_lengths.front());
          gram_lengths.back() = 0;

          // If there are enough grams to form an ngram, save
        } else if (num_grams >= ngram_n) {
          // Save the current ngram
          ngram_list.push_back(current_ngram);

          // Start the next ngram by dropping the first gram and its space from
          // the ngram
          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
          current_ngram += ' ';

          // Drop the length of the first gram and prepare to record the length
          // of the new gram
          gram_lengths.erase(gram_lengths.begin());
          gram_lengths.push_back(0);

          // Otherwise, continue building
        } else {
          current_ngram += ' ';
          gram_lengths.push_back(0);
        }

        started_gram = false;
      }

      // Skip ignored characters
      // alternatively, (perhaps marginally) faster: if (is_punctuation(ch))
      // continue;
    } else if (ignore.find(*iter) != std::string::npos) {
      continue;
    }

    // If it is a non-ignored character, add it to the ngram and update the last
    // gram's length
    else {
      current_ngram += tolower(*iter);
      gram_lengths.back() += 1;
      started_gram = true;
    }
  }

  return ngram_list;
}