in scripts/clean_training_data/janitor_util.cpp [24:104]
std::vector<std::string> clean_ngram(std::string const &input,
std::string const &ignore,
size_t ngram_n) noexcept {
size_t num_grams = 0;
std::vector<std::string> ngram_list;
std::vector<uint8_t> gram_lengths;
std::string current_ngram;
// Max gram length is set to 10 below.
current_ngram.reserve(11 * ngram_n);
gram_lengths.reserve(ngram_n);
bool started_gram = false;
gram_lengths.push_back(0);
// for (size_t i=0; i<input.length(); i++) {
// this is slightly faster, and we don't need the index in this one
for (auto iter = input.begin(); iter != input.end(); iter++) {
// If whitespace, end the current ngram and start the next
// alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ...
// }
if (is_whitespace(*iter) || gram_lengths.back() > 10) {
// Skip all whitespace
while (++iter != input.end() && is_whitespace(*iter))
;
iter--;
if (started_gram) {
num_grams += 1;
// Building 1grams is a special case
if (ngram_n == 1) {
ngram_list.push_back(current_ngram);
current_ngram = current_ngram.substr(gram_lengths.front());
gram_lengths.back() = 0;
// If there are enough grams to form an ngram, save
} else if (num_grams >= ngram_n) {
// Save the current ngram
ngram_list.push_back(current_ngram);
// Start the next ngram by dropping the first gram and its space from
// the ngram
current_ngram = current_ngram.substr(gram_lengths.front() + 1);
current_ngram += ' ';
// Drop the length of the first gram and prepare to record the length
// of the new gram
gram_lengths.erase(gram_lengths.begin());
gram_lengths.push_back(0);
// Otherwise, continue building
} else {
current_ngram += ' ';
gram_lengths.push_back(0);
}
started_gram = false;
}
// Skip ignored characters
// alternatively, (perhaps marginally) faster: if (is_punctuation(ch))
// continue;
} else if (ignore.find(*iter) != std::string::npos) {
continue;
}
// If it is a non-ignored character, add it to the ngram and update the last
// gram's length
else {
current_ngram += tolower(*iter);
gram_lengths.back() += 1;
started_gram = true;
}
}
return ngram_list;
}