in src/utils/normalize.cpp [17:56]
void normalize_text(std::string& str) {
/*
* We categorize longer strings into the following buckets:
*
* 1. All punctuation-and-numeric. Things in this bucket get
* their numbers flattened, to prevent combinatorial explosions.
* They might be specific numbers, prices, etc.
*
* 2. All letters: case-flattened.
*
* 3. Mixed letters and numbers: a product ID? Flatten case and leave
* numbers alone.
*
* The case-normalization is state-machine-driven.
*/
bool allNumeric = true;
bool containsDigits = false;
for (char c: str) {
assert(c); // don't shove binary data through this.
containsDigits |= isdigit(c);
if (!isascii(c)) {
allNumeric = false;
continue;
}
if (!isalpha(c)) continue;
allNumeric = false;
}
bool flattenCase = true;
bool flattenNum = allNumeric && containsDigits;
if (!flattenNum && !flattenCase) return;
std::transform(str.begin(), str.end(), str.begin(),
[&](char c) {
if (flattenNum && isdigit(c)) return '0';
if (isalpha(c)) return char(tolower(c));
return c;
});
}