in include/model/CTokenListDataCategorizer.h [95:154]
void tokeniseString(const TStrStrUMap& fields,
const std::string& str,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight) override {
tokenIds.clear();
tokenUniqueIds.clear();
totalWeight = 0;
std::string temp;
// TODO - make more efficient
std::string::size_type nonHexPos(std::string::npos);
for (const char curChar : str) {
if (TRUNCATE_AT_NEWLINE && curChar == '\n') {
break;
}
// Basically tokenise into [a-zA-Z0-9/]+ strings, possibly
// allowing underscores, dots and dashes in the middle
if (std::isalnum(static_cast<unsigned char>(curChar)) ||
(!temp.empty() && ((ALLOW_UNDERSCORE && curChar == '_') ||
(ALLOW_DOT && curChar == '.') ||
(ALLOW_DASH && curChar == '-'))) ||
(ALLOW_FORWARD_SLASH && curChar == '/')) {
temp += curChar;
if (IGNORE_HEX) {
// Count dots and dashes as numeric
if (!std::isxdigit(static_cast<unsigned char>(curChar)) &&
curChar != '.' && curChar != '-') {
nonHexPos = temp.length() - 1;
}
}
} else {
if (!temp.empty()) {
this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds,
totalWeight, minReweightedTotalWeight,
maxReweightedTotalWeight);
temp.clear();
}
if (IGNORE_HEX) {
nonHexPos = std::string::npos;
}
}
}
if (!temp.empty()) {
this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds, totalWeight,
minReweightedTotalWeight, maxReweightedTotalWeight);
}
LOG_TRACE(<< str << " tokenised to " << tokenIds.size() << " tokens with total weight "
<< totalWeight << ": " << SIdTranslater(*this, tokenIds, ' '));
m_DictionaryWeightFunc.reset();
}