void tokeniseString()

in include/model/CTokenListDataCategorizer.h [95:154]


    void tokeniseString(const TStrStrUMap& fields,
                        const std::string& str,
                        TSizeSizePrVec& tokenIds,
                        TSizeSizeMap& tokenUniqueIds,
                        std::size_t& totalWeight,
                        std::size_t& minReweightedTotalWeight,
                        std::size_t& maxReweightedTotalWeight) override {
        tokenIds.clear();
        tokenUniqueIds.clear();
        totalWeight = 0;

        std::string temp;

        // TODO - make more efficient
        std::string::size_type nonHexPos(std::string::npos);
        for (const char curChar : str) {

            if (TRUNCATE_AT_NEWLINE && curChar == '\n') {
                break;
            }

            // Basically tokenise into [a-zA-Z0-9/]+ strings, possibly
            // allowing underscores, dots and dashes in the middle
            if (std::isalnum(static_cast<unsigned char>(curChar)) ||
                (!temp.empty() && ((ALLOW_UNDERSCORE && curChar == '_') ||
                                   (ALLOW_DOT && curChar == '.') ||
                                   (ALLOW_DASH && curChar == '-'))) ||
                (ALLOW_FORWARD_SLASH && curChar == '/')) {
                temp += curChar;
                if (IGNORE_HEX) {
                    // Count dots and dashes as numeric
                    if (!std::isxdigit(static_cast<unsigned char>(curChar)) &&
                        curChar != '.' && curChar != '-') {
                        nonHexPos = temp.length() - 1;
                    }
                }
            } else {
                if (!temp.empty()) {
                    this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds,
                                        totalWeight, minReweightedTotalWeight,
                                        maxReweightedTotalWeight);
                    temp.clear();
                }

                if (IGNORE_HEX) {
                    nonHexPos = std::string::npos;
                }
            }
        }

        if (!temp.empty()) {
            this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds, totalWeight,
                                minReweightedTotalWeight, maxReweightedTotalWeight);
        }

        LOG_TRACE(<< str << " tokenised to " << tokenIds.size() << " tokens with total weight "
                  << totalWeight << ": " << SIdTranslater(*this, tokenIds, ' '));

        m_DictionaryWeightFunc.reset();
    }