in torchaudio/csrc/decoder/src/dictionary/Utils.cpp [58:82]
std::vector<std::string> splitWrd(const std::string& word) {
std::vector<std::string> tokens;
tokens.reserve(word.size());
int len = word.length();
for (int i = 0; i < len;) {
auto c = static_cast<unsigned char>(word[i]);
int curTknBytes = -1;
// UTF-8 checks, works for ASCII automatically
if ((c & 0x80) == 0) {
curTknBytes = 1;
} else if ((c & 0xE0) == 0xC0) {
curTknBytes = 2;
} else if ((c & 0xF0) == 0xE0) {
curTknBytes = 3;
} else if ((c & 0xF8) == 0xF0) {
curTknBytes = 4;
}
if (curTknBytes == -1 || i + curTknBytes > len) {
throw std::runtime_error("splitWrd: invalid UTF-8 : " + word);
}
tokens.emplace_back(word.begin() + i, word.begin() + i + curTknBytes);
i += curTknBytes;
}
return tokens;
}