std::vector splitWrd()

in torchaudio/csrc/decoder/src/dictionary/Utils.cpp [58:82]


std::vector<std::string> splitWrd(const std::string& word) {
  std::vector<std::string> tokens;
  tokens.reserve(word.size());
  int len = word.length();
  for (int i = 0; i < len;) {
    auto c = static_cast<unsigned char>(word[i]);
    int curTknBytes = -1;
    // UTF-8 checks, works for ASCII automatically
    if ((c & 0x80) == 0) {
      curTknBytes = 1;
    } else if ((c & 0xE0) == 0xC0) {
      curTknBytes = 2;
    } else if ((c & 0xF0) == 0xE0) {
      curTknBytes = 3;
    } else if ((c & 0xF8) == 0xF0) {
      curTknBytes = 4;
    }
    if (curTknBytes == -1 || i + curTknBytes > len) {
      throw std::runtime_error("splitWrd: invalid UTF-8 : " + word);
    }
    tokens.emplace_back(word.begin() + i, word.begin() + i + curTknBytes);
    i += curTknBytes;
  }
  return tokens;
}