void FastWordpieceTokenizer::TokenizeTextImpl()

in tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc [184:292]


void FastWordpieceTokenizer::TokenizeTextImpl(
    absl::string_view input_text, std::vector<std::string>* output_pieces,
    std::vector<int>* output_ids, std::vector<int>* output_start_offsets,
    std::vector<int>* output_end_offsets) const {
  static_assert(kGetPieces || kGetIds,
                "At least one of `kGetPieces` and `kGetIds` should be true.");
  if (input_text.empty()) {
    return;
  }
  const int input_size = input_text.size();
  int next_pos = 0;
  int cur_pos = 0;
  int original_num_tokens =
      GetCurrentOutputSize<kGetPieces>(output_pieces, output_ids);
  UChar32 prev_unicode_char;
  UChar32 cur_unicode_char;
  while (cur_pos < input_size) {
    int cur_offset_in_input_word = 0;
    // Tokenize the word starting at the current position.
    auto cur_node = trie_->CreateTraversalCursorPointToRoot();
    int word_byte_length_so_far = 0;
    int input_word_offset_in_text = cur_pos;
    absl::string_view input_substr = input_text.substr(cur_pos);
    // The trie matching loop below tokenizes and recognizes word pieces until
    //  1. it steps over the input boundary, or
    //  2. the length of the current word reaches 'max_bytes_per_token', or
    //  3. it sees a whitespace / punctuation / unknown character.
    while (cur_pos < input_size) {
      prev_unicode_char = cur_unicode_char;
      next_pos = cur_pos;
      U8_NEXT(input_text, next_pos, input_text.length(), cur_unicode_char);

      if (word_byte_length_so_far + next_pos - cur_pos >
          config_->max_bytes_per_token())
        break;
      // Try matching one Unicode character from here.
      while (!trie_->TryTraverseSeveralSteps(
          cur_node, input_text.substr(cur_pos, next_pos - cur_pos))) {
        // Trie cannot consume the whole Unicode character. We need to pop one
        // or more longest-matching tokens off the beginning of the string
        // represented by the current node. We then transit to the node pointed
        // by the failure link, which represents the remaining suffix string
        // after popping those matching prefix tokens.
        //
        // For example, if the current node is "abcdef", and we need to pop
        // "ab", and "##cd" off the beginning, the failure link points to the
        // node that represents "##ef".
        if (!TryFollowFailureLinkAndCollectTokens<kGetPieces, kGetIds,
                                                  kGetOffsets>(
                input_substr, input_word_offset_in_text,
                cur_offset_in_input_word, cur_node, output_pieces, output_ids,
                output_start_offsets, output_end_offsets)) {
          goto outside_trie_match_loop;
        }
      }
      // Trie consumed the whole Unicode char and was able to traverse to a
      // new node. We move forward the cursor to match the next character.
      word_byte_length_so_far += next_pos - cur_pos;
      cur_pos = next_pos;
    }
  outside_trie_match_loop:
    if (cur_pos >= input_size) {
      // Collect the remaining tokens stored on a path on the trie.
      HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
          input_substr, input_word_offset_in_text, cur_node,
          original_num_tokens, cur_offset_in_input_word, output_pieces,
          output_ids, output_start_offsets, output_end_offsets);
      // Break as we've finished all characters.
      break;
    }
    bool is_white_space = u_isUWhiteSpace(cur_unicode_char);
    if (is_white_space ||
        fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(
            cur_unicode_char) ||
        (cur_pos && fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(
                        prev_unicode_char))) {
      // If the current Unicode character is a valid word boundary, collect the
      // remaining tokens stored on a path on the trie.
      HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
          absl::string_view(input_substr.data(),
                            cur_pos - input_word_offset_in_text),
          input_word_offset_in_text, cur_node, original_num_tokens,
          cur_offset_in_input_word, output_pieces, output_ids,
          output_start_offsets, output_end_offsets);
      // Skip the whitespace.
      if (is_white_space) cur_pos = next_pos;
      // Continue in the outer while loop to process the remaining input.
      continue;
    }

    // Note that even with the following line removed, the code is still correct
    // (i.e., Mutants is right). We keep this line for efficiency reasons: We
    // have tested the current char, and it is not a whitespace or punctuation
    // char. Hence it's safe to skip the current char; we don't want to test it
    // again in the subsequent function.
    cur_pos = next_pos;
    int end_of_word =
        SkipTheRemainingOfWordAndTrailingWhiteSpaces(input_text, cur_pos);

    // The current character is not a word boundary. The case is simple: We are
    // at the start or middle of some word with unknown characters or exceeding
    // the length limit. We map the entire word unk_token, skip the remaining
    // portion, and continue.
    ResetOutputAppendUnknownToken<kGetPieces, kGetIds, kGetOffsets>(
        input_word_offset_in_text, (end_of_word - input_word_offset_in_text),
        original_num_tokens, output_pieces, output_ids, output_start_offsets,
        output_end_offsets);
  }
}