in tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc [184:292]
void FastWordpieceTokenizer::TokenizeTextImpl(
absl::string_view input_text, std::vector<std::string>* output_pieces,
std::vector<int>* output_ids, std::vector<int>* output_start_offsets,
std::vector<int>* output_end_offsets) const {
static_assert(kGetPieces || kGetIds,
"At least one of `kGetPieces` and `kGetIds` should be true.");
if (input_text.empty()) {
return;
}
const int input_size = input_text.size();
int next_pos = 0;
int cur_pos = 0;
int original_num_tokens =
GetCurrentOutputSize<kGetPieces>(output_pieces, output_ids);
UChar32 prev_unicode_char;
UChar32 cur_unicode_char;
while (cur_pos < input_size) {
int cur_offset_in_input_word = 0;
// Tokenize the word starting at the current position.
auto cur_node = trie_->CreateTraversalCursorPointToRoot();
int word_byte_length_so_far = 0;
int input_word_offset_in_text = cur_pos;
absl::string_view input_substr = input_text.substr(cur_pos);
// The trie matching loop below tokenizes and recognizes word pieces until
// 1. it steps over the input boundary, or
// 2. the length of the current word reaches 'max_bytes_per_token', or
// 3. it sees a whitespace / punctuation / unknown character.
while (cur_pos < input_size) {
prev_unicode_char = cur_unicode_char;
next_pos = cur_pos;
U8_NEXT(input_text, next_pos, input_text.length(), cur_unicode_char);
if (word_byte_length_so_far + next_pos - cur_pos >
config_->max_bytes_per_token())
break;
// Try matching one Unicode character from here.
while (!trie_->TryTraverseSeveralSteps(
cur_node, input_text.substr(cur_pos, next_pos - cur_pos))) {
// Trie cannot consume the whole Unicode character. We need to pop one
// or more longest-matching tokens off the beginning of the string
// represented by the current node. We then transit to the node pointed
// by the failure link, which represents the remaining suffix string
// after popping those matching prefix tokens.
//
// For example, if the current node is "abcdef", and we need to pop
// "ab", and "##cd" off the beginning, the failure link points to the
// node that represents "##ef".
if (!TryFollowFailureLinkAndCollectTokens<kGetPieces, kGetIds,
kGetOffsets>(
input_substr, input_word_offset_in_text,
cur_offset_in_input_word, cur_node, output_pieces, output_ids,
output_start_offsets, output_end_offsets)) {
goto outside_trie_match_loop;
}
}
// Trie consumed the whole Unicode char and was able to traverse to a
// new node. We move forward the cursor to match the next character.
word_byte_length_so_far += next_pos - cur_pos;
cur_pos = next_pos;
}
outside_trie_match_loop:
if (cur_pos >= input_size) {
// Collect the remaining tokens stored on a path on the trie.
HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
input_substr, input_word_offset_in_text, cur_node,
original_num_tokens, cur_offset_in_input_word, output_pieces,
output_ids, output_start_offsets, output_end_offsets);
// Break as we've finished all characters.
break;
}
bool is_white_space = u_isUWhiteSpace(cur_unicode_char);
if (is_white_space ||
fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(
cur_unicode_char) ||
(cur_pos && fast_wordpiece_tokenizer_utils::IsPunctuationOrChineseChar(
prev_unicode_char))) {
// If the current Unicode character is a valid word boundary, collect the
// remaining tokens stored on a path on the trie.
HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
absl::string_view(input_substr.data(),
cur_pos - input_word_offset_in_text),
input_word_offset_in_text, cur_node, original_num_tokens,
cur_offset_in_input_word, output_pieces, output_ids,
output_start_offsets, output_end_offsets);
// Skip the whitespace.
if (is_white_space) cur_pos = next_pos;
// Continue in the outer while loop to process the remaining input.
continue;
}
// Note that even with the following line removed, the code is still correct
// (i.e., Mutants is right). We keep this line for efficiency reasons: We
// have tested the current char, and it is not a whitespace or punctuation
// char. Hence it's safe to skip the current char; we don't want to test it
// again in the subsequent function.
cur_pos = next_pos;
int end_of_word =
SkipTheRemainingOfWordAndTrailingWhiteSpaces(input_text, cur_pos);
// The current character is not a word boundary. The case is simple: We are
// at the start or middle of some word with unknown characters or exceeding
// the length limit. We map the entire word unk_token, skip the remaining
// portion, and continue.
ResetOutputAppendUnknownToken<kGetPieces, kGetIds, kGetOffsets>(
input_word_offset_in_text, (end_of_word - input_word_offset_in_text),
original_num_tokens, output_pieces, output_ids, output_start_offsets,
output_end_offsets);
}
}