in tensorflow_text/core/kernels/sentence_fragmenter_v2.cc [375:442]
bool FragmentBoundaryMatch::Advance(int index, absl::string_view slice) {
int temp_offset;
// By defualt offset is the next character.
int offset = 1;
bool no_transition = false;
bool is_terminal_punc = IsTerminalPunc(slice, &temp_offset);
if (is_terminal_punc) {
offset = temp_offset;
}
bool is_ellipsis = IsEllipsis(slice, &temp_offset);
if (is_ellipsis) {
offset = temp_offset;
}
bool is_close_punc = IsClosePunc(slice, &temp_offset);
if (is_close_punc) {
offset = temp_offset;
}
bool is_acronym = IsPeriodSeparatedAcronym(slice, &temp_offset);
if (is_acronym) {
is_terminal_punc = false;
offset = temp_offset;
}
bool is_emoticon = IsEmoticon(slice, &temp_offset);
if (is_emoticon) {
is_terminal_punc = false;
offset = temp_offset;
}
switch (state_) {
case INITIAL_STATE:
if (is_terminal_punc || is_acronym || is_emoticon) {
first_terminal_punc_index_ = index;
state_ = COLLECTING_TERMINAL_PUNC;
}
break;
case COLLECTING_TERMINAL_PUNC:
if (is_terminal_punc || is_emoticon) {
// Stay in COLLECTING_TERMINAL_PUNC state.
} else if (is_close_punc) {
first_close_punc_index_ = index;
state_ = COLLECTING_CLOSE_PUNC;
} else {
no_transition = true;
}
break;
case COLLECTING_CLOSE_PUNC:
if (is_close_punc || is_ellipsis || is_emoticon) {
// Stay in COLLECTING_CLOSE_PUNC state. We effectively ignore
// emoticons and ellipses and continue to accept closing punctuation
// after them.
} else {
no_transition = true;
}
break;
}
if (no_transition) {
return false;
} else {
limit_index_ = index + offset;
if (state_ == COLLECTING_TERMINAL_PUNC) {
// We've gotten terminal punctuation, but no close punctuation yet.
first_close_punc_index_ = limit_index_;
}
return true;
}
}