in tensorflow_text/core/kernels/sentence_fragmenter.cc [80:140]
Status Advance(const UnicodeUtil *util, const Document &document, int index,
bool *result) {
const Token &token = document.tokens()[index];
const tstring &word = token.word();
bool no_transition = false;
bool is_terminal_punc = false;
TF_RETURN_IF_ERROR(util->IsTerminalPunc(word, &is_terminal_punc));
bool is_ellipsis = false;
TF_RETURN_IF_ERROR(util->IsEllipsis(word, &is_ellipsis));
bool is_close_punc = false;
TF_RETURN_IF_ERROR(util->IsClosePunc(word, &is_close_punc));
switch (state_) {
case INITIAL_STATE:
if (is_terminal_punc || is_ellipsis ||
IsPeriodSeparatedAcronym(token) ||
TokenHasProperty(Token::EMOTICON, token)) {
first_terminal_punc_index_ = index;
state_ = COLLECTING_TERMINAL_PUNC;
}
break;
case COLLECTING_TERMINAL_PUNC:
if (is_terminal_punc || is_ellipsis ||
TokenHasProperty(Token::EMOTICON, token)) {
// Stay in COLLECTING_TERMINAL_PUNC state.
} else if (is_close_punc) {
first_close_punc_index_ = index;
state_ = COLLECTING_CLOSE_PUNC;
} else {
no_transition = true;
}
break;
case COLLECTING_CLOSE_PUNC:
if (is_close_punc || is_ellipsis ||
TokenHasProperty(Token::EMOTICON, token)) {
// Stay in COLLECTING_CLOSE_PUNC state. We effectively ignore
// emoticons and ellipses and continue to accept closing punctuation
// after them.
} else {
no_transition = true;
}
break;
}
if (no_transition) {
*result = false;
return Status::OK();
} else {
limit_index_ = index + 1;
if (state_ == COLLECTING_TERMINAL_PUNC) {
// We've gotten terminal punctuation, but no close punctuation yet.
first_close_punc_index_ = limit_index_;
}
*result = true;
return Status::OK();
}
}