bool FragmentBoundaryMatch::Advance()

in tensorflow_text/core/kernels/sentence_fragmenter_v2.cc [375:442]


bool FragmentBoundaryMatch::Advance(int index, absl::string_view slice) {
  int temp_offset;
  // By defualt offset is the next character.
  int offset = 1;
  bool no_transition = false;
  bool is_terminal_punc = IsTerminalPunc(slice, &temp_offset);
  if (is_terminal_punc) {
    offset = temp_offset;
  }

  bool is_ellipsis = IsEllipsis(slice, &temp_offset);
  if (is_ellipsis) {
    offset = temp_offset;
  }
  bool is_close_punc = IsClosePunc(slice, &temp_offset);
  if (is_close_punc) {
    offset = temp_offset;
  }
  bool is_acronym = IsPeriodSeparatedAcronym(slice, &temp_offset);
  if (is_acronym) {
    is_terminal_punc = false;
    offset = temp_offset;
  }
  bool is_emoticon = IsEmoticon(slice, &temp_offset);
  if (is_emoticon) {
    is_terminal_punc = false;
    offset = temp_offset;
  }

  switch (state_) {
    case INITIAL_STATE:
      if (is_terminal_punc || is_acronym || is_emoticon) {
        first_terminal_punc_index_ = index;
        state_ = COLLECTING_TERMINAL_PUNC;
      }
      break;
    case COLLECTING_TERMINAL_PUNC:
      if (is_terminal_punc || is_emoticon) {
        // Stay in COLLECTING_TERMINAL_PUNC state.
      } else if (is_close_punc) {
        first_close_punc_index_ = index;
        state_ = COLLECTING_CLOSE_PUNC;
      } else {
        no_transition = true;
      }
      break;
    case COLLECTING_CLOSE_PUNC:
      if (is_close_punc || is_ellipsis || is_emoticon) {
        // Stay in COLLECTING_CLOSE_PUNC state. We effectively ignore
        // emoticons and ellipses and continue to accept closing punctuation
        // after them.
      } else {
        no_transition = true;
      }
      break;
  }

  if (no_transition) {
    return false;
  } else {
    limit_index_ = index + offset;
    if (state_ == COLLECTING_TERMINAL_PUNC) {
      // We've gotten terminal punctuation, but no close punctuation yet.
      first_close_punc_index_ = limit_index_;
    }
    return true;
  }
}