public TokenSample next()

in opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/en/TokenSampleStream.java [57:109]


  public TokenSample next() {
    String[] tokens = line.split("\\s+");
    if (tokens.length == 0) {
      evenq = true;
    }
    StringBuilder sb = new StringBuilder(line.length());
    List<Span> spans = new ArrayList<>();
    int length = 0;
    for (int ti = 0; ti < tokens.length; ti++) {
      String token = tokens[ti];
      String lastToken = ti - 1 >= 0 ? tokens[ti - 1] : "";
      token = switch (token) {
        case "-LRB-" -> "(";
        case "-LCB-" -> "{";
        case "-RRB-" -> ")";
        case "-RCB-" -> "}";
        default -> token;
      };
      if (sb.length() != 0) {
        if (!alphaNumeric.matcher(token).find() || token.startsWith("'") || token.equalsIgnoreCase("n't")) {
          if ((token.equals("``") || token.equals("--") || token.equals("$") ||
              token.equals("(")  || token.equals("&")  || token.equals("#") ||
              (token.equals("\"") && (evenq && ti != tokens.length - 1)))
              && (!lastToken.equals("(") || !lastToken.equals("{"))) {
            length++;
          }
        }
        else {
          if (!lastToken.equals("``") && (!lastToken.equals("\"") || evenq) && !lastToken.equals("(")
              && !lastToken.equals("{") && !lastToken.equals("$") && !lastToken.equals("#")) {
            length++;
          }
        }
      }
      if (token.equals("\"")) {
        evenq = ti == tokens.length - 1 || !evenq;
      }
      if (sb.length() < length) {
        sb.append(" ");
      }
      sb.append(token);
      spans.add(new Span(length, length + token.length()));
      length += token.length();
    }

    try {
      line = in.readLine();
    } catch (IOException e) {
      logger.error(e.getLocalizedMessage(), e);
      line = null;
    }
    return new TokenSample(sb.toString(),spans.toArray(new Span[0]));
  }