protected Iterator createEvents()

in opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java [98:168]


  protected Iterator<Event> createEvents(TokenSample tokenSample) {

    List<Event> events = new ArrayList<>(50);

    Span[] tokens = tokenSample.getTokenSpans();
    String text = tokenSample.getText();

    if (tokens.length > 0) {

      int start = tokens[0].getStart();
      int end = tokens[tokens.length - 1].getEnd();

      String sent = text.substring(start, end);

      Span[] candTokens = WhitespaceTokenizer.INSTANCE.tokenizePos(sent);

      int firstTrainingToken = -1;
      int lastTrainingToken = -1;
      for (Span candToken : candTokens) {
        Span cSpan = candToken;
        String ctok = sent.substring(cSpan.getStart(), cSpan.getEnd());
        //adjust cSpan to text offsets
        cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
        //should we skip this token
        if (ctok.length() > 1 && (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {

          //find offsets of annotated tokens inside of candidate tokens
          boolean foundTrainingTokens = false;
          for (int ti = lastTrainingToken + 1; ti < tokens.length; ti++) {
            if (cSpan.contains(tokens[ti])) {
              if (!foundTrainingTokens) {
                firstTrainingToken = ti;
                foundTrainingTokens = true;
              }
              lastTrainingToken = ti;
            }
            else if (cSpan.getEnd() < tokens[ti].getEnd()) {
              break;
            }
            else if (tokens[ti].getEnd() < cSpan.getStart()) {
              //keep looking
            }
            else {
              logger.warn("Bad training token: {} cand: {} token={}", tokens[ti], cSpan,
                  text.substring(tokens[ti].getStart(), tokens[ti].getEnd()));
            }
          }

          // create training data
          if (foundTrainingTokens) {

            for (int ti = firstTrainingToken; ti <= lastTrainingToken; ti++) {
              Span tSpan = tokens[ti];
              int cStart = cSpan.getStart();
              for (int i = tSpan.getStart() + 1; i < tSpan.getEnd(); i++) {
                String[] context = cg.getContext(ctok, i - cStart);
                events.add(new Event(TokenizerME.NO_SPLIT, context));
              }

              if (tSpan.getEnd() != cSpan.getEnd()) {
                String[] context = cg.getContext(ctok, tSpan.getEnd() - cStart);
                events.add(new Event(TokenizerME.SPLIT, context));
              }
            }
          }
        }
      }
    }

    return events.iterator();
  }