public NameSample read()

in opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java [100:171]


  public NameSample read() throws IOException {

    if (nameSamples.isEmpty()) {
      String doc = samples.read();
      if (doc != null) {
        boolean clearAdaptiveData = true;
        String line;
        try (BufferedReader docIn = new BufferedReader(new StringReader(doc))) {
          while ((line = docIn.readLine()) != null) {

            if (line.startsWith(TAG_DOC_OPEN)) {
              continue;
            }
            if (line.equals(TAG_DOC_CLOSE)) {
              break;
            }

            String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(line);
            List<Span> entities = new LinkedList<>();
            List<String> cleanedTokens = new ArrayList<>(tokens.length);

            int tokenIndex = 0;
            int entityBeginIndex = -1;
            String entityType = null;
            boolean insideStartEnmaxTag = false;
            for (String token : tokens) {

              // Split here, next part of tag is in new token
              if (token.startsWith(TAG_ENAMEX_OPEN)) {
                insideStartEnmaxTag = true;
                continue;
              }

              if (insideStartEnmaxTag) {
                String typeBegin = TYPE;
                if (token.startsWith(typeBegin)) {
                  int typeEnd = token.indexOf("\"", typeBegin.length());
                  entityType = StringUtil.toLowerCase(token.substring(typeBegin.length(), typeEnd));
                }

                if (token.contains(SYMBOL_CLOSE)) {
                  entityBeginIndex = tokenIndex;
                  insideStartEnmaxTag = false;
                } else {
                  continue;
                }
              }

              if (token.endsWith(TAG_ENAMEX_CLOSE)) {
                entities.add(new Span(entityBeginIndex, tokenIndex + 1, entityType));
                entityBeginIndex = -1;
              }

              cleanedTokens.add(convertToken(token));
              tokenIndex++;
            }

            nameSamples.add(new NameSample(cleanedTokens.toArray(new String[0]),
                    entities.toArray(new Span[0]), clearAdaptiveData));

            clearAdaptiveData = false;
          }
        }
      }
    }

    if (!nameSamples.isEmpty()) {
      return nameSamples.remove(0);
    } else {
      return null;
    }
  }