public static NKJPSegmentationDocument parse()

in opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java [92:159]


  public static NKJPSegmentationDocument parse(InputStream is) throws IOException {

    Map<String, Map<String, Pointer>> sentences = new LinkedHashMap<>();

    try {
      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
      Document doc = docBuilder.parse(is);

      XPathFactory xPathfactory = XPathFactory.newInstance();
      XPath xpath = xPathfactory.newXPath();

      final XPathExpression SENT_NODES = xpath.compile("/teiCorpus/TEI/text/body/p/s");
      final XPathExpression SEG_NODES = xpath.compile("./seg|./choice");
      final XPathExpression SEG_NODES_ONLY = xpath.compile("./seg");

      NodeList nl = (NodeList) SENT_NODES.evaluate(doc, XPathConstants.NODESET);

      for (int i = 0; i < nl.getLength(); i++) {
        Node sentnode = nl.item(i);

        String sentid = null;
        if (sentnode.getAttributes().getNamedItem(XML_ID) != null) {
          sentid = sentnode.getAttributes().getNamedItem(XML_ID).getTextContent();
        }

        Map<String, Pointer> segments = new LinkedHashMap<>();
        NodeList segnl = (NodeList) SEG_NODES.evaluate(sentnode, XPathConstants.NODESET);

        for (int j = 0; j < segnl.getLength(); j++) {
          Node n = segnl.item(j);
          if (n.getNodeName().equals(SEG)) {
            String segid = xmlID(n);
            Pointer pointer = fromSeg(n);
            segments.put(segid, pointer);
          } else if (n.getNodeName().equals(CHOICE)) {
            NodeList choices = n.getChildNodes();
            for (int k = 0; k < choices.getLength(); k++) {
              Node choice = choices.item(k);
              if (choice.getNodeName().equals(NKJP_PAREN)) {
                if (!checkRejectedParen(choice)) {
                  NodeList paren_segs = (NodeList) SEG_NODES_ONLY.evaluate(choice,
                      XPathConstants.NODESET);

                  for (int l = 0; l < paren_segs.getLength(); l++) {
                    String segid = xmlID(paren_segs.item(l));
                    Pointer pointer = fromSeg(paren_segs.item(l));
                    segments.put(segid, pointer);
                  }
                }
              } else if (choice.getNodeName().equals(SEG)) {
                if (!checkRejected(choice)) {
                  String segid = xmlID(choice);
                  Pointer pointer = fromSeg(choice);
                  segments.put(segid, pointer);
                }
              }
            }
          }
        }
        sentences.put(sentid, segments);
      }

    } catch (SAXException | XPathExpressionException | IOException e) {
      throw new IOException("Failed to parse NKJP document", e);
    }

    return new NKJPSegmentationDocument(sentences);
  }