public static IrishSentenceBankDocument parse()

in opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java [178:292]


  public static IrishSentenceBankDocument parse(InputStream is) throws IOException {
    IrishSentenceBankDocument document = new IrishSentenceBankDocument();

    try {
      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
      Document doc = docBuilder.parse(is);

      String root = doc.getDocumentElement().getNodeName();
      if (!root.equalsIgnoreCase("sentences")) {
        throw new IOException("Expected root node " + root);
      }

      NodeList nl = doc.getDocumentElement().getChildNodes();
      for (int i = 0; i < nl.getLength(); i++) {
        Node sentnode = nl.item(i);
        if (sentnode.getNodeName().equals("sentence")) {
          String src = sentnode.getAttributes().getNamedItem("source").getNodeValue();
          String trans = "";
          Map<Integer, String> toks = new HashMap<>();
          Map<Integer, List<String>> flx = new HashMap<>();
          List<Span> spans = new ArrayList<>();
          NodeList sentnl = sentnode.getChildNodes();
          int flexes = 1;
          java.lang.StringBuilder orig = new java.lang.StringBuilder();

          for (int j = 0; j < sentnl.getLength(); j++) {
            final String name = sentnl.item(j).getNodeName();
            switch (name) {
              case "flex":
                String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue();
                Integer flexslot = Integer.parseInt(slottmpa);
                if (flexslot > flexes) {
                  flexes = flexslot;
                }

                flx.computeIfAbsent(flexslot, k -> new ArrayList<>());
                String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue();
                flx.get(flexslot).add(tkn);
                break;

              case "translation":
                trans = sentnl.item(j).getFirstChild().getTextContent();
                break;

              case "original":
                int last = 0;
                NodeList orignl = sentnl.item(j).getChildNodes();
                for (int k = 0; k < orignl.getLength(); k++) {
                  switch (orignl.item(k).getNodeName()) {
                    case "token":
                      String tmptok = orignl.item(k).getFirstChild().getTextContent();
                      spans.add(new Span(last, last + tmptok.length()));

                      String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue();
                      int tokslot = Integer.parseInt(slottmpb);
                      if (tokslot > flexes) {
                        flexes = tokslot;
                      }

                      toks.put(tokslot, tmptok);
                      orig.append(tmptok);
                      last += tmptok.length();
                      break;

                    case "#text":
                      String tmptxt = orignl.item(k).getTextContent();
                      orig.append(tmptxt);

                      if (!" ".equals(tmptxt)) {
                        spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last)));
                      }

                      last += tmptxt.length();
                      break;

                    default:
                      throw new IOException("Unexpected node: " + orignl.item(k).getNodeName());
                  }
                }
                break;

              case "#text":
              case "#comment":
                break;

              default:
                throw new IOException("Unexpected node: " + name);
            }
          }
          IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes];
          for (Entry<Integer, String> entry : toks.entrySet()) {
            final Integer flexidx = entry.getKey();
            final String left = entry.getValue();
            if (flx.get(flexidx) == null) {
              flexa = null;
              break;
            }
            int rsize = flx.get(flexidx).size();
            String[] right = new String[rsize];
            right = flx.get(flexidx).toArray(right);
            flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right);
          }

          Span[] spanout = new Span[spans.size()];
          spanout = spans.toArray(spanout);
          document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa));
        } else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) {
          throw new IOException("Unexpected node: " + sentnode.getNodeName());
        }
      }
      return document;
    } catch (SAXException e) {
      throw new IOException("Failed to parse IrishSentenceBank document", e);
    }
  }