in opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java [178:292]
public static IrishSentenceBankDocument parse(InputStream is) throws IOException {
IrishSentenceBankDocument document = new IrishSentenceBankDocument();
try {
DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
Document doc = docBuilder.parse(is);
String root = doc.getDocumentElement().getNodeName();
if (!root.equalsIgnoreCase("sentences")) {
throw new IOException("Expected root node " + root);
}
NodeList nl = doc.getDocumentElement().getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
Node sentnode = nl.item(i);
if (sentnode.getNodeName().equals("sentence")) {
String src = sentnode.getAttributes().getNamedItem("source").getNodeValue();
String trans = "";
Map<Integer, String> toks = new HashMap<>();
Map<Integer, List<String>> flx = new HashMap<>();
List<Span> spans = new ArrayList<>();
NodeList sentnl = sentnode.getChildNodes();
int flexes = 1;
java.lang.StringBuilder orig = new java.lang.StringBuilder();
for (int j = 0; j < sentnl.getLength(); j++) {
final String name = sentnl.item(j).getNodeName();
switch (name) {
case "flex":
String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue();
Integer flexslot = Integer.parseInt(slottmpa);
if (flexslot > flexes) {
flexes = flexslot;
}
flx.computeIfAbsent(flexslot, k -> new ArrayList<>());
String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue();
flx.get(flexslot).add(tkn);
break;
case "translation":
trans = sentnl.item(j).getFirstChild().getTextContent();
break;
case "original":
int last = 0;
NodeList orignl = sentnl.item(j).getChildNodes();
for (int k = 0; k < orignl.getLength(); k++) {
switch (orignl.item(k).getNodeName()) {
case "token":
String tmptok = orignl.item(k).getFirstChild().getTextContent();
spans.add(new Span(last, last + tmptok.length()));
String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue();
int tokslot = Integer.parseInt(slottmpb);
if (tokslot > flexes) {
flexes = tokslot;
}
toks.put(tokslot, tmptok);
orig.append(tmptok);
last += tmptok.length();
break;
case "#text":
String tmptxt = orignl.item(k).getTextContent();
orig.append(tmptxt);
if (!" ".equals(tmptxt)) {
spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last)));
}
last += tmptxt.length();
break;
default:
throw new IOException("Unexpected node: " + orignl.item(k).getNodeName());
}
}
break;
case "#text":
case "#comment":
break;
default:
throw new IOException("Unexpected node: " + name);
}
}
IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes];
for (Entry<Integer, String> entry : toks.entrySet()) {
final Integer flexidx = entry.getKey();
final String left = entry.getValue();
if (flx.get(flexidx) == null) {
flexa = null;
break;
}
int rsize = flx.get(flexidx).size();
String[] right = new String[rsize];
right = flx.get(flexidx).toArray(right);
flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right);
}
Span[] spanout = new Span[spans.size()];
spanout = spans.toArray(spanout);
document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa));
} else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) {
throw new IOException("Unexpected node: " + sentnode.getNodeName());
}
}
return document;
} catch (SAXException e) {
throw new IOException("Failed to parse IrishSentenceBank document", e);
}
}