in opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java [89:184]
private ArrayList<Sentence> readFile(String file) {
ArrayList<Sentence> result = new ArrayList<>();
try {
File xmlFile = new File(file);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(xmlFile);
doc.getDocumentElement().normalize();
NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH);
for (int i = 0; i < paragraphs.getLength(); i++) {
Node nParagraph = paragraphs.item(i);
if (nParagraph.getNodeType() == Node.ELEMENT_NODE) {
Element eParagraph = (Element) nParagraph;
// THE PARAGRAPH ID
int paragraphID = Integer.parseInt(eParagraph
.getAttribute(ATTRIBUTE_PARAGRAPHNUM));
NodeList nSentences = nParagraph.getChildNodes();
for (int j = 1; j < nSentences.getLength(); j++) {
Node nSentence = nSentences.item(j);
if (nSentence.getNodeType() == Node.ELEMENT_NODE) {
Element eSentence = (Element) nSentence;
// THE SENTENCE ID
int sentenceID = Integer.parseInt(eSentence
.getAttribute(ATTRIBUTE_SENTENCENUM));
Sentence isentence = new Sentence(paragraphID, sentenceID);
NodeList nWords = nSentence.getChildNodes();
int wnum = 0;
for (int k = 0; k < nWords.getLength(); k++) {
Node nWord = nWords.item(k);
if (nWord.getNodeType() == Node.ELEMENT_NODE) {
if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) {
Element eWord = (Element) nWord;
String word = eWord.getTextContent();
String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
String pos = eWord.getAttribute(ATTRIBUTE_POS);
if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) {
// if the word is already disambiguated
String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA);
String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);
Word iword = new Word(paragraphID, sentenceID, wnum,
Word.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
isentence.addIword(iword);
// System.out.println("*** " + iword.toString() + " ***");
} else {
// if the word is not disambiguated
Word iword = new Word(paragraphID, sentenceID, wnum,
Word.Type.WORD, word, cmd, pos);
isentence.addIword(iword);
}
wnum++;
} else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
Element eWord = (Element) nWord;
String word = eWord.getTextContent();
Word iword = new Word(paragraphID, sentenceID, wnum,
Word.Type.PUNCTUATIONMARK, word);
isentence.addIword(iword);
wnum++;
}
}
}
result.add(isentence);
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return result;
}