in opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java [171:291]
public ArrayList<WSDSample> getSensevalData(String wordTag) {
ArrayList<WSDSample> setInstances = new ArrayList<>();
final InputStream resource;
try {
if (data.endsWith(".train.gz")) {
resource = new GZIPInputStream(new FileInputStream(data));
} else {
resource = new FileInputStream(data);
}
} catch (IOException e) {
throw new RuntimeException("Error opening or loading Senseval data from specified resource file!", e);
}
try (InputStream xmlFileInputStream = new BufferedInputStream(resource)) {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(xmlFileInputStream);
doc.getDocumentElement().normalize();
NodeList lexelts = doc.getElementsByTagName("lexelt");
for (int i = 0; i < lexelts.getLength(); i++) {
Node nLexelt = lexelts.item(i);
if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
Element eLexelt = (Element) nLexelt;
if (eLexelt.getAttribute("item").equals(wordTag)) {
NodeList nInstances = nLexelt.getChildNodes();
for (int j = 1; j < nInstances.getLength(); j++) {
Node nInstance = nInstances.item(j);
if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
ArrayList<String> senseIDs = new ArrayList<>();
String rawWord;
String[] finalText = null;
int index = 0;
NodeList nChildren = nInstance.getChildNodes();
for (int k = 1; k < nChildren.getLength(); k++) {
Node nChild = nChildren.item(k);
if (nChild.getNodeName().equals("answer")) {
// String answer =
// nChild.getAttributes().item(0).getTextContent();
String senseid = nChild.getAttributes().item(1)
.getTextContent();
String temp = senseid;
// String[] temp = { answer, senseid };
senseIDs.add(temp);
}
if (nChild.getNodeName().equals("context")) {
if (nChild.hasChildNodes()) {
String textBefore = nChild.getChildNodes().item(0)
.getTextContent();
rawWord = nChild.getChildNodes().item(1).getTextContent();
String textAfter = nChild.getChildNodes().item(2)
.getTextContent();
List<String> textBeforeTokenized = Arrays.asList(textBefore.split("\\s"));
List<String> textAfterTokenized = Arrays.asList(textAfter.split("\\s"));
textBeforeTokenized.removeAll(Collections.singleton(null));
textBeforeTokenized.removeAll(Collections.singleton(""));
textAfterTokenized.removeAll(Collections.singleton(null));
textAfterTokenized.removeAll(Collections.singleton(""));
finalText = new String[textBeforeTokenized.size() + 1
+ textAfterTokenized.size()];
int l = 0;
for (String tempWord : textBeforeTokenized) {
finalText[l] = tempWord;
l++;
}
index = l;
finalText[l] = rawWord.toLowerCase();
l++;
for (String tempWord : textAfterTokenized) {
finalText[l] = tempWord;
l++;
}
}
}
}
final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
final POSTagger tagger = WSDHelper.getTagger();
final String[] words = finalText;
final String[] tags = tagger.tag(finalText);
String[] lemmas = lemmatizer.lemmatize(words, tags);
WSDSample wtd = new WSDSample(words, tags, lemmas, index, senseIDs.toArray(new String[0]));
setInstances.add(wtd);
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return setInstances;
}