public List getSensevalData()

in opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java [217:308]


  public List<WSDSample> getSensevalData(String wordTag) {

    final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
    final POSTagger tagger = WSDHelper.getTagger();
    final List<WSDSample> setInstances = new ArrayList<>();
    final NodeList lexelts = trainDoc.getElementsByTagName("lexelt");

    for (int i = 0; i < lexelts.getLength(); i++) {

      Node nLexelt = lexelts.item(i);

      if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
        Element eLexelt = (Element) nLexelt;

        if (eLexelt.getAttribute("item").equals(wordTag)) {

          NodeList nInstances = nLexelt.getChildNodes();

          for (int j = 1; j < nInstances.getLength(); j++) {

            Node nInstance = nInstances.item(j);

            if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
              List<String> senseIDs = new ArrayList<>();
              String rawWord;
              String[] finalText = null;
              int index = 0;

              NodeList nChildren = nInstance.getChildNodes();

              for (int k = 1; k < nChildren.getLength(); k++) {
                Node nChild = nChildren.item(k);

                if (nChild.getNodeName().equals("answer")) {
                  // String answer =
                  // nChild.getAttributes().item(0).getTextContent();

                  String temp = nChild.getAttributes().item(1).getTextContent();
                  // String[] temp = { answer, senseid };
                  senseIDs.add(temp);
                }

                if (nChild.getNodeName().equals("context")) {

                  if (nChild.hasChildNodes()) {
                    NodeList children = nChild.getChildNodes();
                    String textBefore = children.item(0).getTextContent();
                    rawWord = children.item(1).getTextContent();
                    String textAfter = children.item(2).getTextContent();

                    List<String> textBeforeTokenized = Arrays.asList(textBefore.split("\\s"));
                    List<String> textAfterTokenized = Arrays.asList(textAfter.split("\\s"));

                    textBeforeTokenized.removeAll(Collections.singleton(null));
                    // textBeforeTokenized.removeAll(Collections.singleton(""));
                    textAfterTokenized.removeAll(Collections.singleton(null));
                    // textAfterTokenized.removeAll(Collections.singleton(""));

                    finalText = new String[textBeforeTokenized.size() + 1
                        + textAfterTokenized.size()];

                    int l = 0;
                    for (String tempWord : textBeforeTokenized) {
                      finalText[l] = tempWord;
                      l++;
                    }
                    index = l;
                    finalText[l] = rawWord.toLowerCase();
                    l++;
                    for (String tempWord : textAfterTokenized) {
                      finalText[l] = tempWord;
                      l++;
                    }
                  }
                }
              }

              final String[] words = finalText;
              final String[] tags = tagger.tag(finalText);
              final String[] lemmas = lemmatizer.lemmatize(words, tags);

              WSDSample wtd = new WSDSample(words, tags, lemmas, index, senseIDs.toArray(new String[0]));
              setInstances.add(wtd);

            }
          }
        }
      }
    }

    return setInstances;
  }