public ArrayList getSensevalData()

in opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java [171:291]


  public ArrayList<WSDSample> getSensevalData(String wordTag) {

    ArrayList<WSDSample> setInstances = new ArrayList<>();

    final InputStream resource;
    try {
      if (data.endsWith(".train.gz")) {
        resource = new GZIPInputStream(new FileInputStream(data));
      } else {
        resource = new FileInputStream(data);
      }
    } catch (IOException e) {
      throw new RuntimeException("Error opening or loading Senseval data from specified resource file!", e);
    }

    try (InputStream xmlFileInputStream = new BufferedInputStream(resource)) {
      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
      Document doc = dBuilder.parse(xmlFileInputStream);

      doc.getDocumentElement().normalize();

      NodeList lexelts = doc.getElementsByTagName("lexelt");

      for (int i = 0; i < lexelts.getLength(); i++) {

        Node nLexelt = lexelts.item(i);

        if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
          Element eLexelt = (Element) nLexelt;

          if (eLexelt.getAttribute("item").equals(wordTag)) {

            NodeList nInstances = nLexelt.getChildNodes();

            for (int j = 1; j < nInstances.getLength(); j++) {

              Node nInstance = nInstances.item(j);

              if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
                ArrayList<String> senseIDs = new ArrayList<>();
                String rawWord;
                String[] finalText = null;
                int index = 0;

                NodeList nChildren = nInstance.getChildNodes();

                for (int k = 1; k < nChildren.getLength(); k++) {
                  Node nChild = nChildren.item(k);

                  if (nChild.getNodeName().equals("answer")) {
                    // String answer =
                    // nChild.getAttributes().item(0).getTextContent();
                    String senseid = nChild.getAttributes().item(1)
                        .getTextContent();

                    String temp = senseid;
                    // String[] temp = { answer, senseid };
                    senseIDs.add(temp);
                  }

                  if (nChild.getNodeName().equals("context")) {

                    if (nChild.hasChildNodes()) {
                      String textBefore = nChild.getChildNodes().item(0)
                          .getTextContent();
                      rawWord = nChild.getChildNodes().item(1).getTextContent();
                      String textAfter = nChild.getChildNodes().item(2)
                          .getTextContent();

                      List<String> textBeforeTokenized = Arrays.asList(textBefore.split("\\s"));
                      List<String> textAfterTokenized = Arrays.asList(textAfter.split("\\s"));

                      textBeforeTokenized.removeAll(Collections.singleton(null));
                      textBeforeTokenized.removeAll(Collections.singleton(""));
                      textAfterTokenized.removeAll(Collections.singleton(null));
                      textAfterTokenized.removeAll(Collections.singleton(""));

                      finalText = new String[textBeforeTokenized.size() + 1
                          + textAfterTokenized.size()];

                      int l = 0;
                      for (String tempWord : textBeforeTokenized) {
                        finalText[l] = tempWord;
                        l++;
                      }
                      index = l;
                      finalText[l] = rawWord.toLowerCase();
                      l++;
                      for (String tempWord : textAfterTokenized) {
                        finalText[l] = tempWord;
                        l++;
                      }

                    }
                  }

                }
                final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
                final POSTagger tagger = WSDHelper.getTagger();

                final String[] words = finalText;
                final String[] tags = tagger.tag(finalText);
                String[] lemmas = lemmatizer.lemmatize(words, tags);

                WSDSample wtd = new WSDSample(words, tags, lemmas, index, senseIDs.toArray(new String[0]));
                setInstances.add(wtd);

              }
            }
          }
        }
      }

    } catch (Exception e) {
      e.printStackTrace();
    }

    return setInstances;

  }