private ArrayList readFile()

in opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java [89:184]


  private ArrayList<Sentence> readFile(String file) {

    ArrayList<Sentence> result = new ArrayList<>();

    try {

      File xmlFile = new File(file);
      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
      Document doc = dBuilder.parse(xmlFile);

      doc.getDocumentElement().normalize();

      NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH);

      for (int i = 0; i < paragraphs.getLength(); i++) {

        Node nParagraph = paragraphs.item(i);

        if (nParagraph.getNodeType() == Node.ELEMENT_NODE) {

          Element eParagraph = (Element) nParagraph;
          // THE PARAGRAPH ID
          int paragraphID = Integer.parseInt(eParagraph
              .getAttribute(ATTRIBUTE_PARAGRAPHNUM));

          NodeList nSentences = nParagraph.getChildNodes();

          for (int j = 1; j < nSentences.getLength(); j++) {

            Node nSentence = nSentences.item(j);
            if (nSentence.getNodeType() == Node.ELEMENT_NODE) {

              Element eSentence = (Element) nSentence;
              // THE SENTENCE ID
              int sentenceID = Integer.parseInt(eSentence
                  .getAttribute(ATTRIBUTE_SENTENCENUM));
              Sentence isentence = new Sentence(paragraphID, sentenceID);

              NodeList nWords = nSentence.getChildNodes();

              int wnum = 0;
              for (int k = 0; k < nWords.getLength(); k++) {
                Node nWord = nWords.item(k);

                if (nWord.getNodeType() == Node.ELEMENT_NODE) {

                  if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) {

                    Element eWord = (Element) nWord;
                    String word = eWord.getTextContent();
                    String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
                    String pos = eWord.getAttribute(ATTRIBUTE_POS);
                    if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) {
                      // if the word is already disambiguated
                      String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA);
                      String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
                      String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);

                      Word iword = new Word(paragraphID, sentenceID, wnum,
                          Word.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
                      isentence.addIword(iword);

                      // System.out.println("*** " + iword.toString() + " ***");

                    } else {
                      // if the word is not disambiguated
                      Word iword = new Word(paragraphID, sentenceID, wnum,
                          Word.Type.WORD, word, cmd, pos);
                      isentence.addIword(iword);
                    }
                    wnum++;

                  } else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
                    Element eWord = (Element) nWord;
                    String word = eWord.getTextContent();
                    Word iword = new Word(paragraphID, sentenceID, wnum,
                        Word.Type.PUNCTUATIONMARK, word);
                    isentence.addIword(iword);
                    wnum++;
                  }

                }

              }
              result.add(isentence);
            }
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }

    return result;
  }