public MascDocumentStream()

in opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java [107:162]


  public MascDocumentStream(File mascCorpusDirectory,
                            boolean searchRecursive, FileFilter fileFilter) throws IOException {

    saxParser = XmlUtil.createSaxParser();

    if (!mascCorpusDirectory.isDirectory()) {
      throw new IOException("Input corpus directory must be a directory " +
          "according to File.isDirectory()!");
    }

    int failedLoads = 0;
    Stack<File> directoryStack = new Stack<>();
    directoryStack.add(mascCorpusDirectory);

    while (!directoryStack.isEmpty()) {
      for (File file : directoryStack.pop().listFiles(fileFilter)) {
        if (file.isFile()) {
          String hdrFilePath = file.getAbsolutePath();

          // look for the header files
          if (hdrFilePath.endsWith(".hdr")) {

            HashMap<String, File> fileGroup = checkAnnotations(hdrFilePath);
            InputStream f_primary = new BufferedInputStream(
                new FileInputStream(fileGroup.get("f.text")));
            InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
            InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
            InputStream f_s = (fileGroup.containsKey("f.s")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
            InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
                new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;

            try {
              documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
                  f_penn, f_s, f_ne));
            } catch (IOException e) {
              logger.error("Failed to parse the file: {}", hdrFilePath, e);
              failedLoads++;
            }
          }

        } else if (searchRecursive && file.isDirectory()) {
          directoryStack.push(file);
        }
      }
    }

    logger.info("Documents loaded: {}", documents.size());
    if (failedLoads > 0) {
      logger.info("Failed loading {} documents.", failedLoads);
    }
    reset();

  }