in opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java [107:162]
public MascDocumentStream(File mascCorpusDirectory,
boolean searchRecursive, FileFilter fileFilter) throws IOException {
saxParser = XmlUtil.createSaxParser();
if (!mascCorpusDirectory.isDirectory()) {
throw new IOException("Input corpus directory must be a directory " +
"according to File.isDirectory()!");
}
int failedLoads = 0;
Stack<File> directoryStack = new Stack<>();
directoryStack.add(mascCorpusDirectory);
while (!directoryStack.isEmpty()) {
for (File file : directoryStack.pop().listFiles(fileFilter)) {
if (file.isFile()) {
String hdrFilePath = file.getAbsolutePath();
// look for the header files
if (hdrFilePath.endsWith(".hdr")) {
HashMap<String, File> fileGroup = checkAnnotations(hdrFilePath);
InputStream f_primary = new BufferedInputStream(
new FileInputStream(fileGroup.get("f.text")));
InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
InputStream f_s = (fileGroup.containsKey("f.s")) ?
new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;
try {
documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
f_penn, f_s, f_ne));
} catch (IOException e) {
logger.error("Failed to parse the file: {}", hdrFilePath, e);
failedLoads++;
}
}
} else if (searchRecursive && file.isDirectory()) {
directoryStack.push(file);
}
}
}
logger.info("Documents loaded: {}", documents.size());
if (failedLoads > 0) {
logger.info("Failed loading {} documents.", failedLoads);
}
reset();
}