in tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java [93:190]
public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
List<Metadata> metadataList = null;
if (extractFile == null || !Files.isRegularFile(extractFile)) {
throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
}
FileSuffixes fileSuffixes = parseSuffixes(extractFile
.getFileName()
.toString());
if (fileSuffixes.format == null) {
throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
}
if (!Files.isRegularFile(extractFile)) {
throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
}
long length = -1L;
try {
length = Files.size(extractFile);
} catch (IOException e) {
throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION, e);
}
if (length == 0L) {
throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
}
if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
LOG.info("minExtractLength {} > IGNORE_LENGTH {} and length {} < minExtractLength {} for file '{}'",
minExtractLength, IGNORE_LENGTH, length, minExtractLength, extractFile);
throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
}
if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
LOG.info("maxExtractLength {} > IGNORE_LENGTH {} and length {} > maxExtractLength {} for file '{}'",
maxExtractLength, IGNORE_LENGTH, length, maxExtractLength, extractFile);
throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
}
Reader reader = null;
InputStream is = null;
try {
is = Files.newInputStream(extractFile);
if (fileSuffixes.compression != null) {
switch (fileSuffixes.compression) {
case "bz2":
is = new BZip2CompressorInputStream(is);
break;
case "gz":
case "gzip":
is = new GzipCompressorInputStream(is);
break;
case "zip":
is = new ZCompressorInputStream(is);
break;
default:
LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
return metadataList;
}
}
reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
} catch (IOException e) {
throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION, e);
}
try {
if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
metadataList = JsonMetadataList.fromJson(reader);
if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
while (metadataList.size() > 1) {
metadataList.remove(metadataList.size() - 1);
}
} else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && metadataList.size() > 1) {
StringBuilder sb = new StringBuilder();
Metadata containerMetadata = metadataList.get(0);
for (Metadata m : metadataList) {
String c = m.get(TikaCoreProperties.TIKA_CONTENT);
if (c != null) {
sb.append(c);
sb.append(" ");
}
}
containerMetadata.set(TikaCoreProperties.TIKA_CONTENT, sb.toString());
while (metadataList.size() > 1) {
metadataList.remove(metadataList.size() - 1);
}
}
} else {
metadataList = generateListFromTextFile(reader, fileSuffixes);
}
} catch (IOException e) {
throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION, e);
} finally {
IOUtils.closeQuietly(reader);
IOUtils.closeQuietly(is);
}
return metadataList;
}