public List loadExtract()

in tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java [93:190]


    public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {

        List<Metadata> metadataList = null;
        if (extractFile == null || !Files.isRegularFile(extractFile)) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
        }

        FileSuffixes fileSuffixes = parseSuffixes(extractFile
                .getFileName()
                .toString());
        if (fileSuffixes.format == null) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
        }
        if (!Files.isRegularFile(extractFile)) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
        }

        long length = -1L;
        try {
            length = Files.size(extractFile);
        } catch (IOException e) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION, e);
        }

        if (length == 0L) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
        }

        if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
            LOG.info("minExtractLength {} > IGNORE_LENGTH {} and length {} < minExtractLength {} for file '{}'", 
                    minExtractLength, IGNORE_LENGTH, length, minExtractLength, extractFile);
            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
        }
        if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
            LOG.info("maxExtractLength {} > IGNORE_LENGTH {} and length {} > maxExtractLength {} for file '{}'", 
                    maxExtractLength, IGNORE_LENGTH, length, maxExtractLength, extractFile);
            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
        }

        Reader reader = null;
        InputStream is = null;
        try {
            is = Files.newInputStream(extractFile);
            if (fileSuffixes.compression != null) {
                switch (fileSuffixes.compression) {
                    case "bz2":
                        is = new BZip2CompressorInputStream(is);
                        break;
                    case "gz":
                    case "gzip":
                        is = new GzipCompressorInputStream(is);
                        break;
                    case "zip":
                        is = new ZCompressorInputStream(is);
                        break;
                    default:
                        LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
                        return metadataList;
                }
            }
            reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
        } catch (IOException e) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION, e);
        }

        try {
            if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
                metadataList = JsonMetadataList.fromJson(reader);
                if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
                    while (metadataList.size() > 1) {
                        metadataList.remove(metadataList.size() - 1);
                    }
                } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && metadataList.size() > 1) {
                    StringBuilder sb = new StringBuilder();
                    Metadata containerMetadata = metadataList.get(0);
                    for (Metadata m : metadataList) {
                        String c = m.get(TikaCoreProperties.TIKA_CONTENT);
                        if (c != null) {
                            sb.append(c);
                            sb.append(" ");
                        }
                    }
                    containerMetadata.set(TikaCoreProperties.TIKA_CONTENT, sb.toString());
                    while (metadataList.size() > 1) {
                        metadataList.remove(metadataList.size() - 1);
                    }
                }
            } else {
                metadataList = generateListFromTextFile(reader, fileSuffixes);
            }
        } catch (IOException e) {
            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION, e);
        } finally {
            IOUtils.closeQuietly(reader);
            IOUtils.closeQuietly(is);
        }
        return metadataList;
    }