protected void compareFiles()

in tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java [161:309]


    protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException {

        ExtractReaderException.TYPE extractExceptionA = null;
        ExtractReaderException.TYPE extractExceptionB = null;

        List<Metadata> metadataListA = null;
        if (extractExceptionA == null) {
            try {
                metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
            } catch (ExtractReaderException e) {
                e.printStackTrace();
                extractExceptionA = e.getType();
            }
        }

        List<Metadata> metadataListB = null;
        try {
            metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
        } catch (ExtractReaderException e) {
            extractExceptionB = e.getType();
        }

        //array indices for those metadata items handled in B
        Set<Integer> handledB = new HashSet<>();
        String containerID = Integer.toString(ID.getAndIncrement());
        //container table
        Map<Cols, String> contData = new HashMap<>();
        contData.put(Cols.CONTAINER_ID, containerID);
        contData.put(Cols.FILE_PATH, fpsA
                .getRelativeSourceFilePath()
                .toString());
        long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
        contData.put(Cols.LENGTH, srcFileLength > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLength) : "");
        contData.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(fpsA
                .getRelativeSourceFilePath()
                .getFileName()
                .toString()));

        long extractFileLengthA = getFileLength(fpsA.getExtractFile());
        contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthA) : "");

        long extractFileLengthB = getFileLength(fpsB.getExtractFile());
        contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthB) : "");

        writer.writeRow(COMPARISON_CONTAINERS, contData);

        if (extractExceptionA != null) {
            writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA
                    .getRelativeSourceFilePath()
                    .toString(), extractExceptionA);
        }
        if (extractExceptionB != null) {
            writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB
                    .getRelativeSourceFilePath()
                    .toString(), extractExceptionB);
        }

        if (metadataListA == null && metadataListB == null) {
            return;
        }
        List<Integer> numAttachmentsA = countAttachments(metadataListA);
        List<Integer> numAttachmentsB = countAttachments(metadataListB);

        String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
        Map<Class, Object> tokenStatsA = null;
        Map<Class, Object> tokenStatsB = null;
        //now get that metadata
        if (metadataListA != null) {
            for (int i = 0; i < metadataListA.size(); i++) {
                //the first file should have the same id as the container id
                String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
                Metadata metadataA = metadataListA.get(i);
                ContentTags contentTagsA = getContent(fpsA, metadataA);
                ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
                Metadata metadataB = null;

                //TODO: shouldn't be fileA!!!!
                writeTagData(fileId, contentTagsA, TAGS_TABLE_A);

                writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
                writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
                int matchIndex = getMatch(i, sharedDigestKey, handledB, metadataListA, metadataListB);

                if (matchIndex > -1 && !handledB.contains(matchIndex)) {
                    metadataB = metadataListB.get(matchIndex);
                    handledB.add(matchIndex);
                }
                if (metadataB != null) {
                    contentTagsB = getContent(fpsB, metadataB);
                    writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
                    writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                    writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
                }
                writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
                //write content
                try {
                    tokenStatsA = calcTextStats(contentTagsA);
                    writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
                    tokenStatsB = calcTextStats(contentTagsB);
                    if (metadataB != null) {
                        writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                if (metadataB != null) {
                    TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
                    TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
                    //arbitrary decision...only run the comparisons if there are > 10 tokens total
                    //We may want to bump that value a bit higher?
                    //now run comparisons
                    if (tokenCountsA.getTotalTokens() + tokenCountsB.getTotalTokens() > 10) {
                        Map<Cols, String> data = new HashMap<>();
                        data.put(Cols.ID, fileId);

                        ContrastStatistics contrastStatistics = tokenContraster.calculateContrastStatistics(tokenCountsA, tokenCountsB);

                        writeContrasts(data, contrastStatistics);
                        writer.writeRow(CONTENT_COMPARISONS, data);
                    }
                }
            }
        }
        //now try to get any Metadata objects in B
        //that haven't yet been handled.
        if (metadataListB != null) {
            for (int i = 0; i < metadataListB.size(); i++) {
                if (handledB.contains(i)) {
                    continue;
                }
                Metadata metadataB = metadataListB.get(i);
                ContentTags contentTagsB = getContent(fpsB, metadataB);
                //the first file should have the same id as the container id
                String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
                writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
                writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                writeEmbeddedFilePathData(i, fileId, null, metadataB);
                writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);

                //write content
                try {
                    tokenStatsB = calcTextStats(contentTagsB);
                    writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
    }