in tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java [161:309]
protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException {
ExtractReaderException.TYPE extractExceptionA = null;
ExtractReaderException.TYPE extractExceptionB = null;
List<Metadata> metadataListA = null;
if (extractExceptionA == null) {
try {
metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
} catch (ExtractReaderException e) {
e.printStackTrace();
extractExceptionA = e.getType();
}
}
List<Metadata> metadataListB = null;
try {
metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
} catch (ExtractReaderException e) {
extractExceptionB = e.getType();
}
//array indices for those metadata items handled in B
Set<Integer> handledB = new HashSet<>();
String containerID = Integer.toString(ID.getAndIncrement());
//container table
Map<Cols, String> contData = new HashMap<>();
contData.put(Cols.CONTAINER_ID, containerID);
contData.put(Cols.FILE_PATH, fpsA
.getRelativeSourceFilePath()
.toString());
long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
contData.put(Cols.LENGTH, srcFileLength > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLength) : "");
contData.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(fpsA
.getRelativeSourceFilePath()
.getFileName()
.toString()));
long extractFileLengthA = getFileLength(fpsA.getExtractFile());
contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthA) : "");
long extractFileLengthB = getFileLength(fpsB.getExtractFile());
contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthB) : "");
writer.writeRow(COMPARISON_CONTAINERS, contData);
if (extractExceptionA != null) {
writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA
.getRelativeSourceFilePath()
.toString(), extractExceptionA);
}
if (extractExceptionB != null) {
writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB
.getRelativeSourceFilePath()
.toString(), extractExceptionB);
}
if (metadataListA == null && metadataListB == null) {
return;
}
List<Integer> numAttachmentsA = countAttachments(metadataListA);
List<Integer> numAttachmentsB = countAttachments(metadataListB);
String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
Map<Class, Object> tokenStatsA = null;
Map<Class, Object> tokenStatsB = null;
//now get that metadata
if (metadataListA != null) {
for (int i = 0; i < metadataListA.size(); i++) {
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
Metadata metadataA = metadataListA.get(i);
ContentTags contentTagsA = getContent(fpsA, metadataA);
ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
Metadata metadataB = null;
//TODO: shouldn't be fileA!!!!
writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
int matchIndex = getMatch(i, sharedDigestKey, handledB, metadataListA, metadataListB);
if (matchIndex > -1 && !handledB.contains(matchIndex)) {
metadataB = metadataListB.get(matchIndex);
handledB.add(matchIndex);
}
if (metadataB != null) {
contentTagsB = getContent(fpsB, metadataB);
writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
}
writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
//write content
try {
tokenStatsA = calcTextStats(contentTagsA);
writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
tokenStatsB = calcTextStats(contentTagsB);
if (metadataB != null) {
writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
if (metadataB != null) {
TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
//arbitrary decision...only run the comparisons if there are > 10 tokens total
//We may want to bump that value a bit higher?
//now run comparisons
if (tokenCountsA.getTotalTokens() + tokenCountsB.getTotalTokens() > 10) {
Map<Cols, String> data = new HashMap<>();
data.put(Cols.ID, fileId);
ContrastStatistics contrastStatistics = tokenContraster.calculateContrastStatistics(tokenCountsA, tokenCountsB);
writeContrasts(data, contrastStatistics);
writer.writeRow(CONTENT_COMPARISONS, data);
}
}
}
}
//now try to get any Metadata objects in B
//that haven't yet been handled.
if (metadataListB != null) {
for (int i = 0; i < metadataListB.size(); i++) {
if (handledB.contains(i)) {
continue;
}
Metadata metadataB = metadataListB.get(i);
ContentTags contentTagsB = getContent(fpsB, metadataB);
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
writeEmbeddedFilePathData(i, fileId, null, metadataB);
writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
//write content
try {
tokenStatsB = calcTextStats(contentTagsB);
writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}