in tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java [189:436]
private MediaType applyProbilities(final List<MimeType> possibleTypes,
final MimeType extMimeType,
final MimeType metadataMimeType) {
/* initialize some probability variables */
MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType();
MediaType metaMediaType_ = metadataMimeType == null ? null : metadataMimeType.getType();
int n = possibleTypes.size();
float mag_trust = magic_trust;
float mag_neg = magic_neg;
float ext_trust = extension_trust;
float ext_neg = extension_neg;
float met_trust = meta_trust;
float met_neg = meta_neg;
/* ************************** */
/* pre-process some probability variables */
if (extensionMediaType_ == null || extensionMediaType_.compareTo(rootMediaType) == 0) {
/*
* this is a root type, that means the extension method fails to
* identify any type.
*/
ext_trust = 1;
ext_neg = 1;
}
if (metaMediaType_ == null || metaMediaType_.compareTo(rootMediaType) == 0) {
met_trust = 1;
met_neg = 1;
}
float maxProb = -1f;
MediaType bestEstimate = rootMediaType;
if (!possibleTypes.isEmpty()) {
int i;
for (i = 0; i < n; i++) {
MediaType magictype = possibleTypes.get(i).getType();
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
if (magictype != null && magictype.equals(rootMediaType)) {
mag_trust = 1;
mag_neg = 1;
} else {
// check if each identified type belongs to the same class;
if (extensionMediaType_ != null) {
if (extensionMediaType_.equals(magictype) ||
registry.isSpecializationOf(extensionMediaType_, magictype)) {
// Use just this type
possibleTypes.set(i, extMimeType);
} else if (registry.isSpecializationOf(magictype, extensionMediaType_)) {
extensionMediaType_ = magictype;
}
}
if (metaMediaType_ != null) {
if (metaMediaType_.equals(magictype) ||
registry.isSpecializationOf(metaMediaType_, magictype)) {
// Use just this type
possibleTypes.set(i, metadataMimeType);
} else if (registry.isSpecializationOf(magictype, metaMediaType_)) {
metaMediaType_ = magictype;
}
}
}
/*
* prepare the conditional probability for file type prediction.
*/
float[] results = new float[3];
float[] trust1 = new float[3];
float[] negtrust1 = new float[3];
magictype = possibleTypes.get(i).getType();
if (i > 0) {
/*
* decay as our trust goes down with next type predicted by
* magic
*/
mag_trust = mag_trust * (1 - changeRate);
/*
* grow as our trust goes down
*/
mag_neg = mag_neg * (1 + changeRate);
}
if (magictype != null && mag_trust != 1) {
trust1[0] = mag_trust;
negtrust1[0] = mag_neg;
if (metaMediaType_ != null && met_trust != 1) {
if (magictype.equals(metaMediaType_)) {
trust1[1] = met_trust;
negtrust1[1] = met_neg;
} else {
trust1[1] = 1 - met_trust;
negtrust1[1] = 1 - met_neg;
}
} else {
trust1[1] = 1;
negtrust1[1] = 1;
}
if (extensionMediaType_ != null && ext_trust != 1) {
if (magictype.equals(extensionMediaType_)) {
trust1[2] = ext_trust;
negtrust1[2] = ext_neg;
} else {
trust1[2] = 1 - ext_trust;
negtrust1[2] = 1 - ext_neg;
}
} else {
trust1[2] = 1;
negtrust1[2] = 1;
}
} else {
results[0] = 0.1f;
}
float[] trust2 = new float[3];
float[] negtrust2 = new float[3];
if (metadataMimeType != null && met_trust != 1) {
trust2[1] = met_trust;
negtrust2[1] = met_neg;
if (magictype != null && mag_trust != 1) {
if (metaMediaType_.equals(magictype)) {
trust2[0] = mag_trust;
negtrust2[0] = mag_neg;
} else {
trust2[0] = 1 - mag_trust;
negtrust2[0] = 1 - mag_neg;
}
} else {
trust2[0] = 1f;
negtrust2[0] = 1f;
}
if (extensionMediaType_ != null && ext_trust != 1) {
if (metaMediaType_.equals(extensionMediaType_)) {
trust2[2] = ext_trust;
negtrust2[2] = ext_neg;
} else {
trust2[2] = 1 - ext_trust;
negtrust2[2] = 1 - ext_neg;
}
} else {
trust2[2] = 1f;
negtrust2[2] = 1f;
}
} else {
results[1] = 0.1f;
}
float[] trust3 = new float[3];
float[] negtrust3 = new float[3];
if (extensionMediaType_ != null && ext_trust != 1) {
trust3[2] = ext_trust;
negtrust3[2] = ext_neg;
if (magictype != null && mag_trust != 1) {
if (magictype.equals(extensionMediaType_)) {
trust3[0] = mag_trust;
negtrust3[0] = mag_neg;
} else {
trust3[0] = 1 - mag_trust;
negtrust3[0] = 1 - mag_neg;
}
} else {
trust3[0] = 1f;
negtrust3[0] = 1f;
}
if (metaMediaType_ != null && met_trust != 1) {
if (metaMediaType_.equals(extensionMediaType_)) {
trust3[1] = met_trust;
negtrust3[1] = met_neg;
} else {
trust3[1] = 1 - met_trust;
negtrust3[1] = 1 - met_neg;
}
} else {
trust3[1] = 1f;
negtrust3[1] = 1f;
}
} else {
results[2] = 0.1f;
}
/*
* compute the posterior probability for each predicted file
* type and store them into the "results" array.
*/
float pPrime = priorMagicFileType;
float deno = 1 - priorMagicFileType;
int j;
if (results[0] == 0) {
for (j = 0; j < trust1.length; j++) {
pPrime *= trust1[j];
if (trust1[j] != 1) {
deno *= negtrust1[j];
}
}
pPrime /= (pPrime + deno);
results[0] = pPrime;
}
if (maxProb < results[0]) {
maxProb = results[0];
bestEstimate = magictype;
}
pPrime = priorMetaFileType;
deno = 1 - priorMetaFileType;
if (results[1] == 0) {
for (j = 0; j < trust2.length; j++) {
pPrime *= trust2[j];
if (trust2[j] != 1) {
deno *= negtrust2[j];
}
}
pPrime /= (pPrime + deno);
results[1] = pPrime;
}
if (maxProb < results[1]) {
maxProb = results[1];
bestEstimate = metaMediaType_;
}
pPrime = priorExtensionFileType;
deno = 1 - priorExtensionFileType;
if (results[2] == 0) {
for (j = 0; j < trust3.length; j++) {
pPrime *= trust3[j];
if (trust3[j] != 1) {
deno *= negtrust3[j];
}
}
pPrime /= (pPrime + deno);
results[2] = pPrime;
}
if (maxProb < results[2]) {
maxProb = results[2];
bestEstimate = extensionMediaType_;
}
}
}
return maxProb < threshold ? this.rootMediaType : bestEstimate;
}