private MediaType applyProbilities()

in tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java [189:436]


    private MediaType applyProbilities(final List<MimeType> possibleTypes,
                                       final MimeType extMimeType,
                                       final MimeType metadataMimeType) {

        /* initialize some probability variables */
        MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType();
        MediaType metaMediaType_ = metadataMimeType == null ? null : metadataMimeType.getType();

        int n = possibleTypes.size();
        float mag_trust = magic_trust;
        float mag_neg = magic_neg;
        float ext_trust = extension_trust;
        float ext_neg = extension_neg;
        float met_trust = meta_trust;
        float met_neg = meta_neg;
        /* ************************** */

        /* pre-process some probability variables */
        if (extensionMediaType_ == null || extensionMediaType_.compareTo(rootMediaType) == 0) {
            /*
             * this is a root type, that means the extension method fails to
             * identify any type.
             */
            ext_trust = 1;
            ext_neg = 1;
        }
        if (metaMediaType_ == null || metaMediaType_.compareTo(rootMediaType) == 0) {
            met_trust = 1;
            met_neg = 1;
        }

        float maxProb = -1f;
        MediaType bestEstimate = rootMediaType;

        if (!possibleTypes.isEmpty()) {
            int i;
            for (i = 0; i < n; i++) {
                MediaType magictype = possibleTypes.get(i).getType();
                MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
                if (magictype != null && magictype.equals(rootMediaType)) {
                    mag_trust = 1;
                    mag_neg = 1;
                } else {
                    // check if each identified type belongs to the same class;
                    if (extensionMediaType_ != null) {
                        if (extensionMediaType_.equals(magictype) ||
                                registry.isSpecializationOf(extensionMediaType_, magictype)) {
                            // Use just this type
                            possibleTypes.set(i, extMimeType);
                        } else if (registry.isSpecializationOf(magictype, extensionMediaType_)) {
                            extensionMediaType_ = magictype;
                        }
                    }
                    if (metaMediaType_ != null) {
                        if (metaMediaType_.equals(magictype) ||
                                registry.isSpecializationOf(metaMediaType_, magictype)) {
                            // Use just this type
                            possibleTypes.set(i, metadataMimeType);
                        } else if (registry.isSpecializationOf(magictype, metaMediaType_)) {
                            metaMediaType_ = magictype;
                        }
                    }
                }

                /*
                 * prepare the conditional probability for file type prediction.
                 */

                float[] results = new float[3];
                float[] trust1 = new float[3];
                float[] negtrust1 = new float[3];
                magictype = possibleTypes.get(i).getType();

                if (i > 0) {
                    /*
                     * decay as our trust goes down with next type predicted by
                     * magic
                     */
                    mag_trust = mag_trust * (1 - changeRate);
                    /*
                     * grow as our trust goes down
                     */
                    mag_neg = mag_neg * (1 + changeRate);

                }

                if (magictype != null && mag_trust != 1) {
                    trust1[0] = mag_trust;
                    negtrust1[0] = mag_neg;
                    if (metaMediaType_ != null && met_trust != 1) {
                        if (magictype.equals(metaMediaType_)) {
                            trust1[1] = met_trust;
                            negtrust1[1] = met_neg;
                        } else {
                            trust1[1] = 1 - met_trust;
                            negtrust1[1] = 1 - met_neg;
                        }
                    } else {
                        trust1[1] = 1;
                        negtrust1[1] = 1;
                    }
                    if (extensionMediaType_ != null && ext_trust != 1) {
                        if (magictype.equals(extensionMediaType_)) {
                            trust1[2] = ext_trust;
                            negtrust1[2] = ext_neg;
                        } else {
                            trust1[2] = 1 - ext_trust;
                            negtrust1[2] = 1 - ext_neg;
                        }
                    } else {
                        trust1[2] = 1;
                        negtrust1[2] = 1;
                    }
                } else {
                    results[0] = 0.1f;
                }

                float[] trust2 = new float[3];
                float[] negtrust2 = new float[3];
                if (metadataMimeType != null && met_trust != 1) {
                    trust2[1] = met_trust;
                    negtrust2[1] = met_neg;
                    if (magictype != null && mag_trust != 1) {
                        if (metaMediaType_.equals(magictype)) {
                            trust2[0] = mag_trust;
                            negtrust2[0] = mag_neg;
                        } else {
                            trust2[0] = 1 - mag_trust;
                            negtrust2[0] = 1 - mag_neg;
                        }

                    } else {
                        trust2[0] = 1f;
                        negtrust2[0] = 1f;
                    }
                    if (extensionMediaType_ != null && ext_trust != 1) {
                        if (metaMediaType_.equals(extensionMediaType_)) {
                            trust2[2] = ext_trust;
                            negtrust2[2] = ext_neg;
                        } else {
                            trust2[2] = 1 - ext_trust;
                            negtrust2[2] = 1 - ext_neg;
                        }
                    } else {
                        trust2[2] = 1f;
                        negtrust2[2] = 1f;
                    }
                } else {
                    results[1] = 0.1f;
                }

                float[] trust3 = new float[3];
                float[] negtrust3 = new float[3];
                if (extensionMediaType_ != null && ext_trust != 1) {
                    trust3[2] = ext_trust;
                    negtrust3[2] = ext_neg;
                    if (magictype != null && mag_trust != 1) {
                        if (magictype.equals(extensionMediaType_)) {
                            trust3[0] = mag_trust;
                            negtrust3[0] = mag_neg;
                        } else {
                            trust3[0] = 1 - mag_trust;
                            negtrust3[0] = 1 - mag_neg;
                        }
                    } else {
                        trust3[0] = 1f;
                        negtrust3[0] = 1f;
                    }

                    if (metaMediaType_ != null && met_trust != 1) {
                        if (metaMediaType_.equals(extensionMediaType_)) {
                            trust3[1] = met_trust;
                            negtrust3[1] = met_neg;
                        } else {
                            trust3[1] = 1 - met_trust;
                            negtrust3[1] = 1 - met_neg;
                        }
                    } else {
                        trust3[1] = 1f;
                        negtrust3[1] = 1f;
                    }
                } else {
                    results[2] = 0.1f;
                }
                /*
                 * compute the posterior probability for each predicted file
                 * type and store them into the "results" array.
                 */
                float pPrime = priorMagicFileType;
                float deno = 1 - priorMagicFileType;
                int j;

                if (results[0] == 0) {
                    for (j = 0; j < trust1.length; j++) {
                        pPrime *= trust1[j];
                        if (trust1[j] != 1) {
                            deno *= negtrust1[j];
                        }
                    }
                    pPrime /= (pPrime + deno);
                    results[0] = pPrime;

                }
                if (maxProb < results[0]) {
                    maxProb = results[0];
                    bestEstimate = magictype;
                }

                pPrime = priorMetaFileType;
                deno = 1 - priorMetaFileType;
                if (results[1] == 0) {
                    for (j = 0; j < trust2.length; j++) {
                        pPrime *= trust2[j];
                        if (trust2[j] != 1) {
                            deno *= negtrust2[j];
                        }
                    }
                    pPrime /= (pPrime + deno);
                    results[1] = pPrime;

                }
                if (maxProb < results[1]) {
                    maxProb = results[1];
                    bestEstimate = metaMediaType_;
                }

                pPrime = priorExtensionFileType;
                deno = 1 - priorExtensionFileType;
                if (results[2] == 0) {
                    for (j = 0; j < trust3.length; j++) {
                        pPrime *= trust3[j];
                        if (trust3[j] != 1) {
                            deno *= negtrust3[j];
                        }
                    }
                    pPrime /= (pPrime + deno);
                    results[2] = pPrime;
                }
                if (maxProb < results[2]) {
                    maxProb = results[2];
                    bestEstimate = extensionMediaType_;
                }
            }

        }
        return maxProb < threshold ? this.rootMediaType : bestEstimate;

    }