private void compareFileMagic()

in tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java [858:969]


    private void compareFileMagic(String magicDir) throws Exception {
        Set<String> tikaLacking = new TreeSet<>();
        Set<String> tikaNoMagic = new TreeSet<>();

        // Plausibility check
        File dir = new File(magicDir);
        if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
            // Looks plausible
        } else {
            throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
        }

        // Find all the mimetypes in the directory
        Set<String> fileMimes = new HashSet<>();
        for (File mf : dir.listFiles()) {
            if (mf.isFile()) {
                BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8));
                String line;
                while ((line = r.readLine()) != null) {
                    if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
                        String mime = line
                                .substring(7)
                                .trim();
                        fileMimes.add(mime);
                    }
                }
                r.close();
            }
        }

        // See how those compare to the Tika ones
        TikaConfig config = TikaConfig.getDefaultConfig();
        MimeTypes mimeTypes = config.getMimeRepository();
        MediaTypeRegistry registry = config.getMediaTypeRegistry();
        for (String mime : fileMimes) {
            try {
                final MimeType type = mimeTypes.getRegisteredMimeType(mime);

                if (type == null) {
                    // Tika doesn't know about this one
                    tikaLacking.add(mime);
                } else {
                    // Tika knows about this one!

                    // Does Tika have magic for it?
                    boolean hasMagic = type.hasMagic();

                    // How about the children?
                    if (!hasMagic) {
                        for (MediaType child : registry.getChildTypes(type.getType())) {
                            MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
                            if (childType != null && childType.hasMagic()) {
                                hasMagic = true;
                            }
                        }
                    }

                    // How about the parents?
                    MimeType parentType = type;
                    while (parentType != null && !hasMagic) {
                        if (parentType.hasMagic()) {
                            // Has magic, fine
                            hasMagic = true;
                        } else {
                            // Check the parent next
                            MediaType parent = registry.getSupertype(type.getType());
                            if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
                                // Stop checking parents if we hit a top level type
                                parent = null;
                            }
                            if (parent != null) {
                                parentType = mimeTypes.getRegisteredMimeType(parent.toString());
                            } else {
                                parentType = null;
                            }
                        }
                    }
                    if (!hasMagic) {
                        tikaNoMagic.add(mime);
                    }
                }
            } catch (MimeTypeException e) {
                // Broken entry in the file magic directory
                // Silently skip
            }
        }

        // Check how many tika knows about
        int tikaTypes = 0;
        int tikaAliases = 0;
        for (MediaType type : registry.getTypes()) {
            tikaTypes++;
            tikaAliases += registry
                    .getAliases(type)
                    .size();
        }

        // Report
        System.out.println("Tika knows about " + tikaTypes + " unique mime types");
        System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
        System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
        System.out.println();
        System.out.println("The following mime types are known to File but not Tika:");
        for (String mime : tikaLacking) {
            System.out.println("  " + mime);
        }
        System.out.println();
        System.out.println("The following mime types from File have no Tika magic (but their children might):");
        for (String mime : tikaNoMagic) {
            System.out.println("  " + mime);
        }
    }