public void parsePE()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java [113:254]


    public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
                        byte[] first4) throws TikaException, IOException {
        metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString());
        metadata.set(PLATFORM, PLATFORM_WINDOWS);

        // Skip over the MS-DOS bit
        byte[] msdosSection = new byte[0x3c - 4];
        IOUtils.readFully(stream, msdosSection);

        // Grab the PE header offset
        int peOffset = EndianUtils.readIntLE(stream);

        // Reasonability check - while it may go anywhere, it's normally in the first few kb
        if (peOffset > 4096 || peOffset < 0x3f) {
            return;
        }

        // Skip the rest of the MS-DOS stub (if PE), until we reach what should
        //  be the PE header (if this is a PE executable)
        stream.skip(peOffset - 0x40);

        // Read the PE header
        byte[] pe = new byte[24];
        IOUtils.readFully(stream, pe);

        // Check it really is a PE header
        if (pe[0] == (byte) 'P' && pe[1] == (byte) 'E' && pe[2] == 0 && pe[3] == 0) {
            // Good, has a valid PE signature
        } else {
            // Old style MS-DOS
            return;
        }

        // Read the header values
        int machine = EndianUtils.getUShortLE(pe, 4);
        int numSectors = EndianUtils.getUShortLE(pe, 6);
        long createdAt = EndianUtils.getIntLE(pe, 8);
        long symbolTableOffset = EndianUtils.getIntLE(pe, 12);
        long numSymbols = EndianUtils.getIntLE(pe, 16);
        int sizeOptHdrs = EndianUtils.getUShortLE(pe, 20);
        int characteristcs = EndianUtils.getUShortLE(pe, 22);

        // Turn this into helpful metadata
        Date createdAtD = new Date(createdAt * 1000l);
        metadata.set(TikaCoreProperties.CREATED, createdAtD);

        switch (machine) {
            case 0x14c:
                metadata.set(MACHINE_TYPE, MACHINE_x86_32);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;
            case 0x8664:
                metadata.set(MACHINE_TYPE, MACHINE_x86_64);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "64");
                break;
            case 0x200:
                metadata.set(MACHINE_TYPE, MACHINE_IA_64);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "64");
                break;

            case 0x184:
                metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;
            case 0x284:
                metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "64");
                break;

            case 0x1c0:
            case 0x1c4:
                metadata.set(MACHINE_TYPE, MACHINE_ARM);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;

            case 0x268:
                metadata.set(MACHINE_TYPE, MACHINE_M68K);
                metadata.set(ENDIAN, Endian.BIG.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;

            case 0x266:
            case 0x366:
            case 0x466:
                metadata.set(MACHINE_TYPE, MACHINE_MIPS);
                metadata.set(ENDIAN, Endian.BIG.getName());
                metadata.set(ARCHITECTURE_BITS, "16");
                break;
            case 0x162:
            case 0x166:
            case 0x168:
            case 0x169:
                metadata.set(MACHINE_TYPE, MACHINE_MIPS);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "16");
                break;

            case 0x1f0:
            case 0x1f1:
                metadata.set(MACHINE_TYPE, MACHINE_PPC);
                metadata.set(ENDIAN, Endian.LITTLE.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;

            case 0x1a2:
            case 0x1a3:
                metadata.set(MACHINE_TYPE, MACHINE_SH3);
                metadata.set(ENDIAN, Endian.BIG.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;
            case 0x1a6:
                metadata.set(MACHINE_TYPE, MACHINE_SH4);
                metadata.set(ENDIAN, Endian.BIG.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;
            case 0x1a8:
                metadata.set(MACHINE_TYPE, MACHINE_SH3);
                metadata.set(ENDIAN, Endian.BIG.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;

            case 0x9041:
                metadata.set(MACHINE_TYPE, MACHINE_M32R);
                metadata.set(ENDIAN, Endian.BIG.getName());
                metadata.set(ARCHITECTURE_BITS, "32");
                break;

            case 0xebc:
                metadata.set(MACHINE_TYPE, MACHINE_EFI);
                break;

            default:
                metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
                break;
        }
    }