in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java [113:254]
public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
byte[] first4) throws TikaException, IOException {
metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString());
metadata.set(PLATFORM, PLATFORM_WINDOWS);
// Skip over the MS-DOS bit
byte[] msdosSection = new byte[0x3c - 4];
IOUtils.readFully(stream, msdosSection);
// Grab the PE header offset
int peOffset = EndianUtils.readIntLE(stream);
// Reasonability check - while it may go anywhere, it's normally in the first few kb
if (peOffset > 4096 || peOffset < 0x3f) {
return;
}
// Skip the rest of the MS-DOS stub (if PE), until we reach what should
// be the PE header (if this is a PE executable)
stream.skip(peOffset - 0x40);
// Read the PE header
byte[] pe = new byte[24];
IOUtils.readFully(stream, pe);
// Check it really is a PE header
if (pe[0] == (byte) 'P' && pe[1] == (byte) 'E' && pe[2] == 0 && pe[3] == 0) {
// Good, has a valid PE signature
} else {
// Old style MS-DOS
return;
}
// Read the header values
int machine = EndianUtils.getUShortLE(pe, 4);
int numSectors = EndianUtils.getUShortLE(pe, 6);
long createdAt = EndianUtils.getIntLE(pe, 8);
long symbolTableOffset = EndianUtils.getIntLE(pe, 12);
long numSymbols = EndianUtils.getIntLE(pe, 16);
int sizeOptHdrs = EndianUtils.getUShortLE(pe, 20);
int characteristcs = EndianUtils.getUShortLE(pe, 22);
// Turn this into helpful metadata
Date createdAtD = new Date(createdAt * 1000l);
metadata.set(TikaCoreProperties.CREATED, createdAtD);
switch (machine) {
case 0x14c:
metadata.set(MACHINE_TYPE, MACHINE_x86_32);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x8664:
metadata.set(MACHINE_TYPE, MACHINE_x86_64);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "64");
break;
case 0x200:
metadata.set(MACHINE_TYPE, MACHINE_IA_64);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "64");
break;
case 0x184:
metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x284:
metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "64");
break;
case 0x1c0:
case 0x1c4:
metadata.set(MACHINE_TYPE, MACHINE_ARM);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x268:
metadata.set(MACHINE_TYPE, MACHINE_M68K);
metadata.set(ENDIAN, Endian.BIG.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x266:
case 0x366:
case 0x466:
metadata.set(MACHINE_TYPE, MACHINE_MIPS);
metadata.set(ENDIAN, Endian.BIG.getName());
metadata.set(ARCHITECTURE_BITS, "16");
break;
case 0x162:
case 0x166:
case 0x168:
case 0x169:
metadata.set(MACHINE_TYPE, MACHINE_MIPS);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "16");
break;
case 0x1f0:
case 0x1f1:
metadata.set(MACHINE_TYPE, MACHINE_PPC);
metadata.set(ENDIAN, Endian.LITTLE.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x1a2:
case 0x1a3:
metadata.set(MACHINE_TYPE, MACHINE_SH3);
metadata.set(ENDIAN, Endian.BIG.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x1a6:
metadata.set(MACHINE_TYPE, MACHINE_SH4);
metadata.set(ENDIAN, Endian.BIG.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x1a8:
metadata.set(MACHINE_TYPE, MACHINE_SH3);
metadata.set(ENDIAN, Endian.BIG.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0x9041:
metadata.set(MACHINE_TYPE, MACHINE_M32R);
metadata.set(ENDIAN, Endian.BIG.getName());
metadata.set(ARCHITECTURE_BITS, "32");
break;
case 0xebc:
metadata.set(MACHINE_TYPE, MACHINE_EFI);
break;
default:
metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
break;
}
}