tools/hprof/dump_classes_from_hprof.py (1,094 lines of code) (raw):

#!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # Parses Android hprof dumps. # Example usage: # In [1]: import hprof # In [2]: hprof.parse_filename('/Users/tcm/Documents/com.facebook.crudo.hprof') # Out[3]: <HprofData TAG="JAVA PROFILE 1.0.3" id-size=4 timestamp=1406233374264> # In [4]: hp = _ # In [5]: hprof.open_bitmaps(hp.lookup_instances_of_class('android.graphics.Bitmap')) # Writing 281 bitmaps to /var/folders/1g/1dxc_1rd1jg5l2csn_tck2085w5qnj/T/tmpwMwbMCbitmaps. # 281 of 281 complete import argparse import enum import logging import os.path import struct import subprocess import sys import tempfile from array import array from collections import defaultdict # Allow missing IDs in some cases to work around seemingly broken hprofs. allow_missing_ids = False def parse_hprof_dump(instream): # Read the tag - a null-terminated string tag = b"" while True: byte = instream.read(1) if not byte: break if byte == b"\x00": break tag += byte tag = tag.decode("utf-8") # UTF8 should be close enough to modified UTF8. big_endian_unsigned_4byte_integer = struct.Struct(b">I") sizeof_id = big_endian_unsigned_4byte_integer.unpack(instream.read(4))[0] high_timestamp = big_endian_unsigned_4byte_integer.unpack(instream.read(4))[0] low_timestamp = big_endian_unsigned_4byte_integer.unpack(instream.read(4))[0] timestamp = (high_timestamp << 32) | low_timestamp hprof_data = HprofData(tag, sizeof_id, timestamp) while True: record = Record.read_from_stream(hprof_data, instream) if record.tag == HprofTag.HEAP_DUMP_END: break hprof_data.resolve() return hprof_data def parse_file(instream): return parse_hprof_dump(instream) def parse_filename(filename): return parse_hprof_dump(open(filename, "rb")) class HprofTag(enum.Enum): STRING = 0x01 LOAD_CLASS = 0x02 UNLOAD_CLASS = 0x03 STACK_FRAME = 0x04 STACK_TRACE = 0x05 ALLOC_SITES = 0x06 HEAP_SUMMARY = 0x07 START_THREAD = 0x0A END_THREAD = 0x0B HEAP_DUMP = 0x0C HEAP_DUMP_SEGMENT = 0x1C HEAP_DUMP_END = 0x2C CPU_SAMPLES = 0x0D CONTROL_SETTINGS = 0x0E class HeapTag(enum.Enum): # standard ROOT_UNKNOWN = 0xFF ROOT_JNI_GLOBAL = 0x01 ROOT_JNI_LOCAL = 0x02 ROOT_JAVA_FRAME = 0x03 ROOT_NATIVE_STACK = 0x04 ROOT_STICKY_CLASS = 0x05 ROOT_THREAD_BLOCK = 0x06 ROOT_MONITOR_USED = 0x07 ROOT_THREAD_OBJECT = 0x08 CLASS_DUMP = 0x20 INSTANCE_DUMP = 0x21 OBJECT_ARRAY_DUMP = 0x22 PRIMITIVE_ARRAY_DUMP = 0x23 # Android HEAP_DUMP_INFO = 0xFE ROOT_INTERNED_STRING = 0x89 ROOT_FINALIZING = 0x8A # obsolete ROOT_DEBUGGER = 0x8B ROOT_REFERENCE_CLEANUP = 0x8C # obsolete ROOT_VM_INTERNAL = 0x8D ROOT_JNI_MONITOR = 0x8E UNREACHABLE = 0x90 # obsolete PRIMITIVE_ARRAY_NODATA_DUMP = 0xC3 class HprofBasic(enum.Enum): OBJECT = 2 BOOLEAN = 4 CHAR = 5 FLOAT = 6 DOUBLE = 7 BYTE = 8 SHORT = 9 INT = 10 LONG = 11 def size(self): if self is HprofBasic.OBJECT: return 4 elif self is HprofBasic.BOOLEAN: return 1 elif self is HprofBasic.CHAR: return 2 elif self is HprofBasic.FLOAT: return 4 elif self is HprofBasic.DOUBLE: return 8 elif self is HprofBasic.BYTE: return 1 elif self is HprofBasic.SHORT: return 2 elif self is HprofBasic.INT: return 4 elif self is HprofBasic.LONG: return 8 else: raise Exception("Invalid HprofBasic type: %s" % self) def parse(self, byte_stream): if self is HprofBasic.OBJECT: return byte_stream.next_id() elif self is HprofBasic.BOOLEAN: return byte_stream.next_byte() != 0 elif self is HprofBasic.CHAR: return byte_stream.next_two_bytes() elif self is HprofBasic.FLOAT: return byte_stream.next_four_bytes() elif self is HprofBasic.DOUBLE: return byte_stream.next_eight_bytes() elif self is HprofBasic.BYTE: return byte_stream.next_byte() elif self is HprofBasic.SHORT: return byte_stream.next_two_bytes() elif self is HprofBasic.INT: return byte_stream.next_four_bytes() elif self is HprofBasic.LONG: return byte_stream.next_eight_bytes() else: raise Exception("Invalid HprofBasic type: %s" % self) class Record(object): record_struct_format = b">BII" record_struct = struct.Struct(record_struct_format) def __init__(self, tag, time_offset_us): self.tag = tag self.time_offset_us = time_offset_us @staticmethod def read_from_stream(hprof_data, instream): (tag, time_offset_us, length) = Record.record_struct.unpack( instream.read(struct.calcsize(Record.record_struct_format)) ) data = array("B") data.frombytes(instream.read(length)) if tag == HprofTag.STRING.value: return hprof_data.parse_string_record( tag=tag, time_offset_us=time_offset_us, data=data ) elif tag == HprofTag.LOAD_CLASS.value: return hprof_data.parse_load_class_record( tag=tag, time_offset_us=time_offset_us, data=data ) elif tag == HprofTag.HEAP_DUMP_SEGMENT.value: return hprof_data.parse_heap_dump_segment_record( tag=tag, time_offset_us=time_offset_us, data=data ) # default record = Record(tag=HprofTag(tag), time_offset_us=time_offset_us) record.length = length record.data = data return record def __str__(self): return "Record { %s %dus %d }" % (self.tag, self.time_offset_us, self.length) class StringRecord(Record): def __init__(self, tag, time_offset_us, string_id, string): super(StringRecord, self).__init__(tag, time_offset_us) self.string_id = string_id self.string = string def __str__(self): return "StringRecord { %dus %s }" % (self.time_offset_us, self.string) @staticmethod def create(tag, time_offset_us, data): byte_stream = ByteStream(data) heap_id = byte_stream.next_four_bytes() string = byte_stream.remainder().tobytes().decode("utf-8") return StringRecord(tag, time_offset_us, heap_id, string) class LoadClassRecord(Record): def __init__( self, tag, time_offset_us, class_serial, object_id, stack_serial, class_string_id, ): super(LoadClassRecord, self).__init__(tag, time_offset_us) self.class_serial = class_serial self.object_id = object_id self.stack_serial = stack_serial self.class_string_id = class_string_id def __str__(self): return "LoadClassRecord { %dus %d %d %d %d }" % ( self.time_offset_us, self.class_serial, self.object_id, self.stack_serial, self.class_string_id, ) @staticmethod def create(tag, time_offset_us, data): byte_stream = ByteStream(data) class_serial = byte_stream.next_four_bytes() object_id = byte_stream.next_id() stack_serial = byte_stream.next_four_bytes() class_string_id = byte_stream.next_id() assert not byte_stream.has_more() return LoadClassRecord( tag=tag, time_offset_us=time_offset_us, class_serial=class_serial, object_id=object_id, stack_serial=stack_serial, class_string_id=class_string_id, ) class ByteStream(object): def __init__(self, data): self.data = data self.index = 0 def next_byte(self): byte = self.data[self.index] self.index += 1 return byte def next_two_bytes(self): two_bytes = struct.unpack(b">H", self.data[self.index : self.index + 2])[0] self.index += 2 return two_bytes def next_four_bytes(self): four_bytes = struct.unpack(b">I", self.data[self.index : self.index + 4])[0] self.index += 4 return four_bytes def next_eight_bytes(self): eight_bytes = struct.unpack(b">Q", self.data[self.index : self.index + 8])[0] self.index += 8 return eight_bytes # TODO: support 64-bit def next_id(self): return self.next_four_bytes() def next_byte_array(self, length): byte_array = self.data[self.index : self.index + length] self.index += length return byte_array def remainder(self): index = self.index self.index = len(self.data) return self.data[index:] def has_more(self): return self.index < len(self.data) - 1 class HeapDumpSegmentRecord(Record): def __init__(self, tag, time_offset_us): super(HeapDumpSegmentRecord, self).__init__(tag, time_offset_us) def __str__(self): return "HeapDumpSegmentRecord" class HprofHeap(object): def __init__(self, heap_id, name_string_id): self.heap_id = heap_id self.name_string_id = name_string_id def resolve(self, hprof_data): self.name = hprof_data.lookup_string(self.name_string_id) del self.name_string_id def __str__(self): return "<HprofHeap %s>" % self.name def __repr__(self): return str(self) class SimpleSegment(object): def __init__(self, heap_tag, object_id): self.heap_tag = heap_tag self.object_id = object_id class HprofRoot(SimpleSegment): def __init__(self, heap_tag, object_id): super(HprofRoot, self).__init__(heap_tag, object_id) def resolve(self, hprof_data): self.obj = hprof_data.resolve_object_id(self.object_id) del self.object_id def __str__(self): return "<HprofRoot %s %s>" % (self.heap_tag, self.obj) def __repr__(self): return str(self) class StaticField(object): def __init__(self, string_id, hprof_basic, value): self.string_id = string_id self.hprof_basic = hprof_basic self.value = value @staticmethod def parse(byte_stream): static_field_name_string_id = byte_stream.next_four_bytes() hprof_basic = HprofBasic(byte_stream.next_byte()) value = hprof_basic.parse(byte_stream) return StaticField(static_field_name_string_id, hprof_basic, value) def resolve(self, hprof_data): self.name = hprof_data.lookup_string(self.string_id) del self.string_id class InstanceField(object): def __init__(self, string_id, hprof_basic): self.string_id = string_id self.hprof_basic = hprof_basic @staticmethod def parse(byte_stream): field_name_string_id = byte_stream.next_four_bytes() hprof_basic = HprofBasic(byte_stream.next_byte()) return InstanceField(field_name_string_id, hprof_basic) def resolve(self, hprof_data): self.name = hprof_data.lookup_string(self.string_id) del self.string_id class ReferenceType(enum.Enum): SUPER_CLASS = 2 CLASS_LOADER = 3 CLASS = 4 FIELD = 5 ARRAY = 6 class Reference(object): def __init__(self, reference_type, referer, referee): assert isinstance(referer, HprofObject) assert isinstance(referee, HprofObject) self.reference_type = reference_type self.referer = referer self.referee = referee def __str__(self): return "<Reference %s %s %s>" % ( self.reference_type, self.referer, self.referee, ) def __repr__(self): return str(self) class FieldReference(Reference): def __init__(self, referer, referee, class_name, field_name): super(FieldReference, self).__init__(ReferenceType.FIELD, referer, referee) self.class_name = class_name self.field_name = field_name def __str__(self): return "<FieldReference %s.%s %s>" % ( self.referer, self.field_name, self.referee, ) class HprofObject(SimpleSegment): def __init__(self, heap_tag, object_id, heap_id): super(HprofObject, self).__init__(heap_tag, object_id) self.heap_id = heap_id def resolve(self, hprof_data): self.hprof_data = hprof_data if self.heap_id is None: self.heap = None else: self.heap = hprof_data.lookup_heap(self.heap_id) del self.heap_id def incoming_references(self): return self.hprof_data.lookup_references(self) def __str__(self): return "<%s 0x%x>" % (self.__class__.__name__, self.object_id) def __repr__(self): return str(self) class MergedFields(object): def __init__(self): pass class HprofInstance(HprofObject): def __init__(self, heap_tag, object_id, heap_id): super(HprofInstance, self).__init__(heap_tag, object_id, heap_id) def resolve(self, hprof_data): super(HprofInstance, self).resolve(hprof_data) # load the class of this instance self.clazz = hprof_data.resolve_object_id(self.class_object_id) del self.class_object_id # To avoid over-writing shadowed fields, we have nested dicts, one for each # class in the hierarchy self.class_fields = defaultdict(dict) # for convenience merged_fields_builder = defaultdict(dict) self.fields = MergedFields() byte_stream = ByteStream(self.instance_field_data) del self.instance_field_data # Instance field data consists of current class data, followed by super # class data, and so on. clazz = self.clazz while clazz is not None: for field in clazz.instance_fields: value = field.hprof_basic.parse(byte_stream) name = field.name if field.hprof_basic is HprofBasic.OBJECT: value = hprof_data.resolve_object_id(value) self.class_fields[clazz.name][name] = value merged_fields_builder[name][clazz.name] = value clazz = clazz.super_class for key, value in merged_fields_builder.items(): # Avoid over-writing python internals, like __dict__ if key in self.fields.__dict__: key = "__hprof_" + key assert key not in self.fields.__dict__ if len(value) == 1: setattr(self.fields, key, next(iter(value.values()))) else: # There is a conflict in the class hierarchy (e.g. privates with the # same name), so we need to store a dictionary. setattr(self.fields, key, value) if byte_stream.has_more(): raise Exception("Extra data in %d" % self.object_id) def outgoing_references_to(self, obj): return self.outgoing_references(lambda x: x is obj) def outgoing_references(self, filter_function=lambda x: True): # we are not walking the references from the class even though an instance # can be thought as referencing its class. That ends up being too intrusive # and attribute weight the wrong way. # Classes should be walked explicitly refs = [] for class_name, fields in self.class_fields.items(): for name, value in fields.items(): if isinstance(value, HprofObject) and filter_function(value): refs.append(FieldReference(self, value, class_name, name)) return refs def __str__(self): return "<%s %s 0x%x>" % ( self.__class__.__name__, self.clazz.name, self.object_id, ) def __repr__(self): return str(self) def shallow_size(self): # This should be pretty exact return self.clazz.instance_size class HprofClass(HprofObject): def __init__(self, object_id, heap_id): super(HprofClass, self).__init__(HeapTag.CLASS_DUMP, object_id, heap_id) self.children = [] @staticmethod def parse(byte_stream, heap_id): segment = HprofClass(byte_stream.next_id(), heap_id) segment.stack_serial = byte_stream.next_four_bytes() segment.super_class_id = byte_stream.next_id() segment.class_loader_id = byte_stream.next_id() segment.signer = byte_stream.next_id() # always zero on dalvik segment.prot_domain = byte_stream.next_id() # always zero on dalvik # reserved byte_stream.next_id() byte_stream.next_id() segment.instance_size = byte_stream.next_four_bytes() segment.const_pool_count = ( byte_stream.next_two_bytes() ) # always empty on dalvik if segment.const_pool_count > 0: raise Exception("Cannot handle const_pools.") static_field_count = byte_stream.next_two_bytes() segment.static_fields = [] for _ in range(static_field_count): segment.static_fields.append(StaticField.parse(byte_stream)) instance_field_count = byte_stream.next_two_bytes() segment.instance_fields = [] for _ in range(instance_field_count): segment.instance_fields.append(InstanceField.parse(byte_stream)) return segment def resolve(self, hprof_data): super(HprofClass, self).resolve(hprof_data) load_class_record = hprof_data.lookup_load_class_record(self.object_id) self.name = hprof_data.lookup_string(load_class_record.class_string_id) if self.super_class_id > 0: self.super_class = hprof_data.resolve_object_id(self.super_class_id) self.super_class.children.append(self) else: self.super_class = None del self.super_class_id if self.class_loader_id > 0: self.class_loader = hprof_data.resolve_object_id(self.class_loader_id) else: self.class_loader = None del self.class_loader_id for field in self.instance_fields: field.resolve(hprof_data) self.fields = MergedFields() for static_field in self.static_fields: static_field.resolve(hprof_data) if static_field.hprof_basic == HprofBasic.OBJECT: static_field.value = hprof_data.resolve_object_id(static_field.value) name = static_field.name # Don't want to overwrite Python internal fields - like __dict__ if name in self.fields.__dict__: name = "__hprof_" + name assert name not in self.fields.__dict__ setattr(self.fields, name, static_field.value) def outgoing_references_to(self, obj): return self.outgoing_references(lambda x: x is obj) def outgoing_references(self, filter_function=lambda x: True): refs = [] if self.super_class is not None and filter_function(self.super_class): refs.append(Reference(ReferenceType.SUPER_CLASS, self, self.super_class)) if self.class_loader is not None and filter_function(self.class_loader): refs.append(Reference(ReferenceType.CLASS_LOADER, self, self.class_loader)) for static_field in self.static_fields: if isinstance(static_field.value, HprofObject) and filter_function( static_field.value ): refs.append( FieldReference( self, static_field.value, self.name, static_field.name ) ) return refs def __str__(self): return "<Class %s>" % self.name def shallow_size(self): # This is an estimate # One id for the class (i.e. java.lang.Class) # One id for the lock # I counted 34 id members before the static array # 5 ids for each static field return 36 * self.hprof_data.sizeof_id + 5 * self.hprof_data.sizeof_id * len( self.static_fields ) class HprofPrimitiveArray(HprofObject): def __init__(self, object_id, heap_id): super(HprofPrimitiveArray, self).__init__( HeapTag.PRIMITIVE_ARRAY_DUMP, object_id, heap_id ) @staticmethod def parse(byte_stream, heap_id): segment = HprofPrimitiveArray(byte_stream.next_id(), heap_id) segment.stack_serial = byte_stream.next_four_bytes() segment.num_elements = byte_stream.next_four_bytes() segment.prim_type = HprofBasic(byte_stream.next_byte()) # Parsing primitive data is slow, and not always so interesting, so we defer segment.array_data = byte_stream.next_byte_array( segment.num_elements * segment.prim_type.size() ) return segment def resolve(self, hprof_data): super(HprofPrimitiveArray, self).resolve(hprof_data) self.clazz = hprof_data.class_name_dict[self.prim_type.name.lower() + "[]"] # Resolving large arrays is expensive and pointless def fully_resolve(self): self.array_values = [] byte_stream = ByteStream(self.array_data) count = int(len(self.array_data) / self.prim_type.size()) for _ in range(count): self.array_values.append(self.prim_type.parse(byte_stream)) # We keep around array_data since it's sometimes handy - like when loading bitmaps def outgoing_references_to(self, obj): return [] def outgoing_references(self, filter_function=lambda x: True): return [] def shallow_size(self): # This is an estimate # One id for the pointer to the class object # One id for the lock # One prim for each of the array slots return 2 * self.hprof_data.sizeof_id + self.num_elements * self.prim_type.size() class HprofObjectArray(HprofObject): def __init__(self, object_id, heap_id): super(HprofObjectArray, self).__init__( HeapTag.OBJECT_ARRAY_DUMP, object_id, heap_id ) @staticmethod def parse(byte_stream, heap_id): segment = HprofObjectArray(byte_stream.next_id(), heap_id) segment.stack_serial = byte_stream.next_four_bytes() num_elements = byte_stream.next_four_bytes() segment.array_class_object_id = byte_stream.next_four_bytes() segment.array_values = [] for _ in range(num_elements): segment.array_values.append(byte_stream.next_id()) return segment def resolve(self, hprof_data): super(HprofObjectArray, self).resolve(hprof_data) self.clazz = hprof_data.resolve_object_id(self.array_class_object_id) for i, obj in enumerate(self.array_values): # Resolve non-null value to parsed instance if obj != 0: self.array_values[i] = hprof_data.resolve_object_id( obj, "No object for %x (index %d in %x)", obj, i, self.object_id ) else: self.array_values[i] = None def outgoing_references_to(self, obj): return self.outgoing_references(lambda x: x is obj) def outgoing_references(self, filter_function=lambda x: True): refs = [] for value in self.array_values: if value is not None and filter_function(value): refs.append(Reference(ReferenceType.ARRAY, self, value)) return refs def shallow_size(self): # This is an estimate # One id for the pointer to the class object # One id for the lock # One id for each of the array slots return 2 * self.hprof_data.sizeof_id + self.hprof_data.sizeof_id * len( self.array_values ) class HprofString(HprofInstance): def __init__(self, heap_tag, object_id, heap_id): super(HprofString, self).__init__(heap_tag, object_id, heap_id) def string(self): char_array = self.class_fields["java.lang.String"]["value"] char_array.fully_resolve() return "".join( [chr(char_array.array_values[i]) for i in range(self.fields.count)] ) def __str__(self): return "<String %s 0x%x count=%d>" % ( self.clazz.name, self.object_id, self.fields.count, ) class HprofData(object): def __init__(self, tag, sizeof_id, timestamp): self.tag = tag self.sizeof_id = sizeof_id self.timestamp = timestamp self.object_id_dict = {} self.string_id_dict = {} self.class_object_id_to_load_class_record = {} self.roots = [] self.heap_dict = {} # Populated in resolve step self.class_name_dict = {} self.dupe_class_dict = defaultdict(list) self.current_heap_id = None self.string_class_object_id = None self.inverted_references = None self.gc_done = False def __str__(self): return '<HprofData TAG="%s" id-size=%d timestamp=%d>' % ( self.tag, self.sizeof_id, self.timestamp, ) def __repr__(self): return str(self) def parse_class_dump(self, byte_stream): clazz = HprofClass.parse(byte_stream, self.current_heap_id) # Kind of hacky, but allows derived instance types load_class_record = self.lookup_load_class_record(clazz.object_id) name = self.lookup_string(load_class_record.class_string_id) if name == "java.lang.String": assert self.string_class_object_id is None self.string_class_object_id = clazz.object_id self.object_id_dict[clazz.object_id] = clazz return clazz def parse_instance_dump(self, byte_stream): object_id = byte_stream.next_id() stack_serial = byte_stream.next_four_bytes() class_object_id = byte_stream.next_id() instance_field_values_size = byte_stream.next_four_bytes() instance_field_data = byte_stream.next_byte_array(instance_field_values_size) if class_object_id == self.string_class_object_id: segment = HprofString( HeapTag.INSTANCE_DUMP, object_id, self.current_heap_id ) else: segment = HprofInstance( HeapTag.INSTANCE_DUMP, object_id, self.current_heap_id ) segment.stack_serial = stack_serial segment.class_object_id = class_object_id segment.instance_field_data = instance_field_data if segment.object_id in self.object_id_dict: raise Exception("Duplicate object_id: %d" % segment.object_id) self.object_id_dict[segment.object_id] = segment return segment def parse_primitive_array_dump(self, byte_stream): primitive_array = HprofPrimitiveArray.parse(byte_stream, self.current_heap_id) if primitive_array.object_id in self.object_id_dict: raise Exception("Duplicate object_id: %d" % primitive_array.object_id) self.object_id_dict[primitive_array.object_id] = primitive_array return primitive_array def parse_object_array_dump(self, byte_stream): object_array = HprofObjectArray.parse(byte_stream, self.current_heap_id) if object_array.object_id in self.object_id_dict: raise Exception("Duplicate object_id: %d" % object_array.object_id) self.object_id_dict[object_array.object_id] = object_array return object_array def parse_string_record(self, tag, time_offset_us, data): string_record = StringRecord.create(tag, time_offset_us, data) self.string_id_dict[string_record.string_id] = string_record return string_record def parse_load_class_record(self, tag, time_offset_us, data): load_class_record = LoadClassRecord.create(tag, time_offset_us, data) self.class_object_id_to_load_class_record[ load_class_record.object_id ] = load_class_record return load_class_record def parse_heap_dump_segment_record(self, tag, time_offset_us, data): byte_stream = ByteStream(data) while byte_stream.has_more(): heap_tag = HeapTag(byte_stream.next_byte()) if heap_tag in ( HeapTag.ROOT_UNKNOWN, HeapTag.ROOT_STICKY_CLASS, HeapTag.ROOT_MONITOR_USED, HeapTag.ROOT_INTERNED_STRING, HeapTag.ROOT_FINALIZING, HeapTag.ROOT_DEBUGGER, HeapTag.ROOT_REFERENCE_CLEANUP, HeapTag.ROOT_VM_INTERNAL, ): object_id = byte_stream.next_four_bytes() hprof_root = HprofRoot(heap_tag, object_id) self.add_root(hprof_root) elif heap_tag is HeapTag.ROOT_JNI_GLOBAL: object_id = byte_stream.next_id() hprof_root = HprofRoot(heap_tag, object_id) hprof_root.jni_global_ref_id = byte_stream.next_id() self.add_root(hprof_root) elif heap_tag is HeapTag.ROOT_THREAD_OBJECT: thread_object_id = byte_stream.next_id() hprof_root = HprofRoot(heap_tag, thread_object_id) hprof_root.thread_serial = byte_stream.next_four_bytes() hprof_root.stack_serial = byte_stream.next_four_bytes() self.add_root(hprof_root) elif heap_tag in ( HeapTag.ROOT_JNI_LOCAL, HeapTag.ROOT_JNI_MONITOR, HeapTag.ROOT_JAVA_FRAME, ): object_id = byte_stream.next_id() hprof_root = HprofRoot(heap_tag, object_id) hprof_root.thread_serial = byte_stream.next_four_bytes() hprof_root.stack_serial = byte_stream.next_four_bytes() self.add_root(hprof_root) elif heap_tag in (HeapTag.ROOT_NATIVE_STACK, HeapTag.ROOT_THREAD_BLOCK): object_id = byte_stream.next_id() hprof_root = HprofRoot(heap_tag, object_id) hprof_root.thread_serial = byte_stream.next_four_bytes() self.add_root(hprof_root) elif heap_tag is HeapTag.HEAP_DUMP_INFO: heap_id = byte_stream.next_id() name_string_id = byte_stream.next_id() self.add_heap(HprofHeap(heap_id, name_string_id)) elif heap_tag is HeapTag.PRIMITIVE_ARRAY_DUMP: self.parse_primitive_array_dump(byte_stream) elif heap_tag is HeapTag.CLASS_DUMP: # print("skipping class dump") self.parse_class_dump(byte_stream) elif heap_tag is HeapTag.INSTANCE_DUMP: # print("skipping instance dump") self.parse_instance_dump(byte_stream) elif heap_tag is HeapTag.OBJECT_ARRAY_DUMP: # print("skipping obj array dump") self.parse_object_array_dump(byte_stream) else: raise Exception("Unrecognized tag: %s" % heap_tag) return HeapDumpSegmentRecord(tag, time_offset_us) def add_heap(self, hprof_heap): if hprof_heap.heap_id in self.heap_dict: existing_heap = self.heap_dict[hprof_heap.heap_id] assert hprof_heap.heap_id == existing_heap.heap_id assert hprof_heap.name_string_id == existing_heap.name_string_id else: self.heap_dict[hprof_heap.heap_id] = hprof_heap self.current_heap_id = hprof_heap.heap_id def add_root(self, hprof_root): self.roots.append(hprof_root) def resolve(self): # First resolve heaps for heap in self.heap_dict.values(): heap.resolve(self) # Then resolve classes for obj in self.object_id_dict.values(): if isinstance(obj, HprofClass): clazz = obj clazz.resolve(self) if clazz.name in self.class_name_dict: if ( self.class_name_dict[clazz.name] not in self.dupe_class_dict[clazz.name] ): self.dupe_class_dict[clazz.name].append( self.class_name_dict[clazz.name] ) self.dupe_class_dict[clazz.name].append(clazz) print("Warning: duplicate class: %s" % clazz.name) else: self.class_name_dict[clazz.name] = clazz # Fix up all classes to derive from java.lang.Class # at the time we create every HprofClass 'java.lang.Class' may have # not be parsed yet and thus unavailable clsCls = self.class_name_dict["java.lang.Class"] for cls in self.class_name_dict.values(): cls.clazz = clsCls # Then other objects for obj in self.object_id_dict.values(): if not isinstance(obj, HprofClass): obj.resolve(self) obj.is_root = False # Fixed up for root objects below # Then roots for root in self.roots: root.resolve(self) root.obj.is_root = True def resolve_object_id(self, obj_id, fmt=None, *args): if obj_id == 0: return None if obj_id in self.object_id_dict: return self.object_id_dict[obj_id] if allow_missing_ids: if fmt is not None: logging.warning(fmt, *args) return None if fmt is not None: raise RuntimeError(fmt % args) raise RuntimeError(f"No object for {obj_id:x}") def lookup_string(self, string_id): return self.string_id_dict[string_id].string def lookup_load_class_record(self, class_object_id): return self.class_object_id_to_load_class_record[class_object_id] def lookup_instances_of_class(self, class_name): return [ obj for obj in self.object_id_dict.values() if isinstance(obj, HprofInstance) and obj.clazz.name == class_name ] def load_inverted_references(self): if self.inverted_references is None: # Will be much faster for later invocations self.inverted_references = defaultdict(list) for heap_obj in self.object_id_dict.values(): for ref in heap_obj.outgoing_references(): self.inverted_references[ref.referee].append(ref) def gc(self): if not self.gc_done: self.gc_done = True live_instances = {} incoming_references = defaultdict(list) current_list = set() for root in self.roots: current_list.add(root.obj) live_instances[root.obj.object_id] = root.obj while len(current_list) > 0: new_current = set() for obj in current_list: for ref in obj.outgoing_references(): curr = ref.referee incoming_references[curr].append(ref) if curr.object_id not in live_instances: new_current.add(curr) live_instances[curr.object_id] = curr current_list = new_current self.object_id_dict = live_instances self.inverted_references = incoming_references def lookup_references(self, obj): self.load_inverted_references() return self.inverted_references[obj] def lookup_heap(self, heap_id): return self.heap_dict[heap_id] def strings(hprof_data): string_instances = hprof_data.lookup_instances_of_class("java.lang.String") return [instance.string() for instance in string_instances] def app_strings(hprof_data): return [ instance.string() for instance in app_string_instances(hprof_data) if instance.heap.name != "zygote" ] def app_string_instances(hprof_data): return hprof_data.lookup_instances_of_class("java.lang.String") def app_interned_string_instances(hprof_data): return [ root.obj for root in hprof_data.roots if root.heap_tag is HeapTag.ROOT_INTERNED_STRING and root.obj.heap.name != "zygote" ] def app_non_interned_string_instances(hprof_data): interned = set(app_interned_string_instances(hprof_data)) return [ s for s in hprof_data.lookup_instances_of_class("java.lang.String") if s not in interned and s.heap.name != "zygote" ] def app_roots(hprof_data): return [root for root in hprof_data.roots if root.obj.heap.name != "zygote"] def roots_of_obj(hprof_data, obj): roots = [] if obj.is_root: roots.append(obj) visited = set() references = hprof_data.lookup_references(obj) while len(references) > 0: new_references = [] for reference in references: if reference.referer not in visited: visited.add(reference.referer) new_references.append(reference) if reference.referer.is_root: roots.append(reference.referer) references = new_references return roots def zygote_references_to_app_objects(hprof_data): references = [] for obj in hprof_data.object_id_dict.values(): if obj.heap.name == "zygote": for reference in obj.outgoing_references(): if reference.referee.heap.name != "zygote": references.append(reference) return references def bitmap_instances(hprof_data): return hprof_data.lookup_instances_of_class("android.graphics.Bitmap") def app_bitmap_instances(hprof_data): return [x for x in bitmap_instances(hprof_data) if x.heap.name != "zygote"] def write_bitmap(bitmap_instance, filename): # Need to install PIL # sudo pip install pillow from PIL import Image bitmap_bytes = bitmap_instance.fields.mBuffer.array_data image = Image.frombytes( "RGBA", (bitmap_instance.fields.mWidth, bitmap_instance.fields.mHeight), bitmap_bytes, ) image.save(filename) def open_bitmaps(bitmap_instances): tmp_dir = tempfile.mkdtemp(suffix="bitmaps") subprocess.call(["open", tmp_dir]) # this only works in Mac - sorry! print("Writing %d bitmaps to %s." % (len(bitmap_instances), tmp_dir)) for i, bitmap in enumerate(bitmap_instances): write_bitmap(bitmap, os.path.join(tmp_dir, "bitmap_%s.png" % bitmap.object_id)) sys.stdout.write("\r%d of %d complete" % (i + 1, len(bitmap_instances))) sys.stdout.flush() print("") # terminate line def view_roots(hprof_data): return hprof_data.lookup_instances_of_class("android.view.ViewRootImpl") def print_view_tree(view_root=None): if isinstance(view_root, HprofData): all_view_roots = view_roots(view_root) if len(all_view_roots) != 1: raise Exception("Please specify view root explicitly: %s" % all_view_roots) else: view_root = all_view_roots[0] else: print("not an hprofdata: %s" % view_root.__class__) print("%s" % view_root) def print_view_node(view_node, indent): print("%s%s" % (indent, view_node)) if "android.view.ViewGroup" in view_node.class_fields: children = view_node.class_fields["android.view.ViewGroup"]["mChildren"] for child in children.array_values: if child is not None: print_view_node(child, indent + " ") print_view_node(view_root.fields.mView, " ") def reachable(instance, filter_function=lambda x: True): if isinstance(instance, (list, set)): instances = instance if len(instances) == 0: return [] else: instances = [instance] seen = set(instances) reachable = set(instances) referees = list(reachable) while len(referees) > 0: new_referees = [] for referee in referees: for reference in referee.outgoing_references(): if reference.referee not in seen: if filter_function(reference.referee): reachable.add(reference.referee) new_referees.append(reference.referee) seen.add(reference.referee) referees = new_referees return reachable def reachable_size(instance): return sum(x.shallow_size() for x in reachable(instance)) def retained(instance): if isinstance(instance, (list, set)): instances = instance if len(instances) == 0: return [] else: instances = [instance] reachable_set = reachable(instances) return retained_in_set(instances, reachable_set) def retained_size(instance): return sum(x.shallow_size() for x in retained(instance)) def retained_in_set(instances, reachable_set): if isinstance(instances, (list, set)): if instances: hprof_data = next(iter(instances)).hprof_data else: return [] else: hprof_data = instances.hprof_data # keep the initial set around so nothing from that ever gets deleted initial_set = set(instances) # the set of instances on which to compute the retained set retained_set = set(instances) # objects under investigation, usually a breadth-first walk current = set(instances) # the subset of reachable_set that has incoming references outside of reachable_set escaped = set() # walk the incoming references to the given object returning true if # there is a reference outside the reachable_set set def reference_escapes(obj): visited = set() objects = [obj] while len(objects) > 0: new_objects = [] for o in objects: if o in retained_set: continue # by definition if o in escaped: # 'in escaped' is equivalent to 'not in reachable_set' # because to be in the escaped set means that (transitively) a # reference was 'not in reachable_set' return True if o.is_root: # a gc root is not in the retained set by definition # unless it was part of the initial set but that was # already accounted for above return True for reference in hprof_data.lookup_references(o): referer = reference.referer if referer not in visited: if referer not in reachable_set or referer in escaped: return True visited.add(referer) new_objects.append(referer) objects = new_objects return False # walk the outgoing references of an object adding all elements to # the escaped set. Notice: given that the object in input is in the reachable_set # set all outgoing references must be in the reachable_set set. # Effectively this operation trims the reachable_set set and possibly # the current retained set def remove_escaped_ref(obj): current = set() if obj not in escaped and obj not in initial_set: escaped.add(obj) retained_set.discard(obj) current.add(obj) while len(current) > 0: new_current = set() for reference in obj.outgoing_references(): ref = reference.referee if ref not in escaped and ref not in initial_set: escaped.add(ref) retained_set.discard(ref) new_current.add(ref) current = new_current # walk the graph from the set of roots down, and for every object check # if it has incoming references outside of the reachable_set set. If so the object # and all its outgoing references are *not* in the retained set while len(current) > 0: new_current = set() for obj in current: if reference_escapes(obj): remove_escaped_ref(obj) continue # add it to the retained set, check the children retained_set.add(obj) for reference in obj.outgoing_references(): ref = reference.referee if ( ref not in escaped and ref not in retained_set and ref in reachable_set ): new_current.add(ref) current = new_current return retained_set def wasted_segments(char_array): strings = [ref.referer for ref in char_array.incoming_references()] for s in strings: if not isinstance(s, HprofString): # Don't know anything about non-string references return [] if len(strings) == 0: # It's garbage-collectible, so don't count it as wasted return [] def forward_comparator(x, y): if x.fields.offset != y.fields.offset: return x.fields.offset - y.fields.offset else: return x.fields.count - y.fields.count sorted_forward = sorted(strings, forward_comparator) segments = [] current_segment_start = 0 for s in sorted_forward: if s.fields.offset > current_segment_start: segments.append((current_segment_start, s.fields.offset)) current_segment_start = max( current_segment_start, s.fields.offset + s.fields.count ) # Look at remaining if char_array.num_elements > current_segment_start: segments.append((current_segment_start, char_array.num_elements)) return segments # substring can result in wasted char arrays # This isn't exact - need to figure out way of determining unused chars in the middle def wasted_string_char_arrays(hprof_data): char_arrays = filter( lambda v: isinstance(v, HprofPrimitiveArray) and v.prim_type is HprofBasic.CHAR, hprof_data.object_id_dict.values(), ) with_wasted = map(lambda x: (x, wasted_segments(x)), char_arrays) return filter(lambda x: len(x[1]) > 0, with_wasted) def wasted_string_char_count(hprof_data): wasted_char_array_info = wasted_string_char_arrays(hprof_data) def segment_length(segments): return sum(map(lambda x: x[1] - x[0], segments)) return sum(map(lambda x: segment_length(x[1]), wasted_char_array_info)) def app_heap_objects(hprof_data): return [o for o in hprof_data.object_id_dict.values() if o.heap.name != "zygote"] # return a set of containing 'clazz' and all its subclasses def subclasses_of(hprof_data, clazz): classes = {clazz} children = clazz.children while len(children) > 0: classes = classes.union(children) new_children = [] for child in children: new_children.extend(child.children) children = new_children return classes # return all instances of the given class and all subclasses. def instances_of(hprof_data, clazz): return instances_in(hprof_data, subclasses_of(hprof_data, clazz)) # return a set of all instances from a sequence of classes def instances_in(hprof_data, classes): if not isinstance(classes, set): if isinstance(classes, list): classes = set(classes) else: classes = {classes} return {obj for obj in hprof_data.object_id_dict.values() if obj.clazz in classes} # return a map of class => {instances} for the given sequence of instances def group_by_class(instances): by_class = {} for obj in instances: if obj.clazz not in by_class: by_class[obj.clazz] = set() by_class[obj.clazz].add(obj) return by_class # return a map from thread to locals for every thread def java_locals(hprof_data): locs = [ root for root in hprof_data.roots if root.heap_tag == HeapTag.ROOT_JAVA_FRAME or root.heap_tag == HeapTag.ROOT_JNI_LOCAL ] threads = { root.thread_serial: root.obj for root in hprof_data.roots if root.heap_tag == HeapTag.ROOT_THREAD_OBJECT } thread_locals = {thread: set() for thread in threads.values()} for loc in locs: thread_locals[threads[loc.thread_serial]].add(loc.obj) return thread_locals if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--hprof", help="heap dump to generate class list from", required=True ) parser.add_argument( "--allow_missing_ids", help="Unresolvable IDs result in only warnings, not errors", action="store_true", ) args = parser.parse_args() allow_missing_ids = args.allow_missing_ids hp = parse_filename(args.hprof) classes = [] for cls_name, cls in hp.class_name_dict.items(): classes.append( ( cls_name, hp.class_object_id_to_load_class_record[cls.object_id].class_serial, ) ) seen = set() class_serials = [] for tup in classes: if tup[0] not in seen: seen.add(tup[0]) if not tup[0].endswith("[]"): class_serials.append(tup) # On Dalvik these serial numbers correspond to classload order, # so it's useful to sort by them. class_serials.sort(key=lambda x: x[1]) for cls in class_serials: class_name = str(cls[0]).replace(".", "/") + ".class" print(class_name)