analytics/cubinsizes.py (102 lines of code) (raw):

#!/usr/bin/env python3 # Tool for analyzing sizes of CUDA kernels for various GPU architectures import os import struct import subprocess import sys from tempfile import TemporaryDirectory from typing import Dict # Try to auto-import elftools try: from elftools.elf.elffile import ELFFile except ModuleNotFoundError: print(f'elftools module not found, trying to install it from pip') from pip._internal import main as pip_main try: pip_main(["install", "pyelftools", "--user"]) except SystemExit: print(f'PIP installation failed, please install it manually by invoking "{sys.executable} -mpip install pyelftools --user"') sys.exit(-1) from elftools.elf.elffile import ELFFile # From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size def sizeof_fmt(num, suffix='B'): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) def compute_cubin_sizes(file_name, section_name='.nv_fatbin', debug=False): with open(file_name, 'rb') as f: elf_file = ELFFile(f) nv_fatbin = elf_file.get_section_by_name(section_name) if nv_fatbin is None: return {} data = nv_fatbin.data() idx, offs = 0, 0 elf_sizes = {} while offs < len(data): (magic, version, header_size, fatbin_size) = struct.unpack('IHHL', data[offs: offs + 16]) if magic != 0xba55ed50 or version != 1: raise RuntimeError(f"Unexpected fatbin magic {hex(magic)} or version {version}") if debug: print(f"Found fatbin at {offs} header_size={header_size} fatbin_size={fatbin_size}") offs += header_size fatbin_end = offs + fatbin_size while offs < fatbin_end: (kind, version, hdr_size, elf_size, empty, code_ver, sm_ver) = struct.unpack('HHILLIH', data[offs: offs + 30]) if version != 0x0101 or kind not in [1, 2]: raise RuntimeError(f"Unexpected cubin version {hex(version)} or kind {kind}") sm_ver = f'{"ptx" if kind == 1 else "sm"}_{sm_ver}' if debug: print(f" {idx}: elf_size={elf_size} code_ver={hex(code_ver)} sm={sm_ver}") if sm_ver not in elf_sizes: elf_sizes[sm_ver] = 0 elf_sizes[sm_ver] += elf_size idx, offs = idx + 1, offs + hdr_size + elf_size offs = fatbin_end return elf_sizes class ArFileCtx: def __init__(self, ar_name: str) -> None: self.ar_name = os.path.abspath(ar_name) self._tmpdir = TemporaryDirectory() def __enter__(self) -> str: self._pwd = os.getcwd() rc = self._tmpdir.__enter__() subprocess.check_call(['ar', 'x', self.ar_name]) return rc def __exit__(self, ex, value, tb) -> None: os.chdir(self._pwd) return self._tmpdir.__exit__(ex, value, tb) def dict_add(rc: Dict[str, int], b: Dict[str, int]) -> Dict[str, int]: for key, val in b.items(): rc[key] = (rc[key] if key in rc else 0) + val return rc def main(): if sys.platform != 'linux': print('This script only works with Linux ELF files') return if len(sys.argv) < 2: print(f"{sys.argv[0]} invoked without any arguments trying to infer location of libtorch_cuda") import torch fname = os.path.join(os.path.dirname(torch.__file__), 'lib', 'libtorch_cuda.so') else: fname = sys.argv[1] if not os.path.exists(fname): print(f"Can't find {fname}") sys.exit(-1) section_names = ['.nv_fatbin', '__nv_relfatbin'] results = {name: {} for name in section_names} print(f"Analyzing {fname}") if os.path.splitext(fname)[1] == '.a': with ArFileCtx(fname): for fname in os.listdir("."): if not fname.endswith(".o"): continue for section_name in section_names: elf_sizes = compute_cubin_sizes(fname, section_name) dict_add(results[section_name], elf_sizes) else: for section_name in ['.nv_fatbin', '__nv_relfatbin']: dict_add(results[section_name], compute_cubin_sizes(fname, section_name)) for section_name in section_names: elf_sizes = results[section_name] print(f"{section_name} size {sizeof_fmt(sum(elf_sizes.values()))}") for (sm_ver, total_size) in elf_sizes.items(): print(f" {sm_ver}: {sizeof_fmt(total_size)}") if __name__ == '__main__': main()