perfkitbenchmarker/linux_packages/cuda_toolkit.py (357 lines of code) (raw):

# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Module containing CUDA toolkit installation and cleanup functions. This module installs CUDA toolkit from NVIDIA, configures gpu clock speeds and autoboost settings, and exposes a method to collect gpu metadata. Currently Tesla K80 and P100 gpus are supported, provided that there is only a single type of gpu per system. https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver """ import posixpath import re from absl import flags from perfkitbenchmarker import virtual_machine from perfkitbenchmarker.linux_packages import nvidia_driver # There is no way to tell the apt-get installation # method what dir to install the cuda toolkit to CUDA_HOME = '/usr/local/cuda' flags.DEFINE_enum( 'cuda_toolkit_version', '11.6', [ '9.0', '10.0', '10.1', '10.2', '11.0', '11.1', '11.2', '11.3', '11.4', '11.5', '11.6', '11.7', '11.8', '12.0', '12.1', '12.2', '12.6', 'None', '', ], ( 'Version of CUDA Toolkit to install. ' 'Input "None" or empty string to skip installation' ), module_name=__name__, ) _KEY = flags.DEFINE_string( 'cuda_toolkit_key', '7fa2af80', 'The new GPG keys for the CUDA repository. This is Debian-based distros.', ) FLAGS = flags.FLAGS CUDA_PIN = 'https://developer.download.nvidia.com/compute/cuda/repos/{os}/{cpu_arch}/cuda-{os}.pin' CUDA_12_6_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda-repo-{os}-12-6-local_12.6.0-560.28.03-1_{cpu_arch}.deb' CUDA_12_2_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda-repo-{os}-12-2-local_12.2.2-535.104.05-1_{cpu_arch}.deb' CUDA_12_1_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-{os}-12-1-local_12.1.1-530.30.02-1_{cpu_arch}.deb' CUDA_12_0_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda-repo-{os}-12-0-local_12.0.1-525.85.12-1_{cpu_arch}.deb' CUDA_11_8_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-{os}-11-8-local_11.8.0-520.61.05-1_{cpu_arch}.deb' CUDA_11_7_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-{os}-11-7-local_11.7.1-515.65.01-1_{cpu_arch}.deb' CUDA_11_6_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-{os}-11-6-local_11.6.2-510.47.03-1_{cpu_arch}.deb' CUDA_11_5_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-{os}-11-5-local_11.5.2-495.29.05-1_{cpu_arch}.deb' CUDA_11_4_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda-repo-{os}-11-4-local_11.4.4-470.82.01-1_{cpu_arch}.deb' CUDA_11_3_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda-repo-{os}-11-3-local_11.3.1-465.19.01-1_{cpu_arch}.deb' CUDA_11_2_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda-repo-{os}-11-2-local_11.2.2-460.32.03-1_{cpu_arch}.deb' CUDA_11_1_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-{os}-11-1-local_11.1.1-455.32.00-1_{cpu_arch}.deb' CUDA_11_0_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda-repo-{os}-11-0-local_11.0.3-450.51.06-1_{cpu_arch}.deb' CUDA_10_2_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-{os}-10-2-local-10.2.89-440.33.01_1.0-1_{cpu_arch}.deb' CUDA_10_1_TOOLKIT = 'https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-{os}-10-1-local-10.1.243-418.87.00_1.0-1_{cpu_arch}.deb' CUDA_10_0_TOOLKIT = 'https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda-repo-{os}-10-0-local-10.0.130-410.48_1.0-1_{cpu_arch}' CUDA_9_0_TOOLKIT = 'https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-{os}-9-0-local_9.0.176-1_{cpu_arch}-deb' CUDA_9_0_PATCH = 'https://developer.nvidia.com/compute/cuda/9.0/Prod/patches/1/cuda-repo-{os}-9-0-local-cublas-performance-update_1.0-1_{cpu_arch}-deb' # The new GPG keys for the CUDA repository. This is Debian-based distros. GPG_KEY = 'http://developer.download.nvidia.com/compute/cuda/repos/{os}/{cpu_arch}/{key}.pub' def _CudaOs(os_type): return re.sub('_.*$', '', os_type) def GetCpuArchPath(vm): """Returns the CPU architecture of the VM.""" if vm.cpu_arch == virtual_machine.CPUARCH_X86_64: return virtual_machine.CPUARCH_X86_64 elif vm.cpu_arch == virtual_machine.CPUARCH_AARCH64: return 'sbsa' else: raise NotImplementedError() def _GetCpuArch(vm): """Returns the CPU architecture of the VM.""" if vm.cpu_arch == virtual_machine.CPUARCH_X86_64: return 'amd64' elif vm.cpu_arch == virtual_machine.CPUARCH_AARCH64: return 'arm64' else: raise NotImplementedError() class UnsupportedCudaVersionError(Exception): pass class NvccParseOutputError(Exception): pass def GetMetadata(vm): """Returns gpu-specific metadata as a dict. Args: vm: virtual machine to operate on Returns: A dict of gpu- and CUDA- specific metadata. """ metadata = nvidia_driver.GetMetadata(vm) metadata['cuda_toolkit_version'] = FLAGS.cuda_toolkit_version metadata['cuda_toolkit_home'] = CUDA_HOME metadata['vm_name'] = vm.name return metadata def DoPostInstallActions(vm): """Perform post NVIDIA driver install action on the vm. Args: vm: the virtual machine to operate on """ nvidia_driver.DoPostInstallActions(vm) def GetCudaToolkitVersion(vm): """Get the CUDA toolkit version on the vm, based on nvcc. Args: vm: the virtual machine to query Returns: A string containing the active CUDA toolkit version, None if nvcc could not be found Raises: NvccParseOutputError: On can not parse nvcc output """ stdout, _ = vm.RemoteCommand( posixpath.join(CUDA_HOME, 'bin/nvcc') + ' --version', ignore_failure=True ) if bool(stdout.rstrip()): regex = r'release (\S+),' match = re.search(regex, stdout) if match: return str(match.group(1)) raise NvccParseOutputError( 'Unable to parse nvcc version output from {}'.format(stdout) ) else: return None def EnrollSigningKey(vm): """Fetch GPG Keys for CUDA Toolkit. Args: vm: the virtual machine to install CUDA on. """ if FLAGS.cuda_toolkit_version in ( '11.7', '11.8', '12.0', '12.1', '12.2', '12.6', ): version = FLAGS.cuda_toolkit_version.replace('.', '-') vm.RemoteCommand( 'sudo cp' f' /var/cuda-repo-{_CudaOs(vm.OS_TYPE)}-{version}-local/cuda-*-keyring.gpg' ' /usr/share/keyrings/' ) else: vm.RemoteCommand( 'sudo apt-key adv --fetch-keys' f' {GPG_KEY.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=GetCpuArchPath(vm), key=_KEY.value)}' ) def _InstallCudaPatch(vm, patch_url): """Installs CUDA Toolkit patch from NVIDIA. Args: vm: VM to install patch on patch_url: url of the CUDA patch to install """ # Need to append .deb to package name because the file downloaded from # NVIDIA is missing the .deb extension. basename = posixpath.basename(patch_url) + '.deb' vm.RemoteCommand('wget -q %s -O %s' % (patch_url, basename)) vm.RemoteCommand('sudo dpkg -i %s' % basename) vm.AptUpdate() # Need to be extra careful on the command below because without these # precautions, it was brining up a menu option about grub's menu.lst # on AWS Ubuntu16.04 and thus causing the RemoteCommand to hang and fail. vm.RemoteCommand( 'sudo DEBIAN_FRONTEND=noninteractive apt-get upgrade -yq cuda' ) def _InstallCuda9Point0(vm): """Installs CUDA Toolkit 9.0 from NVIDIA. Args: vm: VM to install CUDA on """ basename = ( posixpath.basename( CUDA_9_0_TOOLKIT.format( os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm) ) ) + '.deb' ) vm.RemoteCommand( 'wget -q %s -O %s' % ( CUDA_9_0_TOOLKIT.format( os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm) ), basename, ) ) vm.RemoteCommand('sudo dpkg -i %s' % basename) EnrollSigningKey(vm) vm.AptUpdate() vm.InstallPackages( 'cuda-toolkit-9-0 cuda-tools-9-0 cuda-libraries-9-0 ' 'cuda-libraries-dev-9-0' ) _InstallCudaPatch( vm, CUDA_9_0_PATCH.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm)), ) def _InstallCuda10Point0(vm): """Installs CUDA Toolkit 10.0 from NVIDIA. Args: vm: VM to install CUDA on """ basename = ( f'{posixpath.basename(CUDA_10_0_TOOLKIT.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm)))}.deb' ) vm.RemoteCommand( 'wget -q' f' {CUDA_10_0_TOOLKIT.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm))} -O' f' {basename}' ) vm.RemoteCommand('sudo dpkg -i %s' % basename) EnrollSigningKey(vm) vm.AptUpdate() vm.InstallPackages( 'cuda-toolkit-10-0 cuda-tools-10-0 cuda-libraries-10-0 ' 'cuda-libraries-dev-10-0' ) def _InstallCuda10Point1(vm): """Installs CUDA Toolkit 10.1 from NVIDIA. Args: vm: VM to install CUDA on """ basename = posixpath.basename( CUDA_10_1_TOOLKIT.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm)) ) vm.RemoteCommand( 'wget -q %s' % CUDA_PIN.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=GetCpuArchPath(vm)) ) vm.RemoteCommand( f'sudo mv cuda-{_CudaOs(vm.OS_TYPE)}.pin ' '/etc/apt/preferences.d/cuda-repository-pin-600' ) vm.RemoteCommand( 'wget -q %s' % CUDA_10_1_TOOLKIT.format( os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm) ) ) vm.RemoteCommand('sudo dpkg -i %s' % basename) EnrollSigningKey(vm) vm.AptUpdate() vm.InstallPackages( 'cuda-toolkit-10-1 cuda-tools-10-1 cuda-libraries-10-1 ' 'cuda-libraries-dev-10-1' ) def _InstallCuda10Point2(vm): """Installs CUDA Toolkit 10.2 from NVIDIA. Args: vm: VM to install CUDA on """ basename = posixpath.basename( CUDA_10_2_TOOLKIT.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm)) ) vm.RemoteCommand( 'wget -q %s' % CUDA_PIN.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=GetCpuArchPath(vm)) ) vm.RemoteCommand( f'sudo mv cuda-{_CudaOs(vm.OS_TYPE)}.pin ' '/etc/apt/preferences.d/cuda-repository-pin-600' ) vm.RemoteCommand( 'wget -q %s' % CUDA_10_2_TOOLKIT.format( os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm) ) ) vm.RemoteCommand('sudo dpkg -i %s' % basename) EnrollSigningKey(vm) vm.AptUpdate() vm.InstallPackages( 'cuda-toolkit-10-2 cuda-tools-10-2 cuda-libraries-10-2 ' 'cuda-libraries-dev-10-2' ) def _DownloadCuda(vm, toolkit_fmt): toolkit = toolkit_fmt.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=_GetCpuArch(vm)) basename = posixpath.basename(toolkit) vm.RemoteCommand(f'wget -nv --tries=10 {toolkit} -O {basename}') vm.InstallPackages(f'./{basename}') def _InstallCuda12Generic(vm, toolkit_fmt, version_dash): """Installs CUDA Toolkit 12.x from NVIDIA. Args: vm: VM to install CUDA on toolkit_fmt: format string to use for the toolkit name version_dash: Version (ie 12-1) to install """ vm.RemoteCommand( 'wget -q' f' {CUDA_PIN.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=GetCpuArchPath(vm))}' ) vm.RemoteCommand( f'sudo mv cuda-{_CudaOs(vm.OS_TYPE)}.pin ' '/etc/apt/preferences.d/cuda-repository-pin-600' ) _DownloadCuda(vm, toolkit_fmt) EnrollSigningKey(vm) vm.AptUpdate() vm.InstallPackages( f'cuda-toolkit-{version_dash} ' f'cuda-tools-{version_dash} ' f'cuda-libraries-{version_dash} ' f'cuda-libraries-dev-{version_dash}' ) def _InstallCuda11Generic(vm, toolkit_fmt, version_dash): """Installs CUDA Toolkit 11.x from NVIDIA. Args: vm: VM to install CUDA on toolkit_fmt: format string to use for the toolkit name version_dash: Version (ie 11-1) to install """ vm.RemoteCommand( 'wget -q' f' {CUDA_PIN.format(os=_CudaOs(vm.OS_TYPE), cpu_arch=GetCpuArchPath(vm))}' f' -O /tmp/cuda-{_CudaOs(vm.OS_TYPE)}.pin' ) vm.RemoteCommand( f'sudo mv /tmp/cuda-{_CudaOs(vm.OS_TYPE)}.pin ' '/etc/apt/preferences.d/cuda-repository-pin-600' ) _DownloadCuda(vm, toolkit_fmt) EnrollSigningKey(vm) vm.AptUpdate() vm.InstallPackages( f'cuda-toolkit-{version_dash} ' f'cuda-tools-{version_dash} ' f'cuda-libraries-{version_dash} ' f'cuda-libraries-dev-{version_dash}' ) def _InstallCuda12Point0(vm): _InstallCuda12Generic(vm, CUDA_12_0_TOOLKIT, '12-0') def _InstallCuda12Point1(vm): _InstallCuda12Generic(vm, CUDA_12_1_TOOLKIT, '12-1') def _InstallCuda12Point2(vm): _InstallCuda12Generic(vm, CUDA_12_2_TOOLKIT, '12-2') def _InstallCuda12Point6(vm): _InstallCuda12Generic(vm, CUDA_12_6_TOOLKIT, '12-6') def _InstallCuda11Point0(vm): _InstallCuda11Generic(vm, CUDA_11_0_TOOLKIT, '11-0') def _InstallCuda11Point1(vm): _InstallCuda11Generic(vm, CUDA_11_1_TOOLKIT, '11-1') def _InstallCuda11Point2(vm): _InstallCuda11Generic(vm, CUDA_11_2_TOOLKIT, '11-2') def _InstallCuda11Point3(vm): _InstallCuda11Generic(vm, CUDA_11_3_TOOLKIT, '11-3') def _InstallCuda11Point4(vm): _InstallCuda11Generic(vm, CUDA_11_4_TOOLKIT, '11-4') def _InstallCuda11Point5(vm): _InstallCuda11Generic(vm, CUDA_11_5_TOOLKIT, '11-5') def _InstallCuda11Point6(vm): _InstallCuda11Generic(vm, CUDA_11_6_TOOLKIT, '11-6') def _InstallCuda11Point7(vm): _InstallCuda11Generic(vm, CUDA_11_7_TOOLKIT, '11-7') def _InstallCuda11Point8(vm): _InstallCuda11Generic(vm, CUDA_11_8_TOOLKIT, '11-8') def AptInstall(vm): """Installs CUDA toolkit on the VM if not already installed.""" version_to_install = FLAGS.cuda_toolkit_version if version_to_install == 'None' or not version_to_install: return current_version = GetCudaToolkitVersion(vm) if current_version == version_to_install: return cuda_path = f'/usr/local/cuda-{FLAGS.cuda_toolkit_version}' if vm.TryRemoteCommand(f'stat {cuda_path}'): vm.RemoteCommand('sudo rm -rf /usr/local/cuda', ignore_failure=True) vm.RemoteCommand(f'sudo ln -s {cuda_path} /usr/local/cuda') return vm.Install('build_tools') vm.Install('wget') vm.Install('nvidia_driver') if version_to_install == '9.0': _InstallCuda9Point0(vm) elif version_to_install == '10.0': _InstallCuda10Point0(vm) elif version_to_install == '10.1': _InstallCuda10Point1(vm) elif version_to_install == '10.2': _InstallCuda10Point2(vm) elif version_to_install == '11.0': _InstallCuda11Point0(vm) elif version_to_install == '11.1': _InstallCuda11Point1(vm) elif version_to_install == '11.2': _InstallCuda11Point2(vm) elif version_to_install == '11.3': _InstallCuda11Point3(vm) elif version_to_install == '11.4': _InstallCuda11Point4(vm) elif version_to_install == '11.5': _InstallCuda11Point5(vm) elif version_to_install == '11.6': _InstallCuda11Point6(vm) elif version_to_install == '11.7': _InstallCuda11Point7(vm) elif version_to_install == '11.8': _InstallCuda11Point8(vm) elif version_to_install == '12.0': _InstallCuda12Point0(vm) elif version_to_install == '12.1': _InstallCuda12Point1(vm) elif version_to_install == '12.2': _InstallCuda12Point2(vm) elif version_to_install == '12.6': _InstallCuda12Point6(vm) else: raise UnsupportedCudaVersionError() DoPostInstallActions(vm) # NVIDIA CUDA Profile Tools Interface. # This library provides advanced profiling support if version_to_install in ('9.0', '10.0'): # cupti is part of cuda>=10.1, and installed as cuda-cupti-10-1/2 vm.RemoteCommand('sudo apt-get install -y libcupti-dev') def YumInstall(vm): """Installs CUDA toolkit on the VM if not already installed. TODO: PKB currently only supports the installation of CUDA toolkit on Ubuntu. Args: vm: VM to install CUDA on """ del vm # unused raise NotImplementedError() def CheckPrerequisites(): """Verifies that the required resources are present. Raises: perfkitbenchmarker.data.ResourceNotFound: On missing resource. """ pass def Uninstall(vm): """Removes the CUDA toolkit. Args: vm: VM that installed CUDA Note that reinstallation does not work correctly, i.e. you cannot reinstall CUDA by calling _Install() again. """ vm.RemoteCommand(f'rm -f cuda-repo-{_CudaOs(vm.OS_TYPE)}*') vm.RemoteCommand('sudo rm -rf {cuda_home}'.format(cuda_home=CUDA_HOME))