cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb (312 lines of code) (raw):

require 'spec_helper' class ConvergeNvidiaDriver def self.setup(chef_run, nvidia_driver_version: nil) chef_run.converge_dsl('aws-parallelcluster-platform') do nvidia_driver 'setup' do nvidia_driver_version nvidia_driver_version action :setup end end end end describe 'nvidia_driver:_nvidia_driver_version' do cached(:nvidia_driver_attribute) { 'nvidia_driver_attribute' } cached(:nvidia_driver_property) { 'nvidia_driver_property' } cached(:chef_run) do ChefSpec::SoloRunner.new(step_into: ['nvidia_driver']) do |node| node.override['cluster']['nvidia']['driver_version'] = nvidia_driver_attribute end end context 'when nvidia driver property is set' do cached(:resource) do ConvergeNvidiaDriver.setup(chef_run, nvidia_driver_version: nvidia_driver_property) chef_run.find_resource('nvidia_driver', 'setup') end it 'takes the value from nvidia driver property' do expect(resource._nvidia_driver_version).to eq(nvidia_driver_property) end end context 'when nvidia driver property is not set' do cached(:resource) do ConvergeNvidiaDriver.setup(chef_run) chef_run.find_resource('nvidia_driver', 'setup') end it 'takes the value from nvidia driver attribute' do expect(resource._nvidia_driver_version).to eq(nvidia_driver_attribute) end end end describe 'nvidia_driver:nvidia_driver_enabled?' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do runner(platform: platform, version: version, step_into: ['nvidia_driver']) end cached(:resource) do ConvergeNvidiaDriver.setup(chef_run) chef_run.find_resource('nvidia_driver', 'setup') end context "when nvidia not enabled" do before do allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false) end it 'is false' do expect(resource.nvidia_driver_enabled?).to eq(false) end end context "when nvidia enabled and arm instance" do before do allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true) end if platform == 'centos' it 'is false' do expect(resource.nvidia_driver_enabled?).to eq(false) end else it 'is true' do expect(resource.nvidia_driver_enabled?).to eq(true) end end end context "when nvidia enabled and not arm instance" do before do allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true) allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false) end it 'is true' do expect(resource.nvidia_driver_enabled?).to eq(true) end end end end end describe 'nvidia_driver:nvidia_kernel_module' do [%w(false kernel), [false, 'kernel'], %w(no kernel), %w(true kernel-open), [true, 'kernel-open'], %w(yes kernel-open)].each do |kernel_open, kernel_module| context "node['cluster']['nvidia']['kernel_open'] is #{kernel_open}" do cached(:chef_run) do ChefSpec::SoloRunner.new(step_into: ['nvidia_driver']) do |node| node.override['cluster']['nvidia']['kernel_open'] = kernel_open end end cached(:resource) do ConvergeNvidiaDriver.setup(chef_run) chef_run.find_resource('nvidia_driver', 'setup') end it "is #{kernel_module}" do allow_any_instance_of(Object).to receive(:nvidia_kernel_module).and_return(kernel_module) expect(resource.nvidia_kernel_module).to eq(kernel_module) end end end end describe 'nvidia_driver:nvidia_arch' do cached(:chef_run) do ChefSpec::SoloRunner.new(step_into: ['nvidia_driver']) end cached(:resource) do ConvergeNvidiaDriver.setup(chef_run) chef_run.find_resource('nvidia_driver', 'setup') end context 'when on arm' do it 'is aarch64' do allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true) expect(resource.nvidia_arch).to eq('aarch64') end end context 'when not on arm' do it 'is x86_64' do allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false) expect(resource.nvidia_arch).to eq('x86_64') end end end describe 'nvidia_driver:setup' do for_all_oses do |platform, version| cached(:nvidia_arch) { 'nvidia_arch' } cached(:nvidia_kernel_module) { 'nvidia_kernel_module' } cached(:nvidia_driver_version) { 'nvidia_driver_version' } cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } context "on #{platform}#{version} when nvidia_driver not enabled" do cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(false) end runner = runner(platform: platform, version: version, step_into: ['nvidia_driver']) ConvergeNvidiaDriver.setup(runner) end it 'does not install NVidia driver' do is_expected.not_to run_bash('nvidia.run advanced') end end [%w(false kernel), %w(true kernel-open)].each do |kernel_open, kernel_module| context "on #{platform}#{version} when nvidia_driver enabled and node['cluster']['nvidia']['kernel_open'] is #{kernel_open}" do if platform == 'centos' cached(:nvidia_driver_version) { '535.129.03' } else cached(:nvidia_driver_version) { 'nvidia_driver_version' } end cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" } cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) allow(::File).to receive(:exist?).with('/usr/bin/nvidia-smi').and_return(false) runner = runner(platform: platform, version: version, step_into: ['nvidia_driver']) do |node| node.automatic['kernel']['release'] = '5.anything' end ConvergeNvidiaDriver.setup(runner, nvidia_driver_version: nvidia_driver_version) end cached(:node) { chef_run.node } it 'dumps nodes attribues' do is_expected.to write_node_attributes('Save Nvidia driver version for Inspec tests') end it 'sets up nvidia_driver' do is_expected.to setup_nvidia_driver('setup') end it 'downloads nvidia driver' do is_expected.to create_remote_file('/tmp/nvidia.run').with( source: nvidia_driver_url, mode: '0755', retries: 3, retry_delay: 5 ) end it 'uninstalls kernel module nouveau' do is_expected.to uninstall_kernel_module('nouveau') end it 'creates file blacklist-nouveau.conf' do is_expected.to create_cookbook_file('blacklist-nouveau.conf').with( source: 'nvidia/blacklist-nouveau.conf', path: '/etc/modprobe.d/blacklist-nouveau.conf', owner: 'root', group: 'root', mode: '0644' ) end if platform == 'amazon' compiler_version = version == '2023' ? 'gcc' : 'gcc10' compiler_path = version == '2023' ? 'CC=/usr/bin/gcc' : 'CC=/usr/bin/gcc10-gcc' if version == '2' it "installs #{compiler_version}" do is_expected.to install_package(compiler_version).with_retries(10).with_retry_delay(5) end it 'creates dkms/nvidia.conf' do is_expected.to create_template('/etc/dkms/nvidia.conf').with( source: 'nvidia/amazon/dkms/nvidia.conf.erb', cookbook: 'aws-parallelcluster-platform', owner: 'root', group: 'root', mode: '0644', variables: { compiler_path: compiler_path } ) end else # Amazon Linux 2023 is expected to install the compiler and create nvidia conf when kernel version is 6. # Here we are testing with kernel version 5 it "does not install #{compiler_version}" do is_expected.not_to install_package(compiler_version).with_retries(10).with_retry_delay(5) end it 'does not create dkms/nvidia.conf' do is_expected.not_to create_template('/etc/dkms/nvidia.conf') end end it 'installs nvidia driver' do is_expected.to run_bash('nvidia.run advanced') .with( user: 'root', group: 'root', cwd: '/tmp', creates: '/usr/bin/nvidia-smi' ) .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau -m=#{kernel_module}}) .with_code(%r{rm -f /tmp/nvidia.run}) end elsif platform == 'ubuntu' && version == '22.04' it 'installs gcc' do is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5) end it 'creates dkms/nvidia.conf' do compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" is_expected.to create_template('/etc/dkms/nvidia.conf').with( source: 'nvidia/amazon/dkms/nvidia.conf.erb', cookbook: 'aws-parallelcluster-platform', owner: 'root', group: 'root', mode: '0644', variables: { compiler_path: compiler_path } ) end it 'installs nvidia driver' do compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" is_expected.to run_bash('nvidia.run advanced') .with( user: 'root', group: 'root', cwd: '/tmp', creates: '/usr/bin/nvidia-smi' ) .with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau -m=#{kernel_module}}) end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') end it 'installs nvidia driver' do is_expected.to run_bash('nvidia.run advanced') .with( user: 'root', group: 'root', cwd: '/tmp', creates: '/usr/bin/nvidia-smi' ) .with_code(%r{./nvidia.run --silent --dkms --disable-nouveau -m=#{kernel_module}}) .with_code(%r{rm -f /tmp/nvidia.run}) end end if platform == 'ubuntu' it 'executes initramfs to remove nouveau' do is_expected.to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u') end else it 'does not execute initramfs to remove nouveau' do is_expected.not_to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u') end end end end context "on #{platform}#{version}" do cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(nvidia_kernel_module) end runner(platform: platform, version: version, step_into: ['nvidia_driver']) end cached(:node) { chef_run.node } context "when nouveau removed" do before do stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(false) ConvergeNvidiaDriver.setup(chef_run, nvidia_driver_version: nvidia_driver_version) end it 'does not execute initramfs to remove nouveau' do is_expected.not_to run_execute('initramfs to remove nouveau').with_command('update-initramfs -u') end end context "when kernel version is not 5" do before do stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(false) node.automatic['kernel']['release'] = '4.anything' ConvergeNvidiaDriver.setup(chef_run, nvidia_driver_version: nvidia_driver_version) end it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') end end end end end