cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb (312 lines of code) (raw):
require 'spec_helper'
class ConvergeGdrcopy
def self.setup(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
gdrcopy 'setup' do
action :setup
end
end
end
def self.verify(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
gdrcopy 'verify' do
action :verify
end
end
end
def self.configure(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
gdrcopy 'configure' do
action :configure
end
end
end
end
describe 'gdrcopy:gdrcopy_enabled?' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:sources_dir) { 'sources_dir' }
cached(:gdrcopy_version) { 'gdrcopy_version' }
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
runner(platform: platform, version: version, step_into: ['gdrcopy']) do |node|
node.override['cluster']['sources_dir'] = sources_dir
end
end
cached(:resource) do
ConvergeGdrcopy.setup(chef_run)
chef_run.find_resource('gdrcopy', 'setup')
end
context 'when nvidia not enabled' do
it "is not enabled" do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
expect(resource.gdrcopy_enabled?).to eq(false)
end
end
context 'when nvidia enabled' do
context 'on arm instance' do
before do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true)
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
end
if platform == 'centos'
it "is not enabled" do
expect(resource.gdrcopy_enabled?).to eq(false)
end
else
it "is enabled" do
expect(resource.gdrcopy_enabled?).to eq(true)
end
end
end
context 'not on arm instance' do
it "is enabled" do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true)
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
expect(resource.gdrcopy_enabled?).to eq(true)
end
end
end
end
end
end
describe 'gdrcopy:gdrcopy_arch' do
for_all_oses do |platform, version|
context "on #{platform}#{version} - arm" do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.setup(runner)
end
cached(:resource) do
chef_run.find_resource('gdrcopy', 'setup')
end
context 'on arm instance' do
cached(:expected_arch) do
case platform
when 'amazon', 'redhat', 'rocky'
'aarch64'
else
'arm64'
end
end
it 'returns arch value for arm architecture' do
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
expect(resource.gdrcopy_arch).to eq(expected_arch)
end
end
context 'not on arm instance' do
cached(:expected_arch) do
platform == 'ubuntu' ? 'amd64' : 'x86_64'
end
it 'returns arch value for arm architecture' do
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
expect(resource.gdrcopy_arch).to eq(expected_arch)
end
end
end
end
end
describe 'gdrcopy:gdrcopy_version' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.setup(runner)
end
cached(:resource) do
chef_run.find_resource('gdrcopy', 'setup')
end
it 'returns the expected gdrcopy version' do
expected_gdrcopy_version = if platform == "centos"
"2.3.1"
else
"2.4.4"
end
expect(resource.gdrcopy_version).to eq(expected_gdrcopy_version)
end
end
end
end
describe 'gdrcopy:gdrcopy_checksum' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.setup(runner)
end
cached(:resource) do
chef_run.find_resource('gdrcopy', 'setup')
end
it 'returns the expected gdrcopy checksum' do
expected_gdrcopy_checksum = if platform == "centos"
"59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8"
else
"8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8"
end
expect(resource.gdrcopy_checksum).to eq(expected_gdrcopy_checksum)
end
end
end
end
describe 'gdrcopy:setup' do
for_all_oses do |platform, version|
context "on #{platform}#{version} when gdrcopy not enabled" do
cached(:chef_run) do
stubs_for_resource('gdrcopy') do |res|
allow(res).to receive(:gdrcopy_enabled?).and_return(false)
end
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.setup(runner)
end
it 'does not install gdrcopy' do
is_expected.not_to run_bash('Install NVIDIA GDRCopy')
end
end
context "on #{platform}#{version} when gdrcopy enabled" do
cached(:sources_dir) { 'sources_dir' }
cached(:gdrcopy_version) { platform == 'centos' ? '2.3.1' : '2.4.4' }
cached(:gdrcopy_checksum) do
if platform == 'centos'
'59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8'
else
'8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8'
end
end
cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' }
cached(:gdrcopy_tarball) { "#{sources_dir}/gdrcopy-#{gdrcopy_version}.tar.gz" }
cached(:gdrcopy_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz" }
cached(:gdrcopy_dependencies) do
case platform
when 'ubuntu'
%w(build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms)
when 'amazon'
if version == '2023'
%w(dkms rpm-build make check check-devel)
else
%w(dkms rpm-build make check check-devel subunit subunit-devel)
end
else
%w(dkms rpm-build make check check-devel subunit subunit-devel)
end
end
cached(:gdrcopy_arch) { 'gdrcopy_arch' }
cached(:gdrcopy_platform) do
platforms = {
'amazon2' => 'amzn-2',
'amazon2023' => 'amzn-2023',
'centos7' => 'el7',
'rhel8' => 'el8',
'rocky8' => 'el8',
'rhel9' => 'el9',
'rocky9' => 'el9',
'ubuntu22.04' => 'Ubuntu22_04',
'ubuntu24.04' => 'Ubuntu24_04',
}
platforms["#{platform}#{version}"]
end
cached(:chef_run) do
stubs_for_resource('gdrcopy') do |res|
allow(res).to receive(:gdrcopy_enabled?).and_return(true)
allow(res).to receive(:gdrcopy_arch).and_return(gdrcopy_arch)
end
runner = runner(platform: platform, version: version, step_into: ['gdrcopy']) do |node|
node.override['cluster']['sources_dir'] = sources_dir
end
ConvergeGdrcopy.setup(runner)
end
cached(:node) { chef_run.node }
it 'sets up gdrcopy' do
is_expected.to setup_gdrcopy('setup')
end
it 'shares gdrcopy service and version with InSpec tests' do
expect(node['cluster']['nvidia']['gdrcopy']['version']).to eq(gdrcopy_version)
expect(node['cluster']['nvidia']['gdrcopy']['service']).to eq(gdrcopy_service)
is_expected.to write_node_attributes('dump node attributes')
end
it 'downloads gdrcopy tarball' do
is_expected.to create_if_missing_remote_file(gdrcopy_tarball).with(
source: gdrcopy_url,
mode: '0644',
retries: 3,
retry_delay: 5,
checksum: gdrcopy_checksum
)
end
it 'builds dependencies' do
is_expected.to install_package(gdrcopy_dependencies).with_retries(3).with_retry_delay(5)
end
cached(:installation_code) { chef_run.bash('Install NVIDIA GDRCopy').code }
it 'installs NVIDIA GDRCopy' do
is_expected.to run_bash('Install NVIDIA GDRCopy').with(
user: 'root',
group: 'root',
cwd: Chef::Config[:file_cache_path]
).with_code(/tar -xf #{gdrcopy_tarball}/)
.with_code(%r{cd gdrcopy-#{gdrcopy_version}/packages})
if platform == 'ubuntu'
expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-deb-packages.sh})
expect(installation_code).to match(/dpkg -i gdrdrv-dkms_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
expect(installation_code).to match(/dpkg -i libgdrapi_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}\+cuda\*.deb/)
expect(installation_code).to match(/dpkg -i gdrcopy_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
elsif platform == 'centos'
expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh})
expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.noarch.#{gdrcopy_platform}.rpm/)
expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch}.#{gdrcopy_platform}.rpm/)
expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version}-1.noarch.#{gdrcopy_platform}.rpm/)
else
expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh})
expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.#{gdrcopy_platform}.noarch.rpm/)
expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_platform}.#{gdrcopy_arch}.rpm/)
expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version}-1.#{gdrcopy_platform}.noarch.rpm/)
end
end
it 'disables gdrcopy service' do
is_expected.to disable_service(gdrcopy_service).with_action(%i(disable stop))
end
end
end
end
describe 'gdrcopy:verify' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.verify(runner)
end
it 'verifies gdrcopy' do
is_expected.to verify_gdrcopy('verify')
is_expected.to run_bash("Verify NVIDIA GDRCopy: copybw").with(
user: 'root',
group: 'root',
cwd: Chef::Config[:file_cache_path]
).with_code(/copybw/)
end
end
end
end
describe 'gdrcopy:configure' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' }
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:graphic_instance?).and_return(true)
allow_any_instance_of(Object).to receive(:is_service_installed?).with(gdrcopy_service).and_return(true)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.configure(runner)
end
it 'configures gdrcopy' do
is_expected.to configure_gdrcopy('configure')
end
it 'enables gdrcopy service' do
is_expected.to run_execute("enable #{gdrcopy_service} service").with_command("systemctl enable #{gdrcopy_service}")
end
it 'starts gdrcopy service' do
is_expected.to start_service(gdrcopy_service).with_supports({ status: true })
end
end
end
end