require 'spec_helper'

class ConvergeGdrcopy
  def self.setup(chef_run)
    chef_run.converge_dsl('aws-parallelcluster-platform') do
      gdrcopy 'setup' do
        action :setup
      end
    end
  end

  def self.verify(chef_run)
    chef_run.converge_dsl('aws-parallelcluster-platform') do
      gdrcopy 'verify' do
        action :verify
      end
    end
  end

  def self.configure(chef_run)
    chef_run.converge_dsl('aws-parallelcluster-platform') do
      gdrcopy 'configure' do
        action :configure
      end
    end
  end
end

describe 'gdrcopy:gdrcopy_enabled?' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version}" do
      cached(:sources_dir) { 'sources_dir' }
      cached(:gdrcopy_version) { 'gdrcopy_version' }
      cached(:chef_run) do
        allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
        runner(platform: platform, version: version, step_into: ['gdrcopy']) do |node|
          node.override['cluster']['sources_dir'] = sources_dir
        end
      end
      cached(:resource) do
        ConvergeGdrcopy.setup(chef_run)
        chef_run.find_resource('gdrcopy', 'setup')
      end

      context 'when nvidia not enabled' do
        it "is not enabled" do
          allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
          expect(resource.gdrcopy_enabled?).to eq(false)
        end
      end

      context 'when nvidia enabled' do
        context 'on arm instance' do
          before do
            allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true)
            allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
          end

          if platform == 'centos'
            it "is not enabled" do
              expect(resource.gdrcopy_enabled?).to eq(false)
            end
          else
            it "is enabled" do
              expect(resource.gdrcopy_enabled?).to eq(true)
            end
          end
        end
        context 'not on arm instance' do
          it "is enabled" do
            allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(true)
            allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
            expect(resource.gdrcopy_enabled?).to eq(true)
          end
        end
      end
    end
  end
end

describe 'gdrcopy:gdrcopy_arch' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version} - arm" do
      cached(:chef_run) do
        allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
        ConvergeGdrcopy.setup(runner)
      end
      cached(:resource) do
        chef_run.find_resource('gdrcopy', 'setup')
      end

      context 'on arm instance' do
        cached(:expected_arch) do
          case platform
          when 'amazon', 'redhat', 'rocky'
            'aarch64'
          else
            'arm64'
          end
        end

        it 'returns arch value for arm architecture' do
          allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
          expect(resource.gdrcopy_arch).to eq(expected_arch)
        end
      end

      context 'not on arm instance' do
        cached(:expected_arch) do
          platform == 'ubuntu' ? 'amd64' : 'x86_64'
        end

        it 'returns arch value for arm architecture' do
          allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
          expect(resource.gdrcopy_arch).to eq(expected_arch)
        end
      end
    end
  end
end

describe 'gdrcopy:gdrcopy_version' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version}" do
      cached(:chef_run) do
        allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
        ConvergeGdrcopy.setup(runner)
      end
      cached(:resource) do
        chef_run.find_resource('gdrcopy', 'setup')
      end

      it 'returns the expected gdrcopy version' do
        expected_gdrcopy_version = if platform == "centos"
                                     "2.3.1"
                                   else
                                     "2.4.4"
                                   end
        expect(resource.gdrcopy_version).to eq(expected_gdrcopy_version)
      end
    end
  end
end

describe 'gdrcopy:gdrcopy_checksum' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version}" do
      cached(:chef_run) do
        allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
        ConvergeGdrcopy.setup(runner)
      end
      cached(:resource) do
        chef_run.find_resource('gdrcopy', 'setup')
      end

      it 'returns the expected gdrcopy checksum' do
        expected_gdrcopy_checksum = if platform == "centos"
                                      "59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8"
                                    else
                                      "8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8"
                                    end
        expect(resource.gdrcopy_checksum).to eq(expected_gdrcopy_checksum)
      end
    end
  end
end

describe 'gdrcopy:setup' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version} when gdrcopy not enabled" do
      cached(:chef_run) do
        stubs_for_resource('gdrcopy') do |res|
          allow(res).to receive(:gdrcopy_enabled?).and_return(false)
        end
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
        ConvergeGdrcopy.setup(runner)
      end

      it 'does not install gdrcopy' do
        is_expected.not_to run_bash('Install NVIDIA GDRCopy')
      end
    end

    context "on #{platform}#{version} when gdrcopy enabled" do
      cached(:sources_dir) { 'sources_dir' }
      cached(:gdrcopy_version) { platform == 'centos' ? '2.3.1' : '2.4.4' }
      cached(:gdrcopy_checksum) do
        if platform == 'centos'
          '59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8'
        else
          '8802f7bc4a589a610118023bdcdd83c10a56dea399acf6eeaac32e8cc10739a8'
        end
      end
      cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' }
      cached(:gdrcopy_tarball) { "#{sources_dir}/gdrcopy-#{gdrcopy_version}.tar.gz" }
      cached(:gdrcopy_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz" }
      cached(:gdrcopy_dependencies) do
        case platform
        when 'ubuntu'
          %w(build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms)
        when 'amazon'
          if version == '2023'
            %w(dkms rpm-build make check check-devel)
          else
            %w(dkms rpm-build make check check-devel subunit subunit-devel)
          end
        else
          %w(dkms rpm-build make check check-devel subunit subunit-devel)
        end
      end
      cached(:gdrcopy_arch) { 'gdrcopy_arch' }
      cached(:gdrcopy_platform) do
        platforms = {
          'amazon2' => 'amzn-2',
          'amazon2023' => 'amzn-2023',
          'centos7' => 'el7',
          'rhel8' => 'el8',
          'rocky8' => 'el8',
          'rhel9' => 'el9',
          'rocky9' => 'el9',
          'ubuntu22.04' => 'Ubuntu22_04',
          'ubuntu24.04' => 'Ubuntu24_04',
        }
        platforms["#{platform}#{version}"]
      end
      cached(:chef_run) do
        stubs_for_resource('gdrcopy') do |res|
          allow(res).to receive(:gdrcopy_enabled?).and_return(true)
          allow(res).to receive(:gdrcopy_arch).and_return(gdrcopy_arch)
        end
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy']) do |node|
          node.override['cluster']['sources_dir'] = sources_dir
        end
        ConvergeGdrcopy.setup(runner)
      end
      cached(:node) { chef_run.node }

      it 'sets up gdrcopy' do
        is_expected.to setup_gdrcopy('setup')
      end

      it 'shares gdrcopy service and version with InSpec tests' do
        expect(node['cluster']['nvidia']['gdrcopy']['version']).to eq(gdrcopy_version)
        expect(node['cluster']['nvidia']['gdrcopy']['service']).to eq(gdrcopy_service)
        is_expected.to write_node_attributes('dump node attributes')
      end

      it 'downloads gdrcopy tarball' do
        is_expected.to create_if_missing_remote_file(gdrcopy_tarball).with(
          source: gdrcopy_url,
          mode: '0644',
          retries: 3,
          retry_delay: 5,
          checksum: gdrcopy_checksum
        )
      end

      it 'builds dependencies' do
        is_expected.to install_package(gdrcopy_dependencies).with_retries(3).with_retry_delay(5)
      end

      cached(:installation_code) { chef_run.bash('Install NVIDIA GDRCopy').code }

      it 'installs NVIDIA GDRCopy' do
        is_expected.to run_bash('Install NVIDIA GDRCopy').with(
          user: 'root',
          group: 'root',
          cwd: Chef::Config[:file_cache_path]
        ).with_code(/tar -xf #{gdrcopy_tarball}/)
                                                         .with_code(%r{cd gdrcopy-#{gdrcopy_version}/packages})

        if platform == 'ubuntu'
          expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-deb-packages.sh})
          expect(installation_code).to match(/dpkg -i gdrdrv-dkms_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
          expect(installation_code).to match(/dpkg -i libgdrapi_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
          expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}\+cuda\*.deb/)
          expect(installation_code).to match(/dpkg -i gdrcopy_#{gdrcopy_version}_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
        elsif platform == 'centos'
          expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh})
          expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.noarch.#{gdrcopy_platform}.rpm/)
          expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch}.#{gdrcopy_platform}.rpm/)
          expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version}-1.noarch.#{gdrcopy_platform}.rpm/)
        else
          expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh})
          expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.#{gdrcopy_platform}.noarch.rpm/)
          expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_platform}.#{gdrcopy_arch}.rpm/)
          expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version}-1.#{gdrcopy_platform}.noarch.rpm/)
        end
      end

      it 'disables gdrcopy service' do
        is_expected.to disable_service(gdrcopy_service).with_action(%i(disable stop))
      end
    end
  end
end

describe 'gdrcopy:verify' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version}" do
      cached(:chef_run) do
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
        ConvergeGdrcopy.verify(runner)
      end

      it 'verifies gdrcopy' do
        is_expected.to verify_gdrcopy('verify')
        is_expected.to run_bash("Verify NVIDIA GDRCopy: copybw").with(
          user: 'root',
          group: 'root',
          cwd: Chef::Config[:file_cache_path]
        ).with_code(/copybw/)
      end
    end
  end
end

describe 'gdrcopy:configure' do
  for_all_oses do |platform, version|
    context "on #{platform}#{version}" do
      cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' }
      cached(:chef_run) do
        allow_any_instance_of(Object).to receive(:graphic_instance?).and_return(true)
        allow_any_instance_of(Object).to receive(:is_service_installed?).with(gdrcopy_service).and_return(true)
        runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
        ConvergeGdrcopy.configure(runner)
      end

      it 'configures gdrcopy' do
        is_expected.to configure_gdrcopy('configure')
      end

      it 'enables gdrcopy service' do
        is_expected.to run_execute("enable #{gdrcopy_service} service").with_command("systemctl enable #{gdrcopy_service}")
      end

      it 'starts gdrcopy service' do
        is_expected.to start_service(gdrcopy_service).with_supports({ status: true })
      end
    end
  end
end
