in src/hpcadvisor/batch_handler.py [0:0]
def create_pool(sku, number_of_nodes):
random_code = utils.get_random_code()
poolname = f"pool-{random_code}"
if not batch_client:
log.critical("batch_client is None")
return None
log.info(f"Create pool: {poolname}")
subnetid = env["SUBVNETID"]
pool_id = poolname
anf_ip = ""
if anfenabled:
anf_ip = get_anf_ip()
log.debug(f"ANF IP address={anf_ip}")
# storage_account = env["STORAGEACCOUNT"]
# storage_file_dir = "data"
# nfs_fileshare = storage_file_dir
# nfs_share_hostname = f"{storage_account}.file.core.windows.net"
# nfs_share_directory = f"/{storage_account}/{nfs_fileshare}"
#
# mount_configuration = batchmodels.MountConfiguration(
# nfs_mount_configuration=batchmodels.NFSMountConfiguration(
# source=f"{nfs_share_hostname}:{nfs_share_directory}",
# relative_mount_path=nfs_fileshare,
# mount_options="-o rw,hard,rsize=65536,wsize=65536,vers=4,minorversion=1,tcp,sec=sys",
# )
# )
# TODO: need to move to another place
# TODO: find alternatives for this sleep 60 to prevent rpm lock error
#
# https://www.eessi.io/docs/getting_access/native_installation/
# TODO: put bach the option for blob on nfs
# sudo chown _azbatch:_azbatchgrp /mnt/batch/tasks/fsmounts/data
anfmountdir = env["ANFMOUNTDIR"]
anfvolume = env["ANFVOLUMENAME"]
script = f"""
sleep 60
sudo mkdir {anfmountdir} ; mount {anf_ip}:/{anfvolume} {anfmountdir}
sudo chown _azbatch:_azbatchgrp {anfmountdir}
sudo df -Tha
sudo rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux
sudo yum upgrade -y almalinux-release
sudo yum install -y https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm
sudo yum install -y cvmfs
sudo yum install -y https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm
sudo bash -c "echo 'CVMFS_CLIENT_PROFILE=\"single\"' > /etc/cvmfs/default.local"
sudo bash -c "echo 'CVMFS_QUOTA_LIMIT=10000' >> /etc/cvmfs/default.local"
sudo cvmfs_config setup
sudo mount -t cvmfs software.eessi.io /cvmfs/software.eessi.io
sudo mount -t cvmfs pilot.eessi-hpc.org /cvmfs/pilot.eessi-hpc.org
"""
script_array = script.split("\n")
filtered_array = [item for item in script_array if item.strip()]
single_line_script = wrap_commands_in_shell(filtered_array)
log.debug(f"Batch StartTask single_line_script: {single_line_script}")
user = batchmodels.AutoUserSpecification(
scope=batchmodels.AutoUserScope.pool,
elevation_level=batchmodels.ElevationLevel.admin,
)
# command_line=task_commands[0],
start_task = batchmodels.StartTask(
command_line=single_line_script,
user_identity=batchmodels.UserIdentity(auto_user=user),
wait_for_success=True,
max_task_retry_count=1,
)
network_configuration = batchmodels.NetworkConfiguration(
subnet_id=subnetid,
public_ip_address_configuration=batchmodels.PublicIPAddressConfiguration(
provision="noPublicIPAddresses"
),
)
publisher, offer, image_sku, version = _get_image_info(VMIMAGE)
new_pool = batchmodels.PoolAddParameter(
id=pool_id,
virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
image_reference=batchmodels.ImageReference(
publisher=publisher,
offer=offer,
sku=image_sku,
version=version,
),
node_agent_sku_id=env["NODEAGENTSKU"],
),
vm_size=sku,
target_dedicated_nodes=number_of_nodes,
enable_inter_node_communication=True,
target_node_communication_mode="simplified",
# mount_configuration=[mount_configuration],
start_task=start_task,
network_configuration=network_configuration,
task_scheduling_policy=batchmodels.TaskSchedulingPolicy(
node_fill_type=batchmodels.ComputeNodeFillType.pack
),
)
attempts = 5
for i in range(attempts):
try:
batch_client.pool.add(new_pool)
log.info(f"Pool {pool_id} created!")
break
except batchmodels.BatchErrorException as err:
if err.error.code == "PoolExists":
log.warning("Pool %s already exists", pool_id)
break
else:
log.error(
f"Cannot create pool {pool_id}. Error code: {err.error.code} attempts: {i}/{attempts}"
)
if err.error.values:
for detail in err.error.values:
log.error(detail.key + ": " + detail.value)
if i == attempts - 1:
log.error(f"Cannot create pool {pool_id}. Max attempts reached")
return None
time.sleep(5)
return poolname