bicep/files-to-load/create_cc_param.py (170 lines of code) (raw):

#!/usr/bin/env python import argparse import hashlib import json import os import shutil from subprocess import check_output import sys import typing def get_json_dict(file_name): abs_path = os.path.abspath(file_name) with open(abs_path) as fr: return json.load(fr) def set_slurm_params(params, dbPassword, outputs): params['Region'] = outputs['location']['value'] if outputs['vnet']['value']['type'] == 'new': subnetID = outputs['vnet']['value']['computeSubnetId'] subnet_toks = subnetID.split("/") if len(subnet_toks) >= 11: params['SubnetId'] = "/".join([subnet_toks[4], subnet_toks[8], subnet_toks[10]]) else: print(f"Unexpected subnet id {subnetID} - passing as SubnetId directly instead of resource_group/vnet_name/subnet_name", file=sys.stderr) params['SubnetId'] = subnetID else: params['SubnetId'] = '/'.join([outputs['vnet']['value']['rg'], outputs['vnet']['value']['name'], outputs['vnet']['value']['computeSubnetName']]) #HTC params['HTCMachineType'] = outputs['partitions']['value']['htc']['sku'] params['MaxHTCExecuteNodeCount'] = int(outputs['partitions']['value']['htc']['maxNodes']) params['HTCImageName'] = outputs['partitions']['value']['htc']['osImage'] params['HTCUseLowPrio'] = outputs['partitions']['value']['htc']['useSpot'] #HPC params['HPCMachineType'] = outputs['partitions']['value']['hpc']['sku'] params['MaxHPCExecuteNodeCount'] = int(outputs['partitions']['value']['hpc']['maxNodes']) params['HPCImageName'] = outputs['partitions']['value']['hpc']['osImage'] #GPU params['GPUMachineType'] = outputs['partitions']['value']['gpu']['sku'] params['MaxGPUExecuteNodeCount'] = int(outputs['partitions']['value']['gpu']['maxNodes']) params['GPUImageName'] = outputs['partitions']['value']['gpu']['osImage'] #scheduler node #params['slurm'] #is this the slurm version??? no, so what is it? params['SchedulerMachineType'] = outputs['schedulerNode']['value']['sku'] params['SchedulerImageName'] = outputs['schedulerNode']['value']['osImage'] params['configuration_slurm_version'] = outputs['slurmSettings']['value']['version'] # if outputs['slurmSettings']['value']['canUseSlurmHA']: # params['configuration_slurm_ha_enabled'] = outputs['slurmSettings']['value']['slurmHA'] params['configuration_slurm_accounting_enabled'] = bool(outputs['databaseInfo']['value']) if params['configuration_slurm_accounting_enabled']: params['configuration_slurm_accounting_user'] = outputs['databaseInfo']['value']['databaseUser'] if params['configuration_slurm_accounting_enabled']: params['configuration_slurm_accounting_password'] = dbPassword if params['configuration_slurm_accounting_enabled']: params['configuration_slurm_accounting_url'] = outputs['databaseInfo']['value']['url'] #params['configuration_slurm_accounting_certificate_url'] #login node(s) params['loginMachineType'] = (outputs['loginNodes']['value']['sku']).strip() params['NumberLoginNodes'] = int(outputs['loginNodes']['value']['initialNodes']) params['LoginImageName'] = outputs['loginNodes']['value']['osImage'] params['EnableNodeHealthChecks'] = outputs['slurmSettings']['value']['healthCheckEnabled'] #Execute node tags params['NodeTags'] = outputs['nodeArrayTags']['value'] #Network Attached Storage params['UseBuiltinShared'] = outputs['filerInfoFinal']['value']['home']['type'] == 'nfs-new' if params['UseBuiltinShared']: params['FilesystemSize'] = outputs['filerInfoFinal']['value']['home']['nfsCapacityInGb'] else: params['NFSType'] = 'nfs' if outputs['filerInfoFinal']['value']['home']['type'] in ['nfs-existing','anf-new'] else 'lustre' # We no longer need to handle these differently based on the fs type, as each # fs module's common outputs map to these. params['NFSSharedExportPath'] = outputs['filerInfoFinal']['value']['home']['exportPath'] params['NFSSharedMountOptions'] = outputs['filerInfoFinal']['value']['home']['mountOptions'] params['NFSAddress'] = outputs['filerInfoFinal']['value']['home']['ipAddress'] params['AdditionalNFS'] = outputs['filerInfoFinal']['value']['additional']['type'] != 'disabled' if params['AdditionalNFS']: params['AdditionalNFSType'] = 'nfs' if outputs['filerInfoFinal']['value']['additional']['type'] in ['nfs-existing','anf-new'] else 'lustre' params['AdditionalNFSMountPoint'] = outputs['filerInfoFinal']['value']['additional']['mountPath'] params['AdditionalNFSExportPath'] = outputs['filerInfoFinal']['value']['additional']['exportPath'] params['AdditionalNFSMountOptions'] = outputs['filerInfoFinal']['value']['additional']['mountOptions'] params['AdditionalNFSAddress'] = outputs['filerInfoFinal']['value']['additional']['ipAddress'] def set_ood_params(params, outputs): slurm_params = get_json_dict('initial_params.json') # We want to essentially inherit certain settings from the slurm cluster. set_slurm_params(slurm_params, "", outputs) params['NFSAddress'] = slurm_params.get('NFSAddress') or 'ccw-scheduler' params['NFSSharedExportPath'] = slurm_params.get('NFSSharedExportPath') or '/shared' params['NFSSharedMountOptions'] = slurm_params.get('NFSSharedMountOptions') params['SubnetId'] = slurm_params["SubnetId"] params['Region'] = slurm_params['Region'] params['Credentials'] = slurm_params['Credentials'] params['MachineType'] = outputs['ood']['value'].get('sku') params['ManagedIdentity'] = outputs['ood']['value'].get('managedIdentity') params['BootDiskSize'] = outputs['ood']['value'].get('BootDiskSize') params['ImageName'] = outputs['ood']['value'].get('osImage') params['ood_server_name'] = outputs['ood']['value'].get('fqdn','') params['ood_entra_user_map_match'] = outputs['ood']['value'].get('userDomain') params['ood_entra_client_id'] = outputs['ood']['value'].get('clientId') params['ood_entra_tenant_id'] = outputs['ood']['value'].get('tenantId') params['ood_nic'] = outputs['ood']['value'].get('nic') class ClusterInitSpec: def __init__(self, project: str, version: str, spec: str, targets: typing.List[str]): self.project = project self.version = version self.spec = spec self.targets = targets self.cluster_init_key = f"{self.project}:{self.spec}:{self.version}" def download_cluster_init(outputs, root_folder, locker) -> typing.List[ClusterInitSpec]: ret = [] for record in (outputs['clusterInitSpecs'].get("value") or []): url = _strip_tags_from_github_url(record) url_hash = hashlib.sha256(url.encode()) folder = os.path.join(root_folder, url_hash.hexdigest()) if not os.path.exists(folder): # download and move to avoid repeated failures with partial downloads/uploads check_output(["/usr/local/bin/cyclecloud", "project", "fetch", url, folder + ".tmp"]) check_output(["/usr/local/bin/cyclecloud", "project", "upload", locker], cwd=folder + ".tmp") shutil.move(folder + ".tmp", folder) with open(os.path.join(folder, "download-url"), "w") as fw: fw.write(url) proj_info_raw = check_output(["/usr/local/bin/cyclecloud", "project", "info"], cwd=folder).decode() proj_info = {} for line in proj_info_raw.splitlines(): key, rest = line.split(":", 1) proj_info[key.lower()] = rest.strip() ret.append(ClusterInitSpec(proj_info["name"], proj_info["version"], record.get("spec") or "default", record["target"])) return ret def _strip_tags_from_github_url(record): url = record["gitHubReleaseURL"] if "/tag/" in url: return url.replace("/tag", "") return url def _version_from_url(record): if record.get("version"): return record["version"] return record["gitHubReleaseURL"].split("/")[-1] def set_cluster_init_params(params: dict, specs: typing.List[ClusterInitSpec], cluster_name: str, target_params: dict) -> None: order = 10000 for spec in specs: for target in spec.targets: target_key = f"{target_params[target.lower()]}" if not params.get(target_key): params[target_key] = {} params[target_key][spec.cluster_init_key] = { "Order": order, "Spec": spec.spec, "Name": spec.cluster_init_key, "Project": spec.project, "Locker": "azure-storage", "Version": spec.version } order += 100 def main(): parser = argparse.ArgumentParser(description="TODO RDH") parser.add_argument("--locker", default="azure-storage") parser.add_argument("--cluster-init-working-dir", default="cluster-init") subparsers = parser.add_subparsers() ccw_parser = subparsers.add_parser("slurm") # TODO this needs to be by cluster type target_params = { "login": "LoginClusterInitSpecs", "gpu": "GPUClusterInitSpecs", "hpc": "HPCClusterInitSpecs", "htc": "HTCClusterInitSpecs", "scheduler": "SchedulerClusterInitSpecs", "dynamic": "DynamicClusterInitSpecs", "ood": "ClusterInitSpecs" } ccw_parser.set_defaults(cluster_type="slurm", target_params=target_params) ccw_parser.add_argument("--dbPassword", dest="dbPassword", default="", help="MySQL database password") ood_parser = subparsers.add_parser("ood") ood_parser.set_defaults(cluster_type="ood", target_params=target_params) args = parser.parse_args() if args.cluster_type == "slurm": output_params = get_json_dict('initial_params.json') else: output_params = {} ccw_outputs = get_json_dict('ccwOutputs.json') specs = download_cluster_init(ccw_outputs, os.path.join(os.getcwd(), args.cluster_init_working_dir), args.locker) set_cluster_init_params(output_params, specs, args.cluster_type, args.target_params) if args.cluster_type == "slurm": set_slurm_params(output_params, args.dbPassword, ccw_outputs) else: set_ood_params(output_params, ccw_outputs) print(json.dumps(output_params, indent=4)) if __name__ == '__main__': main()