packer/docker/build.py (157 lines of code) (raw):

#!/usr/bin/env python3 # Copyright (C) SchedMD LLC. # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import subprocess import shlex import tempfile import os import json import yaml def run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, timeout=None, check=True, universal_newlines=True, **kwargs, ): """Wrapper for subprocess.run() with convenient defaults""" if isinstance(args, list): args = list(filter(lambda x: x is not None, args)) args = " ".join(args) if not shell and isinstance(args, str): args = shlex.split(args) result = subprocess.run( args, stdout=stdout, stderr=stderr, shell=shell, timeout=timeout, check=check, universal_newlines=universal_newlines, **kwargs, ) return result def dict_to_conf(conf, delim=" "): """convert dict to delimited slurm-style key-value pairs""" def filter_conf(pair): k, v = pair if isinstance(v, list): v = ",".join(el for el in v if el is not None) return k, (v if bool(v) or v == 0 else None) return delim.join( f'{k} = "{v}"' for k, v in map(filter_conf, conf.items()) if v is not None ) def calculate_python310(tf_version): (major, minor, patch) = tf_version.split(".") python310 = int(major) >= 2 and ( int(minor) >= 13 or (int(minor) == 12 and int(patch) >= 1) ) return python310 def print_exception(e): print(f"process {e.cmd} failed with exitcode {e.returncode}") print(f"stdout: \n=================\n{e.stdout}") print(f"stderr: \n=================\n{e.stdout}") exit(e.returncode) def get_tf_versions(yaml_file_path): with open(yaml_file_path, "r") as file: try: yaml_data = yaml.safe_load(file) except yaml.YAMLError as exc: print(f"Error while parsing YAML file({yaml_file_path}):", exc) return None return list(yaml_data["tf_versions_to_tpu_mapping"].keys()) parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument( "--project_id", "-p", dest="project_id", default="schedmd-slurm-public", help="The project id to use for pushing the docker images.", ) parser.add_argument( "--slurm_version", "-s", dest="slurm_version", default="24.11.2", help="The Slurm version to use for the image.", ) parser.add_argument( "--gcp_version", "-g", dest="slurmgcp_version", default="6.4", help="The slurm_gcp version to use for the image.", ) parser.add_argument( "-t", "--tf_versions", nargs="+", default=[], help="The tf_versions to use", ) parser.add_argument( "-d", "--docker_push", help="set this flag to automatically push the images with docker", action="store_true", ) os.chdir(os.path.dirname(os.path.abspath(__file__))) args = parser.parse_args() all_tf_versions = get_tf_versions("../../ansible/roles/tpu/vars/main.yml") if not args.tf_versions: args.tf_versions = all_tf_versions else: if not set(args.tf_versions).issubset(set(all_tf_versions)): print( f"Argument tf_versions list {args.tf_versions} is not a valid subset of the supported tf_versions {all_tf_versions}" ) exit(1) file_params = { "install_lustre": "false", "source_image_project_id": "irrelevant", "zone": "irrelevant", "tf_version": "overridden", } file_params["project_id"] = args.project_id file_params["slurm_version"] = args.slurm_version file_params["slurmgcp_version"] = args.slurmgcp_version data = dict_to_conf(file_params, delim="\n") + "\n" tmp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False, suffix=".pkvars.hcl") tmp_file.write(data) tmp_file.close() print("Building base_image ubuntu:22.04") try: run( f'packer build -var-file={tmp_file.name} -var "docker_image=ubuntu:22.04" -only "base.*" .' ) except subprocess.CalledProcessError as e: print_exception(e) exit(e.returncode) for tf_version in args.tf_versions: print(f"Build tf image {tf_version}") try: run( f'packer build -var-file={tmp_file.name} -var "docker_image=ubuntu:22.04" -var "tf_version={tf_version}" -only "tensorflow.*" .' ) except subprocess.CalledProcessError as e: print_exception(e) print("Skipping to next tf_version") continue if args.docker_push: image_name = None with open("docker-manifest.json", "r") as f: data = json.load(f) image_name = data["builds"][0]["artifact_id"] os.remove("docker-manifest.json") if image_name: print("Pushing the image") try: run(f"docker push {image_name}") except subprocess.CalledProcessError as e: print_exception(e) print("Skipping to next tf_version") continue else: print(f"Error retrieving the docker image name for tf_version={tf_version}") os.remove(tmp_file.name)