in pai/job/_local_training_job.py [0:0]
def run(self):
"""Run estimator job in local with docker."""
output_model_path = self.output_path()
os.makedirs(output_model_path, exist_ok=True)
volumes = {}
tmp_dir = tempfile.mkdtemp()
# 1. Prepare source code to directory /ml/usercode
user_code_dir = os.path.join(self.tmp_dir, "user_code")
if is_oss_uri(self.estimator.source_dir):
raise RuntimeError("OSS source code is not supported in local training.")
shutil.copytree(self.estimator.source_dir, user_code_dir)
volumes[user_code_dir] = {
"bind": _TrainingJobConfig.WORKING_DIR,
"mode": "rw",
}
# 2. Prepare input data for training job.
input_data = self.prepare_input_data()
for host_path, container_path in input_data.items():
volumes[host_path] = {
"bind": container_path,
"mode": "rw",
}
# 3. Prepare input config files, such as hyperparameters.json,
# training-job.json, etc.
input_config_path = os.path.join(tmp_dir, "config")
os.makedirs(input_config_path, exist_ok=True)
self.prepare_input_config(input_config_path=input_config_path)
volumes[input_config_path] = {
"bind": _TrainingJobConfig.INPUT_CONFIG_DIR,
"mode": "rw",
}
execution_dir = os.path.join(tmp_dir, "config", "execution")
os.makedirs(execution_dir, exist_ok=True)
command_path = os.path.join(execution_dir, "command.sh")
with open(command_path, "w") as f:
f.write(self.estimator.command)
launch_script_path = os.path.join(input_config_path, "launch.sh")
with open(launch_script_path, "w") as f:
f.write(
_TRAINING_LAUNCH_SCRIPT_TEMPLATE.format(
posixpath.join(
_TrainingJobConfig.INPUT_CONFIG_DIR, "execution/command.sh"
)
)
)
# 4. Config output model channel
volumes[output_model_path] = {
"bind": posixpath.join(_TrainingJobConfig.OUTPUT_DIR, "model"),
"mode": "rw",
}
gpu_count = (
-1 if self.instance_type.strip() == INSTANCE_TYPE_LOCAL_GPU else None
)
self._container_run = run_container(
environment_variables=self.prepare_env(),
image_uri=self.estimator.image_uri,
entry_point=[
"/bin/sh",
posixpath.join(_TrainingJobConfig.INPUT_CONFIG_DIR, "launch.sh"),
],
volumes=volumes,
working_dir=_TrainingJobConfig.WORKING_DIR,
gpu_count=gpu_count,
)