pai/modelscope/estimator.py (103 lines of code) (raw):
# Copyright 2023 Alibaba, Inc. or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional
from ..api.image import ImageLabel
from ..common.logging import get_logger
from ..common.utils import to_semantic_version
from ..estimator import Estimator
from ..session import Session
logger = get_logger(__name__)
class ModelScopeEstimator(Estimator):
"""Handle training of custom ModelScope model.
The ModelScope Estimator is optimized to run a ModelScope training
script in the PAI Training Service with a specific image.
Example::
est = ModelScopeEstimator(
source_dir="./train/src/",
command="python train.py",
modelscope_version = 'latest',
instance_type="ecs.c6.xlarge",
)
est.fit()
print(est.model_data())
"""
def __init__(
self,
command: str,
source_dir: Optional[str] = None,
git_config: Optional[Dict[str, str]] = None,
image_uri: Optional[str] = None,
modelscope_version: Optional[str] = None,
hyperparameters: Optional[Dict[str, Any]] = None,
base_job_name: Optional[str] = None,
checkpoints_path: Optional[str] = None,
output_path: Optional[str] = None,
instance_type: Optional[str] = None,
instance_count: int = 1,
session: Optional[Session] = None,
**kwargs,
):
"""Initialize a ModelScope Estimator.
Args:
command (str): The command used to run the training job.
source_dir (str, optional): The local source code directory used in the
training job. The directory will be packaged and uploaded to an OSS
bucket, then downloaded to the `/ml/usercode` directory in the training
job container. If there is a `requirements.txt` file in the source code
directory, the corresponding dependencies will be installed before the
training script runs.
If 'git_config' is provided, 'source_dir' should be a relative location
to a directory in the Git repo. With the following GitHub repo directory
structure:
.. code::
|----- README.md
|----- src
|----- train.py
|----- test.py
if you need 'src' directory as the source code directory, you can assign
source_dir='./src/'.
git_config (Dict[str, str]): Git configuration used to clone the repo.
Including ``repo``, ``branch``, ``commit``, ``username``, ``password``
and ``token``. The ``repo`` is required. All other fields are optional.
``repo`` specifies the Git repository. If you don't provide ``branch``,
the default value 'master' is used. If you don't provide ``commit``, the
latest commit in the specified branch is used. ``username``, ``password``
and ``token`` are for authentication purpose.
For example, the following config:
.. code:: python
git_config = {
'repo': 'https://github.com/modelscope/modelscope.git',
'branch': 'master',
'commit': '9bfc4a9d83c4beaf8378d0a186261ffc1cd9f960'
}
results in cloning the git repo specified in 'repo', then checking out
the 'master' branch, and checking out the specified commit.
image_uri (str, optional): If specified, the estimator will use this image
in the training job, instead of selecting the appropriate PAI official
image based on modelscope_version. It can be an image provided by PAI
or a user customized image. To view the images provided by PAI, please
refer to the document:
https://help.aliyun.com/document_detail/202834.htm.
If ``modelscope_version`` is ``None``, then ``image_uri`` is required.
If also ``None``, then a ``ValueError`` will be raised.
modelscope_version (str, optional): Modelscope version you want to use for
executing your model training code. Defaults to ``None``. Required unless
``image_uri`` is provided.
hyperparameters (dict, optional): A dictionary that represents the
hyperparameters used in the training job. The hyperparameters will be
stored in the `/ml/input/config/hyperparameters.json` as a JSON
dictionary in the training container.
base_job_name (str, optional): The base name used to generate the training
job name.
checkpoints_path (str, optional): An OSS URI that stores the checkpoint of the
training job. If provided, the OSS URI will be mounted to the directory
`/ml/output/checkpoints/`.
output_path (str, optional): An OSS URI to store the outputs of the training
jobs. If not provided, an OSS URI will be generated using the default
OSS bucket in the session. When the `estimator.fit` method is called,
a specific OSS URI under the output_path for each channel is generated
and mounted to the training container.
A completed training container directory structure example::
/ml
|-- usercode // User source code directory.
| |-- requirements.txt
| `-- train.py
|-- input // TrainingJob input
| `-- config
| |-- hyperparameters.json // Hyperparameters in JSON
| | // dictionary format for the
| | // TrainingJob
| |
| `-- data // TrainingJob input channels
| | // `/ml/input/data/` is a input
| | // channel, and the directory
| | // name is the channel name.
| | // Each directory under the
| |-- test-data
| | `-- test.csv
| `-- train-data
| `-- train.csv
`-- output // TrainingJob output channels.
| // Each directory under the
| // `/ml/output/` is an output
| // channel, and the directory
| // name is the channel name.
`-- model
`-- checkpoints
instance_type (str): The machine instance type used to run the training job.
To view the supported machine instance types, please refer to the
document:
https://help.aliyun.com/document_detail/171758.htm#section-55y-4tq-84y.
If the instance_type is "local", the training job is executed locally
using docker.
instance_count (int): The number of machines used to run the training job.
session (:class:`pai.session.Session`, optional): A pai session object manages
interactions with PAI REST API.
**kwargs: Additional kwargs passed to the :class:`~pai.estimator.Estimator`
constructor.
.. tip::
You can find additional parameters for initializing this class at
:class:`~pai.estimator.Estimator`.
"""
self._validate_image_uri(
image_uri=image_uri, modelscope_version=modelscope_version
)
self.image_uri = image_uri
self.modelscope_version = modelscope_version
super(ModelScopeEstimator, self).__init__(
image_uri=self.image_uri,
command=command,
source_dir=source_dir,
git_config=git_config,
hyperparameters=hyperparameters,
base_job_name=base_job_name,
checkpoints_path=checkpoints_path,
output_path=output_path,
instance_type=instance_type,
instance_count=instance_count,
session=session,
**kwargs,
)
# Check image_uri and modelscope_version
self.training_image_uri()
def _validate_image_uri(self, image_uri: str, modelscope_version: str) -> None:
"""Check if image_uri or modelscope_version arguments are specified."""
if not image_uri and not modelscope_version:
raise ValueError(
"modelscope_version, and image_uri are both None. "
"Specify either modelscope_version or image_uri."
)
def training_image_uri(self) -> str:
"""Return the Docker image to use for training.
The :meth:`~pai.estimator.Estimator.fit` method, which does the model training,
calls this method to find the image to use for model training.
Returns:
str: The URI of the Docker image.
"""
if self.image_uri:
return self.image_uri
labels = [
ImageLabel.OFFICIAL_LABEL,
ImageLabel.DSW_LABEL,
ImageLabel.DEVICE_TYPE_GPU,
]
# Filter images by ModelScope version
if self.modelscope_version == "latest":
latest_version = self._get_latest_ms_version_for_training()
labels.append(ImageLabel.framework_version("ModelScope", latest_version))
else:
labels.append(
ImageLabel.framework_version("ModelScope", self.modelscope_version)
)
resp = self.session.image_api.list(
labels=labels,
workspace_id=0,
verbose=True,
)
if resp.total_count == 0:
raise ValueError(
"No official image found for modelscope version:"
f" {self.modelscope_version}. Currently supported versions are:"
f" {self._get_supported_ms_versions_for_training()}"
)
image = resp.items[0]["ImageUri"]
return image
def _get_supported_ms_versions_for_training(self) -> List[str]:
"""Return the list of supported ModelScope versions for training."""
label_keys = "system.framework.ModelScope"
label_filter = [
ImageLabel.OFFICIAL_LABEL,
ImageLabel.DSW_LABEL,
ImageLabel.DEVICE_TYPE_GPU,
ImageLabel.framework_version("ModelScope", "*"),
]
list_image_labels = self.session.image_api.list_labels(
label_keys=label_keys,
label_filter=label_filter,
workspace_id=0,
)
res = []
for label in list_image_labels:
if label["Value"] not in res:
res.append(label["Value"])
res.sort(key=lambda x: to_semantic_version(x))
return res
def _get_latest_ms_version_for_training(self) -> str:
"""Return the latest ModelScope version for training."""
res = self._get_supported_ms_versions_for_training()
return max(
res,
key=lambda x: to_semantic_version(x),
)