notebooks/utils.py [56:147]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    display_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    image_uri: str,
    run_mode: str,
    gin_files: List[str],
    model_dir: str,
    gin_search_paths: Optional[List[str]] = None,
    tfds_data_dir: Optional[str] = None,
    replica_count: int = 1,
    gin_overwrites: Optional[List[str]] = None,
    base_output_dir: Optional[str] = None,
) -> CustomJob:
    """Creates a Vertex AI custom T5X training job.
    It copies the configuration files (.gin) to GCS, creates a worker_pool_spec 
    and returns an aiplatform.CustomJob.

    Args:
        display_name (str):
            Required. User defined display name for the Vertex AI custom T5X job.
        machine_type (str):
            Required. The type of machine for running the custom training job on
            dedicated resources. For TPUs, use `cloud-tpu`.
        accelerator_type (str):
            Required. The type of accelerator(s) that may be attached
            to the machine as per `accelerator_count`. Only used if
            `machine_type` is set. Options: `TPU_V2` or `TPU_V3`.
        accelerator_count (int):
            Required. The number of accelerators to attach to the `machine_type`. 
            Only used if `machine_type` is set. For TPUs, this is the number of
            cores to be provisioned.
            Example: 8, 128, 512, etc.
        image_uri (str):
            Required. Full image path to be used as the execution environment of the 
            custom T5X training job.
            Example:
                'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
        run_mode (str):
            Required. The mode to run T5X under. Options: `train`, `eval`, `infer`.
        gin_files (List[str]):
            Required. Full path to gin configuration file on local filesystem. 
            Multiple paths may be passed and will be imported in the given 
            order, with later configurations overriding earlier ones.
        gin_search_paths (List[str]):
            List of gin config path prefixes to be prepended to gin suffixes in gin includes and gin_files
        model_dir (str):
            Required. Path on Google Cloud Storage to store all the artifacts generated
            by the custom T5X training job. The path must be in this format:
            `gs://{bucket name}/{your folder}/...`.
            Example:
                gs://my_bucket/experiments/model1/
        tfds_data_dir (Optional[str] = None):
            Optional. If set, this directory will be used to store datasets prepared by 
            TensorFlow Datasets that are not available in the public TFDS GCS 
            bucket. Note that this flag overrides the `tfds_data_dir` attribute of 
            all Task`s. This path must be a valid GCS path.
            Example:
                gs://my_bucket/datasets/my_dataset/
        replica_count (int = 1):
            Optional. The number of worker replicas. If replica count = 1 then one chief
            replica will be provisioned. For TPUs this must be set to 1.
        gin_overwrites (Optional[List[str]] = None):
            Optional. List of arguments to overwrite gin configurations. Argument must be 
            enclosed in parentheses.
            Example:
                --gin.TRAIN_PATH=\"gs://my_bucket/folder\"
        base_output_dir (Optional[str] = None):

    Returns:
        (aiplatform.CustomJob):
            Return an instance of a Vertex AI training CustomJob.
    """

    local_fs = fsspec.filesystem('file')
    gcs_fs = gcsfs.GCSFileSystem()

    # Check if gin files exists
    if not gin_files or not all([local_fs.isfile(f) for f in gin_files]):
        raise FileNotFoundError(
            'Provide a list of valid gin files.'
        )

    # Try to copy files to GCS bucket
    try:
        gcs_gin_files = []
        for gin_file in gin_files:
            gcs_path = os.path.join(model_dir, gin_file.split(sep='/')[-1])
            gcs_fs.put(gin_file, gcs_path)
            gcs_gin_files.append(gcs_path.replace('gs://', '/gcs/'))
    except:
        raise RuntimeError('Could not copy gin files to GCS.')
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


scripts/run.py [61:150]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    display_name: str,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    image_uri: str,
    run_mode: str,
    gin_files: List[str],
    model_dir: str,
    gin_search_paths: Optional[List[str]] = None,
    tfds_data_dir: Optional[str] = None,
    replica_count: int = 1,
    gin_overwrites: Optional[List[str]] = None,
    base_output_dir: Optional[str] = None,
) -> CustomJob:
    """Creates a Vertex AI custom T5X training job.
    It copies the configuration files (.gin) to GCS, creates a worker_pool_spec 
    and returns an aiplatform.CustomJob.
    Args:
        display_name (str):
            Required. User defined display name for the Vertex AI custom T5X job.
        machine_type (str):
            Required. The type of machine for running the custom training job on
            dedicated resources. For TPUs, use `cloud-tpu`.
        accelerator_type (str):
            Required. The type of accelerator(s) that may be attached
            to the machine as per `accelerator_count`. Only used if
            `machine_type` is set. Options: `TPU_V2` or `TPU_V3`.
        accelerator_count (int):
            Required. The number of accelerators to attach to the `machine_type`. 
            Only used if `machine_type` is set. For TPUs, this is the number of
            cores to be provisioned.
            Example: 8, 128, 512, etc.
        image_uri (str):
            Required. Full image path to be used as the execution environment of the 
            custom T5X training job.
            Example:
                'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
        run_mode (str):
            Required. The mode to run T5X under. Options: `train`, `eval`, `infer`.
        gin_files (List[str]):
            Required. Full path to gin configuration file on local filesystem. 
            Multiple paths may be passed and will be imported in the given 
            order, with later configurations overriding earlier ones.
        gin_search_paths (List[str]):
            List of gin config path prefixes to be prepended to gin suffixes in gin includes and gin_files
        model_dir (str):
            Required. Path on Google Cloud Storage to store all the artifacts generated
            by the custom T5X training job. The path must be in this format:
            `gs://{bucket name}/{your folder}/...`.
            Example:
                gs://my_bucket/experiments/model1/
        tfds_data_dir (Optional[str] = None):
            Optional. If set, this directory will be used to store datasets prepared by 
            TensorFlow Datasets that are not available in the public TFDS GCS 
            bucket. Note that this flag overrides the `tfds_data_dir` attribute of 
            all Task`s. This path must be a valid GCS path.
            Example:
                gs://my_bucket/datasets/my_dataset/
        replica_count (int = 1):
            Optional. The number of worker replicas. If replica count = 1 then one chief
            replica will be provisioned. For TPUs this must be set to 1.
        gin_overwrites (Optional[List[str]] = None):
            Optional. List of arguments to overwrite gin configurations. Argument must be 
            enclosed in parentheses.
            Example:
                --gin.TRAIN_PATH=\"gs://my_bucket/folder\"
        base_output_dir (Optional[str] = None):
    Returns:
        (aiplatform.CustomJob):
            Return an instance of a Vertex AI training CustomJob.
    """

    local_fs = fsspec.filesystem('file')
    gcs_fs = gcsfs.GCSFileSystem()

    # Check if gin files exists
    if not gin_files or not all([local_fs.isfile(f) for f in gin_files]):
        raise FileNotFoundError(
            'Provide a list of valid gin files.'
        )

    # Try to copy files to GCS bucket
    try:
        gcs_gin_files = []
        for gin_file in gin_files:
            gcs_path = os.path.join(model_dir, gin_file.split(sep='/')[-1])
            gcs_fs.put(gin_file, gcs_path)
            gcs_gin_files.append(gcs_path.replace('gs://', '/gcs/'))
    except:
        raise RuntimeError('Could not copy gin files to GCS.')
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -