in sdks/python/apache_beam/options/pipeline_options.py [0:0]
def _add_argparse_args(cls, parser):
parser.add_argument(
'--num_workers',
type=int,
default=None,
help=(
'Number of workers to use when executing the Dataflow job. If not '
'set, the Dataflow service will use a reasonable default.'))
parser.add_argument(
'--max_num_workers',
type=int,
default=None,
help=(
'Maximum number of workers to use when executing the Dataflow job.'
))
parser.add_argument(
'--autoscaling_algorithm',
type=str,
choices=['NONE', 'THROUGHPUT_BASED'],
default=None, # Meaning unset, distinct from 'NONE' meaning don't scale
help=
('If and how to autoscale the workerpool.'))
parser.add_argument(
'--worker_machine_type',
'--machine_type',
dest='machine_type',
default=None,
help=(
'Machine type to create Dataflow worker VMs as. See '
'https://cloud.google.com/compute/docs/machine-types '
'for a list of valid options. If not set, '
'the Dataflow service will choose a reasonable '
'default.'))
parser.add_argument(
'--disk_size_gb',
type=int,
default=None,
help=(
'Remote worker disk size, in gigabytes, or 0 to use the default '
'size. If not set, the Dataflow service will use a reasonable '
'default.'))
parser.add_argument(
'--worker_disk_type',
'--disk_type',
dest='disk_type',
default=None,
help=('Specifies what type of persistent disk should be used.'))
parser.add_argument(
'--worker_region',
default=None,
help=(
'The Compute Engine region (https://cloud.google.com/compute/docs/'
'regions-zones/regions-zones) in which worker processing should '
'occur, e.g. "us-west1". Mutually exclusive with worker_zone. If '
'neither worker_region nor worker_zone is specified, default to '
'same value as --region.'))
parser.add_argument(
'--worker_zone',
default=None,
help=(
'The Compute Engine zone (https://cloud.google.com/compute/docs/'
'regions-zones/regions-zones) in which worker processing should '
'occur, e.g. "us-west1-a". Mutually exclusive with worker_region. '
'If neither worker_region nor worker_zone is specified, the '
'Dataflow service will choose a zone in --region based on '
'available capacity.'))
parser.add_argument(
'--zone',
default=None,
help=(
'GCE availability zone for launching workers. Default is up to the '
'Dataflow service. This flag is deprecated, and will be replaced '
'by worker_zone.'))
parser.add_argument(
'--network',
default=None,
help=(
'GCE network for launching workers. Default is up to the Dataflow '
'service.'))
parser.add_argument(
'--subnetwork',
default=None,
help=(
'GCE subnetwork for launching workers. Default is up to the '
'Dataflow service. Expected format is '
'regions/REGION/subnetworks/SUBNETWORK or the fully qualified '
'subnetwork name. For more information, see '
'https://cloud.google.com/compute/docs/vpc/'))
parser.add_argument(
'--worker_harness_container_image',
default=None,
help=(
'Docker registry location of container image to use for the '
'worker harness. If not set, an appropriate approved Google Cloud '
'Dataflow image will be used based on the version of the '
'SDK. Note: This flag is deprecated and only supports '
'approved Google Cloud Dataflow container images. To provide a '
'custom container image, use sdk_container_image instead.'))
parser.add_argument(
'--sdk_container_image',
default=None,
help=(
'Docker registry location of container image to use for the '
'worker harness. If not set, an appropriate approved Google Cloud '
'Dataflow image will be used based on the version of the '
'SDK. If set for a non-portable pipeline, only official '
'Google Cloud Dataflow container images may be used here.'))
parser.add_argument(
'--sdk_harness_container_image_overrides',
action='append',
default=None,
help=(
'Overrides for SDK harness container images. Could be for the '
'local SDK or for a remote SDK that pipeline has to support due '
'to a cross-language transform. Each entry consist of two values '
'separated by a comma where first value gives a regex to '
'identify the container image to override and the second value '
'gives the replacement container image.'))
parser.add_argument(
'--default_sdk_harness_log_level',
default=None,
help=(
'Controls the default log level of all loggers without a log level '
'override. Values can be either a labeled level or a number '
'(See https://docs.python.org/3/library/logging.html#levels). '
'Default log level is INFO.'))
parser.add_argument(
'--sdk_harness_log_level_overrides',
type=json.loads,
action=_DictUnionAction,
default=None,
help=(
'Controls the log levels for specifically named loggers. The '
'expected format is a json string: \'{"module":"log_level",...}\'. '
'For example, by specifying the value \'{"a.b.c":"DEBUG"}\', '
'the logger underneath the module "a.b.c" will be configured to '
'output logs at the DEBUG level. Similarly, by specifying the '
'value \'{"a.b.c":"WARNING"}\' all loggers underneath the "a.b.c" '
'module will be configured to output logs at the WARNING level. '
'Also, note that when multiple overrides are specified, the exact '
'name followed by the closest parent takes precedence.'))
parser.add_argument(
'--use_public_ips',
default=None,
action='store_true',
help='Whether to assign public IP addresses to the worker VMs.')
parser.add_argument(
'--no_use_public_ips',
dest='use_public_ips',
default=None,
action='store_false',
help='Whether to assign only private IP addresses to the worker VMs.')
parser.add_argument(
'--min_cpu_platform',
dest='min_cpu_platform',
type=str,
help='GCE minimum CPU platform. Default is determined by GCP.')
parser.add_argument(
'--max_cache_memory_usage_mb',
dest='max_cache_memory_usage_mb',
type=int,
default=0,
help=(
'Size of the SDK Harness cache to store user state and side '
'inputs in MB. The cache is disabled by default. Increasing '
'cache size might improve performance of some pipelines, such as '
'pipelines that use iterable side input views, but can '
'lead to an increase in memory consumption and OOM errors if '
'workers are not appropriately provisioned. '
'Using the cache might decrease performance pipelines using '
'materialized side inputs. '
'If the cache is full, least '
'recently used elements will be evicted. This cache is per '
'each SDK Harness instance. SDK Harness is a component '
'responsible for executing the user code and communicating with '
'the runner. Depending on the runner, there may be more than one '
'SDK Harness process running on the same worker node.'))