horovod/runner/common/util/config_parser.py (131 lines of code) (raw):

# Copyright 2020 Uber Technologies, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import logging # Parameter knobs HOROVOD_FUSION_THRESHOLD = 'HOROVOD_FUSION_THRESHOLD' HOROVOD_CYCLE_TIME = 'HOROVOD_CYCLE_TIME' HOROVOD_CACHE_CAPACITY = 'HOROVOD_CACHE_CAPACITY' HOROVOD_HIERARCHICAL_ALLREDUCE = 'HOROVOD_HIERARCHICAL_ALLREDUCE' HOROVOD_HIERARCHICAL_ALLGATHER = 'HOROVOD_HIERARCHICAL_ALLGATHER' # Autotune knobs HOROVOD_AUTOTUNE = 'HOROVOD_AUTOTUNE' HOROVOD_AUTOTUNE_LOG = 'HOROVOD_AUTOTUNE_LOG' HOROVOD_AUTOTUNE_WARMUP_SAMPLES = 'HOROVOD_AUTOTUNE_WARMUP_SAMPLES' HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE = 'HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE' HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES = 'HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES' HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE = 'HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE' # Timeline knobs HOROVOD_TIMELINE = 'HOROVOD_TIMELINE' HOROVOD_TIMELINE_MARK_CYCLES = 'HOROVOD_TIMELINE_MARK_CYCLES' # Stall check knobs HOROVOD_STALL_CHECK_DISABLE = 'HOROVOD_STALL_CHECK_DISABLE' HOROVOD_STALL_CHECK_TIME_SECONDS = 'HOROVOD_STALL_CHECK_TIME_SECONDS' HOROVOD_STALL_SHUTDOWN_TIME_SECONDS = 'HOROVOD_STALL_SHUTDOWN_TIME_SECONDS' # Library options knobs HOROVOD_MPI_THREADS_DISABLE = 'HOROVOD_MPI_THREADS_DISABLE' HOROVOD_NUM_NCCL_STREAMS = 'HOROVOD_NUM_NCCL_STREAMS' NCCL_IB_DISABLE = 'NCCL_IB_DISABLE' HOROVOD_CCL_BGT_AFFINITY = 'HOROVOD_CCL_BGT_AFFINITY' HOROVOD_GLOO_TIMEOUT_SECONDS = 'HOROVOD_GLOO_TIMEOUT_SECONDS' # Logging knobs HOROVOD_LOG_LEVEL = 'HOROVOD_LOG_LEVEL' HOROVOD_LOG_HIDE_TIME = 'HOROVOD_LOG_HIDE_TIME' LOG_LEVELS = ['TRACE', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'FATAL'] def _set_arg_from_config(args, arg_base_name, override_args, config, arg_prefix=''): arg_name = arg_prefix + arg_base_name if arg_name in override_args: return value = config.get(arg_base_name) if value is not None: setattr(args, arg_name, value) def set_args_from_config(args, config, override_args): # Controller controller = config.get('controller') if controller and not args.use_gloo and not args.use_mpi: if controller.lower() == 'gloo': args.use_gloo = True elif controller.lower() == 'mpi': args.use_mpi = True else: raise ValueError('No such controller supported: {}'.format(controller)) # Params params = config.get('params') if params: _set_arg_from_config(args, 'fusion_threshold_mb', override_args, params) _set_arg_from_config(args, 'cycle_time_ms', override_args, params) _set_arg_from_config(args, 'cache_capacity', override_args, params) _set_arg_from_config(args, 'hierarchical_allreduce', override_args, params) _set_arg_from_config(args, 'hierarchical_allgather', override_args, params) # Autotune autotune = config.get('autotune') if autotune: args.autotune = autotune.get('enabled', False) if 'autotune' not in override_args else args.autotune _set_arg_from_config(args, 'log_file', override_args, autotune, arg_prefix='autotune_') _set_arg_from_config(args, 'warmup_samples', override_args, autotune, arg_prefix='autotune_') _set_arg_from_config(args, 'steps_per_sample', override_args, autotune, arg_prefix='autotune_') _set_arg_from_config(args, 'bayes_opt_max_samples', override_args, autotune, arg_prefix='autotune_') _set_arg_from_config(args, 'gaussian_process_noise', override_args, autotune, arg_prefix='autotune_') # Timeline timeline = config.get('timeline') if timeline: _set_arg_from_config(args, 'filename', override_args, timeline, arg_prefix='timeline_') _set_arg_from_config(args, 'mark_cycles', override_args, timeline, arg_prefix='timeline_') # Stall Check stall_check = config.get('stall_check') if stall_check: args.no_stall_check = not stall_check.get('enabled', True) \ if 'no_stall_check' not in override_args else args.no_stall_check _set_arg_from_config(args, 'warning_time_seconds', override_args, stall_check, arg_prefix='stall_check_') _set_arg_from_config(args, 'shutdown_time_seconds', override_args, stall_check, arg_prefix='stall_check_') # Library Options library_options = config.get('library_options') if library_options: _set_arg_from_config(args, 'mpi_threads_disable', override_args, library_options) _set_arg_from_config(args, 'num_nccl_streams', override_args, library_options) _set_arg_from_config(args, 'ccl_bgt_affinity', override_args, library_options) _set_arg_from_config(args, 'gloo_timeout_seconds', override_args, library_options) # Logging logging = config.get('logging') if logging: _set_arg_from_config(args, 'level', override_args, logging, arg_prefix='log_') _set_arg_from_config(args, 'hide_timestamp', override_args, logging, arg_prefix='log_') def _validate_arg_nonnegative(args, arg_name): value = getattr(args, arg_name) if value is not None and value < 0: raise ValueError('{}={} must be >= 0'.format(arg_name, value)) def validate_config_args(args): _validate_arg_nonnegative(args, 'fusion_threshold_mb') _validate_arg_nonnegative(args, 'cycle_time_ms') _validate_arg_nonnegative(args, 'cache_capacity') _validate_arg_nonnegative(args, 'autotune_warmup_samples') _validate_arg_nonnegative(args, 'autotune_steps_per_sample') _validate_arg_nonnegative(args, 'autotune_bayes_opt_max_samples') noise = args.autotune_gaussian_process_noise if noise is not None and (noise < 0 or noise > 1): raise ValueError('{}={} must be in [0, 1]'.format('autotune_gaussian_process_noise', args.autotune_gaussian_process_noise)) _validate_arg_nonnegative(args, 'stall_check_warning_time_seconds') _validate_arg_nonnegative(args, 'stall_check_shutdown_time_seconds') _validate_arg_nonnegative(args, 'num_nccl_streams') _validate_arg_nonnegative(args, 'ccl_bgt_affinity') _validate_arg_nonnegative(args, 'gloo_timeout_seconds') def _add_arg_to_env(env, env_key, arg_value, transform_fn=None): if arg_value is not None: value = arg_value if transform_fn: value = transform_fn(value) env[env_key] = str(value) def set_env_from_args(env, args): def identity(value): return 1 if value else 0 # Params _add_arg_to_env(env, HOROVOD_FUSION_THRESHOLD, args.fusion_threshold_mb, lambda v: v * 1024 * 1024) _add_arg_to_env(env, HOROVOD_CYCLE_TIME, args.cycle_time_ms) _add_arg_to_env(env, HOROVOD_CACHE_CAPACITY, args.cache_capacity) _add_arg_to_env(env, HOROVOD_HIERARCHICAL_ALLREDUCE, args.hierarchical_allreduce, identity) _add_arg_to_env(env, HOROVOD_HIERARCHICAL_ALLGATHER, args.hierarchical_allgather, identity) # Autotune if args.autotune: _add_arg_to_env(env, HOROVOD_AUTOTUNE, args.autotune, identity) _add_arg_to_env(env, HOROVOD_AUTOTUNE_LOG, args.autotune_log_file) _add_arg_to_env(env, HOROVOD_AUTOTUNE_WARMUP_SAMPLES, args.autotune_warmup_samples) _add_arg_to_env(env, HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE, args.autotune_steps_per_sample) _add_arg_to_env(env, HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES, args.autotune_bayes_opt_max_samples) _add_arg_to_env(env, HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE, args.autotune_gaussian_process_noise) # Timeline if args.timeline_filename: _add_arg_to_env(env, HOROVOD_TIMELINE, args.timeline_filename) _add_arg_to_env(env, HOROVOD_TIMELINE_MARK_CYCLES, args.timeline_mark_cycles, identity) # Stall Check _add_arg_to_env(env, HOROVOD_STALL_CHECK_DISABLE, args.no_stall_check, identity) _add_arg_to_env(env, HOROVOD_STALL_CHECK_TIME_SECONDS, args.stall_check_warning_time_seconds) _add_arg_to_env(env, HOROVOD_STALL_SHUTDOWN_TIME_SECONDS, args.stall_check_shutdown_time_seconds) # Library Options _add_arg_to_env(env, HOROVOD_MPI_THREADS_DISABLE, args.mpi_threads_disable, identity) _add_arg_to_env(env, HOROVOD_NUM_NCCL_STREAMS, args.num_nccl_streams) _add_arg_to_env(env, NCCL_IB_DISABLE, 1 if args.tcp_flag else None) _add_arg_to_env(env, HOROVOD_CCL_BGT_AFFINITY, args.ccl_bgt_affinity) _add_arg_to_env(env, HOROVOD_GLOO_TIMEOUT_SECONDS, args.gloo_timeout_seconds) # Logging _add_arg_to_env(env, HOROVOD_LOG_LEVEL, args.log_level) _add_arg_to_env(env, HOROVOD_LOG_HIDE_TIME, args.log_hide_timestamp, identity) return env