in core/src/autogluon/core/models/ensemble/bagged_ensemble_model.py [0:0]
def _fit_folds(self,
X,
y,
model_base,
X_pseudo=None,
y_pseudo=None,
k_fold=None,
k_fold_start=0,
k_fold_end=None,
n_repeats=1,
n_repeat_start=0,
time_limit=None,
sample_weight=None,
save_folds=True,
groups=None,
**kwargs):
fold_fitting_strategy = self.params.get('fold_fitting_strategy', 'auto')
if fold_fitting_strategy == 'auto':
fold_fitting_strategy = self._get_default_fold_fitting_strategy()
num_folds_parallel = self.params.get('num_folds_parallel', 'auto')
disable_parallel_fitting = self.params.get('_disable_parallel_fitting', False)
if fold_fitting_strategy == 'parallel_local':
if disable_parallel_fitting:
fold_fitting_strategy = SequentialLocalFoldFittingStrategy
logger.log(20, f'{model_base.__class__.__name__} does not support parallel folding yet. Will use sequential folding instead')
else:
fold_fitting_strategy = ParallelLocalFoldFittingStrategy
elif fold_fitting_strategy == 'sequential_local':
fold_fitting_strategy = SequentialLocalFoldFittingStrategy
else:
raise ValueError(
f'{fold_fitting_strategy} is not a valid option for fold_fitting_strategy'
'Valid options are: parallel_local and sequential_local'
)
# TODO: Preprocess data here instead of repeatedly
# FIXME: Raise exception if multiclass/binary and a single val fold contains all instances of a class. (Can happen if custom groups is specified)
time_start = time.time()
if k_fold_start != 0:
cv_splitter = self._cv_splitters[n_repeat_start]
else:
cv_splitter = self._get_cv_splitter(n_splits=k_fold, n_repeats=n_repeats, groups=groups)
if k_fold != cv_splitter.n_splits:
k_fold = cv_splitter.n_splits
if k_fold_end is None:
k_fold_end = k_fold
if cv_splitter.n_repeats < n_repeats:
# If current cv_splitter doesn't have enough n_repeats for all folds, then create a new one.
cv_splitter = self._get_cv_splitter(n_splits=k_fold, n_repeats=n_repeats, groups=groups)
fold_fit_args_list, n_repeats_started, n_repeats_finished = self._generate_fold_configs(
X=X,
y=y,
cv_splitter=cv_splitter,
k_fold_start=k_fold_start,
k_fold_end=k_fold_end,
n_repeat_start=n_repeat_start,
n_repeat_end=n_repeats,
)
fold_fit_args_list = [dict(fold_ctx=fold_ctx) for fold_ctx in fold_fit_args_list]
logger.log(20, f'\tFitting {len(fold_fit_args_list)} child models '
f'({fold_fit_args_list[0]["fold_ctx"]["model_name_suffix"]} - {fold_fit_args_list[-1]["fold_ctx"]["model_name_suffix"]})')
oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof(X=X, y=y)
models = []
if num_folds_parallel == 'auto':
num_folds_parallel = len(fold_fit_args_list)
fold_fitting_strategy_args = dict(
model_base=model_base, model_base_kwargs=kwargs,
bagged_ensemble_model=self, X=X, y=y, X_pseudo=X_pseudo, y_pseudo=y_pseudo, sample_weight=sample_weight,
time_limit=time_limit, time_start=time_start, models=models,
oof_pred_proba=oof_pred_proba, oof_pred_model_repeats=oof_pred_model_repeats,
save_folds=save_folds
)
# noinspection PyCallingNonCallable
if fold_fitting_strategy == ParallelLocalFoldFittingStrategy:
fold_fitting_strategy_args['num_folds_parallel'] = num_folds_parallel
fold_fitting_strategy = fold_fitting_strategy(**fold_fitting_strategy_args)
if type(fold_fitting_strategy) == ParallelLocalFoldFittingStrategy and not fold_fitting_strategy.is_mem_sufficient(num_folds_parallel):
# If memory is not sufficient, fall back to sequential fold strategy
fold_fitting_strategy_args.pop('num_folds_parallel', None)
fold_fitting_strategy: AbstractFoldFittingStrategy = SequentialLocalFoldFittingStrategy(**fold_fitting_strategy_args)
logger.log(20, f'Memory not enough to fit {model_base.__class__.__name__} folds in parallel. Will do sequential fitting instead')
logger.log(20, 'Consider decrease folds trained in parallel by passing num_folds_parallel to ag_args_ensemble when calling tabular.fit')
else:
logger.log(20, f'{fold_fitting_strategy.__class__.__name__} is used to fit folds')
# noinspection PyCallingNonCallable
for fold_fit_args in fold_fit_args_list:
fold_fitting_strategy.schedule_fold_model_fit(**fold_fit_args)
fold_fitting_strategy.after_all_folds_scheduled()
self.models += models
self._bagged_mode = True
if self._oof_pred_proba is None:
self._oof_pred_proba = oof_pred_proba
self._oof_pred_model_repeats = oof_pred_model_repeats
else:
self._oof_pred_proba += oof_pred_proba
self._oof_pred_model_repeats += oof_pred_model_repeats
self._cv_splitters += [cv_splitter for _ in range(n_repeats_started)]
self._k_per_n_repeat += [k_fold for _ in range(n_repeats_finished)]
self._n_repeats = n_repeats
if k_fold == k_fold_end:
self._k = None
self._k_fold_end = 0
self._n_repeats_finished = self._n_repeats
else:
self._k = k_fold
self._k_fold_end = k_fold_end
self._n_repeats_finished = self._n_repeats - 1