core/src/autogluon/core/trainer/abstract_trainer.py (74 lines):
	- line 29: # FIXME: Below is major defect!
	- line 34: # TODO: Dynamic model loading for ensemble models during prediction, only load more models if prediction is uncertain. This dynamically reduces inference time.
	- line 35: # TODO: Try midstack Semi-Supervised. Just take final models and re-train them, use bagged preds for SS rows. This would be very cheap and easy to try.
	- line 36: # TODO: Move to autogluon.core
	- line 52: self.sample_weight = sample_weight  # TODO: consider redesign where Trainer doesnt need sample_weight column name and weights are separate from X
	- line 78: self.models = {}  # Dict of model name -> model object. A key, value pair only exists if a model is persisted in memory.  # TODO: v0.1 Rename and consider making private
	- line 102: # self._exceptions_list = []  # TODO: Keep exceptions list for debugging during benchmarking.
	- line 182: # TODO: can_infer is technically more complicated, if an ancestor can't infer then the model can't infer.
	- line 225: # TODO: Enable easier re-mapping of trained models -> hyperparameters input (They don't share a key since name can change)
	- line 394: # FIXME: TODO: v0.1 X_unlabeled isn't cached so it won't be available during refit_full or fit_extra.
	- line 398: # TODO: Consider making level be auto-determined based off of max(base_model_levels)+1
	- line 399: # TODO: Remove name_suffix, hacked in
	- line 400: # TODO: X can be optional because it isn't needed if fit=True
	- line 410: X, w = extract_column(X, self.sample_weight)  # TODO: consider redesign with w as separate arg instead of bundled inside X
	- line 442: # TODO: Remove unnecessary load when no stacking
	- line 450: model_set = [m for m in model_set if m != model.name]  # TODO: Can probably be faster, get this result from graph
	- line 473: # TODO: Consider adding persist to disk functionality for pred_proba dictionary to lessen memory burden on large multiclass problems.
	- line 476: # TODO: Add memory optimal topological ordering -> Minimize amount of pred_probas in memory at a time, delete pred probas that are no longer required
	- line 543: # TODO: Remove _get_inputs_to_stacker_legacy eventually, move logic internally into this function instead
	- line 551: # TODO: After _get_inputs_to_stacker_legacy is removed, this if/else is not necessary, instead pass fit param to get_model_pred_proba_dict()
	- line 559: # TODO: Legacy code, still used during training because it is technically slightly faster and more memory efficient than get_model_pred_proba_dict()
	- line 639: # TODO: Do it for all models in the level at once to avoid repeated processing of data?
	- line 643: # TODO: Technically we don't need to re-train the weighted ensemble, we could just copy the original and re-use the weights.
	- line 653: X_stack_preds = self.get_inputs_to_stacker(X_val, base_models=base_model_names, fit=False, use_orig_features=False)  # TODO: May want to cache this during original fit, as we do with OOF preds
	- line 667: # TODO: stack_name=REFIT_FULL_NAME_AUX?
	- line 671: # TODO: Do the below more elegantly, ideally as a parameter to the trainer train function to disable recording scores/pred time.
	- line 694: self.save()  # TODO: This could be more efficient by passing in arg to not save if called by refit_ensemble_full since it saves anyways later.
	- line 720: # TODO: Consider moving base model info to a separate pkl file so that it can be edited without having to load/save the model again
	- line 735: model_loaded.save()  # TODO: Avoid this!
	- line 748: # TODO: Take best performance model with lowest inference
	- line 766: # TODO: In future perhaps give option for the reduce_memory_size arguments, perhaps trainer level variables specified by user?
	- line 825: # TODO: Move this to model code
	- line 832: # TODO: model_name change to model in params
	- line 918: fit_kwargs=dict(num_classes=self.num_classes, groups=None),  # FIXME: Is this the right way to do this?
	- line 1009: # TODO: Add recursive=True to avoid repeatedly loading models each time this is called for bagged ensembles (especially during repeated bagging)
	- line 1077: # TODO: Add to HPO
	- line 1099: # TODO: raise exception if no base models and level != 1?
	- line 1111: # TODO: Split this to avoid confusion, HPO should go elsewhere?
	- line 1175: # TODO: How to deal with models that fail during this? They have trained valid models before, but should we still use those models or remove the entire model? Currently we still use models.
	- line 1176: # TODO: Time allowance can be made better by only using time taken during final model training and not during HPO and feature pruning.
	- line 1177: # TODO: Time allowance not accurate if running from fit_continue
	- line 1178: # TODO: Remove level and stack_name arguments, can get them automatically
	- line 1179: # TODO: Make sure that pretraining on X_unlabeled only happens 1 time rather than every fold of bagging. (Do during pretrain API work?)
	- line 1311: # TODO: Ban KNN from being a Stacker model outside of aux. Will need to ensemble select on all stack layers ensemble selector to make it work
	- line 1312: # TODO: Robert dataset, LightGBM is super good but RF and KNN take all the time away from it on 1h despite being much worse
	- line 1313: # TODO: Add time_limit_per_model
	- line 1314: # TODO: Rename for v0.1
	- line 1338: # TODO: Only update scores when finished, only update model as part of final models if finished!
	- line 1557: # TODO: Enable raw=True for bagged models when X=None
	- line 1559: # TODO: Consider limiting X to 10k rows here instead of inside the model call
	- line 1609: # TODO: Can get feature importances of all children of model at no extra cost, requires scoring the values after predict_proba on each model
	- line 1612: # TODO: Can skip features which were pruned on all models that model depends on (Complex to implement, requires graph representation)
	- line 1613: # TODO: Note that raw importance will not equal non-raw importance for bagged models, even if raw features are identical to the model features.
	- line 1682: # TODO: v0.1 Proper error catching
	- line 1748: # TODO: feature_metadata
	- line 1749: # TODO: disk size
	- line 1750: # TODO: load time
	- line 1751: # TODO: Add persist_if_mem_safe() function to persist in memory all models if reasonable memory size (or a specific model+ancestors)
	- line 1752: # TODO: Add is_persisted() function to check which models are persisted in memory
	- line 1753: # TODO: package_dependencies, package_dependencies_full
	- line 1865: # TODO:
	- line 1946: # TODO: Also enable deletion of models which didn't succeed in training (files may still be persisted)
	- line 2002: # TODO: Delete from all the other model dicts
	- line 2072: To apply label-smoothing: teacher_preds='onehot' will use original training data labels converted to one-hots for multiclass (no data augmentation).  # TODO: expose smoothing-hyperparameter.
	- line 2088: hyperparameter_tune = False  # TODO: add as argument with scheduler options.
	- line 2186: hyperparameters = self._process_hyperparameters(hyperparameters=hyperparameters)  # TODO: consider exposing ag_args_fit, excluded_model_types as distill() arguments.
	- line 2199: # self.bagged_mode = True  # TODO: Add options for bagging
	- line 2206: time_limit=time_limit,  # FIXME: Also limit augmentation time
	- line 2220: model_obj.save()  # TODO: consider omitting for sake of efficiency
	- line 2230: self.bagged_mode = og_bagged_mode  # TODO: Confirm if safe to train future models after training models in both bagged and non-bagged modes
	- line 2243: # TODO: consider moving weight normalization into AbstractModel.fit()
	- line 2256: # FIXME: This section is a hack, compute genuine feature_metadata for each stack level instead
	- line 2259: # FIXME: Sample weight `extract_column` is a hack, have to compute feature_metadata here because sample weight column could be in X upstream, extract sample weight column upstream instead.
	- line 2260: # FIXME: This doesn't assign proper special types to stack features, relying on a hack in StackerEnsembleModel to assign S_STACK to feature metadata, don't do this.


tabular/src/autogluon/tabular/predictor/predictor.py (62 lines):
	- line 40: # TODO: num_bag_sets -> ag_args
	- line 43: # TODO: make core_kwargs a kwargs argument to predictor.fit
	- line 44: # TODO: add aux_kwargs to predictor.fit
	- line 45: # TODO: add pip freeze + python version output after fit + log file, validate that same pip freeze on load as cached
	- line 46: # TODO: predictor.clone()
	- line 47: # TODO: Add logging comments that models are serialized on disk after fit
	- line 48: # TODO: consider adding kwarg option for data which has already been preprocessed by feature generator to skip feature generation.
	- line 49: # TODO: Resolve raw text feature usage in default feature generator
	- line 52: # TODO: Remove all `time_limits` in project, replace with `time_limit`
	- line 197: if sample_weight == AUTO_WEIGHT:  # TODO: update auto_weight strategy and make it the default
	- line 201: self.weight_evaluation = weight_evaluation  # TODO: sample_weight and weight_evaluation can both be properties that link to self._learner.sample_weight, self._learner.weight_evaluation
	- line 738: # TODO: Hyperparam could have non-serializble objects. Save as pkl and loaded on demand
	- line 743: # FIXME: v0.1 This section is a hack
	- line 881: # TODO: Note that temperature scaling is known to worsen calibration in the face of shifted test data.
	- line 883: # FIXME: Avoid depending on torch for temp scaling
	- line 966: # TODO: Allow disable aux (default to disabled)
	- line 967: # TODO: num_bag_sets
	- line 970: # save_bag_folds = kwargs['save_bag_folds']  # TODO: Enable
	- line 978: # TODO: Since data preprocessor is fitted on original train_data it cannot account for if
	- line 1005: fit_new_weighted_ensemble = False  # TODO: Add as option
	- line 1006: aux_kwargs = None  # TODO: Add as option
	- line 1019: # TODO: make core_kwargs a kwargs argument to predictor.fit, add aux_kwargs to predictor.fit
	- line 1027: # TODO: Add special error message if called and training/val data was not cached.
	- line 1082: # TODO: This is a hack! self.predict_prob does not update to use weighted ensemble
	- line 1084: # TODO: There should also be PL added to weighted ensemble model name to notify
	- line 1635: # TODO: uncomment once feature_prune is functional:  self._summarize('feature_prune', 'feature-selection used', results)
	- line 1669: pass # TODO: print detailed bagging info
	- line 1671: pass # TODO: print detailed stacking info, like how much it improves validation performance
	- line 1673: pass # TODO: print detailed feature-selection info once feature-selection is functional.
	- line 2058: # TODO: Add data argument
	- line 2059: # TODO: Add option to disable OOF generation of newly fitted models
	- line 2060: # TODO: Move code logic to learner/trainer
	- line 2061: # TODO: Add fit() arg to perform this automatically at end of training
	- line 2062: # TODO: Consider adding cutoff arguments such as top-k models
	- line 2169: # TODO: Improve error messages when trying to get oof from refit_full and distilled models.
	- line 2170: # TODO: v0.1 add tutorial related to this method, as it is very powerful.
	- line 2171: # TODO: Remove train_data argument once we start caching the raw original data: Can just load that instead.
	- line 2223: # FIXME: This is a hack, add refit tag in a nicer way than via the _model_full_dict_val_score
	- line 2224: # TODO: bagged-with-holdout refit to bagged-no-holdout should still be able to return out-of-fold predictions
	- line 2422: # TODO: v0.1 add documentation for arguments
	- line 2787: # TODO:
	- line 2793: # TODO: Remove features from models option for fit_extra
	- line 2794: # TODO: Constructor?
	- line 2797: holdout_frac=None,  # TODO: Potentially error if num_bag_folds is also specified
	- line 2799: # TODO: Potentially move to fit_extra, raise exception if value too large / invalid in fit_extra.
	- line 2996: # TODO: What about datasets that are 100k+? At a certain point should we not bag?
	- line 2997: # TODO: What about time_limit? Metalearning can tell us expected runtime of each model, then we can select optimal folds + stack levels to fit time constraint
	- line 3018: num_bag_sets = 20  # TODO: v0.1 Reduce to 5 or 3 as 20 is unnecessarily extreme as a default.
	- line 3039: # TODO: Documentation, flesh out capabilities
	- line 3040: # TODO: Rename feature_generator -> feature_pipeline for users?
	- line 3041: # TODO: Return transformed data?
	- line 3042: # TODO: feature_generator_kwargs?
	- line 3047: # TODO: rename to `advice`
	- line 3048: # TODO: Add documentation
	- line 3070: # TODO: Advice on unused features (if no model uses a feature)
	- line 3071: # TODO: Advice on fit_extra
	- line 3072: # TODO: Advice on distill
	- line 3073: # TODO: Advice on leaderboard
	- line 3074: # TODO: Advice on persist
	- line 3075: # TODO: Advice on refit_full
	- line 3076: # TODO: Advice on feature_importance
	- line 3077: # TODO: Advice on dropping poor models


core/src/autogluon/core/models/ensemble/bagged_ensemble_model.py (31 lines):
	- line 28: # TODO: Add metadata object with info like score on each model, train time on each model, etc.
	- line 52: # TODO: Consider moving `_child_oof` logic to a separate class / refactor OOF logic.
	- line 53: # FIXME: Avoid unnecessary refit during refit_full on `_child_oof=True` models, just re-use the original model.
	- line 72: drop_unique=False,  # TODO: Get the value from child instead
	- line 99: # TODO: Require is_valid == True (add option param to ignore is_valid)
	- line 148: # TODO: We may want to throw an exception instead and avoid calling fit more than once
	- line 173: kwargs['num_classes'] = self.num_classes  # TODO: maybe don't pass num_classes to children
	- line 199: # FIXME: Don't save folds except for refit
	- line 200: # FIXME: Cleanup self
	- line 201: # FIXME: Don't add `_FULL` to name
	- line 242: # TODO: Remove this limitation if n_repeats > 1
	- line 245: # TODO: Remove this limitation
	- line 301: # TODO: Consider moving this into end of abstract model fit for all models.
	- line 327: self._oof_pred_proba = model_base.predict_proba(X=X)  # TODO: Cheater value, will be overfit to valid set
	- line 345: # TODO: re-enable macos once this issue is addressed
	- line 403: # TODO: Preprocess data here instead of repeatedly
	- line 404: # FIXME: Raise exception if multiclass/binary and a single val fold contains all instances of a class. (Can happen if custom groups is specified)
	- line 541: # TODO: Augment to generate OOF after shuffling each column in X (Batching), this is the fastest way.
	- line 542: # TODO: Reduce logging clutter during OOF importance calculation (Currently logs separately for each child)
	- line 553: # FIXME: use FULL features (children can have different features)
	- line 601: # TODO: DON'T THROW AWAY SAMPLES! USE LARGER N
	- line 604: val = val['importance'].to_dict()  # TODO: Don't throw away stddev information of children
	- line 632: # TODO: Multiply epochs/n_iterations by some value (such as 1.1) to account for having more training data than bagged models
	- line 915: # TODO: Currently double disk usage, saving model in HPO and also saving model in bag
	- line 916: # FIXME: with use_bag_holdout=True, the fold-1 scores that are logged are of the inner validation score, not the holdout score.
	- line 923: kwargs['num_classes'] = self.num_classes  # TODO: maybe don't pass num_classes to children
	- line 926: # TODO: Preprocess data here instead of repeatedly
	- line 953: scheduler_options[1]['time_out'] = orig_time * 0.8  # TODO: Scheduler doesn't early stop on final model, this is a safety net. Scheduler should be updated to early stop
	- line 962: # TODO: Create new Ensemble Here
	- line 974: bag._child_oof = True  # TODO: Consider a separate tag for refit_folds vs efficient OOF
	- line 1009: # TODO: hpo_results likely not correct because no renames


core/src/autogluon/core/models/abstract/abstract_model.py (28 lines):
	- line 89: self.name = name  # TODO: v0.1 Consider setting to self._name and having self.name be a property so self.name can't be set outside of self.rename()
	- line 97: # TODO: Would be ideal to not create dir, but still track that it is unique. However, this isn't possible to do without a global list of used dirs or using UUID.
	- line 101: self.path = self.create_contexts(self.path_root + self.path_suffix)  # TODO: Make this path a function for consistency.
	- line 134: self._user_params_aux = hyperparameters.pop(AG_ARGS_FIT)  # TODO: Delete after initialization?
	- line 139: self._user_params = hyperparameters  # TODO: Delete after initialization?
	- line 192: # TODO: v0.1 update to be aligned with _set_default_auxiliary_params(), add _get_default_params()
	- line 201: # TODO: Consider adding to get_info() output
	- line 206: # TODO: v0.1 consider adding documentation to each model highlighting which feature dtypes are valid
	- line 214: # TODO: Add more params
	- line 225: # TODO: add option for only top-k ngrams
	- line 229: # TODO: v0.1 Document get_features_kwargs_extra in task.fit
	- line 261: # TODO: v0.1 Change this to update path_root only, path change to property
	- line 287: # TODO: Remove kwargs?
	- line 304: # TODO: Remove kwargs?
	- line 314: # TODO: In online-inference this becomes expensive, add option to remove it (only safe in controlled environment where it is already known features are present
	- line 327: # TODO: Consider changing how this works or where it is done
	- line 350: # FIXME: Consider counting NaNs as unique values, if unique_counts == 2 (including NaN), then treat as boolean
	- line 352: # TODO: Could this be optimized to be faster? This might be a bit slow for large data.
	- line 527: kwargs = self.initialize(**kwargs)  # FIXME: This might have to go before self._preprocess_fit_args, but then time_limit might be incorrect in **kwargs init to initialize
	- line 578: # TODO: This is expensive to convert at inference time, try to avoid in future
	- line 880: # TODO: This will break on S3. Use tabular/utils/savers for datasets, add new function
	- line 935: # TODO: ignore models which were killed early by scheduler (eg. in Hyperband). How to ID these?
	- line 956: # TODO: Experimental, currently unused
	- line 967: # TODO: This results in a doubling of memory usage of the model to calculate its size.
	- line 1029: # TODO: Report errors?
	- line 1048: 'hyperparameters_fit': self.params_trained,  # TODO: Explain in docs that this is for hyperparameters that differ in final model from original hyperparameters, such as epochs (from early stopping)
	- line 1089: # TODO: v0.1 Add reference link to all valid keys and their usage or keep full docs here and reference elsewhere?
	- line 1138: # TODO: Add documentation for valid args for each model. Currently only `ag.early_stop`


tabular/src/autogluon/tabular/models/tabular_nn/mxnet/tabular_nn_mxnet.py (17 lines):
	- line 38: # TODO: Gets stuck after infering feature types near infinitely in nyc-jiashenliu-515k-hotel-reviews-data-in-europe dataset, 70 GB of memory, c5.9xlarge
	- line 76: self.features_to_drop = []  # may change between different bagging folds. TODO: consider just removing these from self._features_internal
	- line 139: if sample_weight is not None:  # TODO: support
	- line 155: # self._save_preprocessor()  # TODO: should save these things for hyperparam tunning. Need one HP tuner for network-specific HPs, another for preprocessing HPs.
	- line 173: # TODO: if we don't want to save intermediate network parameters, need to do something like saving in temp directory to clean up after training:
	- line 195: # TODO: Below should not occur until at time of saving
	- line 310: # FIXME: Switch to adaptive ES
	- line 338: self.summary_writer.add_scalar(tag='train_loss', value=train_loss.asscalar(), global_step=e)  # TODO: do we want to keep mxboard support?
	- line 340: # TODO: Ensure reporter/scheduler properly handle None/nan values after refactor
	- line 341: if val_dataset is not None and (not np.isnan(val_metric)):  # TODO: This might work without the if statement
	- line 391: def _predict_tabular_data(self, new_data, process=True, predict_proba=True):  # TODO ensure API lines up with tabular.Model class.
	- line 486: # TODO no label processing for now
	- line 487: # TODO: add time/ngram features
	- line 488: # TODO: no filtering of data-frame columns based on statistics, e.g. categorical columns with all unique variables or zero-variance features.
	- line 545: elif params['optimizer'] == 'adam':  # TODO: Can we try AdamW?
	- line 579: # TODO: Don't use os.makedirs here, have save_parameters function in tabular_nn_model that checks if local path or S3 path
	- line 591: # TODO: maybe need to initialize/hybridize?


tabular/src/autogluon/tabular/models/image_prediction/image_predictor.py (16 lines):
	- line 18: # FIXME: Avoid hard-coding 'image' column name
	- line 19: # TODO: Handle multiple image columns?
	- line 20: # TODO: Handle multiple images in a single image column?
	- line 21: # TODO: Add regression support
	- line 22: # TODO: refit_full does not work as expected: It won't use all data, will just split train data internally.
	- line 92: if sample_weight is not None:  # TODO: support
	- line 106: # TODO: Consider some kind of weighting of the two options so there isn't a harsh cutoff at 50
	- line 107: # FIXME: What if all rows in a class are null? Will probably crash.
	- line 109: self._dummy_pred_proba = self._compute_dummy_pred_proba(y[null_indices])  # FIXME: Do this one for better results
	- line 122: # TODO: ImagePredictor doesn't use problem_type in any way at present.
	- line 127: # eval_metric=self.eval_metric,  # TODO: multiclass/binary vision problem works only with accuracy, regression with rmse
	- line 133: # FIXME: ImagePredictor crashes if given float time_limit
	- line 142: # self.model.set_verbosity(verbosity)  # TODO: How to set verbosity of fit predictor?
	- line 146: # TODO: Add option to crash if null is present for faster predict_proba
	- line 164: # TODO: Consider moving to AbstractModel or as a separate function
	- line 165: # TODO: Test softclass


tabular/src/autogluon/tabular/learner/default_learner.py (15 lines):
	- line 22: # TODO: Add functionality for advanced feature generators such as gl_code_matrix_generator (inter-row dependencies, apply to train differently than test, etc., can only run after train/test split, rerun for each cv fold)
	- line 23: # TODO: - Differentiate between advanced generators that require fit (stateful, gl_code_matrix) and those that do not (bucket label averaging in SCOT GC 2019)
	- line 24: # TODO: - Those that do not could be added to preprocessing function of model, but would then have to be recomputed on each model.
	- line 25: # TODO: Add cv / OOF generator option, so that AutoGluon can be used as a base model in an ensemble stacker
	- line 37: # TODO: v0.1 Document trainer_fit_kwargs
	- line 49: # TODO: if provided, feature_types in X, X_val are ignored right now, need to pass to Learner/trainer and update this documentation.
	- line 88: k_fold=num_bag_folds,  # TODO: Consider moving to fit call
	- line 89: n_repeats=num_bag_sets,  # TODO: Consider moving to fit call
	- line 109: # TODO: Add default values to X_val, X_unlabeled, holdout_frac, and num_bag_folds
	- line 114: # TODO: We should probably uncomment the below lines, NaN label should be treated as just another value in multiclass classification -> We will have to remove missing, compute problem type, and add back missing if multiclass
	- line 151: X = self.cleaner.fit_transform(X)  # TODO: Consider merging cleaner into label_cleaner
	- line 177: # TODO: Move this up to top of data before removing data, this way our feature generator is better
	- line 219: X, X_val = self.bundle_weights(X, w, X_val, w_val)  # TODO: consider not bundling sample-weights inside X, X_val
	- line 250: elif self.sample_weight == AUTO_WEIGHT:  # TODO: support more sophisticated auto_weight strategy
	- line 252: X[self.sample_weight] = w  # TODO: consider not bundling sample weights inside X


features/src/autogluon/features/generators/abstract.py (15 lines):
	- line 19: # TODO: Add option to minimize memory usage of feature names by making them integers / strings of integers
	- line 20: # TODO: Add ability to track which input features created which output features.
	- line 21: # TODO: Add log of # of observation counts to high cardinality categorical features
	- line 123: # TODO: Consider merging feature_metadata and feature_metadata_real, have FeatureMetadata contain exact dtypes, grouped raw dtypes, and special dtypes all at once.
	- line 212: X = X.reset_index(drop=True)  # TODO: Theoretically inplace=True avoids data copy, but can lead to altering of original DataFrame outside of method context.
	- line 214: y = y.reset_index(drop=True)  # TODO: this assumes y and X had matching indices prior
	- line 239: # TODO: Add option to return feature_metadata instead to avoid data copy
	- line 287: X = X.reset_index(drop=True)  # TODO: Theoretically inplace=True avoids data copy, but can lead to altering of original DataFrame outside of method context.
	- line 386: # TODO: Find way to increase flexibility here, possibly through init args
	- line 405: # TODO: Use code from problem type detection for column types. Ints/Floats could be Categorical through this method. Maybe try both?
	- line 480: # TODO: Ensure arbitrary feature removal does not result in inconsistencies (add unit test)
	- line 545: # TODO: Move to a generator
	- line 554: # TODO: Consider adding _log and verbosity methods to mixin
	- line 575: # TODO: Handle cases where self.features_in or self.feature_metadata_in was already set at init.
	- line 652: # TODO: Unit test this


tabular/src/autogluon/tabular/models/rf/rf_model.py (15 lines):
	- line 38: # TODO: Use sklearnex instead once a suitable toggle option is provided that won't impact future models
	- line 39: # FIXME: DAAL OOB score is broken, returns biased predictions. Without this optimization, can't compute Efficient OOF.
	- line 54: # TODO: X.fillna -inf? Add extra is_missing column?
	- line 68: # TODO: 600 is much better, but increases info leakage in stacking -> therefore 300 is ~equal in stack ensemble final quality.
	- line 74: # TODO: min_samples_leaf=5 is too large on most problems, however on some datasets it helps a lot (airlines likes >40 min_samples_leaf, adult likes 2 much better than 1)
	- line 83: # TODO: Add in documentation that Categorical default is the first index
	- line 84: # TODO: enable HPO for RF models
	- line 209: # TODO: Remove this after simplifying _predict_proba to reduce code duplication. This is only present for SOFTCLASS support.
	- line 239: # TODO: Remove `_set_oob_score` after sklearn version requirement is >=1.0
	- line 242: # FIXME: Unknown if this works with quantile regression
	- line 251: # TODO: This can also be done via setting `oob_score=True` in model params,
	- line 269: # TODO: Can instead do `_compute_oob_predictions` but requires post-processing. Skips scoring func.
	- line 273: # TODO: Remove once sklearn < 1.0 support is dropped
	- line 288: # TODO: Regression does not return NaN for missing rows, instead it sets them to 0. This makes life hard.
	- line 316: # TODO: Add HPO


features/src/autogluon/features/generators/text_ngram.py (14 lines):
	- line 19: # TODO: Add argument to define the text preprocessing logic
	- line 20: # TODO: Add argument to output ngrams as a sparse matrix
	- line 21: # TODO: Add HashingVectorizer support
	- line 22: # TODO: Add TFIDF support
	- line 23: # TODO: Documentation
	- line 50: # TODO: 0.20 causes OOM error with 64 GB ram on NN with several datasets. LightGBM and CatBoost succeed
	- line 51: # TODO: Finetune this, or find a better way to ensure stability
	- line 52: # TODO: adjust max_memory_ratio correspondingly if prefilter_tokens==True
	- line 97: # TODO: Optimize for inference
	- line 128: # TODO: Preprocess text?
	- line 210: X_nlp_features = pd.DataFrame(transform_array, columns=self._feature_names_dict[nlp_feature], index=X.index)  # TODO: Consider keeping sparse
	- line 223: # TODO: REMOVE NEED FOR text_data input!
	- line 244: # TODO: This doesn't have to be done twice, can update transform matrix based on new vocab instead of calling .transform
	- line 252: transform_matrix = vectorizer.fit_transform(text_data)  # TODO: Consider upgrading to pandas 0.25.0 to benefit from sparse attribute improvements / bug fixes! https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.25.0.html


tabular/src/autogluon/tabular/learner/abstract_learner.py (13 lines):
	- line 29: # TODO: - Semi-supervised learning
	- line 30: # TODO: - Minimize memory usage of DataFrames (convert int64 -> uint8 when possible etc.)
	- line 32: # TODO: Loading learner from S3 on Windows may cause issues due to os.path.sep
	- line 97: # TODO: Possibly rename to features_in or consider refactoring all feature_generators features_in -> features
	- line 334: ensemble_selection.fit(predictions=pred_probas, labels=y_internal, identifiers=None, sample_weight=w)  # TODO: Only fit non-nan
	- line 381: # TODO: Add support for calculating pred_time_test_full for oracle_ensemble, need to copy graph from trainer and add oracle_ensemble to it with proper edges.
	- line 578: # sample_weight=sample_weight,  # TODO: add sample_weight support
	- line 666: # TODO: cache_data must be set to True to be able to pass X and y as None in this function, otherwise it will error.
	- line 722: # TODO: Potentially set reset_paths=False inside load function if it is the same path to avoid re-computing paths on all models
	- line 723: # TODO: path_context -> path for v0.1
	- line 733: # TODO: Still have to change paths of models in trainer + trainer object path variables
	- line 818: # TODO: Add data info gathering at beginning of .fit() that is used by all learners to add to get_info output
	- line 819: # TODO: Add feature inference / feature engineering info to get_info output


tabular/src/autogluon/tabular/models/catboost/catboost_model.py (12 lines):
	- line 23: # TODO: Consider having CatBoost variant that converts all categoricals to numerical as done in RFModel, was showing improved results in some problems.
	- line 41: if self.problem_type != SOFTCLASS:  # TODO: remove this after catboost 0.24
	- line 64: approx_mem_size_req = data_mem_uasge * 7 + data_mem_uasge / 4 * num_classes  # TODO: Extremely crude approximation, can be vastly improved
	- line 67: # TODO: Use Pool in preprocess, optimize bagging to do Pool.split() to avoid re-computing pool for each fold! Requires stateful + y
	- line 87: # FIXME: This is extremely slow due to unoptimized metric / objective sent to CatBoost
	- line 115: # TODO: What if path is in S3?
	- line 122: # TODO: Add more control over these params (specifically early_stopping_rounds)
	- line 139: # TODO: Confirm if GPU is used in HPO (Probably not)
	- line 140: # TODO: Adjust max_bins to 254?
	- line 178: # TODO: Custom metrics don't seem to work anymore
	- line 179: # TODO: Custom metrics not supported in GPU mode
	- line 180: # TODO: Callbacks not supported in GPU mode


tabular/src/autogluon/tabular/trainer/model_presets/presets.py (11 lines):
	- line 55: DEFAULT_QUANTILE_MODEL = ['RF', 'XT', 'FASTAI', 'NN_TORCH', 'ENS_WEIGHTED']  # TODO: OTHERS will be added
	- line 121: # TODO: Consider making hyperparameters arg in fit() accept lists, concatenate hyperparameter sets together.
	- line 122: # TODO: Consider adding special optional AG args for #cores,#gpus,num_early_stopping_iterations,etc.
	- line 124: # TODO: Consider adding special optional AG args for use_original_features,features_to_use,etc.
	- line 125: # TODO: Consider adding optional AG args to dynamically disable models such as valid_num_classes_range, valid_row_count_range, valid_feature_count_range, etc.
	- line 126: # TODO: Args such as max_repeats, num_folds
	- line 128: # TODO: Add option to update hyperparameters with only added keys, so disabling CatBoost would just be {'CAT': []}, which keeps the other models as is.
	- line 129: # TODO: special optional AG arg for only training model if eval_metric in list / not in list. Useful for F1 and 'is_unbalanced' arg in LGBM.
	- line 342: # TODO: v0.1 cleanup and avoid hardcoded logic with model names
	- line 344: # TODO v0.1: This import depends on mxnet, consider refactoring to avoid mxnet
	- line 368: model.normalize_pred_probas = True  # FIXME: Do we need to do this for child models too?


tabular/src/autogluon/tabular/models/lr/lr_model.py (11 lines):
	- line 22: # TODO: Can Bagged LinearModels be combined during inference to 1 model by averaging their weights?
	- line 43: # TODO: Add more granular switch, currently this affects all future LR models even if they had `use_daal=False`
	- line 69: TODO: ensure features with zero variance have already been removed before this function is called.
	- line 75: language_featnames = []  # TODO: Disabled currently, have to pass raw text data features here to function properly
	- line 80: self._features_internal = list(df.columns)  # FIXME: Don't edit _features_internal
	- line 93: # TODO: handle collinear features - they will impact results quality
	- line 143: # TODO: It could be possible to adaptively set max_iter [1] to approximately respect time_limit based on sample-size, feature-dimensionality, and the solver used.
	- line 165: # TODO: copy_X=True currently set during regression problem type, could potentially set to False to avoid unnecessary data copy.
	- line 177: # TODO: Add HPO
	- line 185: one_hot_threshold = 10000  # FIXME research memory constraints
	- line 210: one_hot_threshold = 10000  # FIXME research memory constraints


core/src/autogluon/core/utils/utils.py (10 lines):
	- line 34: # FIXME: update to use only torch for TIMM or find a better GPU detection strategy
	- line 35: # FIXME: get_gpu_count by itself doesn't always work for Windows
	- line 45: # FIXME: Sometimes doesn't detect GPU on Windows
	- line 46: # FIXME: Doesn't ensure the GPUs are actually usable by the model (MXNet, PyTorch, etc.)
	- line 113: # FIXME: There is a bug in sklearn that causes an incorrect ValueError if performing stratification and all classes have fewer than n_splits samples.
	- line 469: problem_type = MULTICLASS  # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
	- line 479: # TODO: Move this outside of this function so it is visible even if problem type was not inferred.
	- line 543: # TODO: Improve time estimate (Currently pessimistic)
	- line 681: # TODO: Can speedup shuffle_repeats by incorporating into X_raw (do multiple repeats in a single predict call)
	- line 687: # TODO: Stratify? We currently don't know in this function the problem_type (could pass as additional arg).


tabular/src/autogluon/tabular/models/knn/knn_model.py (10 lines):
	- line 18: # TODO: Normalize data!
	- line 30: # TODO: Add more granular switch, currently this affects all future KNN models even if they had `use_daal=False`
	- line 64: ignored_type_group_raw=[R_BOOL, R_CATEGORY, R_OBJECT],  # TODO: Eventually use category features
	- line 84: # TODO: Enable HPO for KNN
	- line 102: if sample_weight is not None:  # TODO: support
	- line 106: # FIXME: v0.1 Must store final num rows for refit_full or else will use everything! Worst case refit_full could train far longer than the original model.
	- line 128: # TODO: Won't work for RAPIDS without modification
	- line 129: # TODO: Technically isn't OOF, but can be used inplace of OOF. Perhaps rename to something more accurate?
	- line 166: # TODO: Consider making this fully generic and available to all models
	- line 259: # TODO: Add HPO


tabular/src/autogluon/tabular/models/fastainn/tabular_nn_fastai.py (9 lines):
	- line 23: # FIXME: Has a leak somewhere, training additional models in a single python script will slow down training for each additional model. Gets very slow after 20+ models (10x+ slowdown)
	- line 40: # TODO: Takes extremely long time prior to training start if many (10000) continuous features from ngrams, debug - explore TruncateSVD option to reduce input dimensionality
	- line 41: # TODO: currently fastai automatically detect and use CUDA if available - add code to honor autogluon settings
	- line 150: # FIXME: Consider representing categories as int
	- line 175: if sample_weight is not None:  # TODO: support
	- line 190: # TODO: Control CPU vs GPU usage during inference
	- line 194: # TODO: respect CUDA_VISIBLE_DEVICES to select proper GPU
	- line 208: # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
	- line 383: # TODO: This call has very high fixed cost with many features (0.7s for a single row with 3k features)


tabular/src/autogluon/tabular/models/lgb/lgb_model.py (8 lines):
	- line 30: # TODO: Save dataset to binary and reload for HPO. This will avoid the memory spike overhead when training each model and instead it will only occur once upon saving the dataset.
	- line 68: approx_mem_size_req = data_mem_uasge * 7 + data_mem_uasge / 4 * num_classes  # TODO: Extremely crude approximation, can be vastly improved
	- line 108: # TODO: lightgbm must have a special install to support GPU: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version
	- line 129: # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
	- line 143: # Note: Don't use self.params_aux['max_memory_usage_ratio'] here as LightGBM handles memory per iteration optimally.  # TODO: Consider using when ratio < 1.
	- line 220: # FIXME This is a HACK. Passing in value -1, 0, or None will only use 1 cores. Need to pass in a large number instead
	- line 222: # TODO Avoid using psutil when lgb fixed the mem leak.
	- line 281: # TODO: Try creating multiple Datasets for subsets of features, then combining with Dataset.add_features_from(), this might avoid memory spike


core/src/autogluon/core/models/ensemble/stacker_ensemble_model.py (7 lines):
	- line 21: # TODO: Currently, if this is a stacker above level 1, it will be very slow taking raw input due to each stacker needing to repeat computation on the base models.
	- line 56: # TODO: Consider deleting these variables after initialization
	- line 107: compute_base_preds = False  # TODO: Consider removing, this can be dangerous but the code to make this work otherwise is complex (must rewrite predict_proba)
	- line 121: X_stacker.append(y_pred_proba)  # TODO: This could get very large on a high class count problem. Consider capping to top N most frequent classes and merging least frequent
	- line 149: # TODO: This could be preprocess_nonadaptive=True in general, just have preprocess_nonadaptive=False for child models
	- line 218: if self.feature_metadata is None:  # TODO: This is probably not the best way to do this
	- line 221: # FIXME: This is a hack, stack feature special types should be already present in feature_metadata, not added here


tabular/src/autogluon/tabular/models/fasttext/fasttext_model.py (7 lines):
	- line 44: # TODO: Investigate allowing categorical features as well
	- line 112: # TODO: move logic to self._preprocess_nonadaptive()
	- line 113: # TODO: text features: alternate text preprocessing steps
	- line 114: # TODO: categorical features: special encoding:  <feature name>_<feature value>
	- line 162: # TODO: s3 support
	- line 178: # TODO: hack to subpress a deprecation warning from fasttext
	- line 189: # TODO: Add HPO


core/src/autogluon/core/models/ensemble/fold_fitting_strategy.py (6 lines):
	- line 241: # TODO: Add support for sample_weight when pseudo is present
	- line 466: # TODO: Add support for sample_weight when pseudo is present
	- line 507: # TODO: We need to handle user provide custom num_cpus
	- line 518: # FIXME: Avoid hardcoding model names.
	- line 555: # FIXME: Avoid hardcoding model names.
	- line 564: # FIXME: Avoid hardcoding model names.


features/src/autogluon/features/generators/astype.py (6 lines):
	- line 15: # TODO: Add int fillna input value options: 0, set value, mean, mode, median
	- line 37: # TODO: consider returning self._transform(X) if we allow users to specify real dtypes as input
	- line 82: # TODO: Consider imputing to mode? This is tricky because training data had no missing values.
	- line 83: # TODO: Add unit test for this situation, to confirm it is handled properly.
	- line 91: # TODO: Confirm this works with sparse and other feature types!
	- line 92: # FIXME: Address situation where test-time invalid type values cause crash:


common/src/autogluon/common/features/infer_types.py (6 lines):
	- line 97: # TODO: Expand to int64 -> date features (milli from epoch etc)
	- line 98: # TODO: This takes a surprisingly long time to run, ~30 seconds a laptop for 50,000 rows of datetime_as_object for a single column. Try to optimize.
	- line 101: # TODO: Check if low numeric numbers, could be categorical encoding!
	- line 102: # TODO: If low numeric, potentially it is just numeric instead of date
	- line 105: if type_family != 'object':  # TODO: seconds from epoch support
	- line 108: # TODO: pd.Series(['20170204','20170205','20170206']) is incorrectly not detected as datetime_as_object


tabular/src/autogluon/tabular/tuning/feature_pruner.py (6 lines):
	- line 9: # TODO: currently is buggy
	- line 46: # TODO: CV5 instead of holdout? Should be better
	- line 47: # TODO: Add holdout here, it is overfitting with Logistic Regression
	- line 49: objective_goal_is_negative = False  # Fixed to false if using sklearn scorers # self.model_base.problem_type == REGRESSION  # TODO: if objective function goal = lower (logloss, MAE, etc.)
	- line 123: # TODO: Save gain_df, banned_features
	- line 149: if threshold < -100000000:  # FIXME: Hacked for regression


tabular/src/autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py (6 lines):
	- line 25: # TODO: QuantileTransformer in pipelines accounts for majority of online inference time
	- line 40: self.features_to_drop = []  # may change between different bagging folds. TODO: consider just removing these from self._features_internal
	- line 157: if sample_weight is not None:  # TODO: support
	- line 166: self.num_dataloading_workers = 0  # TODO: verify 0 is typically faster and uses less memory than 1 in pytorch
	- line 167: self.num_dataloading_workers = 0  # TODO: >0 crashes on MacOS
	- line 560: # TODO: Don't use os.makedirs here, have save_parameters function in tabular_nn_model that checks if local path or S3 path


vision/src/autogluon/vision/predictor/predictor.py (6 lines):
	- line 274: # FIXME: imagenet does not work, crashes in validating data due to empty DataFrames.
	- line 355: # needed for gluon-cv TODO: remove after gluon-cv is updated https://github.com/dmlc/gluon-cv/issues/1633
	- line 400: # TODO: remove this once mxnet is deprecated
	- line 416: # TODO: MXNetErrorCatcher was removed because it didn't return traceback
	- line 584: # TODO: This crashes if a feature is already named 'index'.
	- line 646: # TODO: remove the switch if mxnet is deprecated


core/src/autogluon/core/constants.py (5 lines):
	- line 18: # TODO: Have documentation for all AG_ARGS values
	- line 21: AG_ARGS_ENSEMBLE = 'ag_args_ensemble'  # Contains arguments that impact model ensembling, such as if an ensemble model is allowed to use the original features.  # TODO: v0.1 add to documentation
	- line 28: # TODO: Add docs to dedicated page, or should it live in AbstractModel?
	- line 29: # TODO: How to reference correct version of docs?
	- line 30: # TODO: Add error in AG_ARGS if unknown key present


tabular/src/autogluon/tabular/models/catboost/catboost_utils.py (5 lines):
	- line 9: # TODO: Add weight support?
	- line 10: # TODO: Can these be optimized? What computational cost do they have compared to the default catboost versions?
	- line 40: # TODO: Binary log_loss doesn't work for some reason
	- line 92: # TODO: Refactor as a dictionary mapping as done in LGBM
	- line 98: # SoftclassCustomMetric = make_softclass_metric()  # TODO: remove after catboost 0.24


features/src/autogluon/features/generators/pipeline.py (5 lines):
	- line 19: # TODO: Documentation
	- line 47: # TODO: Consider adding final check of validity/that features are reasonable.
	- line 79: self._ensure_no_duplicate_column_names(X=X)  # TODO: Remove this, move pre_memory_usage and post_memory_usage into super().
	- line 107: self._log(log_level, f'\t\tThese features carry no predictive signal and should be manually investigated.')  # TODO: What about features with 1 unique value but also np.nan?
	- line 111: # TODO: Consider highlighting why a feature was unused (complex to implement, can check if was valid input to any generator in a generator group through feature chaining)


tabular/src/autogluon/tabular/models/lgb/callbacks.py (5 lines):
	- line 16: # TODO: Add option to stop if current run's metric value is X% lower, such as min 30%, current 40% -> Stop
	- line 133: if i == indices_to_check[0]:  # TODO: documentation needs to note that we assume 0th index is the 'official' validation performance metric.
	- line 181: # TODO: Add toggle parameter to early_stopping to disable this
	- line 182: # TODO: Identify optimal threshold values for early_stopping based on lack of memory
	- line 204: # TODO: We will want to track size of model as well, even if we early stop before OOM, we will still crash when saving if the model is large enough


tabular/src/autogluon/tabular/models/xgboost/xgboost_model.py (4 lines):
	- line 80: # TODO: utilize sample_weight_val in early-stopping if provided
	- line 147: # TODO: Investigate speed-ups from GPU inference
	- line 171: elif self.problem_type == SOFTCLASS:  # TODO: delete this elif if it's unnecessary.
	- line 183: approx_mem_size_req = data_mem_uasge * 7 + data_mem_uasge / 4 * num_classes  # TODO: Extremely crude approximation, can be vastly improved


core/src/autogluon/core/data/label_cleaner.py (4 lines):
	- line 127: # TODO: Unused? There are not many reasonable situations that seem to require this method.
	- line 173: # TODO: Expand print statement to multiclass as well
	- line 188: # TODO: Clean this code, for loop
	- line 282: # TODO: Expand functionality if necessary


features/src/autogluon/features/binning.py (4 lines):
	- line 14: # TODO: Rewrite with normalized value counts as binning technique, will be more performant and optimal
	- line 38: # TODO: max_desired_bins and min_desired_bins are currently equivalent, but in future they will be parameterized to allow for flexibility.
	- line 65: # TODO: Clean code
	- line 66: # TODO: Consider re-using bins variable instead of making bins_2-7 variables


tabular/src/autogluon/tabular/models/rf/rf_rapids_model.py (4 lines):
	- line 11: # TODO: Improve memory safety
	- line 12: # TODO: Respect time limit
	- line 13: # TODO: Depending on max_depth parameter, RFRapidsModel is slower than RFModel.
	- line 52: # FIXME: Efficient OOF doesn't work in RAPIDS


core/src/autogluon/core/scheduler/seq_scheduler.py (4 lines):
	- line 123: # FIXME: Hack to be compatible with gluoncv
	- line 159: # TODO: Add special exception type when there are no more new configurations to try (exhausted search space)
	- line 161: logger.exception('Detailed Traceback:')  # TODO: Avoid logging if verbosity=0
	- line 283: # TODO: Consider passing the metadata search space to searcher to avoid having to do this


core/src/autogluon/core/models/greedy_ensemble/greedy_weighted_ensemble_model.py (3 lines):
	- line 34: # TODO: Consider moving convert_pred_probas_df_to_list into inner model to ensure X remains a dataframe after preprocess is called
	- line 36: # TODO: super() call?
	- line 51: # TODO: Check memory after loading best model predictions, only load top X model predictions that fit in memory


core/src/autogluon/core/models/ensemble/weighted_ensemble_model.py (3 lines):
	- line 13: # TODO: v0.1 see if this can be removed and logic moved to greedy weighted ensemble model -> Use StackerEnsembleModel as stacker instead
	- line 14: # TODO: Optimize predict speed when fit on kfold, can simply sum weights
	- line 66: # TODO: Rewrite preprocess() in greedy_weighted_ensemble_model to enable


tabular/src/autogluon/tabular/models/lgb/hyperparameters/searchspaces.py (3 lines):
	- line 25: 'min_data_in_leaf': Int(lower=2, upper=60, default=20),  # TODO: Use size of dataset to set upper, if row count is small upper should be small
	- line 26: 'num_leaves': Int(lower=16, upper=96, default=31),  # TODO: Use row count and feature count to set this, the higher feature count the higher num_leaves upper
	- line 33: # TODO: Bin size max increase


tabular/src/autogluon/tabular/models/tabular_nn/mxnet/embednet.py (3 lines):
	- line 70: class EmbedNet(gluon.Block): # TODO: hybridize?
	- line 148: # TODO: Remove below lines or write logic to switch between using these lines and the multithreaded version once multithreaded version is optimized
	- line 154: # TODO: Optimize below to perform better before using


features/src/autogluon/features/generators/category.py (3 lines):
	- line 16: # TODO: Add hashing trick if minimize_memory=True to avoid storing full original mapping
	- line 17: # TODO: fill_nan add additional options: group_rares, possibly percentile based
	- line 134: X_category[column] = X_category[column].astype(CategoricalDtype(categories=category_list))  # TODO: Remove columns if all NaN after this?


core/src/autogluon/core/models/abstract/abstract_nn_model.py (3 lines):
	- line 16: # TODO: v0.1 clean method
	- line 20: TODO: ensure features with zero variance have already been removed before this function is called.
	- line 29: language_featnames = [] # TODO: not implemented. This should fetch text features present in the data


features/src/autogluon/features/generators/datetime.py (3 lines):
	- line 57: # TODO: Improve handling of missing datetimes
	- line 61: # TODO: Be aware: When converted to float32 by downstream models, the seconds value will be up to 3 seconds off the true time due to rounding error. If seconds matter, find a separate way to generate (Possibly subtract smallest datetime from all values).
	- line 62: # TODO: could also return an extra boolean column is_nan which could provide predictive signal.


features/src/autogluon/features/generators/fillna.py (3 lines):
	- line 13: # TODO: Add fillna_special_map, fillna_combined_map to increase options
	- line 14: # TODO: Add options to specify mean/median/mode for int/float
	- line 15: # TODO: Add fillna_features for feature specific fill values


tabular/src/autogluon/tabular/models/knn/_knn_loo_variants.py (3 lines):
	- line 12: # TODO: Consider contributing to sklearn officially
	- line 13: # TODO: This uses private methods in sklearn, could potentially break without warning in future sklearn releases
	- line 14: # TODO: Code is largely identical to `predict` and `predict_proba` methods, but due to how those methods are coded, we can't call them directly.


common/src/autogluon/common/savers/save_pd.py (3 lines):
	- line 10: # TODO: Update so verbose prints at level 20, and adjust calls to save accordingly
	- line 53: df.to_parquet(path, compression=compression, engine='fastparquet')  # TODO: Might be slower than pyarrow in multiprocessing
	- line 60: s3_utils.delete_s3_prefix(bucket=bucket, prefix=prefix)  # TODO: Might only delete the first 1000!


core/src/autogluon/core/augmentation/distill_utils.py (3 lines):
	- line 53: # return postprocess_augmented(X_aug, X)  # TODO: dropping duplicates is much more efficient, but may skew distribution for entirely-categorical data with few categories.
	- line 67: # TODO: This can easily be optimized heavily
	- line 122: # TODO: Remove or fix, likely doesn't work anymore


features/src/autogluon/features/generators/bulk.py (3 lines):
	- line 14: # TODO: Add parameter to add prefix to each generator to guarantee no name collisions: 'G1_', 'G2_', etc.
	- line 15: # TODO: Add argument keep_unused, which creates an identity feature generator at each stage to pipe unused input features into the next stage instead of dropping them.
	- line 145: # TODO: consider moving to self._remove_features_out


core/src/autogluon/core/utils/feature_selection.py (3 lines):
	- line 121: # TODO: Make this work with unlabelled data
	- line 343: # FIXME: Right now the upper bound on the number of features we evaluate feature importance at once is determined by our expected feature
	- line 413: Assumes baseline validation score has already been computed. TODO: Take into account speedup from parallel feature


core/src/autogluon/core/_setup_utils.py (2 lines):
	- line 22: 'psutil': '>=5.7.3,<5.9',  # TODO: Consider capping to <6.0 instead, capping to 5.9 to avoid possible issues.
	- line 29: # TODO: Use DOCS_PACKAGES and TEST_PACKAGES


tabular/src/autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py (2 lines):
	- line 242: # TODO: this is using unique on X again. Ideally we should integrate
	- line 298: # TODO: maybe integrate this part with the one above


core/src/autogluon/core/utils/early_stopping.py (2 lines):
	- line 37: # TODO: Add time component
	- line 39: # TODO: Incorporate score, rolling window


tabular/src/autogluon/tabular/models/tabular_nn/utils/nn_architecture_utils.py (2 lines):
	- line 44: base_size = max(1, min(num_net_outputs, 100) / 50)  # TODO: Updated because it improved model quality and made training far faster
	- line 46: layer_expansion_factor = 1  # TODO: consider scaling based on num_rows, eg: layer_expansion_factor = 2-np.exp(-max(0,train_dataset.num_examples-10000))


vision/src/autogluon/vision/detector/detector.py (2 lines):
	- line 242: # FIXME: Use ImagePredictor's tuning_data split logic when None, currently this does not perform an ideal split.
	- line 303: # TODO: MXNetErrorCatcher was removed because it didn't return traceback,


core/src/autogluon/core/searcher/local_searcher.py (2 lines):
	- line 40: # FIXME: Don't do this, fix the outer code to not require this
	- line 48: # FIXME: Consider removing


tabular/src/autogluon/tabular/models/catboost/catboost_softclass_utils.py (2 lines):
	- line 30: # TODO: inefficient copy of approxes, targets to np.array from provided UniTuple (required for JIT to work)
	- line 64: # TODO: Consider replacing with C++ implementation (but requires building catboost from source).


core/src/autogluon/core/task/base/base_task.py (2 lines):
	- line 32: # FIXME: REMOVE THIS, first GluonCV needs to stop depending on AG, as it imports this class
	- line 154: )  # TODO: use 'auto' downstream


features/src/autogluon/features/generators/label_encoder.py (2 lines):
	- line 13: # TODO: LabelEncoderTransformer
	- line 32: # TODO: add inplace option?


core/src/autogluon/core/metrics/classification_metrics.py (2 lines):
	- line 274: TODO : Add dedicated confusion_matrix function to AbstractLearner
	- line 338: # TODO Add the "labels" option to metrics that will require the label map.


tabular/src/autogluon/tabular/trainer/auto_trainer.py (2 lines):
	- line 62: # TODO: User could be intending to blend instead. Add support for blend stacking.
	- line 80: # TODO: QUANTILE VERSION?


tabular/src/autogluon/tabular/models/vowpalwabbit/vowpalwabbit_model.py (2 lines):
	- line 114: # TODO: Add Early Stopping support via validation
	- line 211: # TODO: Can be improved further to make it more accurate


common/src/autogluon/common/loaders/load_pd.py (2 lines):
	- line 56: sample_count=sample_count, worker_count=worker_count, multiprocessing_method=multiprocessing_method)  # TODO: Add arguments!
	- line 69: df = pd.read_parquet(path, columns=columns_to_keep, engine='fastparquet')  # TODO: Deal with extremely strange issue resulting from torch being present in package, will cause read_parquet to either freeze or Segmentation Fault when performing multiprocessing


tabular/src/autogluon/tabular/models/catboost/callbacks.py (2 lines):
	- line 128: # FIXME: Avoid using private API! (https://github.com/awslabs/autogluon/issues/1381)
	- line 134: # FIXME: Unsure if this works for custom metrics!


common/src/autogluon/common/features/feature_metadata.py (2 lines):
	- line 68: # TODO: Add valid_names, invalid_names arguments which override all other arguments for the features listed?
	- line 262: # TODO: Add documentation on shared_raw_features usage


tabular/src/autogluon/tabular/models/fastainn/hyperparameters/parameters.py (2 lines):
	- line 4: # TODO this method is generalizable and potentially should be moved out into framework
	- line 19: # TODO: explore/add other hyperparameters like weight decay, use of batch-norm, activation-function choice, etc.


common/src/autogluon/common/savers/save_json.py (2 lines):
	- line 1: # TODO: Standardize / unify this code with ag.save()
	- line 9: # TODO: Support S3 paths


tabular/src/autogluon/tabular/models/tabular_nn/mxnet/tabular_nn_dataset.py (2 lines):
	- line 11: logger = logging.getLogger(__name__)  # TODO: Currently unused
	- line 62: # TODO: The code fixes the crash on mxnet gluon interpreting a single value in a batch incorrectly.


tabular/src/autogluon/tabular/models/tab_transformer/tab_transformer_model.py (2 lines):
	- line 22: TODO: Fix Mac OS X warning spam.
	- line 267: scheduler = optim.lr_scheduler.ExponentialLR(optimizer_embeds, gamma=base_exp_decay) # TODO: Should we be using this in _epoch()?


common/src/autogluon/common/features/types.py (2 lines):
	- line 9: R_BOOL = 'bool'  # TODO: R_BOOL/R_BOOLEAN?
	- line 10: # TODO: R_FLOAT_SPARSE/R_INT_SPARSE/R_CATEGORY_SPARSE?


tabular/src/autogluon/tabular/models/text_prediction/text_prediction_v1_model.py (2 lines):
	- line 131: if sample_weight is not None:  # TODO: support
	- line 200: # TODO: use get_gpu_count_torch() or some better way once torch models are available.


tabular/src/autogluon/tabular/models/knn/knn_rapids_model.py (2 lines):
	- line 11: # FIXME: Benchmarks show that CPU KNN can be trained in ~3 seconds with 0.2 second validation time for CoverType on automlbenchmark (m5.2xlarge)
	- line 15: # TODO: Given this is so fast, consider doing rapid feature pruning


features/src/autogluon/features/generators/drop_unique.py (2 lines):
	- line 13: # TODO: Not necessary to exist after fitting, can just update outer context feature_out/feature_in and then delete this
	- line 33: # TODO: Consider NaN?


common/src/autogluon/common/savers/save_pkl.py (2 lines):
	- line 1: # TODO: Standardize / unify this code with ag.save()
	- line 10: # TODO: object -> obj?


tabular/src/autogluon/tabular/models/tab_transformer/pretexts.py (1 line):
	- line 10: possible TODO: although there is a supervised pretext option below, i.e. pretrain using


tabular/src/autogluon/tabular/models/xgboost/xgboost_utils.py (1 line):
	- line 23: mean_squared_error='rmse', # TODO: not supported from default eavl metric. Firstly, use `rsme` refenced by catboost model.


tabular/src/autogluon/tabular/models/lr/lr_rapids_model.py (1 line):
	- line 14: # FIXME: If rapids is installed, normal CPU LinearModel crashes.


core/src/autogluon/core/scheduler/scheduler_factory.py (1 line):
	- line 112: # TODO: re-enable bayesopt after it's been implemented


core/src/autogluon/core/models/_utils.py (1 line):
	- line 5: # TODO: Add more strategies


core/src/autogluon/core/calibrate/temperature_scaling.py (1 line):
	- line 39: # TODO: Could alternatively add epsilon to y_val_probs in order to avoid.


tabular/src/autogluon/tabular/models/tab_transformer/hyperparameters/searchspaces.py (1 line):
	- line 3: # TODO: May have to split search space's by problem type. Not necessary right now.


features/src/autogluon/features/generators/one_hot_encoder.py (1 line):
	- line 104: # TODO: Replace XGBoost, NN, and Linear Model OHE logic with this


core/src/autogluon/core/models/greedy_ensemble/ensemble_selection.py (1 line):
	- line 71: # TODO: Consider having a removal stage, remove each model and see if score is affected, if improves or not effected, remove it.


features/src/autogluon/features/generators/drop_duplicates.py (1 line):
	- line 14: # TODO: Not necessary to exist after fitting, can just update outer context feature_out/feature_in and then delete this


text/src/autogluon/text/text_prediction/mx/models.py (1 line):
	- line 436: # TODO Dynamically cache the preprocessor that has been fitted.


tabular/src/autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py (1 line):
	- line 26: # TODO: Consider avoiding converting to string for improved memory efficiency


features/src/autogluon/features/generators/auto_ml_pipeline.py (1 line):
	- line 16: # TODO: write out in English the full set of transformations that are applied (and eventually host page on website). Also explicitly write out all of the feature-generator "hyperparameters" that might affect the results from the AutoML FeatureGenerator


features/src/autogluon/features/generators/memory_minimize.py (1 line):
	- line 60: # TODO: What about nulls / unknowns?


common/src/autogluon/common/savers/save_pointer.py (1 line):
	- line 7: # TODO: Add S3 support


core/src/autogluon/core/__init__.py (1 line):
	- line 6: # TODO: v0.1 Identify why distributed logs are spammed if not suppressed via the below code


features/src/autogluon/features/generators/binned.py (1 line):
	- line 16: # TODO: Add more parameters (possibly pass in binning function as an argument for full control)


tabular/src/autogluon/tabular/models/vowpalwabbit/vowpalwabbit_utils.py (1 line):
	- line 14: # TODO: Add support for different namespaces


core/src/autogluon/core/models/abstract/model_trial.py (1 line):
	- line 85: # TODO: use sample_weight?


tabular/src/autogluon/tabular/models/lgb/hyperparameters/parameters.py (1 line):
	- line 40: # TODO: Bin size max increase


features/src/autogluon/features/generators/isnan.py (1 line):
	- line 46: # TODO: Try returning bool type instead of uint8


tabular/setup.py (1 line):
	- line 54: # TODO: Consider adding 'skex' to 'all'


tabular/src/autogluon/tabular/configs/presets_configs.py (1 line):
	- line 32: # TODO: Consider HPO-enabled configs if training time doesn't matter but inference latency does.


features/src/autogluon/features/utils.py (1 line):
	- line 38: # TODO: Consider NaN values as a separate value?


vision/src/autogluon/vision/predictor/losses.py (1 line):
	- line 6: #TODO: abstract general loss shared across tasks


features/src/autogluon/features/generators/rename.py (1 line):
	- line 72: 'allow_post_generators': False,  # TODO: This might not be necessary anymore


tabular/src/autogluon/tabular/models/lr/hyperparameters/parameters.py (1 line):
	- line 36: # TODO explore using liblinear for smaller datasets


tabular/src/autogluon/tabular/models/tabular_nn/hyperparameters/parameters.py (1 line):
	- line 17: # TODO: Epochs could take a very long time, we may want smarter logic than simply # of epochs without improvement (slope, difference in score, etc.)