in src/sagemaker_sklearn_extension/preprocessing/data.py [0:0]
def _build_combinations(self, n_features, random_state):
"""Calculate the feature pairs to be added to the input data based on parameters and number of input columns.
If ``interaction_only`` is ``True``, all squared features are omitted. Otherwise, they are added before
interaction features. If there is enough space--as indicated by ``max_n_features``--to add all squared features,
then do so. Otherwise, take a random sub-sample. Then, if there's enough space to add all interaction features,
do so. Otherwise, return a random sub-sample of those.
Parameters
----------
n_features : int
The number of columns in the input vector.
random_state : RandomState
The prepared (using ``check_random_state``) ``RandomState`` instance.
"""
# First calculate how many new features of each kind (squared and interaction) we can add.
added_feature_budget = self.max_n_features - n_features - int(self.include_bias)
if added_feature_budget <= 0:
message = "max_n_features must be large enough for the output to contain more than the original dataset"
if self.include_bias:
message += " and bias column"
raise ValueError(message)
squared_feature_budget = 0 if self.interaction_only else min(added_feature_budget, n_features)
interaction_feature_budget = max(0, added_feature_budget - squared_feature_budget)
# Produce squared feature pairs.
squared_features = []
if squared_feature_budget == n_features:
# No need to reorder if we can fit all squared features.
squared_features = [(i, i) for i in range(n_features)]
elif squared_feature_budget > 0:
# Otherwise, take a random sample of them.
squared_features = [
(i, i) for i in random_state.choice(range(n_features), size=squared_feature_budget, replace=False)
]
# Produce interaction feature pairs.
interaction_features = []
if interaction_feature_budget > 0:
interaction_features = list(combinations(range(n_features), 2))
# Take a random sample of feature interactions if not all can fit.
if len(interaction_features) > interaction_feature_budget:
random_state.shuffle(interaction_features)
interaction_features = interaction_features[:interaction_feature_budget]
return squared_features + interaction_features