in src/sagemaker_sklearn_extension/feature_extraction/sequences.py [0:0]
def _extract_tsfresh_features(self, X):
X_df = self._convert_to_df(X)
X_df_no_nans = X_df.dropna()
# covering corner case when all nans
if X_df_no_nans.shape[0] == 0:
X_df_no_nans = X_df.loc[[0]].fillna(0)
if self.extraction_type not in ["minimal", "efficient", "all"]:
raise ValueError(
f"{self.extraction_type} is not a supported feature extraction option. Please choose one from "
f"the following options: [minimal, efficient, all]."
)
min_settings = MinimalFCParameters()
# Extract time series features from the dataframe
# Replace any ``NaNs`` and ``infs`` in the extracted features with median/extreme values for that column
tsfresh_features = extract_features(
X_df_no_nans,
default_fc_parameters=min_settings,
column_id="id",
column_sort="time",
impute_function=impute,
n_jobs=0,
)
self.min_settings_card = tsfresh_features.shape[1]
# Minimal features computed indepdently to ensure they go first in the output,
# this is needed to ensure their survival when filtering features
if self.extraction_type in ["efficient", "all"]:
if self.extraction_type == "efficient":
settings = EfficientFCParameters()
else:
settings = ComprehensiveFCParameters()
settings = {k: v for k, v in settings.items() if k not in min_settings}
tsfresh_features_extra = extract_features(
X_df_no_nans,
default_fc_parameters=settings,
column_id="id",
column_sort="time",
impute_function=impute,
n_jobs=0,
)
self.extra_settings_card = tsfresh_features_extra.shape[1]
tsfresh_features = pd.concat([tsfresh_features, tsfresh_features_extra], axis=1)
# If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs),
# impute each tsfresh feature for those observations with the median of that tsfresh feature
tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1)))
return tsfresh_features_imputed, X_df