def _extract_tsfresh_features()

in src/sagemaker_sklearn_extension/feature_extraction/sequences.py [0:0]


    def _extract_tsfresh_features(self, X):
        X_df = self._convert_to_df(X)
        X_df_no_nans = X_df.dropna()
        # covering corner case when all nans
        if X_df_no_nans.shape[0] == 0:
            X_df_no_nans = X_df.loc[[0]].fillna(0)
        if self.extraction_type not in ["minimal", "efficient", "all"]:
            raise ValueError(
                f"{self.extraction_type} is not a supported feature extraction option. Please choose one from "
                f"the following options: [minimal, efficient, all]."
            )
        min_settings = MinimalFCParameters()
        # Extract time series features from the dataframe
        # Replace any ``NaNs`` and ``infs`` in the extracted features with median/extreme values for that column
        tsfresh_features = extract_features(
            X_df_no_nans,
            default_fc_parameters=min_settings,
            column_id="id",
            column_sort="time",
            impute_function=impute,
            n_jobs=0,
        )
        self.min_settings_card = tsfresh_features.shape[1]
        # Minimal features computed indepdently to ensure they go first in the output,
        # this is needed to ensure their survival when filtering features
        if self.extraction_type in ["efficient", "all"]:
            if self.extraction_type == "efficient":
                settings = EfficientFCParameters()
            else:
                settings = ComprehensiveFCParameters()
            settings = {k: v for k, v in settings.items() if k not in min_settings}
            tsfresh_features_extra = extract_features(
                X_df_no_nans,
                default_fc_parameters=settings,
                column_id="id",
                column_sort="time",
                impute_function=impute,
                n_jobs=0,
            )
            self.extra_settings_card = tsfresh_features_extra.shape[1]
            tsfresh_features = pd.concat([tsfresh_features, tsfresh_features_extra], axis=1)

        # If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs),
        # impute each tsfresh feature for those observations with the median of that tsfresh feature
        tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1)))
        return tsfresh_features_imputed, X_df