def fit()

in graspologic/cluster/gclust.py [0:0]
67 lines of code
13 McCabe index (conditional complexity)

    def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "GaussianCluster":
        """
        Fits gaussian mixure model to the data.
        Estimate model parameters with the EM algorithm.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        y : array-like, shape (n_samples,), optional (default=None)
            List of labels for X if available. Used to compute
            ARI scores.

        Returns
        -------
        self
        """

        # Deal with number of clusters
        if self.max_components is None:
            lower_ncomponents = 1
            upper_ncomponents = self.min_components
        else:
            lower_ncomponents = self.min_components
            upper_ncomponents = self.max_components

        n_mixture_components = upper_ncomponents - lower_ncomponents + 1

        if upper_ncomponents > X.shape[0]:
            if self.max_components is None:
                msg = "if max_components is None then min_components must be >= "
                msg += "n_samples, but min_components = {}, n_samples = {}".format(
                    upper_ncomponents, X.shape[0]
                )
            else:
                msg = "max_components must be >= n_samples, but max_components = "
                msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
            raise ValueError(msg)
        elif lower_ncomponents > X.shape[0]:
            msg = "min_components must be <= n_samples, but min_components = "
            msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
            raise ValueError(msg)

        # Get parameters
        random_state = self.random_state

        param_grid_values = dict(
            covariance_type=self.covariance_type,
            n_components=range(lower_ncomponents, upper_ncomponents + 1),
            tol=[self.tol],
            reg_covar=[self.reg_covar],
            max_iter=[self.max_iter],
            n_init=[self.n_init],
            init_params=[self.init_params],
            random_state=[random_state],
        )

        param_grid = list(ParameterGrid(param_grid_values))

        models: List[List[GaussianMixture]] = [[] for _ in range(n_mixture_components)]
        bics: List[List[float]] = [[] for _ in range(n_mixture_components)]
        aris: List[List[float]] = [[] for _ in range(n_mixture_components)]

        for i, params in enumerate(param_grid):
            model = GaussianMixture(**params)
            model.fit(X)
            models[i % n_mixture_components].append(model)
            bics[i % n_mixture_components].append(model.bic(X))
            if y is not None:
                predictions = model.predict(X)
                aris[i % n_mixture_components].append(
                    adjusted_rand_score(y, predictions)
                )

        self.bic_ = pd.DataFrame(
            bics,
            index=np.arange(lower_ncomponents, upper_ncomponents + 1),
            columns=self.covariance_type,
        )

        if y is not None:
            self.ari_ = pd.DataFrame(
                aris,
                index=np.arange(lower_ncomponents, upper_ncomponents + 1),
                columns=self.covariance_type,
            )
        else:
            self.ari_ = None

        # Get the best cov type and its index within the dataframe
        best_covariance = self.bic_.min(axis=0).idxmin()
        best_covariance_idx = self.covariance_type.index(best_covariance)

        # Get the index best component for best_covariance
        best_component = self.bic_.idxmin()[best_covariance]

        self.n_components_ = best_component
        self.covariance_type_ = best_covariance
        self.model_ = models[best_component - self.min_components][best_covariance_idx]

        return self