in graspologic/cluster/gclust.py [0:0]
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "GaussianCluster":
"""
Fits gaussian mixure model to the data.
Estimate model parameters with the EM algorithm.
Parameters
----------
X : array-like, shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
y : array-like, shape (n_samples,), optional (default=None)
List of labels for X if available. Used to compute
ARI scores.
Returns
-------
self
"""
# Deal with number of clusters
if self.max_components is None:
lower_ncomponents = 1
upper_ncomponents = self.min_components
else:
lower_ncomponents = self.min_components
upper_ncomponents = self.max_components
n_mixture_components = upper_ncomponents - lower_ncomponents + 1
if upper_ncomponents > X.shape[0]:
if self.max_components is None:
msg = "if max_components is None then min_components must be >= "
msg += "n_samples, but min_components = {}, n_samples = {}".format(
upper_ncomponents, X.shape[0]
)
else:
msg = "max_components must be >= n_samples, but max_components = "
msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
raise ValueError(msg)
elif lower_ncomponents > X.shape[0]:
msg = "min_components must be <= n_samples, but min_components = "
msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0])
raise ValueError(msg)
# Get parameters
random_state = self.random_state
param_grid_values = dict(
covariance_type=self.covariance_type,
n_components=range(lower_ncomponents, upper_ncomponents + 1),
tol=[self.tol],
reg_covar=[self.reg_covar],
max_iter=[self.max_iter],
n_init=[self.n_init],
init_params=[self.init_params],
random_state=[random_state],
)
param_grid = list(ParameterGrid(param_grid_values))
models: List[List[GaussianMixture]] = [[] for _ in range(n_mixture_components)]
bics: List[List[float]] = [[] for _ in range(n_mixture_components)]
aris: List[List[float]] = [[] for _ in range(n_mixture_components)]
for i, params in enumerate(param_grid):
model = GaussianMixture(**params)
model.fit(X)
models[i % n_mixture_components].append(model)
bics[i % n_mixture_components].append(model.bic(X))
if y is not None:
predictions = model.predict(X)
aris[i % n_mixture_components].append(
adjusted_rand_score(y, predictions)
)
self.bic_ = pd.DataFrame(
bics,
index=np.arange(lower_ncomponents, upper_ncomponents + 1),
columns=self.covariance_type,
)
if y is not None:
self.ari_ = pd.DataFrame(
aris,
index=np.arange(lower_ncomponents, upper_ncomponents + 1),
columns=self.covariance_type,
)
else:
self.ari_ = None
# Get the best cov type and its index within the dataframe
best_covariance = self.bic_.min(axis=0).idxmin()
best_covariance_idx = self.covariance_type.index(best_covariance)
# Get the index best component for best_covariance
best_component = self.bic_.idxmin()[best_covariance]
self.n_components_ = best_component
self.covariance_type_ = best_covariance
self.model_ = models[best_component - self.min_components][best_covariance_idx]
return self