in python/pipelines/components/python/component.py [0:0]
def _objective(trial):
# Define the hyperparameter search space.
# Using Scikit-learn goes beyond using https://cloud.google.com/bigquery/docs/hp-tuning-overview
params = {
"n_clusters": trial.suggest_int("n_clusters", min_num_clusters, max_num_clusters),
"max_iter": trial.suggest_int("max_iter", 10, 1000, step=10),
"tol": trial.suggest_float("tol", 1e-6, 1e-2, step=1e-6),
}
model = _create_model(params)
model.fit(training_dataset_df)
labels = model.predict(training_dataset_df)
return silhouette_score(
X=model.named_steps['transform'].transform(training_dataset_df),
labels=labels,
metric='euclidean',
# By default, we're setting sample size to the whole training dataset since we're looking for accurate scores
# However, if the codes takes a long time to calculate the silhouette score, assuming training size is
# bigger than 10000, then set a limit on the number of samples to 10000
sample_size=None if int(len(training_dataset_df)) < 10_000 else 10_000,
random_state=42
), params['n_clusters']