in nevergrad/functions/ml/mlfunctionlib.py [0:0]
def make_dataset(self, data_dimension: tp.Optional[int], dataset: str) -> None:
# Filling datasets.
rng = self.parametrization.random_state
if not dataset.startswith("artificial"):
assert dataset in ["boston", "diabetes", "kerasBoston", "auto-mpg", "red-wine", "white-wine"]
assert data_dimension is None
sets_url = {
"auto-mpg": "http://www-lisic.univ-littoral.fr/~teytaud/files/Cours/Apprentissage/data/auto-mpg.data",
"red-wine": "http://www-lisic.univ-littoral.fr/~teytaud/files/Cours/Apprentissage/data/winequality-red.csv",
"white-wine": "http://www-lisic.univ-littoral.fr/~teytaud/files/Cours/Apprentissage/data/winequality-white.csv",
}
sets_tag = {"auto-mpg": "mpg", "red-wine": "quality", "white-wine": "quality"}
if dataset == "kerasBoston":
try:
from tensorflow import keras # pylint: disable=import-outside-toplevel
except ImportError as e:
raise ImportError(
"Please install keras (pip install keras) to use keras ml tuning"
) from e
data = keras.datasets.boston_housing
elif dataset in sets_tag:
data = pd.read_csv(sets_url[dataset])
else:
data = {"boston": sklearn.datasets.load_boston, "diabetes": sklearn.datasets.load_diabetes,}[
dataset
](return_X_y=True)
# Half the dataset for training.
test_ratio = 0.5
if dataset == "kerasBoston":
(self.X_train, self.y_train), (self.X_test, self.y_test) = data.load_data(
test_split=test_ratio, seed=42
)
elif dataset in sets_url:
if dataset == "auto-mpg":
data.drop("name", 1, inplace=True)
train, test = train_test_split(data, test_size=test_ratio)
self.y_train = train[sets_tag[dataset]].to_numpy()
self.y_test = test[sets_tag[dataset]].to_numpy()
del train[sets_tag[dataset]]
del test[sets_tag[dataset]]
self.X_train = train.to_numpy()
self.X_test = test.to_numpy()
else:
rng.shuffle(data[0].T) # We randomly shuffle the columns.
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
data[0], data[1], test_size=test_ratio, random_state=42
)
num_train_data = len(self.X_train)
self.num_data = num_train_data
kf = KFold(n_splits=self._cross_val_num)
kf.get_n_splits(self.X_train)
for train_index, valid_index in kf.split(self.X_train):
self.X_train_cv += [self.X_train[train_index]]
self.y_train_cv += [self.y_train[train_index]]
self.X_valid_cv += [self.X_train[valid_index]]
self.y_valid_cv += [self.y_train[valid_index]]
return
assert data_dimension is not None, f"Pb with {dataset} in dimension {data_dimension}"
# Training set.
X = np.arange(0.0, 1.0, 1.0 / (self.num_data * data_dimension))
X = X.reshape(-1, data_dimension)
rng.shuffle(X)
target_function: tp.Callable[[np.ndarray], np.ndarray] = { # type: ignore
"artificial": np.sin,
"artificialcos": np.cos,
"artificialsquare": np.square,
}[dataset]
y = np.sum(np.sin(X), axis=1).ravel()
self.X_train = X # Training set.
self.y_train = y # Labels of the training set.
# We generate the cross-validation subsets.
for cv in range(self._cross_val_num):
# Training set.
X_train_cv = X[np.arange(self.num_data) % self._cross_val_num != cv].copy()
y_train_cv = np.sum(target_function(X_train_cv), axis=1).ravel()
self.X_train_cv += [X_train_cv]
self.y_train_cv += [y_train_cv]
# Validation set or test set (noise_free is True for test set).
X_valid_cv = X[np.arange(self.num_data) % self._cross_val_num == cv].copy()
X_valid_cv = X_valid_cv.reshape(-1, data_dimension)
y_valid_cv = np.sum(target_function(X_valid_cv), axis=1).ravel()
self.X_valid_cv += [X_valid_cv]
self.y_valid_cv += [y_valid_cv]
# We also generate the test set.
X_test = np.arange(0.0, 1.0, 1.0 / 60000)
rng.shuffle(X_test)
X_test = X_test.reshape(-1, data_dimension)
y_test = np.sum(target_function(X_test), axis=1).ravel()
self.X_test = X_test
self.y_test = y_test