in nni/algorithms/feature_engineering/gradient_selector/fginitialize.py [0:0]
def __init__(self,
path_data=None,
data_format=constants.DataFormat.NUMPY,
D=None, N=None,
classification=True,
ordinal=False,
balanced=True,
preprocess=None,
n_to_estimate=None,
MAXMEMGB=syssettings.MAXMEMGB,
set_params=True,
path_mappings=None,
X=None,
y=None,
verbose=0,
n_classes=None,
device=constants.Device.CPU):
"""
Dataset class with helpful features and functions for being included in a dataloader
and managing memory usage.
can read following formats:
svm: svm light format (sklearn.datasets.load_svmlight_file)
numpy: Pass X and y as numpy or sparse arrays
assumes
1. if classification, y is in {-1, 1} or continuous and 0 indexed
2. y can fit into memory
3. consecutive calls to __getitem__() have consecutive idx values
notes:
1. this implementation is not careful wrt/ precise memory reqts. for
example, being able to store one dense row in memory is necessary,
but not sufficient.
2. for y with 4.2 billion elements, 31.3 GB of memory is necessary
@ 8 bytes/scalar. Use partial fit to avoid loading the entire dataset
at once
3. disk_size always refer to size of complete data file, even after
a split().
Parameters
----------
path_data : str
Path to load data from
data_format : str
File ending for path data.
"numpy" is the default when passing in X and y
D : int
Number of features.
N : int
Number of rows.
classification : bool
If True, problem is classification, else regression.
ordinal: bool
If True, problem is ordinal classification. Requires classification to be True.
balanced : bool
If true, each class is weighted equally in optimization, otherwise
weighted is done via support of each class. Requires classification to be True.
prerocess : str
'zscore' which refers to centering and normalizing data to unit variance or
'center' which only centers the data to 0 mean
n_to_estimate : int
Number of rows of data to estimate
MAXMEMGB : float
Maximum allowable size for a minibatch
set_params : bool
Whether or not to determine the statistics of the dataset
path_mappings : str
Used when streaming from disk
X : array-like
Shape = [n_samples, n_features]
The training input samples.
y : array-like
Shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).
verbose : int
Controls the verbosity when fitting. Set to 0 for no printing
1 or higher for printing every verbose number of gradient steps.
device : str
'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
n_classes : int
number of classes
"""
self.path_data = path_data
if self.path_data:
self.disk_size = os.path.getsize(path_data)
else:
assert X is not None, 'X must be specified if no path data'
self.disk_size = X.nbytes if not scipy.sparse.issparse(
X) else X.data.nbytes
assert data_format in constants.DataFormat.ALL_FORMATS, 'Format must in {0}.'.format(
", ".join(constants.DataFormat.ALL_FORMATS))
self.format = data_format
self.classification = classification
self.ordinal = ordinal
self.balanced = balanced
self.MAXMEMGB = MAXMEMGB
self.preprocess = preprocess
self.set_params = set_params
self.verbose = verbose
self.n_classes = n_classes
self.device = device
self.path_data_stats = None
if D is None:
assert self.disk_size / BYTESPERGB <= self.MAXMEMGB, \
'Cannot load data into memory. Supply D.'
if self.format == constants.DataFormat.SVM:
self.X, self.y = load_svmlight_file(path_data)
elif self.format == constants.DataFormat.NUMPY:
assert X is not None, 'X must be specified in numpy mode'
assert y is not None, 'y must be specified in numpy mode'
self.X = X
self.y = y
if self.n_classes is None:
self.n_classes = np.unique(y).shape[0]
elif self.classification:
assert self.n_classes >= np.unique(y).shape[0], \
'n_classes given must be greater than or equal to the number of classes in y'
else:
raise NotImplementedError
self.y = torch.as_tensor(self.y, dtype=torch.get_default_dtype())
self.N, self.D = self.X.shape
# assumes X was returned as a sparse array
self.storage_level = (constants.StorageLevel.SPARSE
if scipy.sparse.issparse(self.X)
else constants.StorageLevel.DENSE)
else:
assert N is not None, 'Supply N.'
self.N, self.D = N, D
# assume sparse matrix cannot fit into memory
self.storage_level = constants.StorageLevel.DISK
self.dense_size_gb = self.get_dense_size()
# check dense size
self.set_dense_X()
self.max_rows = int(self.MAXMEMGB * BYTESPERGB / BYTESPERREAL / self.D)
assert self.max_rows, \
'Cannot fit one dense row into %d GB memory.' % self.MAXMEMGB
self.max_rows = self.max_batch_size()
sys.stdout.flush()
if n_to_estimate is None:
self.n_to_estimate = self.max_batch_size()
else:
assert n_to_estimate <= self.N, 'n_to_estimate must be <= N.'
self.n_to_estimate = n_to_estimate
# initialize disk loader
if self.storage_level == constants.StorageLevel.DISK and self.set_params:
if self.format == constants.DataFormat.SVM:
raise NotImplementedError(
'Please use partial fit to train on datasets that do not fit in memory')
else:
raise NotImplementedError
# TODO: use a passed-in RNG here
self.ix_statistics = np.random.permutation(self.N)[:self.n_to_estimate]
self.n_features = self.D
if self.set_params:
if self.verbose:
print('Finding data statistics...', end='')
sys.stdout.flush()
Xmn, sv1, Xsd, ymn, ysd = self.compute_data_stats()
self.set_data_stats(Xmn, sv1, Xsd, ymn, ysd)
if self.verbose:
print()
self.set_return_raw(False)
else:
self.set_return_raw(True)
self.set_return_np(False)
# this needs to occur after setting preprocessing params
if (self.storage_level == constants.StorageLevel.DISK and
self.format == constants.DataFormat.SVM and self.set_params):
self.loader.batchsize = 1