def __init_

def init()

in nni/algorithms/feature_engineering/gradient_selector/fginitialize.py [0:0]
98 lines of code
19 McCabe index (conditional complexity)

    def __init__(self,
                 path_data=None,
                 data_format=constants.DataFormat.NUMPY,
                 D=None, N=None,
                 classification=True,
                 ordinal=False,
                 balanced=True,
                 preprocess=None,
                 n_to_estimate=None,
                 MAXMEMGB=syssettings.MAXMEMGB,
                 set_params=True,
                 path_mappings=None,
                 X=None,
                 y=None,
                 verbose=0,
                 n_classes=None,
                 device=constants.Device.CPU):
        """
        Dataset class with helpful features and functions for being included in a dataloader
        and managing memory usage.
        can read following formats:
            svm:        svm light format (sklearn.datasets.load_svmlight_file)
            numpy:      Pass X and y as numpy or sparse arrays

        assumes
            1. if classification, y is in {-1, 1} or continuous and 0 indexed
            2. y can fit into memory
            3. consecutive calls to __getitem__() have consecutive idx values

        notes:
            1. this implementation is not careful wrt/ precise memory reqts. for
            example, being able to store one dense row in memory is necessary,
            but not sufficient.
            2. for y with 4.2 billion elements, 31.3 GB of memory is  necessary
            @ 8 bytes/scalar. Use partial fit to avoid loading the entire dataset
            at once
            3. disk_size always refer to size of complete data file, even after
            a split().


        Parameters
        ----------
        path_data : str
            Path to load data from
        data_format : str
            File ending for path data.
            "numpy" is the default when passing in X and y
        D : int
            Number of features.
        N : int
            Number of rows.
        classification : bool
            If True, problem is classification, else regression.
        ordinal: bool
            If True, problem is ordinal classification. Requires classification to be True.
        balanced : bool
            If true, each class is weighted equally in optimization, otherwise
            weighted is done via support of each class. Requires classification to be True.
        prerocess : str
            'zscore' which refers to centering and normalizing data to unit variance or
            'center' which only centers the data to 0 mean
        n_to_estimate : int
            Number of rows of data to estimate
        MAXMEMGB : float
            Maximum allowable size for a minibatch
        set_params : bool
            Whether or not to determine the statistics of the dataset
        path_mappings : str
            Used when streaming from disk
        X : array-like
            Shape = [n_samples, n_features]
            The training input samples.
        y : array-like
            Shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        verbose : int
            Controls the verbosity when fitting. Set to 0 for no printing
            1 or higher for printing every verbose number of gradient steps.
        device : str
            'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
        n_classes : int
            number of classes
        """

        self.path_data = path_data
        if self.path_data:
            self.disk_size = os.path.getsize(path_data)
        else:
            assert X is not None, 'X must be specified if no path data'
            self.disk_size = X.nbytes if not scipy.sparse.issparse(
                X) else X.data.nbytes
        assert data_format in constants.DataFormat.ALL_FORMATS, 'Format must in {0}.'.format(
            ", ".join(constants.DataFormat.ALL_FORMATS))
        self.format = data_format
        self.classification = classification
        self.ordinal = ordinal
        self.balanced = balanced
        self.MAXMEMGB = MAXMEMGB
        self.preprocess = preprocess
        self.set_params = set_params
        self.verbose = verbose
        self.n_classes = n_classes
        self.device = device

        self.path_data_stats = None

        if D is None:
            assert self.disk_size / BYTESPERGB <= self.MAXMEMGB, \
                'Cannot load data into memory. Supply D.'

            if self.format == constants.DataFormat.SVM:
                self.X, self.y = load_svmlight_file(path_data)
            elif self.format == constants.DataFormat.NUMPY:
                assert X is not None, 'X must be specified in numpy mode'
                assert y is not None, 'y must be specified in numpy mode'
                self.X = X
                self.y = y
                if self.n_classes is None:
                    self.n_classes = np.unique(y).shape[0]
                elif self.classification:
                    assert self.n_classes >= np.unique(y).shape[0], \
                        'n_classes given must be greater than or equal to the number of classes in y'
            else:
                raise NotImplementedError
            self.y = torch.as_tensor(self.y, dtype=torch.get_default_dtype())

            self.N, self.D = self.X.shape

            # assumes X was returned as a sparse array
            self.storage_level = (constants.StorageLevel.SPARSE
                                  if scipy.sparse.issparse(self.X)
                                  else constants.StorageLevel.DENSE)

        else:
            assert N is not None, 'Supply N.'
            self.N, self.D = N, D

            # assume sparse matrix cannot fit into memory
            self.storage_level = constants.StorageLevel.DISK

        self.dense_size_gb = self.get_dense_size()

        # check dense size
        self.set_dense_X()

        self.max_rows = int(self.MAXMEMGB * BYTESPERGB / BYTESPERREAL / self.D)
        assert self.max_rows, \
            'Cannot fit one dense row into %d GB memory.' % self.MAXMEMGB
        self.max_rows = self.max_batch_size()
        sys.stdout.flush()

        if n_to_estimate is None:
            self.n_to_estimate = self.max_batch_size()
        else:
            assert n_to_estimate <= self.N, 'n_to_estimate must be <= N.'
            self.n_to_estimate = n_to_estimate

        # initialize disk loader
        if self.storage_level == constants.StorageLevel.DISK and self.set_params:
            if self.format == constants.DataFormat.SVM:
                raise NotImplementedError(
                    'Please use partial fit to train on datasets that do not fit in memory')
            else:
                raise NotImplementedError

        # TODO: use a passed-in RNG here
        self.ix_statistics = np.random.permutation(self.N)[:self.n_to_estimate]
        self.n_features = self.D
        if self.set_params:
            if self.verbose:
                print('Finding data statistics...', end='')
                sys.stdout.flush()
            Xmn, sv1, Xsd, ymn, ysd = self.compute_data_stats()
            self.set_data_stats(Xmn, sv1, Xsd, ymn, ysd)
            if self.verbose:
                print()
            self.set_return_raw(False)
        else:
            self.set_return_raw(True)

        self.set_return_np(False)

        # this needs to occur after setting preprocessing params
        if (self.storage_level == constants.StorageLevel.DISK and
                self.format == constants.DataFormat.SVM and self.set_params):
            self.loader.batchsize = 1
def __init__()

def init()