def load_dataset_from_np()

in community-artifacts/Deep-learning/Utilities/madlib_image_loader.py [0:0]


    def load_dataset_from_np(self, data_x, data_y, table_name=None,
                             append=False, label_datatype='TEXT'):
        """
        Loads a numpy array into db.  For append=False, creates a new table and
            loads the data.  For append=True, appends data to existing table.
            Throws an exception if append=False and table_name already exists,
            or if append=True and table_name does not exist.  Makes use of
            worker processes initialized during ImageLoader object creation to
            load in parallel.
        @data_x independent variable data, a numpy array of images.  Size of
            first dimension is number of images.  Rest of dimensions determined
            by image resolution and number of channels.
        @data_y dependent variable data (image classes), as an numpy array
        @table_name Name of table in db to load data into
        @append Whether to create a new table (False) or append to an existing
            one (True).  If unspecified, default is False
        @label_datatype: If set will create table with the the column 'y' set
            to the datatype specified. Default is set to TEXT
        """
        start_time = time.time()
        self.mother = True
        self.from_disk = False
        self.append = append
        self.label_datatype = label_datatype

        if table_name:
            self.table_name = table_name

        if not self.table_name:
            raise ValueError("Must specify table_name either in ImageLoader"
                " constructor or in load_dataset_from_np params!")

        # Flatten labels only for arrays with shape (n,1) o (1,n) since these
        # shapes can be treated as individual labels
        if data_y.ndim == 2 and (data_y.shape[0] == 1 or data_y.shape[1] == 1):
            data_y = data_y.flatten()
        else:
            self.label_datatype = self.label_datatype + '[]'


        self._validate_input_and_create_table(data_x, data_y)

        data = zip(data_x, data_y)

        if not self.pool:
            print("Spawning {0} workers...".format(self.num_workers))
            self.pool = Pool(processes=self.num_workers,
                     initializer=init_worker,
                     initargs=(current_process().pid,
                               self.table_name,
                               self.append,
                               False,
                               self.db_creds,
                               False))


        datas = []

        for n in range(0, len(data), self.ROWS_PER_FILE):
            datas.append(data[n:n+self.ROWS_PER_FILE])

        #
        # Each element in datas is a list of self.ROWS_PER_FILE rows
        #
        #  Shape of datas:  ( number of files, rows per file, ( x-dim, y-dim ) )
        #
        #  ( inside x can also be a numpy tensor with several dimensions, but y
        #    should just be a single scalar )
        #
        #  multiprocessing library will call _call_np_worker() in some worker for
        #   each file, splitting the list of files up into roughly equal chunks
        #   for each worker to handle.  For example, if there are 500 files and
        #   5 workers, each will handle about 100 files, and _call_np_worker()
        #   will be called 100 times, each time with a different file full
        #   of images.

        try:
            self.pool.map(_call_np_worker, datas)
        except(Exception) as e:
            self.terminate_workers()
            raise e

        end_time = time.time()
        print("Done!  Loaded {0} images in {1}s"\
            .format(len(data), end_time - start_time))

        self.terminate_workers()