in community-artifacts/Deep-learning/Utilities/madlib_image_loader.py [0:0]
def load_dataset_from_np(self, data_x, data_y, table_name=None,
append=False, label_datatype='TEXT'):
"""
Loads a numpy array into db. For append=False, creates a new table and
loads the data. For append=True, appends data to existing table.
Throws an exception if append=False and table_name already exists,
or if append=True and table_name does not exist. Makes use of
worker processes initialized during ImageLoader object creation to
load in parallel.
@data_x independent variable data, a numpy array of images. Size of
first dimension is number of images. Rest of dimensions determined
by image resolution and number of channels.
@data_y dependent variable data (image classes), as an numpy array
@table_name Name of table in db to load data into
@append Whether to create a new table (False) or append to an existing
one (True). If unspecified, default is False
@label_datatype: If set will create table with the the column 'y' set
to the datatype specified. Default is set to TEXT
"""
start_time = time.time()
self.mother = True
self.from_disk = False
self.append = append
self.label_datatype = label_datatype
if table_name:
self.table_name = table_name
if not self.table_name:
raise ValueError("Must specify table_name either in ImageLoader"
" constructor or in load_dataset_from_np params!")
# Flatten labels only for arrays with shape (n,1) o (1,n) since these
# shapes can be treated as individual labels
if data_y.ndim == 2 and (data_y.shape[0] == 1 or data_y.shape[1] == 1):
data_y = data_y.flatten()
else:
self.label_datatype = self.label_datatype + '[]'
self._validate_input_and_create_table(data_x, data_y)
data = zip(data_x, data_y)
if not self.pool:
print("Spawning {0} workers...".format(self.num_workers))
self.pool = Pool(processes=self.num_workers,
initializer=init_worker,
initargs=(current_process().pid,
self.table_name,
self.append,
False,
self.db_creds,
False))
datas = []
for n in range(0, len(data), self.ROWS_PER_FILE):
datas.append(data[n:n+self.ROWS_PER_FILE])
#
# Each element in datas is a list of self.ROWS_PER_FILE rows
#
# Shape of datas: ( number of files, rows per file, ( x-dim, y-dim ) )
#
# ( inside x can also be a numpy tensor with several dimensions, but y
# should just be a single scalar )
#
# multiprocessing library will call _call_np_worker() in some worker for
# each file, splitting the list of files up into roughly equal chunks
# for each worker to handle. For example, if there are 500 files and
# 5 workers, each will handle about 100 files, and _call_np_worker()
# will be called 100 times, each time with a different file full
# of images.
try:
self.pool.map(_call_np_worker, datas)
except(Exception) as e:
self.terminate_workers()
raise e
end_time = time.time()
print("Done! Loaded {0} images in {1}s"\
.format(len(data), end_time - start_time))
self.terminate_workers()