in example/speech-demo/io_util.py [0:0]
def __init__(self, train_sets, batch_size,
init_states, delay=5, feat_dim=40, label_dim=1955,
label_mean_sets=None, data_name='data',
label_name='softmax_label', has_label=True, load_label_mean=True):
self.train_sets = train_sets
self.label_mean_sets = label_mean_sets
self.train_sets.initialize_read()
self.data_name = data_name
if has_label:
self.label_name = label_name
features = []
labels = []
utt_lens = []
utt_ids = []
buckets = []
self.has_label = has_label
if label_mean_sets is not None:
self.label_mean_sets.initialize_read()
(feats, tgts, utt_id) = self.label_mean_sets.load_next_seq()
self.label_mean = feats/np.sum(feats)
for i, v in enumerate(feats):
if v <= 1.0:
self.label_mean[i] = 1
sys.stderr.write("Loading data...\n")
buckets_map = {}
n = 0
while True:
(feats, tgts, utt_id) = self.train_sets.load_next_seq()
if utt_id is None:
break
if tgts is None and self.has_label:
continue
if feats.shape[0] == 0:
continue
features.append(feats)
utt_lens.append(feats.shape[0])
utt_ids.append(utt_id)
if self.has_label:
labels.append(tgts+1)
if feats.shape[0] not in buckets:
buckets_map[feats.shape[0]] = feats.shape[0]
for k, v in buckets_map.iteritems():
buckets.append(k)
buckets.sort()
i_max_bucket = len(buckets)-1
max_bucket = buckets[i_max_bucket]
self.buckets = buckets
self.data = [[] for k in buckets]
self.utt_id = [[] for k in buckets]
self.utt_lens = [[] for k in buckets]
self.feat_dim = feat_dim
self.default_bucket_key = max(buckets)
for i, feats in enumerate(features):
if has_label:
tgts = labels[i]
utt_len = utt_lens[i]
utt_id = utt_ids[i]
for i, bkt in enumerate(buckets):
if bkt >= utt_len:
i_bucket = i
break
if self.has_label:
self.data[i_bucket].append((feats, tgts))
else:
self.data[i_bucket].append(feats)
self.utt_id[i_bucket].append(utt_id)
self.utt_lens[i_bucket].append(utt_len)
# Get the size of each bucket, so that we could sample
# uniformly from the bucket
bucket_sizes = [len(x) for x in self.data]
self.batch_size = batch_size
# convert data into ndarrays for better speed during training
data = [np.zeros((len(x), buckets[i], self.feat_dim))
if len(x) % self.batch_size == 0
else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i], self.feat_dim))
for i, x in enumerate(self.data)]
label = [np.zeros((len(x), buckets[i]))
if len(x) % self.batch_size == 0
else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i]))
for i, x in enumerate(self.data)]
utt_id = [[] for k in buckets]
for i, x in enumerate(data):
utt_id[i] = ["GAP_UTT"] * len(x)
utt_lens = [[] for k in buckets]
for i, x in enumerate(data):
utt_lens[i] = [0] * len(x)
for i_bucket in range(len(self.buckets)):
for j in range(len(self.data[i_bucket])):
sentence = self.data[i_bucket][j]
if self.has_label:
sentence[1][delay:] = sentence[1][:-delay]
sentence[1][:delay] = sentence[1][0] # broadcast assignment
data[i_bucket][j, :len(sentence[0])] = sentence[0]
label[i_bucket][j, :len(sentence[1])] = sentence[1]
else:
data[i_bucket][j, :len(sentence)] = sentence
# borrow this place to pass in sentence length. TODO: use a less hacky way.
label[i_bucket][j, :len(sentence)] += len(sentence)
utt_id[i_bucket][j] = self.utt_id[i_bucket][j]
utt_lens[i_bucket][j] = self.utt_lens[i_bucket][j]
self.data = data
self.label = label
self.utt_id = utt_id
self.utt_lens = utt_lens
# Get the size of each bucket, so that we could sample
# uniformly from the bucket
bucket_sizes = [len(x) for x in self.data]
sys.stderr.write("Summary of dataset ==================\n")
for bkt, sz in zip(buckets, bucket_sizes):
sys.stderr.write("bucket of len %3d : %d samples\n" % (bkt, sz))
bucket_size_tot = float(sum(bucket_sizes))
self.bucket_sizes = bucket_sizes
self.make_data_iter_plan()
self.init_states = init_states
self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
self.provide_data = [(data_name, (batch_size, self.default_bucket_key, self.feat_dim))] + init_states
self.provide_label = None
if has_label:
self.provide_label = [(label_name, (self.batch_size, self.default_bucket_key))]