in example/speech-demo/io_util.py [0:0]
def __init__(self, train_sets, buckets, batch_size,
init_states, delay=5, feat_dim=40,
data_name='data', label_name='softmax_label', has_label=True):
self.train_sets = train_sets
self.train_sets.initialize_read()
self.data_name = data_name
self.label_name = label_name
buckets.sort()
i_max_bucket = len(buckets)-1
max_bucket = buckets[i_max_bucket]
if has_label != True:
buckets = [i for i in range(1, max_bucket)]
i_max_bucket = len(buckets)-1
max_bucket = buckets[i_max_bucket]
self.buckets = buckets
self.data = [[] for k in buckets]
self.utt_id = [[] for k in buckets]
self.feat_dim = feat_dim
self.default_bucket_key = max(buckets)
self.has_label = has_label
sys.stderr.write("Loading data...\n")
T_OVERLAP = buckets[0]/2
n = 0
while True:
(feats, tgts, utt_id) = self.train_sets.load_next_seq()
if utt_id is None:
break
if tgts is None and self.has_label:
continue
if feats.shape[0] == 0:
continue
# we split sentence into overlapping segments if it is
# longer than the largest bucket
t_start = 0
t_end = feats.shape[0]
while t_start < t_end:
if t_end - t_start > max_bucket:
t_take = max_bucket
i_bucket = i_max_bucket
else:
for i, bkt in enumerate(buckets):
if bkt >= t_end-t_start:
t_take = t_end-t_start
i_bucket = i
break
n += 1
if self.has_label:
self.data[i_bucket].append((feats[t_start:t_start+t_take],
tgts[t_start:t_start+t_take]+1))
else:
self.data[i_bucket].append(feats[t_start:t_start+t_take])
self.utt_id[i_bucket].append(utt_id)
t_start += t_take
if t_start >= t_end:
# this sentence is consumed
break
t_start -= T_OVERLAP
# Get the size of each bucket, so that we could sample
# uniformly from the bucket
bucket_sizes = [len(x) for x in self.data]
self.batch_size = batch_size
# convert data into ndarrays for better speed during training
data = [np.zeros((len(x), buckets[i], self.feat_dim))
if len(x) % self.batch_size == 0
else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i],
self.feat_dim))
for i, x in enumerate(self.data)]
label = [np.zeros((len(x), buckets[i]))
if len(x) % self.batch_size == 0
else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i]))
for i, x in enumerate(self.data)]
utt_id = [[] for k in buckets]
for i, x in enumerate(data):
utt_id[i] = ["GAP_UTT"] * len(x)
for i_bucket in range(len(self.buckets)):
for j in range(len(self.data[i_bucket])):
sentence = self.data[i_bucket][j]
if self.has_label:
sentence[1][delay:] = sentence[1][:-delay]
sentence[1][:delay] = sentence[1][0] # broadcast assignment
data[i_bucket][j, :len(sentence[0])] = sentence[0]
label[i_bucket][j, :len(sentence[1])] = sentence[1]
else:
data[i_bucket][j, :len(sentence)] = sentence
# borrow this place to pass in sentence length. TODO: use a less hacky way.
label[i_bucket][j, :len(sentence)] += len(sentence)
utt_id[i_bucket][j] = self.utt_id[i_bucket][j]
self.data = data
self.label = label
self.utt_id = utt_id
# Get the size of each bucket, so that we could sample
# uniformly from the bucket
bucket_sizes = [len(x) for x in self.data]
sys.stderr.write("Summary of dataset ==================\n")
for bkt, sz in zip(buckets, bucket_sizes):
sys.stderr.write("bucket of len %3d : %d samples\n" % (bkt, sz))
self.bucket_sizes = bucket_sizes
self.make_data_iter_plan()
self.init_states = init_states
self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
self.provide_data = [(data_name, (batch_size, self.default_bucket_key, self.feat_dim))] + \
init_states
self.provide_label = [(label_name, (self.batch_size, self.default_bucket_key))]