def __init__()

in example/speech-demo/io_util.py [0:0]


    def __init__(self, train_sets, buckets, batch_size,
                 init_states, delay=5, feat_dim=40,
                 data_name='data', label_name='softmax_label', has_label=True):

        self.train_sets = train_sets
        self.train_sets.initialize_read()

        self.data_name = data_name
        self.label_name = label_name

        buckets.sort()
        i_max_bucket = len(buckets)-1
        max_bucket = buckets[i_max_bucket]

        if has_label != True:
            buckets = [i for i in range(1, max_bucket)]
            i_max_bucket = len(buckets)-1
            max_bucket = buckets[i_max_bucket]

        self.buckets = buckets
        self.data = [[] for k in buckets]
        self.utt_id = [[] for k in buckets]
        self.feat_dim = feat_dim
        self.default_bucket_key = max(buckets)
        self.has_label = has_label

        sys.stderr.write("Loading data...\n")
        T_OVERLAP = buckets[0]/2
        n = 0
        while True:
            (feats, tgts, utt_id) = self.train_sets.load_next_seq()
            if utt_id is None:
                break
            if tgts is None and self.has_label:
                continue
            if feats.shape[0] == 0:
                continue

            # we split sentence into overlapping segments if it is
            # longer than the largest bucket
            t_start = 0
            t_end = feats.shape[0]
            while t_start < t_end:
                if t_end - t_start > max_bucket:
                    t_take = max_bucket
                    i_bucket = i_max_bucket
                else:
                    for i, bkt in enumerate(buckets):
                        if bkt >= t_end-t_start:
                            t_take = t_end-t_start
                            i_bucket = i
                            break

                n += 1
                if self.has_label:
                    self.data[i_bucket].append((feats[t_start:t_start+t_take],
                                                tgts[t_start:t_start+t_take]+1))
                else:
                    self.data[i_bucket].append(feats[t_start:t_start+t_take])

                self.utt_id[i_bucket].append(utt_id)
                t_start += t_take
                if t_start >= t_end:
                    # this sentence is consumed
                    break
                t_start -= T_OVERLAP

        # Get the size of each bucket, so that we could sample
        # uniformly from the bucket
        bucket_sizes = [len(x) for x in self.data]

        self.batch_size = batch_size
        # convert data into ndarrays for better speed during training

        data = [np.zeros((len(x), buckets[i], self.feat_dim))
                if len(x) % self.batch_size == 0
                else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i],
                               self.feat_dim))
                for i, x in enumerate(self.data)]

        label = [np.zeros((len(x), buckets[i]))
                 if len(x) % self.batch_size == 0
                 else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i]))
                 for i, x in enumerate(self.data)]

        utt_id = [[] for k in buckets]
        for i, x in enumerate(data):
            utt_id[i] = ["GAP_UTT"] * len(x)

        for i_bucket in range(len(self.buckets)):
            for j in range(len(self.data[i_bucket])):
                sentence = self.data[i_bucket][j]
                if self.has_label:
                    sentence[1][delay:] = sentence[1][:-delay]
                    sentence[1][:delay] = sentence[1][0]  # broadcast assignment
                    data[i_bucket][j, :len(sentence[0])] = sentence[0]
                    label[i_bucket][j, :len(sentence[1])] = sentence[1]
                else:
                    data[i_bucket][j, :len(sentence)] = sentence
                    # borrow this place to pass in sentence length. TODO: use a less hacky way.
                    label[i_bucket][j, :len(sentence)] += len(sentence)

                utt_id[i_bucket][j] = self.utt_id[i_bucket][j]

        self.data = data
        self.label = label
        self.utt_id = utt_id

        # Get the size of each bucket, so that we could sample
        # uniformly from the bucket
        bucket_sizes = [len(x) for x in self.data]

        sys.stderr.write("Summary of dataset ==================\n")
        for bkt, sz in zip(buckets, bucket_sizes):
            sys.stderr.write("bucket of len %3d : %d samples\n" % (bkt, sz))

        self.bucket_sizes = bucket_sizes
        self.make_data_iter_plan()

        self.init_states = init_states
        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]

        self.provide_data = [(data_name, (batch_size, self.default_bucket_key, self.feat_dim))] + \
            init_states
        self.provide_label = [(label_name, (self.batch_size, self.default_bucket_key))]