def __init__()

in example/speech-demo/io_util.py [0:0]


    def __init__(self, train_sets, batch_size,
            init_states, delay=5, feat_dim=40, label_dim=1955,
            label_mean_sets=None, data_name='data',
            label_name='softmax_label', has_label=True, load_label_mean=True):

        self.train_sets = train_sets
        self.label_mean_sets = label_mean_sets
        self.train_sets.initialize_read()

        self.data_name = data_name
        if has_label:
            self.label_name = label_name

        features = []
        labels = []
        utt_lens = []
        utt_ids = []
        buckets = []
        self.has_label = has_label

        if label_mean_sets is not None:
            self.label_mean_sets.initialize_read()
            (feats, tgts, utt_id) = self.label_mean_sets.load_next_seq()

            self.label_mean = feats/np.sum(feats)
            for i, v in enumerate(feats):
                if v <= 1.0:
                    self.label_mean[i] = 1

        sys.stderr.write("Loading data...\n")
        buckets_map = {}
        n = 0
        while True:
            (feats, tgts, utt_id) = self.train_sets.load_next_seq()
            if utt_id is None:
                break
            if tgts is None and self.has_label:
                continue
            if feats.shape[0] == 0:
                continue
            features.append(feats)
            utt_lens.append(feats.shape[0])
            utt_ids.append(utt_id)
            if self.has_label:
                labels.append(tgts+1)
            if feats.shape[0] not in buckets:
                buckets_map[feats.shape[0]] = feats.shape[0]

        for k, v in buckets_map.iteritems():
            buckets.append(k)

        buckets.sort()
        i_max_bucket = len(buckets)-1
        max_bucket = buckets[i_max_bucket]
        self.buckets = buckets
        self.data = [[] for k in buckets]
        self.utt_id = [[] for k in buckets]
        self.utt_lens = [[] for k in buckets]
        self.feat_dim = feat_dim
        self.default_bucket_key = max(buckets)

        for i, feats in enumerate(features):
            if has_label:
                tgts = labels[i]
            utt_len = utt_lens[i]
            utt_id = utt_ids[i]

            for i, bkt in enumerate(buckets):
                if bkt >= utt_len:
                    i_bucket = i
                    break

            if self.has_label:
                self.data[i_bucket].append((feats, tgts))
            else:
                self.data[i_bucket].append(feats)
            self.utt_id[i_bucket].append(utt_id)
            self.utt_lens[i_bucket].append(utt_len)

        # Get the size of each bucket, so that we could sample
        # uniformly from the bucket
        bucket_sizes = [len(x) for x in self.data]

        self.batch_size = batch_size
        # convert data into ndarrays for better speed during training

        data = [np.zeros((len(x), buckets[i], self.feat_dim))
                if len(x) % self.batch_size == 0
                else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i], self.feat_dim))
                for i, x in enumerate(self.data)]

        label = [np.zeros((len(x), buckets[i]))
                 if len(x) % self.batch_size == 0
                 else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i]))
                 for i, x in enumerate(self.data)]

        utt_id = [[] for k in buckets]
        for i, x in enumerate(data):
            utt_id[i] = ["GAP_UTT"] * len(x)
        utt_lens = [[] for k in buckets]
        for i, x in enumerate(data):
            utt_lens[i] = [0] * len(x)


        for i_bucket in range(len(self.buckets)):
            for j in range(len(self.data[i_bucket])):
                sentence = self.data[i_bucket][j]
                if self.has_label:
                    sentence[1][delay:] = sentence[1][:-delay]
                    sentence[1][:delay] = sentence[1][0] # broadcast assignment
                    data[i_bucket][j, :len(sentence[0])] = sentence[0]
                    label[i_bucket][j, :len(sentence[1])] = sentence[1]
                else:
                    data[i_bucket][j, :len(sentence)] = sentence
                    # borrow this place to pass in sentence length. TODO: use a less hacky way.
                    label[i_bucket][j, :len(sentence)] += len(sentence)

                utt_id[i_bucket][j] = self.utt_id[i_bucket][j]
                utt_lens[i_bucket][j] = self.utt_lens[i_bucket][j]

        self.data = data
        self.label = label
        self.utt_id = utt_id
        self.utt_lens = utt_lens


        # Get the size of each bucket, so that we could sample
        # uniformly from the bucket
        bucket_sizes = [len(x) for x in self.data]

        sys.stderr.write("Summary of dataset ==================\n")
        for bkt, sz in zip(buckets, bucket_sizes):
            sys.stderr.write("bucket of len %3d : %d samples\n" % (bkt, sz))

        bucket_size_tot = float(sum(bucket_sizes))

        self.bucket_sizes = bucket_sizes
        self.make_data_iter_plan()

        self.init_states = init_states
        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]

        self.provide_data = [(data_name, (batch_size, self.default_bucket_key, self.feat_dim))] + init_states
        self.provide_label = None
        if has_label:
            self.provide_label = [(label_name, (self.batch_size, self.default_bucket_key))]