in example/speech-demo/io_func/feat_io.py [0:0]
def __init__(self, dataset_args, n_ins):
# stats
self.mean = None
self.std = None
if 'train_stat' in dataset_args.keys():
train_stat = dataset_args['train_stat']
featureStats = stats.FeatureStats()
featureStats.Load(train_stat)
self.mean = featureStats.GetMean()
self.std = featureStats.GetInvStd()
# open lstfile
file_path = dataset_args["lst_file"]
if file_path.endswith('.gz'):
file_read = gzip.open(file_path, 'r')
else:
file_read = open(file_path, 'r')
separate_lines = False
if "separate_lines" in dataset_args:
separate_lines = to_bool(dataset_args["separate_lines"])
self.has_labels = True
if "has_labels" in dataset_args:
self.has_labels = to_bool(dataset_args["has_labels"])
# parse it, file_lst is a list of (featureFile, labelFile) pairs in the input set
lines = [ln.strip() for ln in file_read]
lines = [ln for ln in lines if ln != "" ]
if self.has_labels:
if separate_lines:
if len(lines) % 2 != 0:
print("List has mis-matched number of feature files and label files")
sys.exit(1)
self.orig_file_lst = []
for i in xrange(0, len(lines), 2):
self.orig_file_lst.append((lines[i], lines[i+1]))
else:
self.orig_file_lst = []
for i in xrange(len(lines)):
pair = re.compile("\s+").split(lines[i])
if len(pair) != 2:
print(lines[i])
print("Each line in the train and eval lists must contain feature file and label file separated by space character")
sys.exit(1)
self.orig_file_lst.append(pair)
else:
# no labels
self.orig_file_lst = []
for i in xrange(0, len(lines), 1):
self.orig_file_lst.append((lines[i], None))
# save arguments
self.n_ins = n_ins
self.file_format = dataset_args['file_format']
self.file_format = "htk"
if 'file_format' in dataset_args:
self.file_format = dataset_args['file_format']
self.offsetLabels = False
if 'offset_labels' in dataset_args:
self.offsetLabels = to_bool(dataset_args['offset_labels'])
self.chunk_size = 32768
if 'gpu_chunk' in dataset_args:
self.chunk_size = int(dataset_args['gpu_chunk'])
self.maxFeats = 0
if "max_feats" in dataset_args:
self.maxFeats = int(dataset_args["max_feats"])
if self.maxFeats == 0:
self.maxFeats = sys.maxint
self.shuffle = True
if 'shuffle' in dataset_args:
self.shuffle = to_bool(dataset_args['shuffle'])
self.seed = None
if "seed" in dataset_args:
self.seed = int(dataset_args["seed"])
if int("_split_id" in dataset_args) + int("_num_splits" in dataset_args) == 1:
raise Exception("_split_id must be used with _num_splits")
self.num_splits = 0
if "_num_splits" in dataset_args:
self.num_splits = int(dataset_Args["_num_splits"])
self.split_id = dataset_args["_split_id"]
# internal state
self.split_parts = False
self.by_matrix = False
self.x = numpy.zeros((self.chunk_size, self.n_ins), dtype=numpy.float32)
if self.has_labels:
self.y = numpy.zeros((self.chunk_size,), dtype=numpy.int32)
else:
self.y = None
self.numpy_rng = numpy.random.RandomState(self.seed)
#self.make_shared()
self.initialize_read()