in src/speech_reps/featurize.py [0:0]
def _file_to_feats(self, file):
assert file.suffix == '.wav'
# To support CMVN files in the future
cmvn_spec = None
def transform(feat):
'''
Transform into a sequnce with every 3 frames stacked
'''
shape = feat.shape
out = np.zeros((int(math.ceil(float(shape[0]) / 3)), 3 * shape[1]))
if feat.shape[0] % 3 == 1:
feat = np.pad(feat, ((1, 1), (0, 0)), 'edge')
elif feat.shape[0] % 3 == 2:
feat = np.pad(feat, ((1, 0), (0, 0)), 'edge')
# middle one
out[:, shape[1]: 2 * shape[1]] = feat[1:feat.shape[0] - 1: 3]
# left context
out[:, :shape[1]] = feat[0:feat.shape[0] - 2:3, :]
# right context
out[:, shape[1] * 2:shape[1] * 3] = feat[2:feat.shape[0]:3, :]
return out
def _run_cmd(cmd):
logging.warn("Running {}".format(cmd))
try:
check_call(cmd, shell=True, universal_newlines=True)
except CalledProcessError as e:
logging.error("Failed with code {}:".format(e.returncode))
logging.error(e.output)
raise e
with TemporaryDirectory() as temp_dir:
temp_dir = Path(temp_dir)
# Create config placeholder
conf_file = temp_dir / 'mfcc.conf'
conf_file.write_text('--use-energy=false\n')
conf_file.write_text('--sample-frequency=8000\n')
conf_file.write_text('--num-mel-bins=40\n')
conf_file.write_text('--num-ceps=40\n')
conf_file.write_text('--low-freq=40\n')
conf_file.write_text('--high-freq=-200\n')
# Create SCP placeholder
input_scp = temp_dir / 'input.scp'
input_scp.write_text('file-0 {}\n'.format(file))
# Compute speech features
feat_ark = temp_dir / "feat.ark"
feat_scp = temp_dir / "feat.scp"
cmd = f"compute-mfcc-feats --config={conf_file} scp:{input_scp} ark,scp:{feat_ark},{feat_scp}"
_run_cmd(cmd)
cmvn_scp = temp_dir / "cmvn.scp"
if cmvn_spec is not None:
# If CMVN specifier is provided, we create a dummy scp
cmvn_scp.write_text("file-0 {cmvn_spec}\n")
else:
# Compute CMVN stats
cmvn_ark = temp_dir / "cmvn.ark"
cmd = f"compute-cmvn-stats scp:{feat_scp} ark,scp:{cmvn_ark},{cmvn_scp}"
_run_cmd(cmd)
# Apply CMVN
final_ark = temp_dir / "final.ark"
final_scp = temp_dir / "final.scp"
cmd = f"apply-cmvn --norm-vars=true scp:{cmvn_scp} scp:{feat_scp} ark,scp:{final_ark},{final_scp}"
_run_cmd(cmd)
with final_scp.open('rb') as fp:
feats = [features for _, features in kaldi_io.read_mat_scp(fp)][0]
# Process data
feats_new = feats
# Turn the audio into a one-entry batch (TC --> TNC)
import mxnet as mx
data = mx.nd.expand_dims(mx.nd.array(feats_new, ctx=self._ctx), axis=1)
# Stack every three frames
data = transform(data)
data_len = mx.nd.array([data.shape[0]], ctx=self._ctx)
vecs = self._model(data, data_len).flatten()
reps = vecs.asnumpy()
return reps