def _file_to

def _file_to_feats()

in src/speech_reps/featurize.py [0:0]
61 lines of code
6 McCabe index (conditional complexity)

    def _file_to_feats(self, file):

        assert file.suffix == '.wav'
        # To support CMVN files in the future
        cmvn_spec = None

        def transform(feat):
            '''
            Transform into a sequnce with every 3 frames stacked
            '''
            shape = feat.shape
            out = np.zeros((int(math.ceil(float(shape[0]) / 3)), 3 * shape[1]))
            if feat.shape[0] % 3 == 1:
                feat = np.pad(feat, ((1, 1), (0, 0)), 'edge')
            elif feat.shape[0] % 3 == 2:
                feat = np.pad(feat, ((1, 0), (0, 0)), 'edge')
            # middle one
            out[:, shape[1]: 2 * shape[1]] = feat[1:feat.shape[0] - 1: 3]
            # left context
            out[:, :shape[1]] = feat[0:feat.shape[0] - 2:3, :]
            # right context
            out[:, shape[1] * 2:shape[1] * 3] = feat[2:feat.shape[0]:3, :]
            return out

        def _run_cmd(cmd):
            logging.warn("Running {}".format(cmd))
            try:
                check_call(cmd, shell=True, universal_newlines=True)
            except CalledProcessError as e:
                logging.error("Failed with code {}:".format(e.returncode))
                logging.error(e.output)
                raise e

        with TemporaryDirectory() as temp_dir:

            temp_dir = Path(temp_dir)

            # Create config placeholder
            conf_file = temp_dir / 'mfcc.conf'
            conf_file.write_text('--use-energy=false\n')
            conf_file.write_text('--sample-frequency=8000\n')
            conf_file.write_text('--num-mel-bins=40\n')
            conf_file.write_text('--num-ceps=40\n')
            conf_file.write_text('--low-freq=40\n')
            conf_file.write_text('--high-freq=-200\n')

            # Create SCP placeholder
            input_scp = temp_dir / 'input.scp'
            input_scp.write_text('file-0 {}\n'.format(file))

            # Compute speech features
            feat_ark = temp_dir / "feat.ark"
            feat_scp = temp_dir / "feat.scp"
            cmd = f"compute-mfcc-feats --config={conf_file} scp:{input_scp} ark,scp:{feat_ark},{feat_scp}"
            _run_cmd(cmd)

            cmvn_scp = temp_dir / "cmvn.scp"
            if cmvn_spec is not None:
                # If CMVN specifier is provided, we create a dummy scp
                cmvn_scp.write_text("file-0 {cmvn_spec}\n")
            else:
                # Compute CMVN stats
                cmvn_ark = temp_dir / "cmvn.ark"
                cmd = f"compute-cmvn-stats scp:{feat_scp} ark,scp:{cmvn_ark},{cmvn_scp}"
                _run_cmd(cmd)

            # Apply CMVN
            final_ark = temp_dir / "final.ark"
            final_scp = temp_dir / "final.scp"
            cmd = f"apply-cmvn --norm-vars=true scp:{cmvn_scp} scp:{feat_scp} ark,scp:{final_ark},{final_scp}"
            _run_cmd(cmd)

            with final_scp.open('rb') as fp:
                feats = [features for _, features in kaldi_io.read_mat_scp(fp)][0]

        # Process data
        feats_new = feats
        # Turn the audio into a one-entry batch (TC --> TNC)
        import mxnet as mx
        data = mx.nd.expand_dims(mx.nd.array(feats_new, ctx=self._ctx), axis=1)
        # Stack every three frames
        data = transform(data)

        data_len = mx.nd.array([data.shape[0]], ctx=self._ctx)

        vecs = self._model(data, data_len).flatten()
        reps = vecs.asnumpy()

        return reps