in src/models/seamless_m4t/feature_extraction_seamless_m4t.js [73:174]
async _call(audio, {
padding = true,
pad_to_multiple_of = 2,
do_normalize_per_mel_bins = true,
return_attention_mask = true,
} = {}) {
validate_audio_inputs(audio, 'SeamlessM4TFeatureExtractor');
let features = await this._extract_fbank_features(audio, this.config.max_length);
if (do_normalize_per_mel_bins) {
const [num_features, feature_size] = features.dims;
const data = features.data;
for (let i = 0; i < feature_size; ++i) {
let sum = 0;
for (let j = 0; j < num_features; ++j) {
sum += data[j * feature_size + i];
}
const mean = sum / num_features;
let variance = 0;
for (let j = 0; j < num_features; ++j) {
variance += (data[j * feature_size + i] - mean) ** 2;
}
variance /= num_features - 1; // NOTE: We use ddof=1
const std = Math.sqrt(variance + 1e-7);
for (let j = 0; j < num_features; ++j) {
const index = j * feature_size + i;
data[index] = (data[index] - mean) / std;
}
}
}
let padded_attention_mask;
if (padding) {
const [num_frames, num_channels] = features.dims;
const data = /** @type {Float32Array} */(features.data);
const pad_size = num_frames % pad_to_multiple_of;
if (pad_size > 0) {
const padded_data = new Float32Array(num_channels * (num_frames + pad_size));
padded_data.set(data)
padded_data.fill(this.config.padding_value, data.length)
const numPaddedFrames = num_frames + pad_size;
features = new Tensor(
features.type,
padded_data,
[numPaddedFrames, num_channels],
)
if (return_attention_mask) {
padded_attention_mask = new Tensor(
'int64',
new BigInt64Array(numPaddedFrames),
[1, numPaddedFrames],
);
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
}
}
}
const [num_frames, num_channels] = features.dims;
const stride = this.config.stride;
const remainder = num_frames % stride;
if (remainder !== 0) {
throw new Error(`The number of frames (${num_frames}) must be a multiple of the stride (${stride}).`)
}
const input_features = features.view(
1,
Math.floor(num_frames / stride),
num_channels * stride,
);
const result = { input_features }
if (return_attention_mask) {
const reshapedNumFrames = input_features.dims[1];
const attention_mask_data = new BigInt64Array(reshapedNumFrames);
if (padded_attention_mask) {
const padded_attention_mask_data = padded_attention_mask.data;
for (let i = 1, j = 0; i < num_frames; i += stride, ++j) {
attention_mask_data[j] = padded_attention_mask_data[i];
}
} else {
attention_mask_data.fill(1n);
}
result.attention_mask = new Tensor(
'int64',
attention_mask_data,
[1, reshapedNumFrames],
);
}
return result;
}