docker_images/speechbrain/app/pipelines/audio_classification.py (24 lines of code) (raw):
from typing import Dict, List
import numpy as np
import torch
from app.common import ModelType, get_type
from app.pipelines import Pipeline
from speechbrain.inference import EncoderClassifier
class AudioClassificationPipeline(Pipeline):
def __init__(self, model_id: str):
model_type = get_type(model_id)
if model_type != ModelType.ENCODERCLASSIFIER:
raise ValueError(f"{model_type.value} is invalid for audio-classification")
self.model = EncoderClassifier.from_hparams(source=model_id)
self.top_k = 5
# Please define a `self.sampling_rate` for this pipeline
# to automatically read the input correctly
self.sampling_rate = 16000
def __call__(self, inputs: np.array) -> List[Dict[str, float]]:
"""
Args:
inputs (:obj:`np.array`):
The raw waveform of audio received. By default at 16KHz.
Return:
A :obj:`list`:. The object returned should be a list like [{"label": "text", "score": 0.9939950108528137}] containing :
- "label": A string representing what the label/class is. There can be multiple labels.
- "score": A score between 0 and 1 describing how confident the model is for this label/class.
"""
batch = torch.from_numpy(inputs).unsqueeze(0)
rel_length = torch.tensor([1.0])
probs, _, _, _ = self.model.classify_batch(batch, rel_length)
probs = torch.softmax(probs[0], dim=0)
labels = self.model.hparams.label_encoder.decode_ndim(range(len(probs)))
results = []
for prob, label in sorted(zip(probs, labels), reverse=True)[: self.top_k]:
results.append({"label": label, "score": prob.item()})
return results