in training/flax/distil_whisper/pipeline.py [0:0]
def preprocess_batch(self, inputs, chunk_length_s=30.0, stride_length_s=None, batch_size=None):
if isinstance(inputs, np.ndarray):
logger.warning(
"Numpy array passed as input - no sampling rate checks will be performed."
"It is strongly recommended to pass the input as a dictionary with an 'array' key "
"containing the numpy array representing the audio, and a 'sampling_rate' key "
"containing the sampling rate associated with the audio array."
"Failing to do so can result in silent errors that might be hard to debug."
)
if isinstance(inputs, str):
if inputs.startswith("http://") or inputs.startswith("https://"):
# We need to actually check for a real protocol, otherwise it's impossible to use a local file
# like http_huggingface_co.png
inputs = requests.get(inputs).content
else:
with open(inputs, "rb") as f:
inputs = f.read()
if isinstance(inputs, bytes):
inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
stride = None
if isinstance(inputs, dict):
stride = inputs.get("stride", None)
# Accepting `"array"` which is the key defined in `datasets` for
# better integration
if not ("sampling_rate" in inputs and "array" in inputs):
raise ValueError(
"When passing a dictionary to FlaxWhisperPipline, the dict needs to contain an 'array' key "
"containing the numpy array representing the audio, and a 'sampling_rate' key "
"containing the sampling rate associated with the audio array."
)
in_sampling_rate = inputs.get("sampling_rate")
inputs = inputs.get("array", None)
if in_sampling_rate != self.feature_extractor.sampling_rate:
try:
import librosa
except ImportError as err:
raise ImportError(
"To support resampling audio files, please install 'librosa' and 'soundfile'."
) from err
inputs = librosa.resample(
inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate
)
ratio = self.feature_extractor.sampling_rate / in_sampling_rate
else:
ratio = 1
if not isinstance(inputs, np.ndarray):
raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
if len(inputs.shape) != 1:
raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
if stride is not None:
if stride[0] + stride[1] > inputs.shape[0]:
raise ValueError("Stride is too large for input")
# Stride needs to get the chunk length here, it's going to get
# swallowed by the `feature_extractor` later, and then batching
# can add extra data in the inputs, so we need to keep track
# of the original length in the stride so we can cut properly.
stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
if chunk_length_s:
if stride_length_s is None:
stride_length_s = chunk_length_s / 6
if isinstance(stride_length_s, (int, float)):
stride_length_s = [stride_length_s, stride_length_s]
chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate)
stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate)
stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate)
if chunk_len < stride_left + stride_right:
raise ValueError("Chunk length must be superior to stride length")
for item in self.chunk_iter_with_batch(
inputs,
chunk_len,
stride_left,
stride_right,
batch_size,
):
yield item
else:
processed = self.feature_extractor(
inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
)
if stride is not None:
processed["stride"] = stride
yield processed