tensorrtllm/run

import argparse import os import torch import json from tensorrt_llm.runtime import ModelRunnerCpp from tensorrt_llm.bindings import GptJsonConfig import numpy as np from collections import OrderedDict from pathlib import Path from whisper_utils import log_mel_spectrogram, get_tokenizer import evaluate from normalizer import data_utils import time from tqdm import tqdm from pathlib import Path import re from concurrent.futures import ThreadPoolExecutor wer_metric = evaluate.load("wer") def read_config(component, engine_dir): engine_dir = Path(engine_dir) config_path = engine_dir / component / 'config.json' with open(config_path, 'r') as f: config = json.load(f) model_config = OrderedDict() model_config.update(config['pretrained_config']) model_config.update(config['build_config']) return model_config class WhisperTRTLLM(object): def __init__(self, engine_dir, assets_dir="assets", batch_size=64): encoder_config = read_config('encoder', engine_dir) decoder_config = read_config('decoder', engine_dir) self.n_mels = encoder_config['n_mels'] self.num_languages = encoder_config['num_languages'] is_multilingual = (decoder_config['vocab_size'] >= 51865) if is_multilingual: tokenizer_name = "multilingual" assert (Path(assets_dir) / "multilingual.tiktoken").exists( ), "multilingual.tiktoken file is not existed in assets_dir" else: tokenizer_name = "gpt2" assert (Path(assets_dir) / "gpt2.tiktoken").exists( ), "gpt2.tiktoken file is not existed in assets_dir" self.text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" if is_multilingual else "<|startoftranscript|><|notimestamps|>" self.tokenizer = get_tokenizer(name=tokenizer_name, num_languages=self.num_languages, tokenizer_dir=assets_dir) self.eot_id = self.tokenizer.encode( "<|endoftext|>", allowed_special=self.tokenizer.special_tokens_set)[0] json_config = GptJsonConfig.parse_file(Path(engine_dir) / 'decoder' / 'config.json') assert json_config.model_config.supports_inflight_batching runner_kwargs = dict(engine_dir=engine_dir, is_enc_dec=True, max_batch_size=batch_size, max_input_len=3000, max_output_len=96, max_beam_width=1, debug_mode=False, kv_cache_free_gpu_memory_fraction=0.9) self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs) def process_single_batch(self, mel_batch, decoder_input_ids, mel_input_lengths, max_new_tokens): outputs = self.model_runner_cpp.generate( batch_input_ids=decoder_input_ids, encoder_input_features=mel_batch, encoder_output_lengths=mel_input_lengths // 2, max_new_tokens=max_new_tokens, end_id=self.eot_id, pad_id=self.eot_id, num_beams=1, output_sequence_lengths=True, return_dict=True ) output_ids = outputs['output_ids'].cpu().numpy().tolist() texts = [] for i in range(len(output_ids)): text = self.tokenizer.decode(output_ids[i][0]).strip() text = re.sub(r'<\|.*?\|>', '', text) texts.append(text) return texts def process_batch(self, mel, mel_input_lengths, num_threads=4, max_new_tokens=96): prompt_id = self.tokenizer.encode( self.text_prefix, allowed_special=self.tokenizer.special_tokens_set) prompt_id = torch.tensor(prompt_id) batch_size = len(mel) decoder_input_ids = prompt_id.repeat(batch_size, 1) with torch.no_grad(): if isinstance(mel, list): mel = torch.stack([m.transpose(1, 2).type(torch.float16).squeeze(0) for m in mel]) else: mel = mel.transpose(1, 2) num_threads = min(num_threads, batch_size) mel_batches = torch.split(mel, batch_size // num_threads) mel_input_lengths_batches = torch.split(mel_input_lengths, batch_size // num_threads) texts_list = [] with ThreadPoolExecutor(max_workers=num_threads) as executor: futures = [] for i, mel_batch in enumerate(mel_batches): current_length = mel_batch.size(0) futures.append(executor.submit( self.process_single_batch, mel_batch, decoder_input_ids[:current_length], mel_input_lengths_batches[i], max_new_tokens )) for future in futures: texts_list.extend(future.result()) return texts_list def longest_common_substring(s1, s2): len1, len2 = len(s1), len(s2) dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] longest_length = 0 end_index_s1 = 0 for i in range(1, len1 + 1): for j in range(1, len2 + 1): if s1[i - 1] == s2[j - 1]: dp[i][j] = dp[i - 1][j - 1] + 1 if dp[i][j] > longest_length: longest_length = dp[i][j] end_index_s1 = i else: dp[i][j] = 0 return s1[end_index_s1 - longest_length:end_index_s1] def chunk_audio(audio, chunk_length, overlap_length, sample_rate): chunk_size = int(chunk_length * sample_rate) overlap_size = int(overlap_length * sample_rate) chunks = [] start = 0 while start < len(audio): end = min(start + chunk_size, len(audio)) chunks.append(audio[start:end]) start += chunk_size - overlap_size return chunks def main(args): asr_model = WhisperTRTLLM(engine_dir=args.model_id) def benchmark(batch, min_new_tokens=None): # Load audio inputs max_duration, sample_rate = 30, 16000 audios_origin = [audio["array"].astype(np.float32) for audio in batch["audio"]] minibatch_size = len(audios_origin) audios, audio_index = [], [] chunk_length = 25 overlap_length = 5 for i, audio in enumerate(audios_origin): if len(audio) > max_duration * sample_rate: audio_chunks = chunk_audio(audio, chunk_length, overlap_length, sample_rate) for chunk in audio_chunks: audios.append(chunk) audio_index.append(i) else: audios.append(audio) audio_index.append(i) audios = [torch.from_numpy(audio) for audio in audios] # START TIMING start_time = time.time() longest_duration = int(sample_rate * max_duration) features = [ log_mel_spectrogram(wave, asr_model.n_mels, padding=longest_duration - wave.shape[-1], device='cuda').unsqueeze(0) for wave in audios ] features_input_lengths = torch.tensor([f.shape[2] for f in features], dtype=torch.int32, device='cuda') texts_origin = asr_model.process_batch(features, features_input_lengths, num_threads=4) texts = [] for i in range(minibatch_size): text_chunks = [] for j in range(len(texts_origin)): if audio_index[j] == i: text_chunks.append(texts_origin[j]) if len(text_chunks) > 1: merged_text = text_chunks[0] for t in text_chunks[1:]: lcs = longest_common_substring(merged_text, t) merged_text += t[len(lcs):] texts.append(merged_text) else: texts.append(text_chunks[0]) # END TIMING runtime = time.time() - start_time print(f"Batch size: {minibatch_size}, Time taken: {runtime:.2f} s, texts_origin_len: {len(texts_origin)}, texts_len: {len(texts)}") # normalize by minibatch size since we want the per-sample time batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] # normalize transcriptions with English normalizer batch["predictions"] = [data_utils.normalizer(pred) for pred in texts] batch["references"] = batch["norm_text"] return batch if args.warmup_steps is not None: dataset = data_utils.load_data(args) dataset = data_utils.prepare_data(dataset) num_warmup_samples = args.warmup_steps * args.batch_size if args.streaming: warmup_dataset = dataset.take(num_warmup_samples) else: warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset)))) warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True, fn_kwargs={"min_new_tokens": args.max_new_tokens})) for _ in tqdm(warmup_dataset, desc="Warming up..."): continue dataset = data_utils.load_data(args) if args.max_eval_samples is not None and args.max_eval_samples > 0: print(f"Subsampling dataset to first {args.max_eval_samples} samples!") if args.streaming: dataset = dataset.take(args.max_eval_samples) else: dataset = dataset.select(range(min(args.max_eval_samples, len(dataset)))) dataset = data_utils.prepare_data(dataset) dataset = dataset.map( benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"], ) all_results = { "audio_length_s": [], "transcription_time_s": [], "predictions": [], "references": [], } result_iter = iter(dataset) for result in tqdm(result_iter, desc="Samples..."): for key in all_results: all_results[key].append(result[key]) # Write manifest results (WER and RTFX) manifest_path = data_utils.write_manifest( all_results["references"], all_results["predictions"], args.model_id, args.dataset_path, args.dataset, args.split, audio_length=all_results["audio_length_s"], transcription_time=all_results["transcription_time_s"], ) print("Results saved at path:", os.path.abspath(manifest_path)) wer = wer_metric.compute( references=all_results["references"], predictions=all_results["predictions"] ) wer = round(100 * wer, 2) rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2) print("WER:", wer, "%", "RTFx:", rtfx) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers", ) parser.add_argument( "--dataset_path", type=str, default="esb/datasets", help="Dataset path. By default, it is `esb/datasets`", ) parser.add_argument( "--dataset", type=str, required=True, help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names " "can be found at `https://huggingface.co/datasets/esb/datasets`", ) parser.add_argument( "--split", type=str, default="test", help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.", ) parser.add_argument( "--device", type=int, default=-1, help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", ) parser.add_argument( "--batch_size", type=int, default=16, help="Number of samples to go through each streamed batch.", ) parser.add_argument( "--max_eval_samples", type=int, default=None, help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", ) parser.add_argument( "--no-streaming", dest="streaming", action="store_false", help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", ) parser.add_argument( "--max_new_tokens", type=int, default=None, help="Maximum number of tokens to generate (for auto-regressive models).", ) parser.add_argument( "--warmup_steps", type=int, default=10, help="Number of warm-up steps to run before launching the timed runs.", ) args = parser.parse_args() parser.set_defaults(streaming=False) main(args)

tensorrtllm/run_eval.py (303 lines of code) (raw):