# TensorRT

In this notebook, we will use TensorRT to optimize a PyTorch model for inference. We will train a simple CNN model on the MNIST dataset, convert it to TensorRT engine using ONNX, and then perform inference using the optimized TensorRT engine model and evaluate the size and accuracy of the model. This notebook require a NVIDIA GPU with CUDA support or NVIDIA Jetson device.

## Setup TensorRT

First, install tensorrt and torch using pip and import the neccesary modules

In [None]:
%pip install torch torchvision
%pip install tensorrt==8.6.1
%pip install pycuda onnx onnxruntime
%pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pytorch-quantization==2.1.2

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import torch.quantization
import pathlib
import numpy as np
import torch.onnx
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import onnx
import onnxruntime

from pytorch_quantization import nn as quant_nn
from pytorch_quantization import quant_modules
from pytorch_quantization import calib
from tqdm import tqdm

## Train PyTorch Model and Export to ONNX

Next, train a simple CNN model on the MNIST dataset and export it to ONNX format

In [None]:
transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])

train_dataset = datasets.MNIST('./data', train=True, download=True,transform=transform)
test_dataset = datasets.MNIST('./data', train=False,transform=transform)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=12, kernel_size=3)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc = nn.Linear(12 * 13 * 13, 10)

    def forward(self, x):
        x = x.view(-1, 1, 28, 28)  
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  
        x = self.fc(x)
        output = F.log_softmax(x, dim=1)
        return output


train_loader = torch.utils.data.DataLoader(train_dataset, 32)
test_loader = torch.utils.data.DataLoader(test_dataset, 32)

device = "cpu"

epochs = 1

model = Net().to(device)
optimizer = optim.Adam(model.parameters())

model.train()

for epoch in range(1, epochs+1):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.item()))

MODEL_DIR = pathlib.Path("./models")
MODEL_DIR.mkdir(exist_ok=True)
torch.save(model.state_dict(), MODEL_DIR / "original_model.p")

x, _ = next(iter(train_loader))
torch.onnx.export(model,              
                  x,                         
                  MODEL_DIR / "mnist_model.onnx",  
                  export_params=True,        
                  opset_version=10,          
                  do_constant_folding=True,  
                  input_names = ['input'],   
                  output_names = ['output'], 
                  dynamic_axes={'input' : {0 : 'batch_size'},    
                                'output' : {0 : 'batch_size'}})

## Convert ONNX Model to TensorRT

To convert the ONNX model to TensorRT engine using the TensorRT Python API. First, initialize TensorRT components which are logger, builder, and network. Next, define ONNX parser to parse the ONNX model from the ONNX file into the TensorRT network. Then, create a builder configuration to set building parameters and a memory pool limit for the workspace in TensorRT. Then, create an optimization profile to handle dynamic input shapes with batch size of 32, channel size of 1, and image dimensions of 28x28. Next, built and serialized the TensorRT engine using the configured network and builder and then saved to disk. Finally, the script cleans up by deleting the builder and network objects to free up resources.

In [None]:
onnx_path = MODEL_DIR / "mnist_model.onnx"
trt_path = MODEL_DIR / 'mnist_engine_pytorch.trt'

# initialize TensorRT engine and parse ONNX model
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

parser = trt.OnnxParser(network, logger)
parser.parse_from_file(str(onnx_path))

# set up the builder config and coptimization profile
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) 

profile = builder.create_optimization_profile()
profile.set_shape("input", (32, 1, 28, 28), (32, 1, 28, 28), (32, 1, 28, 28)) 
config.add_optimization_profile(profile)

# serialize the engine, then save to disk
serialized_engine = builder.build_serialized_network(network, config)
with open(str(trt_path), 'wb') as f:
    f.write(serialized_engine)

# free up resources
del builder
del network

## Run Inference and Check Accuracy

Finally, run inference and then compare the TensorRT engine model accuracy with the ONNX model on the test dataset.

To run test ONNX model, load the model and test model model integrity and then loop over the given Data Loader, For each batch, convert the input data to a NumPy array and fed into the ONNX Runtime session. Once, obtained the output convert back to a PyTorch tensor. Then, calculate the accumulated negative log likelihood loss
and number of correct predictions to measure the accuracy of the model.

To test the tensorRT model, first, load the serialized engine from disk, and initialize TensorRT runtime. Then, deserialized the engine and create execution context is created. Next, allocate memory for input and output data on the GPU, set bindings for the TensorRT execution and create CUDA stream to manage asynchronous data transfers between the CPU and GPU. Then, Loop over the given Data Loader and for each batch, convert the input data to a NumPy array and transfer to the GPU, before executing the model asynchronously, and then transfer the predictions back to the CPU. Run synchronization to ensures proper coordination between threads. Next, reshaped output and convert to a PyTorch tensor to calculate the accumulated negative log likelihood loss and number of correct predictions to measure the accuracy of the model. Finally, free up the memory and CUDA resources 

In [None]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def test_onnx(model_name, data_loader):
    onnx_model = onnx.load(model_name)
    onnx.checker.check_model(onnx_model)
    ort_session = onnxruntime.InferenceSession(model_name)
    test_loss = 0
    correct = 0
    for data, target in data_loader:
        ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(data)}
        output = ort_session.run(None, ort_inputs)[0]
        output = torch.from_numpy(output)
        if target.shape[0] == 32: # last batch might be smaller than 32 (quick fix)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(data_loader.dataset)
    return 100. * correct / len(data_loader.dataset)

def test_tensorrt(model_name, data_loader):
    with open(model_name, "rb") as f:
        serialized_engine = f.read()
    runtime = trt.Runtime(logger)
    engine = runtime.deserialize_cuda_engine(serialized_engine)
    context = engine.create_execution_context()
    input_size = trt.volume(engine.get_binding_shape(0))
    output_size = trt.volume(engine.get_binding_shape(1))
    # Allocate device memory
    d_input = cuda.mem_alloc(input_size * 4)  # Assuming 4-byte float32 data type
    d_output = cuda.mem_alloc(output_size * 4)
    bindings=[int(d_input), int(d_output)]
    stream = cuda.Stream()
    h_output = np.empty(output_size, dtype=np.float32)
    test_loss = 0
    correct = 0
    for data, target in data_loader:
        # Create numpy arrays to hold input and output data
        h_input = data.numpy().astype(np.float32)
        # Transfer input data to device
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Execute model
        context.execute_async_v2(bindings, stream.handle, None)
        # Transfer predictions back
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Syncronize threads
        stream.synchronize()
        output = h_output.reshape(context.get_tensor_shape('output'))
        output = torch.from_numpy(output)
        if target.shape[0] == 32: # last batch might be smaller than 32 (quick fix)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  
            pred = output.argmax(dim=1, keepdim=True)  
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(data_loader.dataset)
    del context
    del engine
    cuda.Context.pop()
    return 100. * correct / len(data_loader.dataset)

acc = test_onnx(onnx_path, test_loader)
print(f"Accuracy of the onnx model is {acc}%")

trtr_acc = test_tensorrt(trt_path, test_loader)
print(f"Accuracy of the tensorrt model is {trtr_acc}%")