From 7c6c22a7c812cef06c2202c12f93846cc512823c Mon Sep 17 00:00:00 2001
From: cyrusbehr <cyrus.behr@gmail.com>
Date: Wed, 10 Apr 2024 12:20:32 -0400
Subject: [PATCH] V5 (#53)

* Compiling

* Fixed bug

* Added to readme

* Added to gitignore

* Added support for loading TensorRT engine file directly

* Minor fixes

* Added log message

* Added command line parser

* Updated changelog

* Added clang-format and reformatted all files

* Changed formatting
---
 .clang-format           |   6 +
 .gitignore              |   4 +-
 .pre-commit-config.yaml |   6 +
 README.md               |  11 +-
 src/cmd_line_parser.h   |  98 ++++++
 src/engine.cpp          | 561 ++----------------------------
 src/engine.h            | 736 ++++++++++++++++++++++++++++++++++++----
 src/main.cpp            | 105 +++---
 8 files changed, 875 insertions(+), 652 deletions(-)
 create mode 100644 .clang-format
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 src/cmd_line_parser.h

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..7c40558
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -4
+ColumnLimit:     140
+IndentWidth:     4
diff --git a/.gitignore b/.gitignore
index bcebf86..8a328d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,5 +35,7 @@
 build*/
 cmake-build-*
 .idea/
+.vscode/
+
+*.onnx
 
-*.onnx
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..273d14b
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: 'v17.0.3'  # Use the sha / tag you want to point at
+  hooks:
+  - id: clang-format
+    types_or: [c++, c, cuda]
diff --git a/README.md b/README.md
index b7d3d3f..6f1858b 100644
--- a/README.md
+++ b/README.md
@@ -78,9 +78,12 @@ You will need to supply your own onnx model for this sample code or you can down
 ### Running the Executable
 - Navigate to the build directory
 - Run the executable and provide the path to your onnx model.
-- ex. `./run_inference_benchmark ../models/yolov8n.onnx`
+- ex. `./run_inference_benchmark --onnx_model ../models/yolov8n.onnx`
   - Note: See sanity check section below for instructions on how to obtain the yolov8n model.  
 - The first time you run the executable for a given model and options, a TensorRT engine file will be built from your onnx model. This process is fairly slow and can take 5+ minutes for some models (ex. yolo models). 
+- Alternatively, you can choose to supply your own TensorRT engine file directly:
+- ex. `./run_inference_benchmark --trt_model ../models/yolov8n.engine.NVIDIAGeForceRTX3080LaptopGPU.fp16.1.1`
+  - Note: See V5.0 changelog below for warnings when supply your own TensorRT engine file. 
 
 ### Sanity Check
 - To perform a sanity check, download the `YOLOv8n` model from [here](https://github.com/ultralytics/ultralytics#models).
@@ -161,6 +164,12 @@ If this project was helpful to you, I would appreciate if you could give it a st
 
 ### Changelog
 
+**V5.0**
+
+- `Engine` class has been modified to take a template parameter which specifies the models output data type. The implementation now supports outputs of type `float`, ``__half`, `int8_t`, `int32_t`, `bool`, and `uint8_t`. 
+- Added support for loading TensorRT engine file directly without needing to compile from onnx model. Howver, it is highly recommended that you use the API provided to build the engine file from the onnx model, instead of loading a TensorRT model directly. If you choose to load a TensorRT model file directly, you must hand-check that the `Options` have been set correctly for your model (for example, if your model has been compiled for FP32 but you try running FP16 inference, it will fail, potentially without a verbose error).
+- Added command line parser. 
+
 **V4.1**
 
 - Added support for fixed batch size > 1.  
diff --git a/src/cmd_line_parser.h b/src/cmd_line_parser.h
new file mode 100644
index 0000000..3b1c0b5
--- /dev/null
+++ b/src/cmd_line_parser.h
@@ -0,0 +1,98 @@
+#pragma once
+#include "engine.h"
+#include <iostream>
+
+struct CommandLineArguments {
+    std::string onnxModelPath = "";
+    std::string trtModelPath = "";
+};
+
+inline void showHelp(char *argv[]) {
+    std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
+
+    std::cout << "Options:" << std::endl;
+    std::cout << "--onnx_model <string>             Path to the ONNX model. "
+                 "(Either onnx_model or trt_model must be provided)"
+              << std::endl;
+    std::cout << "--trt_model <string>              Path to the TensorRT model. "
+                 "(Either onnx_model or trt_model must be provided)"
+              << std::endl;
+
+    std::cout << "Example usage:" << std::endl;
+    std::cout << argv[0] << " --onnx_model model.onnx" << std::endl;
+};
+
+inline bool tryGetNextArgument(int argc, char *argv[], int &currentIndex, std::string &value, std::string flag, bool printErrors = true) {
+    if (currentIndex + 1 >= argc) {
+        if (printErrors)
+            std::cout << "Error: No arguments provided for flag '" << flag << "'" << std::endl;
+        return false;
+    }
+
+    std::string nextArgument = argv[currentIndex + 1];
+    if (nextArgument.substr(0, 2) == "--") {
+        if (printErrors)
+            std::cout << "Error: No arguments provided for flag '" << flag << "'" << std::endl;
+        return false;
+    }
+
+    value = argv[++currentIndex];
+    return true;
+};
+
+inline bool parseArguments(int argc, char *argv[], CommandLineArguments &arguments) {
+    if (argc == 1) {
+        showHelp(argv);
+        return false;
+    }
+
+    for (int i = 1; i < argc; i++) {
+        std::string argument = argv[i];
+
+        if (argument.substr(0, 2) == "--") {
+            std::string flag = argument.substr(2);
+            std::string nextArgument;
+
+            if (flag == "onnx_model") {
+                if (!tryGetNextArgument(argc, argv, i, nextArgument, flag))
+                    return false;
+
+                if (!Util::doesFileExist(nextArgument)) {
+                    std::cout << "Error: Unable to find model at path '" << nextArgument << "' for flag '" << flag << "'" << std::endl;
+                    return false;
+                }
+
+                arguments.onnxModelPath = nextArgument;
+            }
+
+            else if (flag == "trt_model") {
+                if (!tryGetNextArgument(argc, argv, i, nextArgument, flag))
+                    return false;
+
+                if (!Util::doesFileExist(nextArgument)) {
+                    std::cout << "Error: Unable to find model at path '" << nextArgument << "' for flag '" << flag << "'" << std::endl;
+                    return false;
+                }
+
+                arguments.trtModelPath = nextArgument;
+            }
+
+            else {
+                std::cout << "Error: Unknown flag '" << flag << "'" << std::endl;
+                showHelp(argv);
+                return false;
+            }
+        } else {
+            std::cout << "Error: Unknown argument '" << argument << "'" << std::endl;
+            showHelp(argv);
+            return false;
+        }
+    }
+
+    if (arguments.onnxModelPath.empty() && arguments.trtModelPath.empty()) {
+        std::cout << "Error: Must specify either 'onnx_model' or 'trt_model'" << std::endl;
+        return false;
+    }
+
+    return true;
+}
diff --git a/src/engine.cpp b/src/engine.cpp
index 64c2221..28c7fc4 100644
--- a/src/engine.cpp
+++ b/src/engine.cpp
@@ -1,27 +1,26 @@
+#include "engine.h"
 #include <algorithm>
-#include <fstream>
 #include <filesystem>
+#include <fstream>
 #include <iostream>
-#include <random>
 #include <iterator>
-#include <opencv2/cudaimgproc.hpp>
-#include "engine.h"
-#include "NvOnnxParser.h"
+#include <random>
 
 using namespace nvinfer1;
 using namespace Util;
 
-std::vector<std::string> Util::getFilesInDirectory(const std::string& dirPath) {
+std::vector<std::string> Util::getFilesInDirectory(const std::string &dirPath) {
     std::vector<std::string> filepaths;
-    for (const auto& entry: std::filesystem::directory_iterator(dirPath)) {
+    for (const auto &entry : std::filesystem::directory_iterator(dirPath)) {
         filepaths.emplace_back(entry.path().string());
     }
     return filepaths;
 }
 
 void Logger::log(Severity severity, const char *msg) noexcept {
-    // Would advise using a proper logging utility such as https://github.com/gabime/spdlog
-    // For the sake of this tutorial, will just log to the console.
+    // Would advise using a proper logging utility such as
+    // https://github.com/gabime/spdlog For the sake of this tutorial, will just
+    // log to the console.
 
     // Only log Warnings or more important.
     if (severity <= Severity::kWARNING) {
@@ -29,524 +28,12 @@ void Logger::log(Severity severity, const char *msg) noexcept {
     }
 }
 
-Engine::Engine(const Options &options)
-    : m_options(options) {}
-
-bool Engine::build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals,
-                   bool normalize) {
-    m_subVals = subVals;
-    m_divVals = divVals;
-    m_normalize = normalize;
-
-    // Only regenerate the engine file if it has not already been generated for the specified options
-    m_engineName = serializeEngineOptions(m_options, onnxModelPath);
-    std::cout << "Searching for engine file with name: " << m_engineName << std::endl;
-
-    if (doesFileExist(m_engineName)) {
-        std::cout << "Engine found, not regenerating..." << std::endl;
-        return true;
-    }
-
-    if (!doesFileExist(onnxModelPath)) {
-        throw std::runtime_error("Could not find model at path: " + onnxModelPath);
-    }
-
-    // Was not able to find the engine file, generate...
-    std::cout << "Engine not found, generating. This could take a while..." << std::endl;
-
-    // Create our engine builder.
-    auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
-    if (!builder) {
-        return false;
-    }
-
-    // Define an explicit batch size and then create the network (implicit batch size is deprecated).
-    // More info here: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#explicit-implicit-batch
-    auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-    auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
-    if (!network) {
-        return false;
-    }
-
-    // Create a parser for reading the onnx file.
-    auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));
-    if (!parser) {
-        return false;
-    }
-
-    // We are going to first read the onnx file into memory, then pass that buffer to the parser.
-    // Had our onnx model file been encrypted, this approach would allow us to first decrypt the buffer.
-    std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
-    std::streamsize size = file.tellg();
-    file.seekg(0, std::ios::beg);
-
-    std::vector<char> buffer(size);
-    if (!file.read(buffer.data(), size)) {
-        throw std::runtime_error("Unable to read engine file");
-    }
-
-    // Parse the buffer we read into memory.
-    auto parsed = parser->parse(buffer.data(), buffer.size());
-    if (!parsed) {
-        return false;
-    }
-
-    // Ensure that all the inputs have the same batch size
-    const auto numInputs = network->getNbInputs();
-    if (numInputs < 1) {
-        throw std::runtime_error("Error, model needs at least 1 input!");
-    }
-    const auto input0Batch = network->getInput(0)->getDimensions().d[0];
-    for (int32_t i = 1; i < numInputs; ++i) {
-        if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
-            throw std::runtime_error("Error, the model has multiple inputs, each with differing batch sizes!");
-        }
-    }
-
-    // Check to see if the model supports dynamic batch size or not
-    bool doesSupportDynamicBatch = false;
-    if (input0Batch == -1) {
-        doesSupportDynamicBatch = true;
-        std::cout << "Model supports dynamic batch size" << std::endl;
-    } else {
-        std::cout << "Model only supports fixed batch size of " << input0Batch << std::endl;
-        // If the model supports a fixed batch size, ensure that the maxBatchSize and optBatchSize were set correctly.
-        if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
-            throw std::runtime_error("Error, model only supports a fixed batch size of " + std::to_string(input0Batch) +
-            ". Must set Options.optBatchSize and Options.maxBatchSize to 1");
-        }
-    }
-
-    auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
-    if (!config) {
-        return false;
-    }
-
-    // Register a single optimization profile
-    IOptimizationProfile *optProfile = builder->createOptimizationProfile();
-    for (int32_t i = 0; i < numInputs; ++i) {
-        // Must specify dimensions for all the inputs the model expects.
-        const auto input = network->getInput(i);
-        const auto inputName = input->getName();
-        const auto inputDims = input->getDimensions();
-        int32_t inputC = inputDims.d[1];
-        int32_t inputH = inputDims.d[2];
-        int32_t inputW = inputDims.d[3];
-
-        // Specify the optimization profile`
-        if (doesSupportDynamicBatch) {
-            optProfile->setDimensions(inputName, OptProfileSelector::kMIN, Dims4(1, inputC, inputH, inputW));
-        } else {
-            optProfile->setDimensions(inputName, OptProfileSelector::kMIN, Dims4(m_options.optBatchSize, inputC, inputH, inputW));
-        }
-        optProfile->setDimensions(inputName, OptProfileSelector::kOPT, Dims4(m_options.optBatchSize, inputC, inputH, inputW));
-        optProfile->setDimensions(inputName, OptProfileSelector::kMAX, Dims4(m_options.maxBatchSize, inputC, inputH, inputW));
-    }
-    config->addOptimizationProfile(optProfile);
-
-    // Set the precision level
-    if (m_options.precision == Precision::FP16) {
-        // Ensure the GPU supports FP16 inference
-        if (!builder->platformHasFastFp16()) {
-            throw std::runtime_error("Error: GPU does not support FP16 precision");
-        }
-        config->setFlag(BuilderFlag::kFP16);
-    } else if (m_options.precision == Precision::INT8) {
-        if (numInputs > 1) {
-            throw std::runtime_error("Error, this implementation currently only supports INT8 quantization for single input models");
-        }
-
-        // Ensure the GPU supports INT8 Quantization
-        if (!builder->platformHasFastInt8()) {
-            throw std::runtime_error("Error: GPU does not support INT8 precision");
-        }
-
-        // Ensure the user has provided path to calibration data directory
-        if (m_options.calibrationDataDirectoryPath.empty()) {
-            throw std::runtime_error("Error: If INT8 precision is selected, must provide path to calibration data directory to Engine::build method");
-        }
-
-        config->setFlag((BuilderFlag::kINT8));
-
-        const auto input = network->getInput(0);
-        const auto inputName = input->getName();
-        const auto inputDims = input->getDimensions();
-        const auto calibrationFileName = m_engineName + ".calibration";
-
-        m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2], m_options.calibrationDataDirectoryPath,
-                                                                calibrationFileName, inputName, subVals, divVals, normalize);
-        config->setInt8Calibrator(m_calibrator.get());
-    }
-
-    // CUDA stream used for profiling by the builder.
-    cudaStream_t profileStream;
-    checkCudaErrorCode(cudaStreamCreate(&profileStream));
-    config->setProfileStream(profileStream);
-
-    // Build the engine
-    // If this call fails, it is suggested to increase the logger verbosity to kVERBOSE and try rebuilding the engine.
-    // Doing so will provide you with more information on why exactly it is failing.
-    std::unique_ptr<IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};
-    if (!plan) {
-        return false;
-    }
-
-    // Write the engine to disk
-    std::ofstream outfile(m_engineName, std::ofstream::binary);
-    outfile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
-
-    std::cout << "Success, saved engine to " << m_engineName << std::endl;
-
-    checkCudaErrorCode(cudaStreamDestroy(profileStream));
-    return true;
-}
-
-Engine::~Engine() {
-    // Free the GPU memory
-    for (auto & buffer : m_buffers) {
-        checkCudaErrorCode(cudaFree(buffer));
-    }
-
-    m_buffers.clear();
-}
-
-bool Engine::loadNetwork() {
-    // Read the serialized model from disk
-    std::ifstream file(m_engineName, std::ios::binary | std::ios::ate);
-    std::streamsize size = file.tellg();
-    file.seekg(0, std::ios::beg);
-
-    std::vector<char> buffer(size);
-    if (!file.read(buffer.data(), size)) {
-        throw std::runtime_error("Unable to read engine file");
-    }
-
-    // Create a runtime to deserialize the engine file.
-    m_runtime = std::unique_ptr<IRuntime> {createInferRuntime(m_logger)};
-    if (!m_runtime) {
-        return false;
-    }
-
-    // Set the device index
-    auto ret = cudaSetDevice(m_options.deviceIndex);
-    if (ret != 0) {
-        int numGPUs;
-        cudaGetDeviceCount(&numGPUs);
-        auto errMsg = "Unable to set GPU device index to: " + std::to_string(m_options.deviceIndex) +
-                ". Note, your device has " + std::to_string(numGPUs) + " CUDA-capable GPU(s).";
-        throw std::runtime_error(errMsg);
-    }
-
-    // Create an engine, a representation of the optimized model.
-    m_engine = std::unique_ptr<nvinfer1::ICudaEngine>(m_runtime->deserializeCudaEngine(buffer.data(), buffer.size()));
-    if (!m_engine) {
-        return false;
-    }
-
-    // The execution context contains all of the state associated with a particular invocation
-    m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
-    if (!m_context) {
-        return false;
-    }
-
-    // Storage for holding the input and output buffers
-    // This will be passed to TensorRT for inference
-    m_buffers.resize(m_engine->getNbIOTensors());
-
-    // Create a cuda stream
-    cudaStream_t stream;
-    checkCudaErrorCode(cudaStreamCreate(&stream));
-
-    // Allocate GPU memory for input and output buffers
-    m_outputLengthsFloat.clear();
-    for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
-        const auto tensorName = m_engine->getIOTensorName(i);
-        m_IOTensorNames.emplace_back(tensorName);
-        const auto tensorType = m_engine->getTensorIOMode(tensorName);
-        const auto tensorShape = m_engine->getTensorShape(tensorName);
-
-        // The implementation currently only works with float inputs and outputs
-        if (m_engine->getTensorDataType(tensorName) != DataType::kFLOAT) {
-            throw std::runtime_error("Error, the implementation currently only supports float inputs and outputs");
-        }
-
-        if (tensorType == TensorIOMode::kINPUT) {
-            // Allocate memory for the input
-            // Allocate enough to fit the max batch size (we could end up using less later)
-            checkCudaErrorCode(cudaMallocAsync(&m_buffers[i], m_options.maxBatchSize * tensorShape.d[1] * tensorShape.d[2] * tensorShape.d[3] * sizeof(float), stream));
-
-            // Store the input dims for later use
-            m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);
-            m_inputBatchSize = tensorShape.d[0];
-        } else if (tensorType == TensorIOMode::kOUTPUT) {
-            // The binding is an output
-            uint32_t outputLenFloat = 1;
-            m_outputDims.push_back(tensorShape);
-
-            for (int j = 1; j < tensorShape.nbDims; ++j) {
-                // We ignore j = 0 because that is the batch size, and we will take that into account when sizing the buffer
-                outputLenFloat *= tensorShape.d[j];
-            }
-
-            m_outputLengthsFloat.push_back(outputLenFloat);
-            // Now size the output buffer appropriately, taking into account the max possible batch size (although we could actually end up using less memory)
-            checkCudaErrorCode(cudaMallocAsync(&m_buffers[i], outputLenFloat * m_options.maxBatchSize * sizeof(float), stream));
-        } else {
-            throw std::runtime_error("Error, IO Tensor is neither an input or output!");
-        }
-    }
-
-    // Synchronize and destroy the cuda stream
-    checkCudaErrorCode(cudaStreamSynchronize(stream));
-    checkCudaErrorCode(cudaStreamDestroy(stream));
-
-    return true;
-}
-
-bool Engine::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<float>>>& featureVectors) {
-    // First we do some error checking
-    if (inputs.empty() || inputs[0].empty()) {
-        std::cout << "===== Error =====" << std::endl;
-        std::cout << "Provided input vector is empty!" << std::endl;
-        return false;
-    }
-
-    const auto numInputs = m_inputDims.size();
-    if (inputs.size() != numInputs) {
-        std::cout << "===== Error =====" << std::endl;
-        std::cout << "Incorrect number of inputs provided!" << std::endl;
-        return false;
-    }
-
-    // Ensure the batch size does not exceed the max
-    if (inputs[0].size() > static_cast<size_t>(m_options.maxBatchSize)) {
-        std::cout << "===== Error =====" << std::endl;
-        std::cout << "The batch size is larger than the model expects!" << std::endl;
-        std::cout << "Model max batch size: " << m_options.maxBatchSize << std::endl;
-        std::cout << "Batch size provided to call to runInference: " << inputs[0].size() << std::endl;
-        return false;
-    }
-
-    // Ensure that if the model has a fixed batch size that is greater than 1, the input has the correct length
-    if (m_inputBatchSize != -1 && inputs[0].size() != static_cast<size_t>(m_inputBatchSize)) {
-        std::cout << "===== Error =====" << std::endl;
-        std::cout << "The batch size is different from what the model expects!" << std::endl;
-        std::cout << "Model batch size: " << m_inputBatchSize << std::endl;
-        std::cout << "Batch size provided to call to runInference: " << inputs[0].size() << std::endl;
-        return false;
-    }
-
-    const auto batchSize = static_cast<int32_t>(inputs[0].size());
-    // Make sure the same batch size was provided for all inputs
-    for (size_t i = 1; i < inputs.size(); ++i) {
-        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
-            std::cout << "===== Error =====" << std::endl;
-            std::cout << "The batch size needs to be constant for all inputs!" << std::endl;
-            return false;
-        }
-    }
-
-    // Create the cuda stream that will be used for inference
-    cudaStream_t inferenceCudaStream;
-    checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream));
-
-    // Preprocess all the inputs
-    for (size_t i = 0; i < numInputs; ++i) {
-        const auto& batchInput = inputs[i];
-        const auto& dims = m_inputDims[i];
-
-        auto &input = batchInput[0];
-        if (input.channels() != dims.d[0] ||
-            input.rows != dims.d[1] ||
-            input.cols != dims.d[2]) {
-            std::cout << "===== Error =====" << std::endl;
-            std::cout << "Input does not have correct size!" << std::endl;
-            std::cout << "Expected: (" << dims.d[0] << ", " << dims.d[1] << ", "
-                      << dims.d[2] << ")" << std::endl;
-            std::cout << "Got: (" << input.channels() << ", " << input.rows << ", " << input.cols << ")" << std::endl;
-            std::cout << "Ensure you resize your input image to the correct size" << std::endl;
-            return false;
-        }
-
-        nvinfer1::Dims4 inputDims = {batchSize, dims.d[0], dims.d[1], dims.d[2]};
-        m_context->setInputShape(m_IOTensorNames[i].c_str(), inputDims); // Define the batch size
-
-        // OpenCV reads images into memory in NHWC format, while TensorRT expects images in NCHW format. 
-        // The following method converts NHWC to NCHW.
-        // Even though TensorRT expects NCHW at IO, during optimization, it can internally use NHWC to optimize cuda kernels
-        // See: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#data-layout
-        // Copy over the input data and perform the preprocessing
-        auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize);
-        auto *dataPointer = mfloat.ptr<void>();
-
-        checkCudaErrorCode(cudaMemcpyAsync(m_buffers[i], dataPointer,
-                                           mfloat.cols * mfloat.rows * mfloat.channels() * sizeof(float),
-                                           cudaMemcpyDeviceToDevice, inferenceCudaStream));
-    }
-
-    // Ensure all dynamic bindings have been defined.
-    if (!m_context->allInputDimensionsSpecified()) {
-        throw std::runtime_error("Error, not all required dimensions specified.");
-    }
-
-    // Set the address of the input and output buffers
-    for (size_t i = 0; i < m_buffers.size(); ++i) {
-        bool status = m_context->setTensorAddress(m_IOTensorNames[i].c_str(), m_buffers[i]);
-        if (!status) {
-            return false;
-        }
-    }
-
-    // Run inference.
-    bool status = m_context->enqueueV3(inferenceCudaStream);
-    if (!status) {
-        return false;
-    }
-
-    // Copy the outputs back to CPU
-    featureVectors.clear();
-
-    for (int batch = 0; batch < batchSize; ++batch) {
-        // Batch
-        std::vector<std::vector<float>> batchOutputs{};
-        for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbBindings(); ++outputBinding) {
-            // We start at index m_inputDims.size() to account for the inputs in our m_buffers
-            std::vector<float> output;
-            auto outputLenFloat = m_outputLengthsFloat[outputBinding - numInputs];
-            output.resize(outputLenFloat);
-            // Copy the output
-            checkCudaErrorCode(cudaMemcpyAsync(output.data(), static_cast<char*>(m_buffers[outputBinding]) + (batch * sizeof(float) * outputLenFloat), outputLenFloat * sizeof(float), cudaMemcpyDeviceToHost, inferenceCudaStream));
-            batchOutputs.emplace_back(std::move(output));
-        }
-        featureVectors.emplace_back(std::move(batchOutputs));
-    }
-
-    // Synchronize the cuda stream
-    checkCudaErrorCode(cudaStreamSynchronize(inferenceCudaStream));
-    checkCudaErrorCode(cudaStreamDestroy(inferenceCudaStream));
-    return true;
-}
-
-cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>& batchInput, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize) {
-    cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);
-
-    size_t width = batchInput[0].cols * batchInput[0].rows;
-    for (size_t img = 0; img < batchInput.size(); img++) {
-        std::vector<cv::cuda::GpuMat> input_channels{
-                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])),
-                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
-                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U,
-                                 &(gpu_dst.ptr()[width * 2 + width * 3 * img]))
-        };
-        cv::cuda::split(batchInput[img], input_channels);  // HWC -> CHW
-    }
-
-    cv::cuda::GpuMat mfloat;
-    if (normalize) {
-        // [0.f, 1.f]
-        gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f);
-    } else {
-        // [0.f, 255.f]
-        gpu_dst.convertTo(mfloat, CV_32FC3);
-    }
-
-    // Apply scaling and mean subtraction
-    cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1);
-    cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1);
-
-    return mfloat;
-}
-
-std::string Engine::serializeEngineOptions(const Options &options, const std::string& onnxModelPath) {
-    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
-    std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
-
-    // Add the GPU device name to the file to ensure that the model is only used on devices with the exact same GPU
-    std::vector<std::string> deviceNames;
-    getDeviceNames(deviceNames);
-
-    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
-        throw std::runtime_error("Error, provided device index is out of range!");
-    }
-
-    auto deviceName = deviceNames[options.deviceIndex];
-    // Remove spaces from the device name
-    deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
-
-    engineName+= "." + deviceName;
-
-    // Serialize the specified options into the filename
-    if (options.precision == Precision::FP16) {
-        engineName += ".fp16";
-    } else if (options.precision == Precision::FP32){
-        engineName += ".fp32";
-    } else {
-        engineName += ".int8";
-    }
-
-    engineName += "." + std::to_string(options.maxBatchSize);
-    engineName += "." + std::to_string(options.optBatchSize);
-
-    return engineName;
-}
-
-void Engine::getDeviceNames(std::vector<std::string>& deviceNames) {
-    int numGPUs;
-    cudaGetDeviceCount(&numGPUs);
-
-    for (int device=0; device<numGPUs; device++) {
-        cudaDeviceProp prop;
-        cudaGetDeviceProperties(&prop, device);
-
-        deviceNames.push_back(std::string(prop.name));
-    }
-}
-
-cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width, const cv::Scalar &bgcolor) {
-    float r = std::min(width / (input.cols * 1.0), height / (input.rows * 1.0));
-    int unpad_w = r * input.cols;
-    int unpad_h = r * input.rows;
-    cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3);
-    cv::cuda::resize(input, re, re.size());
-    cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor);
-    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
-    return out;
-}
-
-void Engine::transformOutput(std::vector<std::vector<std::vector<float>>>& input, std::vector<std::vector<float>>& output) {
-    if (input.size() != 1) {
-        throw std::logic_error("The feature vector has incorrect dimensions!");
-    }
-
-    output = std::move(input[0]);
-}
-
-void Engine::transformOutput(std::vector<std::vector<std::vector<float>>>& input, std::vector<float>& output) {
-    if (input.size() != 1 || input[0].size() != 1) {
-        throw std::logic_error("The feature vector has incorrect dimensions!");
-    }
-
-    output = std::move(input[0][0]);
-}
-
-Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH,
-                                               const std::string &calibDataDirPath,
-                                               const std::string &calibTableName,
-                                               const std::string &inputBlobName,
-                                               const std::array<float, 3>& subVals,
-                                               const std::array<float, 3>& divVals,
-                                               bool normalize,
+Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath,
+                                               const std::string &calibTableName, const std::string &inputBlobName,
+                                               const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize,
                                                bool readCache)
-        : m_batchSize(batchSize)
-        , m_inputW(inputW)
-        , m_inputH(inputH)
-        , m_imgIdx(0)
-        , m_calibTableName(calibTableName)
-        , m_inputBlobName(inputBlobName)
-        , m_subVals(subVals)
-        , m_divVals(divVals)
-        , m_normalize(normalize)
-        , m_readCache(readCache) {
+    : m_batchSize(batchSize), m_inputW(inputW), m_inputH(inputH), m_imgIdx(0), m_calibTableName(calibTableName),
+      m_inputBlobName(inputBlobName), m_subVals(subVals), m_divVals(divVals), m_normalize(normalize), m_readCache(readCache) {
 
     // Allocate GPU memory to hold the entire batch
     m_inputCount = 3 * inputW * inputH * batchSize;
@@ -563,8 +50,8 @@ Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW
     }
 
     // Randomize the calibration data
-    auto rd = std::random_device {};
-    auto rng = std::default_random_engine { rd() };
+    auto rd = std::random_device{};
+    auto rng = std::default_random_engine{rd()};
     std::shuffle(std::begin(m_imgPaths), std::end(m_imgPaths), rng);
 }
 
@@ -574,7 +61,8 @@ int32_t Int8EntropyCalibrator2::getBatchSize() const noexcept {
 }
 
 bool Int8EntropyCalibrator2::getBatch(void **bindings, const char **names, int32_t nbBindings) noexcept {
-    // This method will read a batch of images into GPU memory, and place the pointer to the GPU memory in the bindings variable.
+    // This method will read a batch of images into GPU memory, and place the
+    // pointer to the GPU memory in the bindings variable.
 
     if (m_imgIdx + m_batchSize > static_cast<int>(m_imgPaths.size())) {
         // There are not enough images left to satisfy an entire batch
@@ -586,7 +74,7 @@ bool Int8EntropyCalibrator2::getBatch(void **bindings, const char **names, int32
     for (int i = m_imgIdx; i < m_imgIdx + m_batchSize; i++) {
         std::cout << "Reading image " << i << ": " << m_imgPaths[i] << std::endl;
         auto cpuImg = cv::imread(m_imgPaths[i]);
-        if (cpuImg.empty()){
+        if (cpuImg.empty()) {
             std::cout << "Fatal error: Unable to read image at path: " << m_imgPaths[i] << std::endl;
             return false;
         }
@@ -596,20 +84,20 @@ bool Int8EntropyCalibrator2::getBatch(void **bindings, const char **names, int32
         cv::cuda::cvtColor(gpuImg, gpuImg, cv::COLOR_BGR2RGB);
 
         // TODO: Define any preprocessing code here, such as resizing
-        auto resized = Engine::resizeKeepAspectRatioPadRightBottom(gpuImg, m_inputH, m_inputW);
+        auto resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(gpuImg, m_inputH, m_inputW);
 
         inputImgs.emplace_back(std::move(resized));
     }
 
     // Convert the batch from NHWC to NCHW
     // ALso apply normalization, scaling, and mean subtraction
-    auto mfloat = Engine::blobFromGpuMats(inputImgs, m_subVals, m_divVals, m_normalize);
+    auto mfloat = Engine<float>::blobFromGpuMats(inputImgs, m_subVals, m_divVals, m_normalize);
     auto *dataPointer = mfloat.ptr<void>();
 
     // Copy the GPU buffer to member variable so that it persists
     checkCudaErrorCode(cudaMemcpyAsync(m_deviceInput, dataPointer, m_inputCount * sizeof(float), cudaMemcpyDeviceToDevice));
 
-    m_imgIdx+= m_batchSize;
+    m_imgIdx += m_batchSize;
     if (std::string(names[0]) != m_inputBlobName) {
         std::cout << "Error: Incorrect input name provided!" << std::endl;
         return false;
@@ -634,10 +122,7 @@ void const *Int8EntropyCalibrator2::readCalibrationCache(size_t &length) noexcep
 void Int8EntropyCalibrator2::writeCalibrationCache(const void *ptr, std::size_t length) noexcept {
     std::cout << "Writing calib cache: " << m_calibTableName << " Size: " << length << " bytes" << std::endl;
     std::ofstream output(m_calibTableName, std::ios::binary);
-    output.write(reinterpret_cast<const char*>(ptr), length);
+    output.write(reinterpret_cast<const char *>(ptr), length);
 }
 
-Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
-    checkCudaErrorCode(cudaFree(m_deviceInput));
-};
-
+Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { checkCudaErrorCode(cudaFree(m_deviceInput)); };
diff --git a/src/engine.h b/src/engine.h
index ab6de35..cc0d6f7 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -1,42 +1,44 @@
 #pragma once
 
-#include <fstream>
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
 #include <chrono>
-#include <opencv2/opencv.hpp>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <fstream>
 #include <opencv2/core/cuda.hpp>
-#include <opencv2/cudawarping.hpp>
 #include <opencv2/cudaarithm.hpp>
-#include "NvInfer.h"
-#include <cuda_runtime.h>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/opencv.hpp>
 
 // Utility methods
 namespace Util {
-    inline bool doesFileExist(const std::string& filepath) {
-        std::ifstream f(filepath.c_str());
-        return f.good();
-    }
+inline bool doesFileExist(const std::string &filepath) {
+    std::ifstream f(filepath.c_str());
+    return f.good();
+}
 
-    inline void checkCudaErrorCode(cudaError_t code) {
-        if (code != 0) {
-            std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + "(" + cudaGetErrorName(code) + "), with message: " + cudaGetErrorString(code);
-            std::cout << errMsg << std::endl;
-            throw std::runtime_error(errMsg);
-        }
+inline void checkCudaErrorCode(cudaError_t code) {
+    if (code != 0) {
+        std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + "(" + cudaGetErrorName(code) +
+                             "), with message: " + cudaGetErrorString(code);
+        std::cout << errMsg << std::endl;
+        throw std::runtime_error(errMsg);
     }
-
-    std::vector<std::string> getFilesInDirectory(const std::string& dirPath);
 }
+
+std::vector<std::string> getFilesInDirectory(const std::string &dirPath);
+} // namespace Util
 // Utility Timer
-template <typename Clock = std::chrono::high_resolution_clock>
-class Stopwatch
-{
+template <typename Clock = std::chrono::high_resolution_clock> class Stopwatch {
     typename Clock::time_point start_point;
+
 public:
-    Stopwatch() :start_point(Clock::now()){}
+    Stopwatch() : start_point(Clock::now()) {}
 
     // Returns elapsed time
-    template <typename Rep = typename Clock::duration::rep, typename Units = typename Clock::duration>
-    Rep elapsedTime() const {
+    template <typename Rep = typename Clock::duration::rep, typename Units = typename Clock::duration> Rep elapsedTime() const {
         std::atomic_thread_fence(std::memory_order_relaxed);
         auto counted_time = std::chrono::duration_cast<Units>(Clock::now() - start_point).count();
         std::atomic_thread_fence(std::memory_order_relaxed);
@@ -62,10 +64,12 @@ enum class Precision {
 struct Options {
     // Precision to use for GPU inference.
     Precision precision = Precision::FP16;
-    // If INT8 precision is selected, must provide path to calibration dataset directory.
+    // If INT8 precision is selected, must provide path to calibration dataset
+    // directory.
     std::string calibrationDataDirectoryPath;
-    // The batch size to be used when computing calibration data for INT8 inference.
-    // Should be set to as large a batch number as your GPU will support.
+    // The batch size to be used when computing calibration data for INT8
+    // inference. Should be set to as large a batch number as your GPU will
+    // support.
     int32_t calibrationBatchSize = 128;
     // The batch size which should be optimized for.
     int32_t optBatchSize = 1;
@@ -78,14 +82,17 @@ struct Options {
 // Class used for int8 calibration
 class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 public:
-    Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string& calibDataDirPath, const std::string& calibTableName, const std::string& inputBlobName,
-                           const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},const std::array<float, 3>& divVals = {1.f, 1.f, 1.f}, bool normalize = true, bool readCache = true);
+    Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath,
+                           const std::string &calibTableName, const std::string &inputBlobName,
+                           const std::array<float, 3> &subVals = {0.f, 0.f, 0.f}, const std::array<float, 3> &divVals = {1.f, 1.f, 1.f},
+                           bool normalize = true, bool readCache = true);
     virtual ~Int8EntropyCalibrator2();
     // Abstract base class methods which must be implemented
-    int32_t getBatchSize () const noexcept override;
-    bool getBatch (void *bindings[], char const *names[], int32_t nbBindings) noexcept override;
-    void const * readCalibrationCache (std::size_t &length) noexcept override;
-    void writeCalibrationCache (void const *ptr, std::size_t length) noexcept override;
+    int32_t getBatchSize() const noexcept override;
+    bool getBatch(void *bindings[], char const *names[], int32_t nbBindings) noexcept override;
+    void const *readCalibrationCache(std::size_t &length) noexcept override;
+    void writeCalibrationCache(void const *ptr, std::size_t length) noexcept override;
+
 private:
     const int32_t m_batchSize;
     const int32_t m_inputW;
@@ -99,57 +106,83 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
     const std::array<float, 3> m_divVals;
     const bool m_normalize;
     const bool m_readCache;
-    void* m_deviceInput;
+    void *m_deviceInput;
     std::vector<char> m_calibCache;
 };
 
 // Class to extend TensorRT logger
 class Logger : public nvinfer1::ILogger {
-    void log (Severity severity, const char* msg) noexcept override;
+    void log(Severity severity, const char *msg) noexcept override;
 };
 
-class Engine {
+template <typename T> class Engine {
 public:
-    Engine(const Options& options);
+    Engine(const Options &options);
     ~Engine();
-    // Build the network
+
+    // Build the onnx model into a TensorRT engine file, cache the model to disk
+    // (to avoid rebuilding in future), and then load the model into memory The
+    // default implementation will normalize values between [0.f, 1.f] Setting the
+    // normalize flag to false will leave values between [0.f, 255.f] (some
+    // converted models may require this). If the model requires values to be
+    // normalized between [-1.f, 1.f], use the following params:
+    //    subVals = {0.5f, 0.5f, 0.5f};
+    //    divVals = {0.5f, 0.5f, 0.5f};
+    //    normalize = true;
+    bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
+                          const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true);
+
+    // Load a TensorRT engine file from disk into memory
     // The default implementation will normalize values between [0.f, 1.f]
-    // Setting the normalize flag to false will leave values between [0.f, 255.f] (some converted models may require this).
-    // If the model requires values to be normalized between [-1.f, 1.f], use the following params:
+    // Setting the normalize flag to false will leave values between [0.f, 255.f]
+    // (some converted models may require this). If the model requires values to
+    // be normalized between [-1.f, 1.f], use the following params:
     //    subVals = {0.5f, 0.5f, 0.5f};
     //    divVals = {0.5f, 0.5f, 0.5f};
     //    normalize = true;
-    bool build(std::string onnxModelPath, const std::array<float, 3>& subVals = {0.f, 0.f, 0.f}, const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
-               bool normalize = true);
-    // Load and prepare the network for inference
-    bool loadNetwork();
+    bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
+                     const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true);
+
     // Run inference.
     // Input format [input][batch][cv::cuda::GpuMat]
     // Output format [batch][output][feature_vector]
-    bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs, std::vector<std::vector<std::vector<float>>>& featureVectors);
+    bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors);
 
-    // Utility method for resizing an image while maintaining the aspect ratio by adding padding to smaller dimension after scaling
-    // While letterbox padding normally adds padding to top & bottom, or left & right sides, this implementation only adds padding to the right or bottom side
-    // This is done so that it's easier to convert detected coordinates (ex. YOLO model) back to the original reference frame.
-    static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, size_t height, size_t width, const cv::Scalar& bgcolor = cv::Scalar(0, 0, 0));
+    // Utility method for resizing an image while maintaining the aspect ratio by
+    // adding padding to smaller dimension after scaling While letterbox padding
+    // normally adds padding to top & bottom, or left & right sides, this
+    // implementation only adds padding to the right or bottom side This is done
+    // so that it's easier to convert detected coordinates (ex. YOLO model) back
+    // to the original reference frame.
+    static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
+                                                                const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));
 
-    [[nodiscard]] const std::vector<nvinfer1::Dims3>& getInputDims() const { return m_inputDims; };
-    [[nodiscard]] const std::vector<nvinfer1::Dims>& getOutputDims() const { return m_outputDims ;};
+    [[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const { return m_inputDims; };
+    [[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const { return m_outputDims; };
 
     // Utility method for transforming triple nested output array into 2D array
-    // Should be used when the output batch size is 1, but there are multiple output feature vectors
-    static void transformOutput(std::vector<std::vector<std::vector<float>>>& input, std::vector<std::vector<float>>& output);
+    // Should be used when the output batch size is 1, but there are multiple
+    // output feature vectors
+    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);
 
-    // Utility method for transforming triple nested output array into single array
-    // Should be used when the output batch size is 1, and there is only a single output feature vector
-    static void transformOutput(std::vector<std::vector<std::vector<float>>>& input, std::vector<float>& output);
+    // Utility method for transforming triple nested output array into single
+    // array Should be used when the output batch size is 1, and there is only a
+    // single output feature vector
+    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);
     // Convert NHWC to NCHW and apply scaling and mean subtraction
-    static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat>& batchInput, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
+    static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
+                                            const std::array<float, 3> &divVals, bool normalize);
+
 private:
+    // Build the network
+    bool build(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize);
+
     // Converts the engine options into a string
-    std::string serializeEngineOptions(const Options& options, const std::string& onnxModelPath);
+    std::string serializeEngineOptions(const Options &options, const std::string &onnxModelPath);
+
+    void getDeviceNames(std::vector<std::string> &deviceNames);
 
-    void getDeviceNames(std::vector<std::string>& deviceNames);
+    void clearGpuBuffers();
 
     // Normalization, scaling, and mean subtraction of inputs
     std::array<float, 3> m_subVals{};
@@ -157,19 +190,598 @@ class Engine {
     bool m_normalize;
 
     // Holds pointers to the input and output GPU buffers
-    std::vector<void*> m_buffers;
-    std::vector<uint32_t> m_outputLengthsFloat{};
+    std::vector<void *> m_buffers;
+    std::vector<uint32_t> m_outputLengths{};
     std::vector<nvinfer1::Dims3> m_inputDims;
     std::vector<nvinfer1::Dims> m_outputDims;
     std::vector<std::string> m_IOTensorNames;
     int32_t m_inputBatchSize;
 
-    // Must keep IRuntime around for inference, see: https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
+    // Must keep IRuntime around for inference, see:
+    // https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
     std::unique_ptr<nvinfer1::IRuntime> m_runtime = nullptr;
     std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;
     std::unique_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;
     std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;
     const Options m_options;
     Logger m_logger;
-    std::string m_engineName;
 };
+
+template <typename T> Engine<T>::Engine(const Options &options) : m_options(options) {}
+
+template <typename T> Engine<T>::~Engine() { clearGpuBuffers(); }
+
+template <typename T> void Engine<T>::clearGpuBuffers() {
+    if (!m_buffers.empty()) {
+        // Free GPU memory of outputs
+        const auto numInputs = m_inputDims.size();
+        for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbBindings(); ++outputBinding) {
+            Util::checkCudaErrorCode(cudaFree(m_buffers[outputBinding]));
+        }
+        m_buffers.clear();
+    }
+}
+
+template <typename T>
+bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals,
+                                 bool normalize) {
+    // Only regenerate the engine file if it has not already been generated for
+    // the specified options, otherwise load cached version from disk
+    const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
+    std::cout << "Searching for engine file with name: " << engineName << std::endl;
+
+    if (Util::doesFileExist(engineName)) {
+        std::cout << "Engine found, not regenerating..." << std::endl;
+    } else {
+        if (!Util::doesFileExist(onnxModelPath)) {
+            throw std::runtime_error("Could not find onnx model at path: " + onnxModelPath);
+        }
+
+        // Was not able to find the engine file, generate...
+        std::cout << "Engine not found, generating. This could take a while..." << std::endl;
+
+        // Build the onnx model into a TensorRT engine
+        auto ret = build(onnxModelPath, subVals, divVals, normalize);
+        if (!ret) {
+            return false;
+        }
+    }
+
+    // Load the TensorRT engine file into memory
+    return loadNetwork(engineName, subVals, divVals, normalize);
+}
+
+template <typename T>
+bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals,
+                            bool normalize) {
+    m_subVals = subVals;
+    m_divVals = divVals;
+    m_normalize = normalize;
+
+    // Read the serialized model from disk
+    if (!Util::doesFileExist(trtModelPath)) {
+        std::cout << "Error, unable to read TensorRT model at path: " + trtModelPath << std::endl;
+        return false;
+    } else {
+        std::cout << "Loading TensorRT engine file at path: " << trtModelPath << std::endl;
+    }
+
+    std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
+    std::streamsize size = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<char> buffer(size);
+    if (!file.read(buffer.data(), size)) {
+        throw std::runtime_error("Unable to read engine file");
+    }
+
+    // Create a runtime to deserialize the engine file.
+    m_runtime = std::unique_ptr<nvinfer1::IRuntime>{nvinfer1::createInferRuntime(m_logger)};
+    if (!m_runtime) {
+        return false;
+    }
+
+    // Set the device index
+    auto ret = cudaSetDevice(m_options.deviceIndex);
+    if (ret != 0) {
+        int numGPUs;
+        cudaGetDeviceCount(&numGPUs);
+        auto errMsg = "Unable to set GPU device index to: " + std::to_string(m_options.deviceIndex) + ". Note, your device has " +
+                      std::to_string(numGPUs) + " CUDA-capable GPU(s).";
+        throw std::runtime_error(errMsg);
+    }
+
+    // Create an engine, a representation of the optimized model.
+    m_engine = std::unique_ptr<nvinfer1::ICudaEngine>(m_runtime->deserializeCudaEngine(buffer.data(), buffer.size()));
+    if (!m_engine) {
+        return false;
+    }
+
+    // The execution context contains all of the state associated with a
+    // particular invocation
+    m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
+    if (!m_context) {
+        return false;
+    }
+
+    // Storage for holding the input and output buffers
+    // This will be passed to TensorRT for inference
+    clearGpuBuffers();
+    m_buffers.resize(m_engine->getNbIOTensors());
+
+    m_outputLengths.clear();
+    m_inputDims.clear();
+    m_outputDims.clear();
+    m_IOTensorNames.clear();
+
+    // Create a cuda stream
+    cudaStream_t stream;
+    Util::checkCudaErrorCode(cudaStreamCreate(&stream));
+
+    // Allocate GPU memory for input and output buffers
+    m_outputLengths.clear();
+    for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
+        const auto tensorName = m_engine->getIOTensorName(i);
+        m_IOTensorNames.emplace_back(tensorName);
+        const auto tensorType = m_engine->getTensorIOMode(tensorName);
+        const auto tensorShape = m_engine->getTensorShape(tensorName);
+        const auto tensorDataType = m_engine->getTensorDataType(tensorName);
+
+        if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
+            // The implementation currently only supports inputs of type float
+            if (m_engine->getTensorDataType(tensorName) != nvinfer1::DataType::kFLOAT) {
+                throw std::runtime_error("Error, the implementation currently only supports float inputs");
+            }
+
+            // Don't need to allocate memory for inputs as we will be using the OpenCV
+            // GpuMat buffer directly.
+
+            // Store the input dims for later use
+            m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);
+            m_inputBatchSize = tensorShape.d[0];
+        } else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
+            // Ensure the model output data type matches the template argument
+            // specified by the user
+            if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same<float, T>::value) {
+                throw std::runtime_error("Error, the model has expected output of type float. Engine class "
+                                         "template parameter must be adjusted.");
+            } else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) {
+                throw std::runtime_error("Error, the model has expected output of type __half. Engine class "
+                                         "template parameter must be adjusted.");
+            } else if (tensorDataType == nvinfer1::DataType::kINT8 && !std::is_same<int8_t, T>::value) {
+                throw std::runtime_error("Error, the model has expected output of type int8_t. Engine class "
+                                         "template parameter must be adjusted.");
+            } else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same<int32_t, T>::value) {
+                throw std::runtime_error("Error, the model has expected output of type int32_t. Engine "
+                                         "class template parameter must be adjusted.");
+            } else if (tensorDataType == nvinfer1::DataType::kBOOL && !std::is_same<bool, T>::value) {
+                throw std::runtime_error("Error, the model has expected output of type bool. Engine class "
+                                         "template parameter must be adjusted.");
+            } else if (tensorDataType == nvinfer1::DataType::kUINT8 && !std::is_same<uint8_t, T>::value) {
+                throw std::runtime_error("Error, the model has expected output of type uint8_t. Engine "
+                                         "class template parameter must be adjusted.");
+            } else if (tensorDataType == nvinfer1::DataType::kFP8) {
+                throw std::runtime_error("Error, model has unsupported output type");
+            }
+
+            // The binding is an output
+            uint32_t outputLength = 1;
+            m_outputDims.push_back(tensorShape);
+
+            for (int j = 1; j < tensorShape.nbDims; ++j) {
+                // We ignore j = 0 because that is the batch size, and we will take that
+                // into account when sizing the buffer
+                outputLength *= tensorShape.d[j];
+            }
+
+            m_outputLengths.push_back(outputLength);
+            // Now size the output buffer appropriately, taking into account the max
+            // possible batch size (although we could actually end up using less
+            // memory)
+            Util::checkCudaErrorCode(cudaMallocAsync(&m_buffers[i], outputLength * m_options.maxBatchSize * sizeof(T), stream));
+        } else {
+            throw std::runtime_error("Error, IO Tensor is neither an input or output!");
+        }
+    }
+
+    // Synchronize and destroy the cuda stream
+    Util::checkCudaErrorCode(cudaStreamSynchronize(stream));
+    Util::checkCudaErrorCode(cudaStreamDestroy(stream));
+
+    return true;
+}
+
+template <typename T>
+bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize) {
+    // Create our engine builder.
+    auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
+    if (!builder) {
+        return false;
+    }
+
+    // Define an explicit batch size and then create the network (implicit batch
+    // size is deprecated). More info here:
+    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#explicit-implicit-batch
+    auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
+    if (!network) {
+        return false;
+    }
+
+    // Create a parser for reading the onnx file.
+    auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));
+    if (!parser) {
+        return false;
+    }
+
+    // We are going to first read the onnx file into memory, then pass that buffer
+    // to the parser. Had our onnx model file been encrypted, this approach would
+    // allow us to first decrypt the buffer.
+    std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
+    std::streamsize size = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<char> buffer(size);
+    if (!file.read(buffer.data(), size)) {
+        throw std::runtime_error("Unable to read engine file");
+    }
+
+    // Parse the buffer we read into memory.
+    auto parsed = parser->parse(buffer.data(), buffer.size());
+    if (!parsed) {
+        return false;
+    }
+
+    // Ensure that all the inputs have the same batch size
+    const auto numInputs = network->getNbInputs();
+    if (numInputs < 1) {
+        throw std::runtime_error("Error, model needs at least 1 input!");
+    }
+    const auto input0Batch = network->getInput(0)->getDimensions().d[0];
+    for (int32_t i = 1; i < numInputs; ++i) {
+        if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
+            throw std::runtime_error("Error, the model has multiple inputs, each "
+                                     "with differing batch sizes!");
+        }
+    }
+
+    // Check to see if the model supports dynamic batch size or not
+    bool doesSupportDynamicBatch = false;
+    if (input0Batch == -1) {
+        doesSupportDynamicBatch = true;
+        std::cout << "Model supports dynamic batch size" << std::endl;
+    } else {
+        std::cout << "Model only supports fixed batch size of " << input0Batch << std::endl;
+        // If the model supports a fixed batch size, ensure that the maxBatchSize
+        // and optBatchSize were set correctly.
+        if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
+            throw std::runtime_error("Error, model only supports a fixed batch size of " + std::to_string(input0Batch) +
+                                     ". Must set Options.optBatchSize and Options.maxBatchSize to 1");
+        }
+    }
+
+    auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config) {
+        return false;
+    }
+
+    // Register a single optimization profile
+    nvinfer1::IOptimizationProfile *optProfile = builder->createOptimizationProfile();
+    for (int32_t i = 0; i < numInputs; ++i) {
+        // Must specify dimensions for all the inputs the model expects.
+        const auto input = network->getInput(i);
+        const auto inputName = input->getName();
+        const auto inputDims = input->getDimensions();
+        int32_t inputC = inputDims.d[1];
+        int32_t inputH = inputDims.d[2];
+        int32_t inputW = inputDims.d[3];
+
+        // Specify the optimization profile`
+        if (doesSupportDynamicBatch) {
+            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, inputC, inputH, inputW));
+        } else {
+            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN,
+                                      nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, inputW));
+        }
+        optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,
+                                  nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, inputW));
+        optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,
+                                  nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, inputW));
+    }
+    config->addOptimizationProfile(optProfile);
+
+    // Set the precision level
+    const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
+    if (m_options.precision == Precision::FP16) {
+        // Ensure the GPU supports FP16 inference
+        if (!builder->platformHasFastFp16()) {
+            throw std::runtime_error("Error: GPU does not support FP16 precision");
+        }
+        config->setFlag(nvinfer1::BuilderFlag::kFP16);
+    } else if (m_options.precision == Precision::INT8) {
+        if (numInputs > 1) {
+            throw std::runtime_error("Error, this implementation currently only supports INT8 "
+                                     "quantization for single input models");
+        }
+
+        // Ensure the GPU supports INT8 Quantization
+        if (!builder->platformHasFastInt8()) {
+            throw std::runtime_error("Error: GPU does not support INT8 precision");
+        }
+
+        // Ensure the user has provided path to calibration data directory
+        if (m_options.calibrationDataDirectoryPath.empty()) {
+            throw std::runtime_error("Error: If INT8 precision is selected, must provide path to "
+                                     "calibration data directory to Engine::build method");
+        }
+
+        config->setFlag((nvinfer1::BuilderFlag::kINT8));
+
+        const auto input = network->getInput(0);
+        const auto inputName = input->getName();
+        const auto inputDims = input->getDimensions();
+        const auto calibrationFileName = engineName + ".calibration";
+
+        m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],
+                                                                m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,
+                                                                subVals, divVals, normalize);
+        config->setInt8Calibrator(m_calibrator.get());
+    }
+
+    // CUDA stream used for profiling by the builder.
+    cudaStream_t profileStream;
+    Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));
+    config->setProfileStream(profileStream);
+
+    // Build the engine
+    // If this call fails, it is suggested to increase the logger verbosity to
+    // kVERBOSE and try rebuilding the engine. Doing so will provide you with more
+    // information on why exactly it is failing.
+    std::unique_ptr<nvinfer1::IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};
+    if (!plan) {
+        return false;
+    }
+
+    // Write the engine to disk
+    std::ofstream outfile(engineName, std::ofstream::binary);
+    outfile.write(reinterpret_cast<const char *>(plan->data()), plan->size());
+
+    std::cout << "Success, saved engine to " << engineName << std::endl;
+
+    Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
+    return true;
+}
+
+template <typename T>
+bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs,
+                             std::vector<std::vector<std::vector<T>>> &featureVectors) {
+    // First we do some error checking
+    if (inputs.empty() || inputs[0].empty()) {
+        std::cout << "===== Error =====" << std::endl;
+        std::cout << "Provided input vector is empty!" << std::endl;
+        return false;
+    }
+
+    const auto numInputs = m_inputDims.size();
+    if (inputs.size() != numInputs) {
+        std::cout << "===== Error =====" << std::endl;
+        std::cout << "Incorrect number of inputs provided!" << std::endl;
+        return false;
+    }
+
+    // Ensure the batch size does not exceed the max
+    if (inputs[0].size() > static_cast<size_t>(m_options.maxBatchSize)) {
+        std::cout << "===== Error =====" << std::endl;
+        std::cout << "The batch size is larger than the model expects!" << std::endl;
+        std::cout << "Model max batch size: " << m_options.maxBatchSize << std::endl;
+        std::cout << "Batch size provided to call to runInference: " << inputs[0].size() << std::endl;
+        return false;
+    }
+
+    // Ensure that if the model has a fixed batch size that is greater than 1, the
+    // input has the correct length
+    if (m_inputBatchSize != -1 && inputs[0].size() != static_cast<size_t>(m_inputBatchSize)) {
+        std::cout << "===== Error =====" << std::endl;
+        std::cout << "The batch size is different from what the model expects!" << std::endl;
+        std::cout << "Model batch size: " << m_inputBatchSize << std::endl;
+        std::cout << "Batch size provided to call to runInference: " << inputs[0].size() << std::endl;
+        return false;
+    }
+
+    const auto batchSize = static_cast<int32_t>(inputs[0].size());
+    // Make sure the same batch size was provided for all inputs
+    for (size_t i = 1; i < inputs.size(); ++i) {
+        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
+            std::cout << "===== Error =====" << std::endl;
+            std::cout << "The batch size needs to be constant for all inputs!" << std::endl;
+            return false;
+        }
+    }
+
+    // Create the cuda stream that will be used for inference
+    cudaStream_t inferenceCudaStream;
+    Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream));
+
+    std::vector<cv::cuda::GpuMat> preprocessedInputs;
+
+    // Preprocess all the inputs
+    for (size_t i = 0; i < numInputs; ++i) {
+        const auto &batchInput = inputs[i];
+        const auto &dims = m_inputDims[i];
+
+        auto &input = batchInput[0];
+        if (input.channels() != dims.d[0] || input.rows != dims.d[1] || input.cols != dims.d[2]) {
+            std::cout << "===== Error =====" << std::endl;
+            std::cout << "Input does not have correct size!" << std::endl;
+            std::cout << "Expected: (" << dims.d[0] << ", " << dims.d[1] << ", " << dims.d[2] << ")" << std::endl;
+            std::cout << "Got: (" << input.channels() << ", " << input.rows << ", " << input.cols << ")" << std::endl;
+            std::cout << "Ensure you resize your input image to the correct size" << std::endl;
+            return false;
+        }
+
+        nvinfer1::Dims4 inputDims = {batchSize, dims.d[0], dims.d[1], dims.d[2]};
+        m_context->setInputShape(m_IOTensorNames[i].c_str(),
+                                 inputDims); // Define the batch size
+
+        // OpenCV reads images into memory in NHWC format, while TensorRT expects
+        // images in NCHW format. The following method converts NHWC to NCHW. Even
+        // though TensorRT expects NCHW at IO, during optimization, it can
+        // internally use NHWC to optimize cuda kernels See:
+        // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#data-layout
+        // Copy over the input data and perform the preprocessing
+        auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize);
+        preprocessedInputs.push_back(mfloat);
+        m_buffers[i] = mfloat.ptr<void>();
+    }
+
+    // Ensure all dynamic bindings have been defined.
+    if (!m_context->allInputDimensionsSpecified()) {
+        throw std::runtime_error("Error, not all required dimensions specified.");
+    }
+
+    // Set the address of the input and output buffers
+    for (size_t i = 0; i < m_buffers.size(); ++i) {
+        bool status = m_context->setTensorAddress(m_IOTensorNames[i].c_str(), m_buffers[i]);
+        if (!status) {
+            return false;
+        }
+    }
+
+    // Run inference.
+    bool status = m_context->enqueueV3(inferenceCudaStream);
+    if (!status) {
+        return false;
+    }
+
+    // Copy the outputs back to CPU
+    featureVectors.clear();
+
+    for (int batch = 0; batch < batchSize; ++batch) {
+        // Batch
+        std::vector<std::vector<T>> batchOutputs{};
+        for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbBindings(); ++outputBinding) {
+            // We start at index m_inputDims.size() to account for the inputs in our
+            // m_buffers
+            std::vector<T> output;
+            auto outputLength = m_outputLengths[outputBinding - numInputs];
+            output.resize(outputLength);
+            // Copy the output
+            Util::checkCudaErrorCode(cudaMemcpyAsync(output.data(),
+                                                     static_cast<char *>(m_buffers[outputBinding]) + (batch * sizeof(T) * outputLength),
+                                                     outputLength * sizeof(T), cudaMemcpyDeviceToHost, inferenceCudaStream));
+            batchOutputs.emplace_back(std::move(output));
+        }
+        featureVectors.emplace_back(std::move(batchOutputs));
+    }
+
+    // Synchronize the cuda stream
+    Util::checkCudaErrorCode(cudaStreamSynchronize(inferenceCudaStream));
+    Util::checkCudaErrorCode(cudaStreamDestroy(inferenceCudaStream));
+    return true;
+}
+
+template <typename T>
+cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
+                                            const std::array<float, 3> &divVals, bool normalize) {
+    cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);
+
+    size_t width = batchInput[0].cols * batchInput[0].rows;
+    for (size_t img = 0; img < batchInput.size(); img++) {
+        std::vector<cv::cuda::GpuMat> input_channels{
+            cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])),
+            cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
+            cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img]))};
+        cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
+    }
+
+    cv::cuda::GpuMat mfloat;
+    if (normalize) {
+        // [0.f, 1.f]
+        gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f);
+    } else {
+        // [0.f, 255.f]
+        gpu_dst.convertTo(mfloat, CV_32FC3);
+    }
+
+    // Apply scaling and mean subtraction
+    cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1);
+    cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1);
+
+    return mfloat;
+}
+
+template <typename T> std::string Engine<T>::serializeEngineOptions(const Options &options, const std::string &onnxModelPath) {
+    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
+    std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
+
+    // Add the GPU device name to the file to ensure that the model is only used
+    // on devices with the exact same GPU
+    std::vector<std::string> deviceNames;
+    getDeviceNames(deviceNames);
+
+    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
+        throw std::runtime_error("Error, provided device index is out of range!");
+    }
+
+    auto deviceName = deviceNames[options.deviceIndex];
+    // Remove spaces from the device name
+    deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
+
+    engineName += "." + deviceName;
+
+    // Serialize the specified options into the filename
+    if (options.precision == Precision::FP16) {
+        engineName += ".fp16";
+    } else if (options.precision == Precision::FP32) {
+        engineName += ".fp32";
+    } else {
+        engineName += ".int8";
+    }
+
+    engineName += "." + std::to_string(options.maxBatchSize);
+    engineName += "." + std::to_string(options.optBatchSize);
+
+    return engineName;
+}
+
+template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
+    int numGPUs;
+    cudaGetDeviceCount(&numGPUs);
+
+    for (int device = 0; device < numGPUs; device++) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, device);
+
+        deviceNames.push_back(std::string(prop.name));
+    }
+}
+
+template <typename T>
+cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
+                                                                const cv::Scalar &bgcolor) {
+    float r = std::min(width / (input.cols * 1.0), height / (input.rows * 1.0));
+    int unpad_w = r * input.cols;
+    int unpad_h = r * input.rows;
+    cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3);
+    cv::cuda::resize(input, re, re.size());
+    cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor);
+    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
+    return out;
+}
+
+template <typename T>
+void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
+    if (input.size() != 1) {
+        throw std::logic_error("The feature vector has incorrect dimensions!");
+    }
+
+    output = std::move(input[0]);
+}
+
+template <typename T> void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
+    if (input.size() != 1 || input[0].size() != 1) {
+        throw std::logic_error("The feature vector has incorrect dimensions!");
+    }
+
+    output = std::move(input[0][0]);
+}
diff --git a/src/main.cpp b/src/main.cpp
index 8c47466..2083cfa 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,26 +1,14 @@
+#include "cmd_line_parser.h"
 #include "engine.h"
-#include <opencv2/opencv.hpp>
-#include <opencv2/cudaimgproc.hpp>
 #include <chrono>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/opencv.hpp>
 
 int main(int argc, char *argv[]) {
-    // Parse the command line arguments
-    // Must pass the model path as a command line argument to the executable
-    if (argc < 2) {
-        std::cout << "Error: Must specify the model path" << std::endl;
-        std::cout << "Usage: " << argv[0] << " /path/to/onnx/model.onnx" << std::endl;
-        return -1;
-    }
+    CommandLineArguments arguments;
 
-    if (argc > 3) {
-        std::cout << "Error: Too many arguments provided" << std::endl;
-        std::cout << "Usage: " << argv[0] << " /path/to/onnx/model.onnx" << std::endl;
-    }
-
-    // Ensure the onnx model exists
-    const std::string onnxModelPath = argv[1];
-    if (!Util::doesFileExist(onnxModelPath)) {
-        std::cout << "Error: Unable to find file at path: " << onnxModelPath << std::endl;
+    // Parse the command line arguments
+    if (!parseArguments(argc, argv, arguments)) {
         return -1;
     }
 
@@ -29,40 +17,47 @@ int main(int argc, char *argv[]) {
     // Specify what precision to use for inference
     // FP16 is approximately twice as fast as FP32.
     options.precision = Precision::FP16;
-    // If using INT8 precision, must specify path to directory containing calibration data.
+    // If using INT8 precision, must specify path to directory containing
+    // calibration data.
     options.calibrationDataDirectoryPath = "";
     // Specify the batch size to optimize for.
     options.optBatchSize = 1;
     // Specify the maximum batch size we plan on running.
     options.maxBatchSize = 1;
 
-    Engine engine(options);
+    Engine<float> engine(options);
 
     // Define our preprocessing code
     // The default Engine::build method will normalize values between [0.f, 1.f]
-    // Setting the normalize flag to false will leave values between [0.f, 255.f] (some converted models may require this).
+    // Setting the normalize flag to false will leave values between [0.f, 255.f]
+    // (some converted models may require this).
 
-    // For our YoloV8 model, we need the values to be normalized between [0.f, 1.f] so we use the following params
-    std::array<float, 3> subVals {0.f, 0.f, 0.f};
-    std::array<float, 3> divVals {1.f, 1.f, 1.f};
+    // For our YoloV8 model, we need the values to be normalized between
+    // [0.f, 1.f] so we use the following params
+    std::array<float, 3> subVals{0.f, 0.f, 0.f};
+    std::array<float, 3> divVals{1.f, 1.f, 1.f};
     bool normalize = true;
     // Note, we could have also used the default values.
 
-    // If the model requires values to be normalized between [-1.f, 1.f], use the following params:
+    // If the model requires values to be normalized between [-1.f, 1.f], use the
+    // following params:
     //    subVals = {0.5f, 0.5f, 0.5f};
     //    divVals = {0.5f, 0.5f, 0.5f};
     //    normalize = true;
-    
-    // Build the onnx model into a TensorRT engine file.
-    bool succ = engine.build(onnxModelPath, subVals, divVals, normalize);
-    if (!succ) {
-        throw std::runtime_error("Unable to build TRT engine.");
-    }
 
-    // Load the TensorRT engine file from disk
-    succ = engine.loadNetwork();
-    if (!succ) {
-        throw std::runtime_error("Unable to load TRT engine.");
+    if (!arguments.onnxModelPath.empty()) {
+        // Build the onnx model into a TensorRT engine file, and load the TensorRT
+        // engine file into memory.
+        bool succ = engine.buildLoadNetwork(arguments.onnxModelPath, subVals, divVals, normalize);
+        if (!succ) {
+            throw std::runtime_error("Unable to build or load TensorRT engine.");
+        }
+    } else {
+        // Load the TensorRT engine file directly
+        bool succ = engine.loadNetwork(arguments.trtModelPath, subVals, divVals, normalize);
+        if (!succ) {
+            throw std::runtime_error("Unable to load TensorRT engine.");
+        }
     }
 
     // Read the input image
@@ -80,25 +75,33 @@ int main(int argc, char *argv[]) {
     // The model expects RGB input
     cv::cuda::cvtColor(img, img, cv::COLOR_BGR2RGB);
 
-    // In the following section we populate the input vectors to later pass for inference
-    const auto& inputDims = engine.getInputDims();
+    // In the following section we populate the input vectors to later pass for
+    // inference
+    const auto &inputDims = engine.getInputDims();
     std::vector<std::vector<cv::cuda::GpuMat>> inputs;
 
-    // Let's use a batch size which matches that which we set the Options.optBatchSize option
+    // Let's use a batch size which matches that which we set the
+    // Options.optBatchSize option
     size_t batchSize = options.optBatchSize;
 
     // TODO:
-    // For the sake of the demo, we will be feeding the same image to all the inputs
-    // You should populate your inputs appropriately.
-    for (const auto & inputDim : inputDims) { // For each of the model inputs...
+    // For the sake of the demo, we will be feeding the same image to all the
+    // inputs You should populate your inputs appropriately.
+    for (const auto &inputDim : inputDims) { // For each of the model inputs...
         std::vector<cv::cuda::GpuMat> input;
         for (size_t j = 0; j < batchSize; ++j) { // For each element we want to add to the batch...
             // TODO:
-            // You can choose to resize by scaling, adding padding, or a combination of the two in order to maintain the aspect ratio
-            // You can use the Engine::resizeKeepAspectRatioPadRightBottom to resize to a square while maintain the aspect ratio (adds padding where necessary to achieve this).
-            auto resized = Engine::resizeKeepAspectRatioPadRightBottom(img, inputDim.d[1], inputDim.d[2]);
-            // You could also perform a resize operation without maintaining aspect ratio with the use of padding by using the following instead:
-//            cv::cuda::resize(img, resized, cv::Size(inputDim.d[2], inputDim.d[1])); // TRT dims are (height, width) whereas OpenCV is (width, height)
+            // You can choose to resize by scaling, adding padding, or a combination
+            // of the two in order to maintain the aspect ratio You can use the
+            // Engine::resizeKeepAspectRatioPadRightBottom to resize to a square while
+            // maintain the aspect ratio (adds padding where necessary to achieve
+            // this).
+            auto resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(img, inputDim.d[1], inputDim.d[2]);
+            // You could also perform a resize operation without maintaining aspect
+            // ratio with the use of padding by using the following instead:
+            //            cv::cuda::resize(img, resized, cv::Size(inputDim.d[2],
+            //            inputDim.d[1])); // TRT dims are (height, width) whereas
+            //            OpenCV is (width, height)
             input.emplace_back(std::move(resized));
         }
         inputs.emplace_back(std::move(input));
@@ -108,7 +111,7 @@ int main(int argc, char *argv[]) {
     std::cout << "\nWarming up the network..." << std::endl;
     std::vector<std::vector<std::vector<float>>> featureVectors;
     for (int i = 0; i < 100; ++i) {
-        succ = engine.runInference(inputs, featureVectors);
+        bool succ = engine.runInference(inputs, featureVectors);
         if (!succ) {
             throw std::runtime_error("Unable to run inference.");
         }
@@ -138,9 +141,10 @@ int main(int argc, char *argv[]) {
     // Print the feature vectors
     for (size_t batch = 0; batch < featureVectors.size(); ++batch) {
         for (size_t outputNum = 0; outputNum < featureVectors[batch].size(); ++outputNum) {
-            std::cout << "Batch " << batch << ", " << "output " << outputNum << std::endl;
+            std::cout << "Batch " << batch << ", "
+                      << "output " << outputNum << std::endl;
             int i = 0;
-            for (const auto &e:  featureVectors[batch][outputNum]) {
+            for (const auto &e : featureVectors[batch][outputNum]) {
                 std::cout << e << " ";
                 if (++i == 10) {
                     std::cout << "...";
@@ -151,7 +155,8 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    // TODO: If your model requires post processing (ex. convert feature vector into bounding boxes) then you would do so here.
+    // TODO: If your model requires post processing (ex. convert feature vector
+    // into bounding boxes) then you would do so here.
 
     return 0;
 }