How to save time by converting ONNX to TensorRT (original) (raw)

Description

This is a very basic question. When I run the attached source code, it takes time to convert the ONNX model to a TensorRT model every time. How can I eliminate the time it takes to convert to TensorRT? I’m implementing it while looking at the sample source code below, but I don’t understand.

For example, can I use the model.trt generated by running trtexec instead of ONNX?

Environment

TensorRT Version: 10.3.0.30
GPU Type: NVIDIA Jetson Orin NX 8GB(VIA AMOS-9100)
Nvidia Driver Version: JetPack 6.1?
CUDA Version: 12.6.68
CUDNN Version: 9.3.0.75
Operating System + Version: JetPack 6.1 [L4T 36.4.0]
Python Version (if applicable): 3.10.12
TensorFlow Version (if applicable): None
PyTorch Version (if applicable): None
Baremetal or Container (if container which image + tag): Baremetal

Relevant Files

import tensorrt as trt
import cv2
import numpy as np
import common

ENGINE_FILE_PATH = "/home/via/sandbox/python/segmentation/model.trt"
ONNX_FILE_PATH = "/home/via/sandbox/python/segmentation/model.onnx"
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class ModelData(object):
    MODEL_PATH = ONNX_FILE_PATH
    INPUT_SHAPE = (3, 288, 288)
    # We can convert TensorRT data types to numpy types with trt.nptype()
    DTYPE = trt.float32

# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file):
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(0)
    config = builder.create_builder_config()
    parser = trt.OnnxParser(network, TRT_LOGGER)

    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, common.GiB(1))
    # Load the Onnx model and parse it in order to populate the TensorRT network.
    with open(model_file, "rb") as model:
        if not parser.parse(model.read()):
            print("ERROR: Failed to parse the ONNX file.")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None

    engine_bytes = builder.build_serialized_network(network, config)
    runtime = trt.Runtime(TRT_LOGGER)
    return runtime.deserialize_cuda_engine(engine_bytes)

def get_input_image_tensor():
    # PreProcess
    bgr_image = cv2.imread("./dog.jpg")
    rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
    height, width, channel = rgb_image.shape

    size = min(height, width)
    top = int((height - size) / 2)
    left = int((width - size) / 2)
    bottom = top + size
    right = left + size
    crop_img = rgb_image[top:bottom, left:right]

    rgb_ds = cv2.resize(crop_img,(288, 288))
    rgb_nchw = np.transpose(rgb_ds, (2, 0, 1))
    rgb_nchw = (rgb_nchw / 128.0) - 1.0
    rgb_batch = rgb_nchw[np.newaxis,:]

    return rgb_batch

def main():
    onnx_model_file = ONNX_FILE_PATH
    engine = build_engine_onnx(onnx_model_file)
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    context = engine.create_execution_context()
    input_tensor = get_input_image_tensor()
    inputs[0].host = np.array(input_tensor, dtype='<f4')
    trt_outputs = common.do_inference(
        context,
        engine=engine,
        bindings=bindings,
        inputs=inputs,
        outputs=outputs,
        stream=stream,
    )
    print(trt_outputs)
    quit()

if __name__ == "__main__":
    main()

Steps To Reproduce

The problem occurs when you unpack the above tar.gz file, go to “sandbox/python/segmentation”, and run “python trt_resnet.py”. common.py and common_runtime.py were copied from the sample source code below.