YOLO Deployment: Model Export and Multi-Platform Deployment

Model Export (17 Format Support)

Ultralytics Unified Export API

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from ultralytics import YOLO

model = YOLO("yolo26n.pt")

# ========== Export Various Formats ==========
# 1. ONNX (Cross-platform Universal)
model.export(format="onnx", simplify=True, dynamic=True)

# 2. TensorRT (Best for NVIDIA GPU)
model.export(format="engine", half=True, workspace=4)

# 3. OpenVINO (Best for Intel CPU)
model.export(format="openvino", half=True)

# 4. CoreML (Apple Devices)
model.export(format="coreml", int8=True)

# 5. TFLite (Android/iOS Mobile)
model.export(format="tflite", int8=True)

# 6. NCNN (Mobile)
model.export(format="ncnn")

# 7. PaddlePaddle
model.export(format="paddle")

Version Export Compatibility

FormatYOLOv8YOLO11YOLO26
ONNXBest
TensorRTNo NMS, Simpler
OpenVINO
TFLite
NCNN

Python Deployment Practice

ONNX Runtime Deployment

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import onnxruntime as ort
import cv2
import numpy as np

# Load ONNX model
session = ort.InferenceSession(
    "yolo26n.onnx",
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)

def preprocess(image, imgsz=640):
    """Image preprocessing"""
    img = cv2.resize(image, (imgsz, imgsz))
    img = img.transpose(2, 0, 1) / 255.0
    return img[np.newaxis].astype(np.float32)

# Inference
image = cv2.imread("test.jpg")
input_data = preprocess(image)
outputs = session.run(None, {"images": input_data})

# YOLO26 Special Note: No NMS post-processing needed!
# Output is already the final detection results

TensorRT Python Deployment

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time

# ========== 1. Engine Loading & Context Creation ==========
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)

with open("yolo26n.engine", "rb") as f:
    engine = runtime.deserialize_cuda_engine(f.read())

context = engine.create_execution_context()

# ========== 2. CUDA Memory Allocation ==========
stream = cuda.Stream()
bindings = []

for i in range(engine.num_io_tensors):
    name = engine.get_tensor_name(i)
    shape = engine.get_tensor_shape(name)
    dtype = trt.nptype(engine.get_tensor_dtype(name))
    size = trt.volume(shape)
    
    host_mem = cuda.pagelocked_empty(size, dtype)   # Host pinned memory
    device_mem = cuda.mem_alloc(host_mem.nbytes)    # Device VRAM
    bindings.append({"name": name, "host": host_mem, "device": device_mem,
                     "shape": shape, "size": size, "dtype": dtype})

# ========== 3. Async Inference Loop ==========
def async_infer(input_blob):
    # H2D copy
    np.copyto(bindings[0]["host"], input_blob.ravel())
    cuda.memcpy_htod_async(bindings[0]["device"], bindings[0]["host"], stream)
    
    # Set tensor addresses and execute
    context.set_tensor_address(bindings[0]["name"], int(bindings[0]["device"]))
    context.set_tensor_address(bindings[1]["name"], int(bindings[1]["device"]))
    context.execute_async_v3(stream.handle)
    
    # D2H copy
    cuda.memcpy_dtoh_async(bindings[1]["host"], bindings[1]["device"], stream)
    stream.synchronize()
    
    return bindings[1]["host"].copy()

# ========== 4. Performance Benchmark ==========
def benchmark(warmup=10, runs=100):
    dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
    for _ in range(warmup):
        async_infer(dummy)
    
    latencies = []
    for _ in range(runs):
        t0 = time.perf_counter()
        async_infer(dummy)
        latencies.append((time.perf_counter() - t0) * 1000)
    
    latencies.sort()
    print(f"TensorRT FP16 | Mean: {np.mean(latencies):.1f}ms | "
          f"P50: {latencies[runs//2]:.1f}ms | "
          f"P99: {latencies[int(runs*0.99)]:.1f}ms | "
          f"Throughput: {1000/np.mean(latencies):.0f} FPS")

benchmark()

OpenVINO Deployment with Benchmarking

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import openvino as ov
import cv2
import numpy as np
import time

# ========== 1. ONNX → OpenVINO Conversion ==========
# Ultralytics unified export:
#   model.export(format="openvino", half=True)

core = ov.Core()
model = core.read_model("yolo26n_openvino/yolo26n.xml")

# ========== 2. CPU Inference ==========
compiled_cpu = core.compile_model(model, device_name="CPU")
infer_request = compiled_cpu.create_infer_request()

def openvino_infer(image):
    img = cv2.resize(image, (640, 640))
    blob = img.transpose(2, 0, 1)[np.newaxis].astype(np.float32) / 255.0
    outputs = infer_request.infer({"images": blob})
    return outputs[next(iter(outputs))]

# ========== 3. Async Pipeline (Throughput Optimized) ==========
def async_pipeline(images, num_requests=4):
    """Multi-request async inference pipeline"""
    requests = [core.compile_model(model, "CPU").create_infer_request()
                for _ in range(num_requests)]
    results = [None] * len(images)
    
    def completion_callback(request, userdata):
        idx = userdata
        results[idx] = request.get_output_tensor().data.copy()
    
    for req in requests:
        req.set_callback(completion_callback)
    
    for i, img in enumerate(images):
        req = requests[i % num_requests]
        req.start_async({"images": preprocess(img)}, userdata=i)
    
    for req in requests:
        req.wait()
    
    return results

# ========== 4. CPU vs NPU Benchmark Comparison ==========
def benchmark_openvino():
    dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
    
    for device in ["CPU", "AUTO"]:
        compiled = core.compile_model(model, device)
        req = compiled.create_infer_request()
        
        # Warmup (avoid first-inference kernel compilation overhead)
        for _ in range(20):
            req.infer({"images": dummy})
        
        times = []
        for _ in range(200):
            t0 = time.perf_counter()
            req.infer({"images": dummy})
            times.append((time.perf_counter() - t0) * 1000)
        
        times.sort()
        print(f"OpenVINO {device}: "
              f"Mean {np.mean(times):.1f}ms | "
              f"P99 {times[int(199*0.99)]:.1f}ms | "
              f"{1000/np.mean(times):.0f} FPS")

benchmark_openvino()

NCNN Mobile Deployment

NCNN is Tencent’s open-source mobile inference framework supporting ARM NEON and Vulkan GPU acceleration.

Model Optimization (ncnnoptimize):

bash
1
2
3
4
5
# FP32 optimization (best compatibility)
ncnnoptimize yolo26n.param yolo26n.bin yolo26n-opt.param yolo26n-opt.bin 0

# FP16 optimization (speed first, requires FP16 support)
ncnnoptimize yolo26n.param yolo26n.bin yolo26n-fp16.param yolo26n-fp16.bin 1

Python Binding Inference:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import ncnn
import cv2
import numpy as np

def ncnn_infer(image_path):
    net = ncnn.Net()
    net.load_param("yolo26n-opt.param")
    net.load_model("yolo26n-opt.bin")
    
    net.opt.use_vulkan_compute = True   # GPU acceleration
    net.opt.use_bf16_storage = True     # Storage optimization
    
    img = cv2.imread(image_path)
    img = cv2.resize(img, (640, 640))
    
    in_mat = ncnn.Mat.from_pixels(
        img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, 640, 640
    )
    in_mat.substract_mean_normalize([0, 0, 0], [1/255.0, 1/255.0, 1/255.0])
    
    ex = net.create_extractor()
    ex.input("images", in_mat)
    out = ex.extract("output0")
    
    return np.array(out)

Android JNI Integration:

cpp
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// ncnn_jni.cpp — Android JNI calling NCNN
#include <jni.h>
#include <ncnn/net.h>

extern "C" JNIEXPORT jfloatArray JNICALL
Java_com_example_yolo_NCNNHelper_detect(
    JNIEnv* env, jobject thiz,
    jbyteArray image_data, jint width, jint height,
    jstring param_path, jstring bin_path) {

    ncnn::Net net;
    const char* param = env->GetStringUTFChars(param_path, nullptr);
    const char* bin  = env->GetStringUTFChars(bin_path, nullptr);

    net.load_param(param);
    net.load_model(bin);
    net.opt.use_vulkan_compute = true;   // GPU acceleration

    env->ReleaseStringUTFChars(param_path, param);
    env->ReleaseStringUTFChars(bin_path, bin);

    jbyte* data = env->GetByteArrayElements(image_data, nullptr);
    ncnn::Mat in = ncnn::Mat::from_pixels(
        (unsigned char*)data, ncnn::Mat::PIXEL_RGBA2RGB, width, height);

    ncnn::Mat out;
    ncnn::Extractor ex = net.create_extractor();
    ex.input("images", in);
    ex.extract("output0", out);

    env->ReleaseByteArrayElements(image_data, data, 0);

    jfloatArray result = env->NewFloatArray(out.total());
    env->SetFloatArrayRegion(result, 0, out.total(), (jfloat*)out.data);
    return result;
}

Performance Reference (YOLO26n, 640×640):

DeviceBackendLatencyPower
Snapdragon 8 Gen 3NCNN Vulkan~8ms3.5W
Snapdragon 8 Gen 3NCNN CPU~18ms2.1W
Apple A17 ProCoreML/ANE~6ms2.8W
MediaTek Dimensity 9300NCNN GPU~10ms3.2W
Raspberry Pi 5NCNN CPU~120ms5W

C++ Deployment Practice

OpenCV DNN Deployment (Simplest)

cpp
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#include <opencv2/dnn.hpp>
#include <opencv2/opencv.hpp>

int main() {
    // Load ONNX model
    cv::dnn::Net net = cv::dnn::readNetFromONNX("yolo26n.onnx");
    net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
    net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
    
    // Preprocessing
    cv::Mat img = cv::imread("test.jpg");
    cv::Mat blob = cv::dnn::blobFromImage(img, 1/255.0, cv::Size(640, 640),
                                          cv::Scalar(0,0,0), true, false);
    
    // Inference
    net.setInput(blob);
    std::vector<cv::Mat> outputs;
    net.forward(outputs, net.getUnconnectedOutLayersNames());
    
    // YOLO26: No NMS needed! Directly parse output
    
    return 0;
}

Docker Multi-Stage Deployment

Dockerfile Build

dockerfile
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Stage 1: Dependency installation
FROM python:3.11-slim AS builder

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Stage 2: Runtime image (minimized)
FROM python:3.11-slim

RUN apt-get update && apt-get install -y --no-install-recommends \
    libgomp1 libglib2.0-0 libgl1-mesa-glx && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /app
COPY --from=builder /usr/local/lib/python3.11/site-packages \
     /usr/local/lib/python3.11/site-packages

# Bake in model file
COPY yolo26n.onnx /app/models/

# Inference service code
COPY serve.py /app/

HEALTHCHECK --interval=30s --timeout=3s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"

EXPOSE 8000
CMD ["python", "serve.py"]

docker-compose Orchestration

yaml
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
version: '3.8'

services:
  yolo-server:
    build: .
    ports:
      - "8000:8000"
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 2G
        reservations:
          cpus: '1'
          memory: 1G
    environment:
      - MODEL_PATH=/app/models/yolo26n.onnx
      - NUM_WORKERS=2
    healthcheck:
      test: ["CMD", "python", "-c",
             "import urllib.request; assert urllib.request.urlopen('http://localhost:8000/health').status == 200"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped

Health Check & Inference Service

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# serve.py — FastAPI inference service
from fastapi import FastAPI, File, UploadFile
import onnxruntime as ort
import numpy as np
import cv2

app = FastAPI()

session = ort.InferenceSession("models/yolo26n.onnx")

@app.get("/health")
async def health():
    return {"status": "ok", "model": "yolo26n"}

@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    contents = await file.read()
    img = cv2.imdecode(np.frombuffer(contents, np.uint8), cv2.IMREAD_COLOR)
    blob = cv2.resize(img, (640, 640)).transpose(2, 0, 1)
    blob = blob[np.newaxis].astype(np.float32) / 255.0
    outputs = session.run(None, {"images": blob})
    return {"detections": outputs[0].tolist()}

Resource Limit Reference

ResourceMinimumRecommended
CPU2 cores4+ cores
Memory2 GB4 GB
GPU (Optional)NVIDIA T4A10G+

Edge Device Deployment

Rockchip RKNN (RK3588 / RK3566)

Rockchip NPU uses the RKNN format — requires converting ONNX models:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
from rknn.api import RKNN

rknn = RKNN()

# Configure target platform
ret = rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]],
                  target_platform="rk3588")  # RK3566 → "rk3566"

# Load ONNX and build
ret = rknn.load_onnx("yolo26n.onnx")
ret = rknn.build(do_quantization=True, dataset="dataset.txt")
rknn.export_rknn("yolo26n.rknn")

# Inference
ret = rknn.init_runtime()
outputs = rknn.inference(inputs=[preprocess(img)])
rknn.release()

Key Parameters:

  • do_quantization=True: INT8 quantization, 3-5x throughput improvement
  • target_platform: RK3588 / RK3566 / RK3399Pro
  • NPU inference latency (RK3588): typically <10ms

NVIDIA Jetson (JetPack + TensorRT)

bash
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Install JetPack (includes TensorRT, CUDA, cuDNN)
sudo apt-get install nvidia-jetpack

# Build TensorRT engine (native Jetson)
trtexec --onnx=yolo26n.onnx \
        --fp16 \
        --saveEngine=yolo26n.engine \
        --workspace=2048

# Performance test
trtexec --loadEngine=yolo26n.engine --best

Jetson Model Performance Estimate (YOLO26n, FP16, 640×640):

DeviceInference LatencyTier
Jetson Orin NX 16GB~6msEdge Flagship
Jetson Orin Nano 8GB~12msBest Value
Jetson Xavier NX~15msPrevious Gen Flagship
Jetson Nano~45msEntry Level

Google Coral Edge TPU (TFLite)

bash
1
2
3
# Compile quantized model for Edge TPU
edgetpu_compiler yolo26n_full_integer_quant.tflite
# Output: yolo26n_edgetpu.tflite
python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from pycoral.adapters import common
from pycoral.utils.edgetpu import make_interpreter
import time
import numpy as np

interpreter = make_interpreter("yolo26n_edgetpu.tflite")
interpreter.allocate_tensors()

# Warmup (first inference includes compilation overhead)
for _ in range(5):
    common.set_input(interpreter, input_data)
    interpreter.invoke()

# Benchmark
times = []
for _ in range(50):
    t0 = time.perf_counter()
    common.set_input(interpreter, input_data)
    interpreter.invoke()
    times.append((time.perf_counter() - t0) * 1000)

print(f"Edge TPU: Mean {np.mean(times):.1f}ms | "
      f"P99: {sorted(times)[int(49*0.99)]:.1f}ms")

Mobile Deployment

Android Deployment (TFLite)

java
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
// Android TFLite Inference
Interpreter interpreter = new Interpreter(loadModelFile());

// Input preprocessing
float[][][][] input = preprocess(bitmap);

// Inference
float[][] output = new float[1][8400][85];
interpreter.run(input, output);

// YOLO26 advantage: No NMS, Java post-processing code greatly simplified

Web Deployment

ONNX Runtime Web

javascript
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
// JavaScript Web Deployment
import * as ort from 'onnxruntime-web';

async function runYOLO() {
    const session = await ort.InferenceSession.create('yolo26n.onnx');
    
    // Image preprocessing
    const input = preprocessImage(imageElement);
    
    // Inference
    const outputs = await session.run({ images: input });
    
    // YOLO26: No NMS, frontend code cleaner
}

Benchmark Methodology

Standardized Benchmark Script

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# benchmark.py — Unified benchmark template
import time
import numpy as np
import onnxruntime as ort
import json
import platform

def benchmark_model(infer_fn, input_data, warmup=30, runs=200,
                    model_name="model", device_info=""):
    """
    Standardized benchmark

    Args:
        infer_fn: Inference function (receives input_data, returns output)
        input_data: Fixed input data (numpy array)
        warmup: Number of warmup iterations
        runs: Number of benchmark iterations
    """
    # Warmup (exclude first-inference kernel compilation, cache filling)
    for _ in range(warmup):
        _ = infer_fn(input_data)

    # Benchmark
    latencies = []
    for _ in range(runs):
        t0 = time.perf_counter()
        _ = infer_fn(input_data)
        latencies.append((time.perf_counter() - t0) * 1000)

    latencies.sort()
    result = {
        "model": model_name,
        "device": device_info,
        "mean_ms": float(np.mean(latencies)),
        "median_ms": float(np.median(latencies)),
        "p50_ms": float(latencies[runs // 2]),
        "p90_ms": float(latencies[int(runs * 0.90)]),
        "p95_ms": float(latencies[int(runs * 0.95)]),
        "p99_ms": float(latencies[int(runs * 0.99)]),
        "min_ms": float(latencies[0]),
        "max_ms": float(latencies[-1]),
        "std_ms": float(np.std(latencies)),
        "fps": 1000.0 / np.mean(latencies),
        "runs": runs,
        "warmup": warmup,
        "system": platform.platform(),
        "processor": platform.processor()
    }
    return result

# Usage example
def test_ort_cpu():
    session = ort.InferenceSession("yolo26n.onnx",
                                    providers=["CPUExecutionProvider"])
    dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
    return benchmark_model(
        lambda x: session.run(None, {"images": x}),
        dummy,
        model_name="YOLO26n ONNX Runtime CPU",
        device_info="Intel i7-12700, 16GB DDR5"
    )

result = test_ort_cpu()
print(json.dumps(result, indent=2))

Report Format

json
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
{
  "model": "YOLO26n ONNX Runtime CPU",
  "device": "Intel i7-12700, 16GB DDR5",
  "mean_ms": 12.3,
  "p50_ms": 12.1,
  "p99_ms": 15.2,
  "min_ms": 11.8,
  "max_ms": 18.7,
  "std_ms": 0.8,
  "fps": 81.3,
  "warmup": 30,
  "runs": 200,
  "system": "Linux-6.8.0-x86_64",
  "processor": "x86_64"
}

Testing Considerations

ConsiderationDescription
CPU Frequency LockingDisable scaling: cpupower frequency-set -g performance
GPU WarmupTensorRT first inference includes kernel compilation — warm up 10+ iterations
Batch Size ImpactLarger batches increase throughput but add latency — choose based on SLA
Memory BandwidthCPU inference is often memory-bandwidth bound — watch DDR frequency
Temperature ControlEdge device latency increases with temperature — monitor SoC temp
ReproducibilityFix random seed, lock CPU affinity, disable hyperthreading

Export Troubleshooting

Common Errors & Fixes

PlatformCommon ErrorSolution
ONNXOpset version mismatchSpecify opset=15 or opset=17
ONNXDynamic shape unsupportedSet dynamic=False, imgsz=640
TensorRTPlugin not foundUpgrade TensorRT or set workspace=8
TensorRTHalf precision not supportedUse half=False or upgrade GPU
OpenVINOFP16 not supported on deviceDevice doesn’t support FP16, set half=False
TFLiteQuantization not supportedUse int8=False or provide calibration dataset
NCNNVulkan not initializedCheck GPU driver, compile with -DNCNN_VULKAN=ON
RKNNPlatform mismatchCheck target_platform parameter
CoreMLMin deployment targetUse mlmodel format instead of mlpackage

Debug Tips

  • Verbose export logging:
    python
    1
    
    model.export(format="onnx", verbose=True)
  • ONNX integrity check:
    bash
    1
    
    python -m onnxruntime.tools.check_model yolo26n.onnx
  • TensorRT verbose logging:
    python
    1
    
    trt.Logger(trt.Logger.VERBOSE)  # Replace WARNING
  • Graph visualization:
    python
    1
    2
    
    import netron
    netron.start("yolo26n.onnx")  # View model structure in browser

Version Deployment Differences Summary

Deployment AspectYOLOv8YOLO11YOLO26
NMS Post-processingRequiredRequiredNot Required
DFL ParsingRequiredRequiredNot Required
Deployment Code SizeBaselineBaselineReduced by 50%
CPU Inference SpeedBaseline+25%+43%
Hardware CompatibilityGoodGoodBest
Edge Device AdaptationAverageBetterDesigned for Edge