YOLO 部署落地:模型导出与多平台部署

模型导出(17 种格式支持)

Ultralytics 统一导出 API

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from ultralytics import YOLO

model = YOLO("yolo26n.pt")

# ========== 各种格式导出 ==========
# 1. ONNX(跨平台通用)
model.export(format="onnx", simplify=True, dynamic=True)

# 2. TensorRT(NVIDIA GPU最佳)
model.export(format="engine", half=True, workspace=4)

# 3. OpenVINO(Intel CPU最佳)
model.export(format="openvino", half=True)

# 4. CoreML(Apple设备)
model.export(format="coreml", int8=True)

# 5. TFLite(Android/iOS移动端)
model.export(format="tflite", int8=True)

# 6. NCNN(移动端)
model.export(format="ncnn")

# 7. PaddlePaddle
model.export(format="paddle")

各版本导出兼容性

格式YOLOv8YOLO11YOLO26
ONNX最佳
TensorRT无 NMS 更简单
OpenVINO
TFLite
NCNN

Python 部署实战

ONNX Runtime 部署

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import onnxruntime as ort
import cv2
import numpy as np

# 加载ONNX模型
session = ort.InferenceSession(
    "yolo26n.onnx",
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)

def preprocess(image, imgsz=640):
    """图片预处理"""
    img = cv2.resize(image, (imgsz, imgsz))
    img = img.transpose(2, 0, 1) / 255.0
    return img[np.newaxis].astype(np.float32)

# 推理
image = cv2.imread("test.jpg")
input_data = preprocess(image)
outputs = session.run(None, {"images": input_data})

# YOLO26特别注意:无需NMS后处理!
# 输出已是最终检测结果

TensorRT Python 部署

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time

# ========== 1. 引擎加载与上下文创建 ==========
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)

with open("yolo26n.engine", "rb") as f:
    engine = runtime.deserialize_cuda_engine(f.read())

context = engine.create_execution_context()

# ========== 2. CUDA 内存分配 ==========
stream = cuda.Stream()
bindings = []

for i in range(engine.num_io_tensors):
    name = engine.get_tensor_name(i)
    shape = engine.get_tensor_shape(name)
    dtype = trt.nptype(engine.get_tensor_dtype(name))
    size = trt.volume(shape)
    
    host_mem = cuda.pagelocked_empty(size, dtype)   # 主机锁页内存
    device_mem = cuda.mem_alloc(host_mem.nbytes)    # 设备显存
    bindings.append({"name": name, "host": host_mem, "device": device_mem,
                     "shape": shape, "size": size, "dtype": dtype})

# ========== 3. 异步推理循环 ==========
def async_infer(input_blob):
    # 输入拷贝:主机 → 设备
    np.copyto(bindings[0]["host"], input_blob.ravel())
    cuda.memcpy_htod_async(bindings[0]["device"], bindings[0]["host"], stream)
    
    # 设置张量地址并执行
    context.set_tensor_address(bindings[0]["name"], int(bindings[0]["device"]))
    context.set_tensor_address(bindings[1]["name"], int(bindings[1]["device"]))
    context.execute_async_v3(stream.handle)
    
    # 输出拷贝:设备 → 主机
    cuda.memcpy_dtoh_async(bindings[1]["host"], bindings[1]["device"], stream)
    stream.synchronize()
    
    return bindings[1]["host"].copy()

# ========== 4. 性能基准测试 ==========
def benchmark(warmup=10, runs=100):
    dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
    for _ in range(warmup):
        async_infer(dummy)
    
    latencies = []
    for _ in range(runs):
        t0 = time.perf_counter()
        async_infer(dummy)
        latencies.append((time.perf_counter() - t0) * 1000)
    
    latencies.sort()
    print(f"TensorRT FP16 | 平均: {np.mean(latencies):.1f}ms | "
          f"P50: {latencies[runs//2]:.1f}ms | "
          f"P99: {latencies[int(runs*0.99)]:.1f}ms | "
          f"吞吐: {1000/np.mean(latencies):.0f} FPS")

benchmark()

OpenVINO 部署与性能基准

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import openvino as ov
import cv2
import numpy as np
import time

# ========== 1. ONNX → OpenVINO 转换 ==========
# Ultralytics 统一导出:
#   model.export(format="openvino", half=True)

core = ov.Core()
model = core.read_model("yolo26n_openvino/yolo26n.xml")

# ========== 2. CPU 推理 ==========
compiled_cpu = core.compile_model(model, device_name="CPU")
infer_request = compiled_cpu.create_infer_request()

def openvino_infer(image):
    img = cv2.resize(image, (640, 640))
    blob = img.transpose(2, 0, 1)[np.newaxis].astype(np.float32) / 255.0
    outputs = infer_request.infer({"images": blob})
    return outputs[next(iter(outputs))]

# ========== 3. 异步推理管线(提升吞吐)==========
def async_pipeline(images, num_requests=4):
    """多请求异步推理管线"""
    requests = [core.compile_model(model, "CPU").create_infer_request()
                for _ in range(num_requests)]
    results = [None] * len(images)
    
    def completion_callback(request, userdata):
        idx = userdata
        results[idx] = request.get_output_tensor().data.copy()
    
    for req in requests:
        req.set_callback(completion_callback)
    
    for i, img in enumerate(images):
        req = requests[i % num_requests]
        req.start_async({"images": preprocess(img)}, userdata=i)
    
    for req in requests:
        req.wait()
    
    return results

# ========== 4. CPU vs NPU 基准对比 ==========
def benchmark_openvino():
    dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
    
    for device in ["CPU", "AUTO"]:
        compiled = core.compile_model(model, device)
        req = compiled.create_infer_request()
        
        # 预热(避免首次 kernel 编译开销)
        for _ in range(20):
            req.infer({"images": dummy})
        
        times = []
        for _ in range(200):
            t0 = time.perf_counter()
            req.infer({"images": dummy})
            times.append((time.perf_counter() - t0) * 1000)
        
        times.sort()
        print(f"OpenVINO {device}: "
              f"平均 {np.mean(times):.1f}ms | "
              f"P99 {times[int(199*0.99)]:.1f}ms | "
              f"{1000/np.mean(times):.0f} FPS")

benchmark_openvino()

NCNN 移动端部署

NCNN 是腾讯开源的移动端推理框架,支持 ARM NEON 和 Vulkan GPU 加速。

模型优化(ncnnoptimize):

bash
1
2
3
4
5
# FP32 优化(兼容性最佳)
ncnnoptimize yolo26n.param yolo26n.bin yolo26n-opt.param yolo26n-opt.bin 0

# FP16 优化(速度优先,需 FP16 支持)
ncnnoptimize yolo26n.param yolo26n.bin yolo26n-fp16.param yolo26n-fp16.bin 1

Python 绑定推理:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import ncnn
import cv2
import numpy as np

def ncnn_infer(image_path):
    net = ncnn.Net()
    net.load_param("yolo26n-opt.param")
    net.load_model("yolo26n-opt.bin")
    
    net.opt.use_vulkan_compute = True   # GPU 加速
    net.opt.use_bf16_storage = True     # 存储优化
    
    img = cv2.imread(image_path)
    img = cv2.resize(img, (640, 640))
    
    in_mat = ncnn.Mat.from_pixels(
        img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, 640, 640
    )
    in_mat.substract_mean_normalize([0, 0, 0], [1/255.0, 1/255.0, 1/255.0])
    
    ex = net.create_extractor()
    ex.input("images", in_mat)
    out = ex.extract("output0")
    
    return np.array(out)

Android JNI 集成:

cpp
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// ncnn_jni.cpp — Android JNI 调用 NCNN
#include <jni.h>
#include <ncnn/net.h>

extern "C" JNIEXPORT jfloatArray JNICALL
Java_com_example_yolo_NCNNHelper_detect(
    JNIEnv* env, jobject thiz,
    jbyteArray image_data, jint width, jint height,
    jstring param_path, jstring bin_path) {

    ncnn::Net net;
    const char* param = env->GetStringUTFChars(param_path, nullptr);
    const char* bin  = env->GetStringUTFChars(bin_path, nullptr);

    net.load_param(param);
    net.load_model(bin);
    net.opt.use_vulkan_compute = true;   // GPU 加速

    env->ReleaseStringUTFChars(param_path, param);
    env->ReleaseStringUTFChars(bin_path, bin);

    jbyte* data = env->GetByteArrayElements(image_data, nullptr);
    ncnn::Mat in = ncnn::Mat::from_pixels(
        (unsigned char*)data, ncnn::Mat::PIXEL_RGBA2RGB, width, height);

    ncnn::Mat out;
    ncnn::Extractor ex = net.create_extractor();
    ex.input("images", in);
    ex.extract("output0", out);

    env->ReleaseByteArrayElements(image_data, data, 0);

    jfloatArray result = env->NewFloatArray(out.total());
    env->SetFloatArrayRegion(result, 0, out.total(), (jfloat*)out.data);
    return result;
}

性能参考(YOLO26n, 640×640):

设备后端延迟功耗
Snapdragon 8 Gen 3NCNN Vulkan~8ms3.5W
Snapdragon 8 Gen 3NCNN CPU~18ms2.1W
Apple A17 ProCoreML/ANE~6ms2.8W
MediaTek Dimensity 9300NCNN GPU~10ms3.2W
Raspberry Pi 5NCNN CPU~120ms5W

C++ 部署实战

OpenCV DNN 部署(最简单)

cpp
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#include <opencv2/dnn.hpp>
#include <opencv2/opencv.hpp>

int main() {
    // 加载ONNX模型
    cv::dnn::Net net = cv::dnn::readNetFromONNX("yolo26n.onnx");
    net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
    net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
    
    // 预处理
    cv::Mat img = cv::imread("test.jpg");
    cv::Mat blob = cv::dnn::blobFromImage(img, 1/255.0, cv::Size(640, 640),
                                          cv::Scalar(0,0,0), true, false);
    
    // 推理
    net.setInput(blob);
    std::vector<cv::Mat> outputs;
    net.forward(outputs, net.getUnconnectedOutLayersNames());
    
    // YOLO26:无需NMS!直接解析输出即可
    
    return 0;
}

Docker 多阶段部署

Dockerfile 构建

dockerfile
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 阶段一:依赖安装
FROM python:3.11-slim AS builder

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# 阶段二:运行镜像(最小化)
FROM python:3.11-slim

RUN apt-get update && apt-get install -y --no-install-recommends \
    libgomp1 libglib2.0-0 libgl1-mesa-glx && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /app
COPY --from=builder /usr/local/lib/python3.11/site-packages \
     /usr/local/lib/python3.11/site-packages

# 内建模型文件
COPY yolo26n.onnx /app/models/

# 推理服务代码
COPY serve.py /app/

HEALTHCHECK --interval=30s --timeout=3s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"

EXPOSE 8000
CMD ["python", "serve.py"]

docker-compose 编排

yaml
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
version: '3.8'

services:
  yolo-server:
    build: .
    ports:
      - "8000:8000"
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 2G
        reservations:
          cpus: '1'
          memory: 1G
    environment:
      - MODEL_PATH=/app/models/yolo26n.onnx
      - NUM_WORKERS=2
    healthcheck:
      test: ["CMD", "python", "-c",
             "import urllib.request; assert urllib.request.urlopen('http://localhost:8000/health').status == 200"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped

健康检查与推理服务

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# serve.py — FastAPI 推理服务
from fastapi import FastAPI, File, UploadFile
import onnxruntime as ort
import numpy as np
import cv2

app = FastAPI()

session = ort.InferenceSession("models/yolo26n.onnx")

@app.get("/health")
async def health():
    return {"status": "ok", "model": "yolo26n"}

@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    contents = await file.read()
    img = cv2.imdecode(np.frombuffer(contents, np.uint8), cv2.IMREAD_COLOR)
    blob = cv2.resize(img, (640, 640)).transpose(2, 0, 1)
    blob = blob[np.newaxis].astype(np.float32) / 255.0
    outputs = session.run(None, {"images": blob})
    return {"detections": outputs[0].tolist()}

资源限制参考

资源最低要求推荐配置
CPU2 核4 核以上
内存2 GB4 GB
GPU(可选)NVIDIA T4A10G 以上

边缘设备部署

Rockchip RKNN(RK3588 / RK3566)

Rockchip NPU 使用 RKNN 格式,需将 ONNX 模型转换:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
from rknn.api import RKNN

rknn = RKNN()

# 配置目标平台
ret = rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]],
                  target_platform="rk3588")  # RK3566 用 "rk3566"

# 加载 ONNX 并构建
ret = rknn.load_onnx("yolo26n.onnx")
ret = rknn.build(do_quantization=True, dataset="dataset.txt")
rknn.export_rknn("yolo26n.rknn")

# 推理
ret = rknn.init_runtime()
outputs = rknn.inference(inputs=[preprocess(img)])
rknn.release()

关键参数:

  • do_quantization=True:INT8 量化,吞吐提升 3-5 倍
  • target_platform:RK3588 / RK3566 / RK3399Pro
  • NPU 推理延迟(RK3588):通常 <10ms

NVIDIA Jetson(JetPack + TensorRT)

bash
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# 安装 JetPack(含 TensorRT、CUDA、cuDNN)
sudo apt-get install nvidia-jetpack

# 构建 TensorRT 引擎(Jetson 原生)
trtexec --onnx=yolo26n.onnx \
        --fp16 \
        --saveEngine=yolo26n.engine \
        --workspace=2048

# 性能测试
trtexec --loadEngine=yolo26n.engine --best

Jetson 各型号性能预估(YOLO26n, FP16, 640×640):

设备推理延迟定位
Jetson Orin NX 16GB~6ms边缘旗舰
Jetson Orin Nano 8GB~12ms性价比之选
Jetson Xavier NX~15ms上一代旗舰
Jetson Nano~45ms入门级

Google Coral Edge TPU(TFLite)

bash
1
2
3
# Edge TPU 编译量化模型
edgetpu_compiler yolo26n_full_integer_quant.tflite
# 输出: yolo26n_edgetpu.tflite
python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from pycoral.adapters import common
from pycoral.utils.edgetpu import make_interpreter
import time
import numpy as np

interpreter = make_interpreter("yolo26n_edgetpu.tflite")
interpreter.allocate_tensors()

# 预热(首次推理含编译开销)
for _ in range(5):
    common.set_input(interpreter, input_data)
    interpreter.invoke()

# 基准测试
times = []
for _ in range(50):
    t0 = time.perf_counter()
    common.set_input(interpreter, input_data)
    interpreter.invoke()
    times.append((time.perf_counter() - t0) * 1000)

print(f"Edge TPU: 平均 {np.mean(times):.1f}ms | "
      f"P99: {sorted(times)[int(49*0.99)]:.1f}ms")

移动端部署

Android 部署(TFLite)

java
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
// Android TFLite 推理
Interpreter interpreter = new Interpreter(loadModelFile());

// 输入预处理
float[][][][] input = preprocess(bitmap);

// 推理
float[][] output = new float[1][8400][85];
interpreter.run(input, output);

// YOLO26优势无NMSJava后处理代码大幅简化

Web 端部署

ONNX Runtime Web

javascript
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
// JavaScript Web部署
import * as ort from 'onnxruntime-web';

async function runYOLO() {
    const session = await ort.InferenceSession.create('yolo26n.onnx');
    
    // 图片预处理
    const input = preprocessImage(imageElement);
    
    // 推理
    const outputs = await session.run({ images: input });
    
    // YOLO26:无NMS,前端代码更简洁
}

基准测试方法论

标准化基准脚本

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# benchmark.py — 统一基准测试模板
import time
import numpy as np
import onnxruntime as ort
import json
import platform

def benchmark_model(infer_fn, input_data, warmup=30, runs=200,
                    model_name="model", device_info=""):
    """
    标准化基准测试

    Args:
        infer_fn: 推理函数(接收 input_data,返回输出)
        input_data: 固定输入数据(numpy array)
        warmup: 预热轮数
        runs: 基准测试轮数
    """
    # 预热(排除首次 kernel 编译、缓存填充)
    for _ in range(warmup):
        _ = infer_fn(input_data)

    # 基准测试
    latencies = []
    for _ in range(runs):
        t0 = time.perf_counter()
        _ = infer_fn(input_data)
        latencies.append((time.perf_counter() - t0) * 1000)

    latencies.sort()
    result = {
        "model": model_name,
        "device": device_info,
        "mean_ms": float(np.mean(latencies)),
        "median_ms": float(np.median(latencies)),
        "p50_ms": float(latencies[runs // 2]),
        "p90_ms": float(latencies[int(runs * 0.90)]),
        "p95_ms": float(latencies[int(runs * 0.95)]),
        "p99_ms": float(latencies[int(runs * 0.99)]),
        "min_ms": float(latencies[0]),
        "max_ms": float(latencies[-1]),
        "std_ms": float(np.std(latencies)),
        "fps": 1000.0 / np.mean(latencies),
        "runs": runs,
        "warmup": warmup,
        "system": platform.platform(),
        "processor": platform.processor()
    }
    return result

# 使用示例
def test_ort_cpu():
    session = ort.InferenceSession("yolo26n.onnx",
                                    providers=["CPUExecutionProvider"])
    dummy = np.random.randn(1, 3, 640, 640).astype(np.float32)
    return benchmark_model(
        lambda x: session.run(None, {"images": x}),
        dummy,
        model_name="YOLO26n ONNX Runtime CPU",
        device_info="Intel i7-12700, 16GB DDR5"
    )

result = test_ort_cpu()
print(json.dumps(result, indent=2))

报告格式

json
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
{
  "model": "YOLO26n ONNX Runtime CPU",
  "device": "Intel i7-12700, 16GB DDR5",
  "mean_ms": 12.3,
  "p50_ms": 12.1,
  "p99_ms": 15.2,
  "min_ms": 11.8,
  "max_ms": 18.7,
  "std_ms": 0.8,
  "fps": 81.3,
  "warmup": 30,
  "runs": 200,
  "system": "Linux-6.8.0-x86_64",
  "processor": "x86_64"
}

测试注意事项

注意事项说明
CPU 频率锁定关闭频率缩放:cpupower frequency-set -g performance
GPU 预热TensorRT 首次推理含 kernel 编译,务必预热 10+ 轮
批处理影响大 batch 提升吞吐但增加延迟,根据 SLA 选择最优 size
内存带宽CPU 推理常受内存带宽瓶颈,留意 DDR 频率
温度控制边缘设备延迟随温度升高而增加,需监控 SoC 温度
结果可复现固定随机种子、锁定 CPU 亲和性、关闭超线程

导出故障排除

常见错误与修复

平台常见错误解决方案
ONNXOpset version mismatch指定 opset=15opset=17
ONNXDynamic shape unsupported设置 dynamic=False, imgsz=640
TensorRTPlugin not found升级 TensorRT 或设置 workspace=8
TensorRTHalf precision not supported使用 half=False 或更换 GPU
OpenVINOFP16 not supported on device该设备不支持 FP16,设置 half=False
TFLiteQuantization not supportedint8=False 或提供校准集
NCNNVulkan not initialized检查 GPU 驱动,编译时加 -DNCNN_VULKAN=ON
RKNNPlatform mismatch检查 target_platform 参数
CoreMLMin deployment targetmlmodel 格式而非 mlpackage

Debug 技巧

  • 详细日志导出
    python
    1
    
    model.export(format="onnx", verbose=True)
  • ONNX 完整性验证
    bash
    1
    
    python -m onnxruntime.tools.check_model yolo26n.onnx
  • TensorRT 详细日志
    python
    1
    
    trt.Logger(trt.Logger.VERBOSE)  # 替换 WARNING
  • 计算图可视化
    python
    1
    2
    
    import netron
    netron.start("yolo26n.onnx")  # 在浏览器中查看模型结构

各版本部署差异总结

部署维度YOLOv8YOLO11YOLO26
NMS 后处理需要需要不需要
DFL 解析需要需要不需要
部署代码量基准基准减少 50%
CPU 推理速度基准+25%+43%
硬件兼容性良好良好最佳
边缘设备适配一般较好专为边缘设计