YOLO 部署落地:模型导出与多平台部署
模型导出(17 种格式支持) Ultralytics 统一导出 API python 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 from ultralytics import YOLO model = YOLO("yolo26n.pt") # ========== 各种格式导出 ========== # 1. ONNX(跨平台通用) model.export(format="onnx", simplify=True, dynamic=True) # 2. TensorRT(NVIDIA GPU最佳) model.export(format="engine", half=True, workspace=4) # 3. OpenVINO(Intel CPU最佳) model.export(format="openvino", half=True) # 4. CoreML(Apple设备) model.export(format="coreml", int8=True) # 5. TFLite(Android/iOS移动端) model.export(format="tflite", int8=True) # 6. NCNN(移动端) model.export(format="ncnn") # 7. PaddlePaddle model.export(format="paddle") 各版本导出兼容性 格式 YOLOv8 YOLO11 YOLO26 ONNX ✅ ✅ ✅ 最佳 TensorRT ✅ ✅ ✅ 无 NMS 更简单 OpenVINO ✅ ✅ ✅ TFLite ✅ ✅ ✅ NCNN ✅ ✅ ✅ Python 部署实战 ONNX Runtime 部署 python 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 import onnxruntime as ort import cv2 import numpy as np # 加载ONNX模型 session = ort.InferenceSession( "yolo26n.onnx", providers=["CUDAExecutionProvider", "CPUExecutionProvider"] ) def preprocess(image, imgsz=640): """图片预处理""" img = cv2.resize(image, (imgsz, imgsz)) img = img.transpose(2, 0, 1) / 255.0 return img[np.newaxis].astype(np.float32) # 推理 image = cv2.imread("test.jpg") input_data = preprocess(image) outputs = session.run(None, {"images": input_data}) # YOLO26特别注意:无需NMS后处理! # 输出已是最终检测结果 TensorRT Python 部署 python 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit import numpy as np import time # ========== 1. 引擎加载与上下文创建 ========== TRT_LOGGER = trt.Logger(trt.Logger.WARNING) runtime = trt.Runtime(TRT_LOGGER) with open("yolo26n.engine", "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() # ========== 2. CUDA 内存分配 ========== stream = cuda.Stream() bindings = [] for i in range(engine.num_io_tensors): name = engine.get_tensor_name(i) shape = engine.get_tensor_shape(name) dtype = trt.nptype(engine.get_tensor_dtype(name)) size = trt.volume(shape) host_mem = cuda.pagelocked_empty(size, dtype) # 主机锁页内存 device_mem = cuda.mem_alloc(host_mem.nbytes) # 设备显存 bindings.append({"name": name, "host": host_mem, "device": device_mem, "shape": shape, "size": size, "dtype": dtype}) # ========== 3. 异步推理循环 ========== def async_infer(input_blob): # 输入拷贝:主机 → 设备 np.copyto(bindings[0]["host"], input_blob.ravel()) cuda.memcpy_htod_async(bindings[0]["device"], bindings[0]["host"], stream) # 设置张量地址并执行 context.set_tensor_address(bindings[0]["name"], int(bindings[0]["device"])) context.set_tensor_address(bindings[1]["name"], int(bindings[1]["device"])) context.execute_async_v3(stream.handle) # 输出拷贝:设备 → 主机 cuda.memcpy_dtoh_async(bindings[1]["host"], bindings[1]["device"], stream) stream.synchronize() return bindings[1]["host"].copy() # ========== 4. 性能基准测试 ========== def benchmark(warmup=10, runs=100): dummy = np.random.randn(1, 3, 640, 640).astype(np.float32) for _ in range(warmup): async_infer(dummy) latencies = [] for _ in range(runs): t0 = time.perf_counter() async_infer(dummy) latencies.append((time.perf_counter() - t0) * 1000) latencies.sort() print(f"TensorRT FP16 | 平均: {np.mean(latencies):.1f}ms | " f"P50: {latencies[runs//2]:.1f}ms | " f"P99: {latencies[int(runs*0.99)]:.1f}ms | " f"吞吐: {1000/np.mean(latencies):.0f} FPS") benchmark() OpenVINO 部署与性能基准 python 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 import openvino as ov import cv2 import numpy as np import time # ========== 1. ONNX → OpenVINO 转换 ========== # Ultralytics 统一导出: # model.export(format="openvino", half=True) core = ov.Core() model = core.read_model("yolo26n_openvino/yolo26n.xml") # ========== 2. CPU 推理 ========== compiled_cpu = core.compile_model(model, device_name="CPU") infer_request = compiled_cpu.create_infer_request() def openvino_infer(image): img = cv2.resize(image, (640, 640)) blob = img.transpose(2, 0, 1)[np.newaxis].astype(np.float32) / 255.0 outputs = infer_request.infer({"images": blob}) return outputs[next(iter(outputs))] # ========== 3. 异步推理管线(提升吞吐)========== def async_pipeline(images, num_requests=4): """多请求异步推理管线""" requests = [core.compile_model(model, "CPU").create_infer_request() for _ in range(num_requests)] results = [None] * len(images) def completion_callback(request, userdata): idx = userdata results[idx] = request.get_output_tensor().data.copy() for req in requests: req.set_callback(completion_callback) for i, img in enumerate(images): req = requests[i % num_requests] req.start_async({"images": preprocess(img)}, userdata=i) for req in requests: req.wait() return results # ========== 4. CPU vs NPU 基准对比 ========== def benchmark_openvino(): dummy = np.random.randn(1, 3, 640, 640).astype(np.float32) for device in ["CPU", "AUTO"]: compiled = core.compile_model(model, device) req = compiled.create_infer_request() # 预热(避免首次 kernel 编译开销) for _ in range(20): req.infer({"images": dummy}) times = [] for _ in range(200): t0 = time.perf_counter() req.infer({"images": dummy}) times.append((time.perf_counter() - t0) * 1000) times.sort() print(f"OpenVINO {device}: " f"平均 {np.mean(times):.1f}ms | " f"P99 {times[int(199*0.99)]:.1f}ms | " f"{1000/np.mean(times):.0f} FPS") benchmark_openvino() NCNN 移动端部署 NCNN 是腾讯开源的移动端推理框架,支持 ARM NEON 和 Vulkan GPU 加速。
继续阅读 →