YOLO Quick Start: Model Loading and Inference

May 8, 2026 AI Tools YOLO, Computer Vision, Python, Inference AI Engineering Series 2121 words 10 min read

🔊

Model Loading and Inference Across Versions

Ultralytics Unified API (Works with v8/11/26)

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
from ultralytics import YOLO

# ========== YOLOv8 ==========
model_v8 = YOLO("yolov8n.pt")      #  nano
model_v8 = YOLO("yolov8s.pt")      #  small
model_v8 = YOLO("yolov8m.pt")      #  medium
model_v8 = YOLO("yolov8l.pt")      #  large
model_v8 = YOLO("yolov8x.pt")      #  extra large

# ========== YOLO11 ==========
model_11 = YOLO("yolo11n.pt")      #  nano
model_11 = YOLO("yolo11s.pt")      #  small
model_11 = YOLO("yolo11m.pt")      #  medium
model_11 = YOLO("yolo11l.pt")      #  large
model_11 = YOLO("yolo11x.pt")      #  extra large

# ========== YOLO26 (2026 latest) ==========
model_26 = YOLO("yolo26n.pt")      #  nano  recommended for edge deployment
model_26 = YOLO("yolo26s.pt")      #  small
model_26 = YOLO("yolo26m.pt")      #  medium
model_26 = YOLO("yolo26l.pt")      #  large
model_26 = YOLO("yolo26x.pt")      #  extra large

Image Detection Hands-on Example

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
from ultralytics import YOLO

# Load model (YOLO26 example)
model = YOLO("yolo26n.pt")

# Single image detection
results = model("test.jpg", conf=0.25, iou=0.45)

# Process results
for result in results:
    boxes = result.boxes          # Detection boxes
    masks = result.masks          # Segmentation masks
    probs = result.probs          # Classification probabilities
    
    # Print detection results
    for box in boxes:
        print(f"Class: {result.names[int(box.cls)]}, "
              f"Confidence: {box.conf.item():.3f}, "
              f"Coordinates: {box.xyxy.tolist()[0]}")
    
    # Save visualization results
    result.save("result.jpg")

Video Detection Hands-on Example

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
from ultralytics import YOLO

model = YOLO("yolo26n.pt")

# Video file detection
results = model.predict(
    source="input.mp4",
    save=True,           # Save result video
    conf=0.3,
    show=False,          # Whether to display in real-time
    stream=True          # Stream processing to save memory
)

# Process frame by frame
for result in results:
    # Custom post-processing logic
    pass

Real-time Camera Detection

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from ultralytics import YOLO
import cv2

model = YOLO("yolo26n.pt")

# Open camera
cap = cv2.VideoCapture(0)  # 0 is default camera

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Inference
    results = model(frame, verbose=False)
    
    # Draw results
    annotated_frame = results[0].plot()
    
    # Display
    cv2.imshow("YOLO Real-time", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Version-specific Code Differences

Feature	YOLOv8	YOLO11	YOLO26	YOLOv9	YOLOv10
Unified API	✅	✅	✅	❌ Separate repo	❌ Separate repo
No NMS	❌	❌	✅	❌	✅
DFL Module	✅	✅	❌ Removed	✅	✅
MuSGD Optimizer	❌	❌	✅	❌	❌
Export Compatibility	Good	Good	Best	Fair	Fair

Results Object API Deep Dive

The model() or model.predict() call returns a list of Results objects. Each Results object encapsulates all inference outputs for a single image. Understanding its internal structure is essential for downstream processing.

Core Attributes

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from ultralytics import YOLO

model = YOLO("yolo26n.pt")
results = model("test.jpg")[0]  # Get results for the first image

# === Detection Boxes ===
boxes = results.boxes
# Boxes object properties:
print(boxes.xyxy)    # Tensor[N,4] — top-left bottom-right (x1,y1,x2,y2)
print(boxes.xywh)    # Tensor[N,4] — center-width-height (cx,cy,w,h)
print(boxes.xyxyn)   # Tensor[N,4] — normalized xyxy (0~1)
print(boxes.xywhn)   # Tensor[N,4] — normalized xywh (0~1)
print(boxes.conf)    # Tensor[N]   — confidence for each box
print(boxes.cls)     # Tensor[N]   — class index for each box
print(boxes.id)      # Tensor[N]   — tracking ID (tracking only)

# === Segmentation Masks ===
masks = results.masks
if masks is not None:
    print(masks.data)     # Tensor[N,H,W] — raw mask tensor
    print(masks.xy)       # list[N] of numpy arrays — polygon contours

# === Classification Probabilities ===
# Only available for classification models (not detection)
probs = results.probs
if probs is not None:
    print(probs.top1)        # Highest probability class index
    print(probs.top1conf)    # Highest probability confidence
    print(probs.top5)        # Top-5 class indices (list)

# === Keypoints ===
# Pose estimation models (e.g., YOLO11-pose)
keypoints = results.keypoints
if keypoints is not None:
    print(keypoints.data)    # Tensor[N,17,3] — (x, y, confidence)
    print(keypoints.xy)      # Tensor[N,17,2] — coordinates only
    print(keypoints.conf)    # Tensor[N,17]   — keypoint confidence

# === Original Image ===
print(results.orig_img.shape)   # (H, W, C) — original image as numpy array
print(results.orig_shape)       # (H, W) — original dimensions

# === Class Name Mapping ===
print(results.names)            # {0: 'person', 1: 'bicycle', ...}

# === Inference Speed ===
print(results.speed)            # {'preprocess': 1.2, 'inference': 8.5, 'postprocess': 0.8}

Filtering Boxes by Condition

python
1
2
3
4
5
6
7
# Keep only boxes with confidence > 0.5 and class person (0)
results = model("test.jpg")[0]
person_boxes = [
    box for box in results.boxes
    if box.conf.item() > 0.5 and int(box.cls) == 0
]
print(f"Detected {len(person_boxes)} people")

Post-processing Parameters Explained

The core parameters of model() and model.predict() control the quality and quantity of detection results.

Confidence Threshold (conf)

python
1
2
# conf=0.25 means only keep boxes with confidence ≥ 0.25
results = model("test.jpg", conf=0.25)

Default: 0.25
Increase (e.g., 0.5): Reduce false positives, suitable for high-precision scenarios (industrial inspection)
Decrease (e.g., 0.1): Increase recall, suitable for scenarios where missing detections is critical (security surveillance)

The confidence value represents the model’s “certainty” for each detection box, normalized to 0~1 via Sigmoid from the classification branch output.

IoU Threshold and NMS

python
1
2
# iou=0.45 controls NMS strictness
results = model("test.jpg", iou=0.45)

NMS (Non-Maximum Suppression) is the core post-processing technique in object detection:

Sort all detection boxes by confidence
Select the highest-confidence box
Remove other boxes with IoU > threshold (considered duplicate detections)
Repeat steps 2-3 until no boxes remain

Higher IoU (e.g., 0.7): Keep more overlapping boxes, suitable for dense scenes
Lower IoU (e.g., 0.3): Remove more overlapping boxes, reduce redundancy

YOLO26 Special Note: YOLO26 uses SimOTA label assignment and does not require NMS during inference. Regardless of the iou parameter setting, YOLO26 skips the NMS step entirely — this is a key architectural difference and a source of its speed advantage.

Batch Inference

When processing many images, batch inference leverages GPU parallel compute capacity efficiently.

Multiple Images Simultaneously

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
from ultralytics import YOLO

model = YOLO("yolo26n.pt")

# Pass a list of image paths — automatic batch inference
results = model(["img1.jpg", "img2.jpg", "img3.jpg"], batch=4)

for i, result in enumerate(results):
    print(f"Image {i+1}: {len(result.boxes)} objects detected")
    result.save(f"result_{i}.jpg")

The batch parameter controls how many images are sent to the GPU at once. Default is -1 (auto-detect), with actual value limited by GPU memory.

stream=True Memory Optimization

When processing very large numbers of images (hundreds or thousands), stream=True is critical to prevent memory overflow:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# Normal mode — waits for ALL images to finish processing
results = model(["img1.jpg", "img2.jpg", "img3.jpg"])
# All results are in memory at once

# Stream mode — yields each result as it's ready
results = model(["img1.jpg", "img2.jpg", "img3.jpg"], stream=True)
# results is a generator, producing results on demand
for result in results:
    # Memory for the current result can be reclaimed
    process(result)

How stream=True works:

model.predict(stream=True) returns a generator instead of a list
The generator yields one Results object at a time
Once a result is processed, Python can reclaim its memory
Ideal for video frame sequences, large datasets, and streaming scenarios

Visualization Customization

The result.plot() method provides rich parameters for controlling visualization output.

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
from ultralytics import YOLO
import cv2

model = YOLO("yolo26n.pt")
results = model("test.jpg")

for result in results:
    # Custom visualization
    annotated = result.plot(
        line_width=2,        # Box border width (default 3)
        font_size=1.0,       # Font size multiplier (default 1.0)
        conf=True,           # Show confidence on boxes (default True)
        labels=True,         # Show class labels (default True)
        boxes=True,          # Draw bounding boxes (default True)
        masks=True,          # Draw segmentation masks (default True)
        probs=True,          # Show classification probabilities (default True)
    )
    
    # Save custom visualization
    cv2.imwrite("custom_result.jpg", annotated)

Parameter Reference

Parameter	Type	Default	Description
`line_width`	int	3	Box border width, increase for high-resolution images
`font_size`	float	1.0	Relative label font size
`conf`	bool	True	Display confidence score on boxes
`labels`	bool	True	Display class names
`boxes`	bool	True	Draw bounding box rectangles
`masks`	bool	True	Draw segmentation masks (segmentation models only)
`probs`	bool	True	Display probabilities (classification models only)

Hide All Annotations

python
1
2
3
4
5
result_clean = result.plot(
    conf=False, labels=False, boxes=False,
    masks=False, probs=False
)
# Equivalent to returning the raw image

Saving Results in Different Formats

Saving Cropped Detections

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
from ultralytics import YOLO
import cv2
import os

model = YOLO("yolo26n.pt")
results = model("test.jpg")

for i, result in enumerate(results):
    boxes = result.boxes
    orig_img = result.orig_img  # numpy array (H, W, 3)
    
    os.makedirs("crops", exist_ok=True)
    for j, box in enumerate(boxes):
        # Get box coordinates (integers)
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        # Crop object region
        crop = orig_img[y1:y2, x1:x2]
        # Save cropped image
        cls_name = result.names[int(box.cls[0])]
        cv2.imwrite(f"crops/{cls_name}_{j}.jpg", crop)

Exporting JSON / CSV

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import json
import csv

results = model("test.jpg")

# === JSON Export ===
data = []
for result in results:
    for box in result.boxes:
        data.append({
            "class": result.names[int(box.cls[0])],
            "confidence": round(box.conf[0].item(), 3),
            "bbox": {
                "x1": round(box.xyxy[0][0].item(), 2),
                "y1": round(box.xyxy[0][1].item(), 2),
                "x2": round(box.xyxy[0][2].item(), 2),
                "y2": round(box.xyxy[0][3].item(), 2),
            },
            "image_width": result.orig_shape[1],
            "image_height": result.orig_shape[0],
        })

with open("detections.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

# === CSV Export ===
with open("detections.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["class", "confidence", "x1", "y1", "x2", "y2"])
    for result in results:
        for box in result.boxes:
            writer.writerow([
                result.names[int(box.cls[0])],
                round(box.conf[0].item(), 3),
                *[round(v.item(), 2) for v in box.xyxy[0]],
            ])

Inference Performance Tips

Half Precision (FP16)

Half-precision floating point can halve GPU memory usage and significantly accelerate inference on compatible hardware (NVIDIA Volta/Turing/Ampere architectures and newer):

python
1
2
3
4
5
6
7
8
model = YOLO("yolo26n.pt")

# Method 1: Specify during predict
results = model.predict("test.jpg", half=True)

# Method 2: Convert model globally
model.model.half()  # Enable FP16 globally
results = model("test.jpg")

Note: FP16 is not supported on some CPUs and older GPUs (e.g., GTX 10 series) — it will fall back to FP32 automatically.

Device Selection

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# Specify GPU device
model = YOLO("yolo26n.pt").to("cuda:0")     # First GPU
model = YOLO("yolo26n.pt").to("cuda:1")     # Second GPU

# Multi-GPU inference (requires larger batch)
model = YOLO("yolo26n.pt")
results = model.predict(["img1.jpg", "img2.jpg"], device="0,1")  # Use GPU 0 and 1

# CPU inference (when GPU memory is insufficient)
model = YOLO("yolo26n.pt").to("cpu")

Warm-up Inference

Deep models incur initialization overhead on first inference (CUDA kernel compilation, memory allocation). Warm-up eliminates cold-start latency:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import time
from ultralytics import YOLO

model = YOLO("yolo26n.pt")

# Warm-up: run inference once with a dummy image
import numpy as np
dummy = np.zeros((640, 640, 3), dtype=np.uint8)
_ = model(dummy, verbose=False)  # Trigger CUDA kernel compilation

# Actual timing
start = time.perf_counter()
results = model("test.jpg", verbose=False)
print(f"Inference time: {(time.perf_counter() - start)*1000:.1f}ms")

Batch Size Tuning

python
1
2
3
4
5
6
7
8
# Batch inference auto-selects optimal batch size
# batch=-1 enables auto-detection; can also specify manually
results = model.predict(
    source=["img1.jpg", "img2.jpg", "img3.jpg", "img4.jpg"],
    batch=2,           # Manual batch size
    half=True,         # Combine with FP16 to reduce memory
    device="cuda:0",
)

Batch Size Memory Estimates:

Model	FP32 batch=1	FP16 batch=1	FP32 batch=4	FP16 batch=4
YOLO26n	~1.2 GB	~0.7 GB	~1.8 GB	~1.0 GB
YOLO26s	~2.0 GB	~1.1 GB	~3.2 GB	~1.7 GB
YOLO26m	~3.5 GB	~1.9 GB	~6.0 GB	~3.2 GB

Estimates for 640×640 input. Actual memory usage varies with image size and number of detections.

Additional Optimization Tips

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
model.predict(
    source="input.mp4",
    stream=True,        # Stream processing to prevent memory overflow
    half=True,          # FP16 acceleration
    device="cuda:0",    # Explicit device selection
    verbose=False,      # Suppress logging to reduce IO overhead
    max_det=100,        # Limit maximum detections
    iou=0.5,            # Higher IoU reduces NMS-retained boxes
    conf=0.3,           # Higher confidence reduces post-processing load
)

These parameter combinations help find the optimal balance between inference speed and accuracy for your specific use case.

Part of series: AI Engineering Series

← Previous YOLO Getting Started: History, Version Comparison and Environment Setup Next → YOLO Dataset Preparation: Annotation Tools and Format Conversion