bpftrace is great for quick probing and ad-hoc debugging. For production-grade monitoring tools, you need full eBPF programs. The architecture splits into two layers:
- Kernel side: eBPF program written in C, attached to hook points, collecting event data
- User side: loader written in Go (or Rust / libbpf C), loading the eBPF program and reading events
Architecture
flowchart LR
classDef kern fill:#E3F2FD,stroke:#1565C0,color:#1565C0
classDef user fill:#FFF3E0,stroke:#E65100,color:#BF360C
classDef data fill:#E8F5E9,stroke:#2E7D32,color:#1B5E20
subgraph kernel["Kernel Space"]
hook@{ shape: rounded, label: "oom_kill_process (kprobe)" }
ebpf@{ shape: proc, label: "eBPF Program\nEvent Collection" }
ring@{ shape: cyl, label: "Ring Buffer" }
end
subgraph userspace["User Space (Go)"]
loader@{ shape: notch-rect, label: "bpf2go Loader" }
reader@{ shape: proc, label: "RingBuf Reader\nEvent Parsing" }
end
hook --> ebpf --> ring
ring --> reader
loader -.-> ebpf
class hook,ebpf,ring kern
class loader,reader user
eBPF Kernel Program (C)
Name the C file oom_kprobe.bpf.c — the bpf suffix is a cilium/ebpf convention for bpf2go code generation:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
| //go:build ignore
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
char LICENSE[] SEC("license") = "Dual BSD/GPL";
struct oom_event {
u32 pid;
u32 tgid;
u64 fpid;
long pages;
char comm[TASK_COMM_LEN];
char fcomm[TASK_COMM_LEN];
u64 timestamp;
};
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 24); // 16MB
} events SEC(".maps");
SEC("kprobe/oom_kill_process")
int BPF_KPROBE(oom_kill_process, struct oom_control *oc,
struct task_struct *p, const char *message)
{
struct oom_event *event;
event = bpf_ringbuf_reserve(&events, sizeof(*event), 0);
if (!event) return 0;
event->pid = BPF_CORE_READ(p, pid);
event->tgid = BPF_CORE_READ(p, tgid);
bpf_probe_read_kernel_str(&event->comm, sizeof(event->comm),
BPF_CORE_READ(p, comm));
struct task_struct *fp = BPF_CORE_READ(oc, chosen);
if (fp) {
event->fpid = BPF_CORE_READ(fp, tgid);
bpf_probe_read_kernel_str(&event->fcomm, sizeof(event->fcomm),
BPF_CORE_READ(fp, comm));
}
event->pages = BPF_CORE_READ(oc, totalpages);
event->timestamp = bpf_ktime_get_ns();
bpf_ringbuf_submit(event, 0);
return 0;
}
|
Key Concepts
BPF_KPROBE macro handles kprobe argument extraction automatically — no manual struct pt_regs unpacking neededBPF_CORE_READ accesses kernel data structures via BTF info, no hardcoded offsets — this is CO-RE in actionbpf_ringbuf_reserve + bpf_ringbuf_submit is a lock-free producer-consumer pattern; events are readable by user-space immediately after submissionvmlinux.h is generated by bpftool btf dump and contains definitions for all kernel data structures
Go User-Space Program
Use the bpf2go code generator from the cilium/ebpf library:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
| //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang-14 \
// -target bpfel -type oom_event oom_kprobe oom_kprobe.bpf.c \
// -- -I../headers -O2 -g -D__TARGET_ARCH_x86
package main
import (
"encoding/binary"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"time"
"github.com/cilium/ebpf/link"
"github.com/cilium/ebpf/ringbuf"
"github.com/cilium/ebpf/rlimit"
)
func main() {
// 1. Raise rlimit (eBPF needs memory locking)
if err := rlimit.RemoveMemlock(); err != nil {
log.Fatal(err)
}
// 2. Load eBPF objects
objs := oom_kprobeObjects{}
if err := loadOom_kprobeObjects(&objs, nil); err != nil {
log.Fatalf("loading objects: %v", err)
}
defer objs.Close()
// 3. Attach kprobe
kp, err := link.Kprobe("oom_kill_process", objs.OomKillProcess, nil)
if err != nil {
log.Fatalf("opening kprobe: %v", err)
}
defer kp.Close()
// 4. Create ring buffer reader
rd, err := ringbuf.NewReader(objs.OomKprobeMaps.Events)
if err != nil {
log.Fatalf("opening ringbuf reader: %v", err)
}
defer rd.Close()
// 5. Signal handling
stop := make(chan os.Signal, 1)
signal.Notify(stop, os.Interrupt, syscall.SIGTERM)
go func() {
<-stop
rd.Close()
}()
log.Println("OOM monitor started. Press Ctrl+C to exit.")
// 6. Event loop
var event oom_kprobeOomEvent
for {
record, err := rd.Read()
if err != nil {
if err == ringbuf.ErrClosed {
return
}
log.Printf("reading from ringbuf: %v", err)
continue
}
if err := binary.Read(record.Reader, binary.LittleEndian, &event); err != nil {
log.Printf("parsing event: %v", err)
continue
}
fmt.Printf("\n[%s] OOM KILL DETECTED\n", time.Now().Format("15:04:05"))
fmt.Printf(" Killed: PID=%d COMM=%s\n", event.Pid, trimNull(event.Comm[:]))
fmt.Printf(" Trigger: PID=%d COMM=%s\n", event.Fpid, trimNull(event.Fcomm[:]))
fmt.Printf(" Pages: %d (%.2f MB)\n", event.Pages, float64(event.Pages)*4/1024)
}
}
func trimNull(b []byte) string {
for i, c := range b {
if c == 0 {
return string(b[:i])
}
}
return string(b)
}
|
Build and Run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| # Generate bpf2go Go code
go generate ./...
# Build
go build -o oom-monitor .
# Run (requires root)
sudo ./oom-monitor
# Trigger OOM in another terminal(will consume significant memory — avoid on production systems)
sudo stress-ng --vm 1 --vm-bytes 80% -t 30s
# Safer alternative: use Docker with memory limit(requires Docker)
# docker run --rm -m 64m ubuntu:22.04 bash -c "apt-get update -qq && apt-get install -y -qq stress-ng && stress-ng --vm 1 --vm-bytes 50m -t 10s"
sudo stress-ng --vm 1 --vm-bytes 80% -t 30s
|
The compiled binary is self-contained — BPF bytecode is embedded into the Go binary via go:embed.
Expected Output
When an OOM is triggered, the monitor should output something like:
1
2
3
4
5
6
7
8
9
| [14:30:25] OOM KILL DETECTED
Killed: PID=12345 COMM=stress-ng
Trigger: PID=9876 COMM=oom-monitor
Pages: 262144 (1024.00 MB)
[14:30:26] OOM KILL DETECTED
Killed: PID=12346 COMM=stress-ng
Trigger: PID=9876 COMM=oom-monitor
Pages: 131072 (512.00 MB)
|
If nothing appears, the kernel didn’t trigger OOM — increase stress-ng’s memory ratio or free up some memory first.
Extension Ideas
Add More Hook Points
Correlating data from multiple kernel hook points gives a more complete picture:
1
2
3
4
5
6
7
8
9
10
11
12
13
| // Capture container-level OOM
SEC("kprobe/mem_cgroup_out_of_memory")
int BPF_KPROBE(memcg_oom, struct mem_cgroup *memcg, gfp_t gfp_mask, int order)
{
// Collect cgroup ID, container memory limit, current usage
}
// Memory pressure events
SEC("tracepoint/psi/memory_stall")
int trace_memory_stall(struct trace_event_raw_psi_group *ctx)
{
// Early warning before OOM actually hits
}
|
User-Side Enhancements
- Prometheus Exporter: Convert OOM events to Counter and Gauge metrics
- Container mapping: Read
/proc/<pid>/cgroup to map OOM events to Kubernetes Pods - Event persistence: Write to ClickHouse / Loki for historical analysis
- Alerting: Real-time notification when OOM events occur
Summary
This article implemented a complete OOM event tracing tool: a C eBPF kernel program handles data collection, and a Go user-space program handles loading and reading, with events passed through a Ring Buffer. This architecture is the standard pattern for eBPF application development and can be reused across many observability scenarios.