eBPF (extended Berkeley Packet Filter) has revolutionized how we observe and debug systems. It allows running sandboxed programs in the Linux kernel without changing kernel source code or loading kernel modules. This post explores how eBPF enables unprecedented visibility into system behavior with minimal overhead.
What is eBPF?
eBPF lets you run custom programs in the kernel in response to events:
Application Code
↓
System Calls / Network Events / Kernel Functions
↓
eBPF Program (runs in kernel)
↓
eBPF Maps (share data with userspace)
↓
Monitoring Tools (read from maps)
Why eBPF Matters
Traditional monitoring approaches have limitations:
# Traditional approach: instrument application code
import time
from prometheus_client import Histogram
request_latency = Histogram('request_latency_seconds', 'Request latency')
@request_latency.time()
def handle_request(request):
# Must modify application code
# Only sees application-level metrics
# No visibility into kernel behavior
return process(request)
# eBPF approach: no application changes needed
# Observes at kernel level
# Sees everything: syscalls, network, file I/O, etc.
Getting Started with eBPF
Simple eBPF Program
// trace_openat.c - Trace file opens
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
// Define data structure for events
struct data_t {
u32 pid;
u64 ts;
char comm[TASK_COMM_LEN];
char filename[256];
};
// Create map to pass data to userspace
BPF_PERF_OUTPUT(events);
// Hook into openat syscall
int trace_openat(struct pt_regs *ctx, int dfd, const char __user *filename) {
struct data_t data = {};
// Get process info
data.pid = bpf_get_current_pid_tgid() >> 32;
data.ts = bpf_ktime_get_ns();
bpf_get_current_comm(&data.comm, sizeof(data.comm));
// Copy filename from userspace
bpf_probe_read_user_str(&data.filename, sizeof(data.filename), filename);
// Send to userspace
events.perf_submit(ctx, &data, sizeof(data));
return 0;
}
Python Frontend
from bcc import BPF
import time
# Load eBPF program
b = BPF(src_file="trace_openat.c")
# Attach to openat syscall
b.attach_kprobe(event="__x64_sys_openat", fn_name="trace_openat")
# Print header
print("%-8s %-16s %-6s %s" % ("TIME", "COMM", "PID", "FILENAME"))
# Process events
def print_event(cpu, data, size):
event = b["events"].event(data)
print("%-8s %-16s %-6d %s" % (
time.strftime("%H:%M:%S"),
event.comm.decode('utf-8', 'replace'),
event.pid,
event.filename.decode('utf-8', 'replace')
))
# Open perf buffer
b["events"].open_perf_buffer(print_event)
# Poll for events
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
exit()
Tracing HTTP Requests with eBPF
Track HTTP request latency without modifying applications:
// http_trace.c
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
// Store request start time
BPF_HASH(start_time, u32, u64);
// Output events
BPF_PERF_OUTPUT(events);
struct http_event_t {
u32 pid;
u64 delta_us;
u16 sport;
u16 dport;
u32 saddr;
u32 daddr;
};
// Trace TCP connect
int trace_connect(struct pt_regs *ctx, struct sock *sk) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 ts = bpf_ktime_get_ns();
start_time.update(&pid, &ts);
return 0;
}
// Trace TCP close
int trace_close(struct pt_regs *ctx, struct sock *sk) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 *tsp = start_time.lookup(&pid);
if (tsp == 0) {
return 0;
}
u64 delta = bpf_ktime_get_ns() - *tsp;
// Get connection info
u16 sport = sk->__sk_common.skc_num;
u16 dport = sk->__sk_common.skc_dport;
u32 saddr = sk->__sk_common.skc_rcv_saddr;
u32 daddr = sk->__sk_common.skc_daddr;
// Submit event
struct http_event_t event = {
.pid = pid,
.delta_us = delta / 1000,
.sport = sport,
.dport = ntohs(dport),
.saddr = saddr,
.daddr = daddr,
};
events.perf_submit(ctx, &event, sizeof(event));
start_time.delete(&pid);
return 0;
}
Network Packet Analysis
Analyze network traffic at kernel level:
from bcc import BPF
from socket import inet_ntop, AF_INET
from struct import pack
# eBPF program
bpf_program = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
struct packet_t {
u32 saddr;
u32 daddr;
u16 sport;
u16 dport;
u32 len;
};
BPF_PERF_OUTPUT(packets);
BPF_HASH(packet_count, u64, u64);
int trace_packet(struct __sk_buff *skb) {
// Parse ethernet header
struct ethernet_t *eth = cursor_advance(cursor, sizeof(*eth));
// Parse IP header
struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
if (ip->nextp != IPPROTO_TCP) {
return 0;
}
// Parse TCP header
struct tcp_t *tcp = cursor_advance(cursor, sizeof(*tcp));
// Record packet
struct packet_t pkt = {
.saddr = ip->src,
.daddr = ip->dst,
.sport = tcp->src_port,
.dport = tcp->dst_port,
.len = skb->len,
};
packets.perf_submit(skb, &pkt, sizeof(pkt));
// Count packets per connection
u64 key = ((u64)ip->src << 32) | ip->dst;
u64 *count = packet_count.lookup(&key);
if (count) {
(*count)++;
} else {
u64 init = 1;
packet_count.update(&key, &init);
}
return 0;
}
"""
b = BPF(text=bpf_program)
# Attach to network interface
function_name = b.load_func("trace_packet", BPF.SOCKET_FILTER)
BPF.attach_raw_socket(function_name, "eth0")
def print_packet(cpu, data, size):
event = b["packets"].event(data)
print(f"{inet_ntop(AF_INET, pack('I', event.saddr))}:{event.sport} -> " +
f"{inet_ntop(AF_INET, pack('I', event.daddr))}:{event.dport} " +
f"({event.len} bytes)")
b["packets"].open_perf_buffer(print_packet)
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
break
# Print statistics
print("\nPacket counts by connection:")
for k, v in b["packet_count"].items():
saddr = inet_ntop(AF_INET, pack('I', k.value >> 32))
daddr = inet_ntop(AF_INET, pack('I', k.value & 0xFFFFFFFF))
print(f"{saddr} -> {daddr}: {v.value} packets")
Performance Profiling with eBPF
Profile CPU usage without overhead:
from bcc import BPF
# CPU profiling eBPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct key_t {
u32 pid;
char name[TASK_COMM_LEN];
int user_stack_id;
int kernel_stack_id;
};
BPF_HASH(counts, struct key_t);
BPF_STACK_TRACE(stack_traces, 10240);
int do_perf_event(struct bpf_perf_event_data *ctx) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
struct key_t key = {};
key.pid = pid;
bpf_get_current_comm(&key.name, sizeof(key.name));
// Capture stack traces
key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK);
key.kernel_stack_id = stack_traces.get_stackid(&ctx->regs, 0);
u64 zero = 0, *val;
val = counts.lookup_or_init(&key, &zero);
(*val)++;
return 0;
}
"""
b = BPF(text=bpf_text)
# Sample at 99 Hz
b.attach_perf_event(
ev_type=PerfType.SOFTWARE,
ev_config=PerfSWConfig.CPU_CLOCK,
fn_name="do_perf_event",
sample_period=0,
sample_freq=99
)
print("Profiling... Hit Ctrl-C to end.")
try:
sleep(10)
except KeyboardInterrupt:
pass
# Print profile
print("\nProfile:")
counts = b["counts"]
stack_traces = b["stack_traces"]
for k, v in sorted(counts.items(), key=lambda x: x[1].value, reverse=True)[:20]:
print(f"\n{k.name.decode('utf-8', 'replace')} (PID {k.pid}): {v.value} samples")
if k.kernel_stack_id >= 0:
print(" Kernel stack:")
for addr in stack_traces.walk(k.kernel_stack_id):
print(f" {b.ksym(addr).decode('utf-8', 'replace')}")
if k.user_stack_id >= 0:
print(" User stack:")
for addr in stack_traces.walk(k.user_stack_id):
print(f" {b.sym(addr, k.pid).decode('utf-8', 'replace')}")
Database Query Tracing
Trace database queries without modifying the application:
from bcc import BPF, USDT
# Attach to PostgreSQL USDT probes
bpf_text = """
#include <uapi/linux/ptrace.h>
struct query_t {
u64 timestamp;
u32 pid;
char query[256];
};
BPF_PERF_OUTPUT(queries);
BPF_HASH(start, u32, u64);
int trace_query_start(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
// Read query string
struct query_t q = {};
q.timestamp = ts;
q.pid = pid;
// Query string is first argument
bpf_usdt_readarg_p(1, ctx, &q.query, sizeof(q.query));
queries.perf_submit(ctx, &q, sizeof(q));
return 0;
}
int trace_query_done(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 *tsp = start.lookup(&pid);
if (tsp != 0) {
u64 delta = bpf_ktime_get_ns() - *tsp;
bpf_trace_printk("Query took %llu us\\n", delta / 1000);
start.delete(&pid);
}
return 0;
}
"""
# Attach to PostgreSQL process
u = USDT(pid=int(postgres_pid))
u.enable_probe(probe="query__start", fn_name="trace_query_start")
u.enable_probe(probe="query__done", fn_name="trace_query_done")
b = BPF(text=bpf_text, usdt_contexts=[u])
def print_query(cpu, data, size):
event = b["queries"].event(data)
print(f"[{event.pid}] {event.query.decode('utf-8', 'replace')}")
b["queries"].open_perf_buffer(print_query)
print("Tracing PostgreSQL queries... Ctrl-C to exit")
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
break
Memory Leak Detection
Detect memory leaks without restarting applications:
// memleak.c
#include <uapi/linux/ptrace.h>
struct alloc_info_t {
u64 size;
u64 timestamp;
int stack_id;
};
BPF_HASH(sizes, u64);
BPF_HASH(allocs, u64, struct alloc_info_t);
BPF_STACK_TRACE(stack_traces, 10240);
int alloc_enter(struct pt_regs *ctx, size_t size) {
u64 pid = bpf_get_current_pid_tgid();
sizes.update(&pid, &size);
return 0;
}
int alloc_exit(struct pt_regs *ctx) {
u64 pid = bpf_get_current_pid_tgid();
u64 addr = PT_REGS_RC(ctx);
u64 *size = sizes.lookup(&pid);
if (size == 0) {
return 0;
}
struct alloc_info_t info = {
.size = *size,
.timestamp = bpf_ktime_get_ns(),
.stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK),
};
allocs.update(&addr, &info);
sizes.delete(&pid);
return 0;
}
int free_enter(struct pt_regs *ctx, void *address) {
u64 addr = (u64)address;
allocs.delete(&addr);
return 0;
}
from bcc import BPF
b = BPF(src_file="memleak.c")
# Attach to malloc/free
b.attach_uprobe(name="c", sym="malloc", fn_name="alloc_enter")
b.attach_uretprobe(name="c", sym="malloc", fn_name="alloc_exit")
b.attach_uprobe(name="c", sym="free", fn_name="free_enter")
print("Tracing memory allocations... Ctrl-C to show leaks")
try:
sleep(30)
except KeyboardInterrupt:
pass
# Analyze leaks
allocs = b["allocs"]
stack_traces = b["stack_traces"]
leaks = {}
for k, v in allocs.items():
stack_id = v.stack_id
if stack_id < 0:
continue
if stack_id not in leaks:
leaks[stack_id] = {
'count': 0,
'size': 0,
'stack': list(stack_traces.walk(stack_id))
}
leaks[stack_id]['count'] += 1
leaks[stack_id]['size'] += v.size
# Print top leaks
print("\nTop memory leaks:")
for stack_id, leak in sorted(leaks.items(),
key=lambda x: x[1]['size'],
reverse=True)[:10]:
print(f"\n{leak['count']} allocations, {leak['size']} bytes:")
for addr in leak['stack']:
print(f" {b.sym(addr, -1).decode('utf-8', 'replace')}")
Container-Aware Monitoring
Monitor containers with eBPF:
from bcc import BPF
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct event_t {
u32 pid;
u32 cgroup_id;
char comm[TASK_COMM_LEN];
u64 cpu_time;
};
BPF_PERF_OUTPUT(events);
int trace_sched(struct pt_regs *ctx, struct task_struct *prev) {
struct event_t event = {};
event.pid = prev->pid;
event.cgroup_id = prev->cgroups->dfl_cgrp->kn->id;
bpf_probe_read_kernel(&event.comm, sizeof(event.comm), prev->comm);
// Get CPU time
event.cpu_time = prev->se.sum_exec_runtime;
events.perf_submit(ctx, &event, sizeof(event));
return 0;
}
"""
b = BPF(text=bpf_text)
b.attach_kprobe(event="finish_task_switch", fn_name="trace_sched")
# Map cgroup IDs to container names
cgroup_to_container = {}
def load_container_mappings():
# Read from /sys/fs/cgroup to map cgroup IDs to containers
# Implementation depends on container runtime
pass
def print_event(cpu, data, size):
event = b["events"].event(data)
container = cgroup_to_container.get(event.cgroup_id, "unknown")
print(f"[{container}] {event.comm.decode('utf-8')} " +
f"(PID {event.pid}): {event.cpu_time} ns")
load_container_mappings()
b["events"].open_perf_buffer(print_event)
while True:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
break
Production eBPF Tools
Pixie for Kubernetes
# Deploy Pixie for automatic eBPF-based observability
apiVersion: v1
kind: Namespace
metadata:
name: pl
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: vizier-pem
namespace: pl
spec:
selector:
matchLabels:
name: vizier-pem
template:
metadata:
labels:
name: vizier-pem
spec:
hostNetwork: true
hostPID: true
containers:
- name: pem
image: gcr.io/pixie-oss/pixie-prod/vizier/pem_image
securityContext:
privileged: true
volumeMounts:
- name: sys
mountPath: /sys
- name: debug
mountPath: /sys/kernel/debug
volumes:
- name: sys
hostPath:
path: /sys
- name: debug
hostPath:
path: /sys/kernel/debug
Cilium for Network Policies
# eBPF-based network security
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
name: api-policy
spec:
endpointSelector:
matchLabels:
app: api
ingress:
- fromEndpoints:
- matchLabels:
app: frontend
toPorts:
- ports:
- port: "8080"
protocol: TCP
rules:
http:
- method: "GET"
path: "/api/.*"
Key Takeaways
- eBPF provides deep observability: See into kernel without modifications
- Minimal overhead: Production-safe performance monitoring
- No application changes: Instrument running systems
- Security applications: Network policies, intrusion detection
- Container-aware: Native understanding of cgroups and namespaces
- Production tools: Pixie, Cilium, Falco use eBPF
eBPF is transforming observability and security. Start experimenting with BCC tools, then build custom eBPF programs for your specific needs.