eBPF (extended Berkeley Packet Filter) has revolutionized how we observe and debug systems. It allows running sandboxed programs in the Linux kernel without changing kernel source code or loading kernel modules. This post explores how eBPF enables unprecedented visibility into system behavior with minimal overhead.

What is eBPF?

eBPF lets you run custom programs in the kernel in response to events:

Application Code

System Calls / Network Events / Kernel Functions

eBPF Program (runs in kernel)

eBPF Maps (share data with userspace)

Monitoring Tools (read from maps)

Why eBPF Matters

Traditional monitoring approaches have limitations:

# Traditional approach: instrument application code
import time
from prometheus_client import Histogram

request_latency = Histogram('request_latency_seconds', 'Request latency')

@request_latency.time()
def handle_request(request):
    # Must modify application code
    # Only sees application-level metrics
    # No visibility into kernel behavior
    return process(request)

# eBPF approach: no application changes needed
# Observes at kernel level
# Sees everything: syscalls, network, file I/O, etc.

Getting Started with eBPF

Simple eBPF Program

// trace_openat.c - Trace file opens
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

// Define data structure for events
struct data_t {
    u32 pid;
    u64 ts;
    char comm[TASK_COMM_LEN];
    char filename[256];
};

// Create map to pass data to userspace
BPF_PERF_OUTPUT(events);

// Hook into openat syscall
int trace_openat(struct pt_regs *ctx, int dfd, const char __user *filename) {
    struct data_t data = {};

    // Get process info
    data.pid = bpf_get_current_pid_tgid() >> 32;
    data.ts = bpf_ktime_get_ns();
    bpf_get_current_comm(&data.comm, sizeof(data.comm));

    // Copy filename from userspace
    bpf_probe_read_user_str(&data.filename, sizeof(data.filename), filename);

    // Send to userspace
    events.perf_submit(ctx, &data, sizeof(data));

    return 0;
}

Python Frontend

from bcc import BPF
import time

# Load eBPF program
b = BPF(src_file="trace_openat.c")

# Attach to openat syscall
b.attach_kprobe(event="__x64_sys_openat", fn_name="trace_openat")

# Print header
print("%-8s %-16s %-6s %s" % ("TIME", "COMM", "PID", "FILENAME"))

# Process events
def print_event(cpu, data, size):
    event = b["events"].event(data)
    print("%-8s %-16s %-6d %s" % (
        time.strftime("%H:%M:%S"),
        event.comm.decode('utf-8', 'replace'),
        event.pid,
        event.filename.decode('utf-8', 'replace')
    ))

# Open perf buffer
b["events"].open_perf_buffer(print_event)

# Poll for events
while True:
    try:
        b.perf_buffer_poll()
    except KeyboardInterrupt:
        exit()

Tracing HTTP Requests with eBPF

Track HTTP request latency without modifying applications:

// http_trace.c
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>

// Store request start time
BPF_HASH(start_time, u32, u64);

// Output events
BPF_PERF_OUTPUT(events);

struct http_event_t {
    u32 pid;
    u64 delta_us;
    u16 sport;
    u16 dport;
    u32 saddr;
    u32 daddr;
};

// Trace TCP connect
int trace_connect(struct pt_regs *ctx, struct sock *sk) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u64 ts = bpf_ktime_get_ns();

    start_time.update(&pid, &ts);

    return 0;
}

// Trace TCP close
int trace_close(struct pt_regs *ctx, struct sock *sk) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;

    u64 *tsp = start_time.lookup(&pid);
    if (tsp == 0) {
        return 0;
    }

    u64 delta = bpf_ktime_get_ns() - *tsp;

    // Get connection info
    u16 sport = sk->__sk_common.skc_num;
    u16 dport = sk->__sk_common.skc_dport;
    u32 saddr = sk->__sk_common.skc_rcv_saddr;
    u32 daddr = sk->__sk_common.skc_daddr;

    // Submit event
    struct http_event_t event = {
        .pid = pid,
        .delta_us = delta / 1000,
        .sport = sport,
        .dport = ntohs(dport),
        .saddr = saddr,
        .daddr = daddr,
    };

    events.perf_submit(ctx, &event, sizeof(event));
    start_time.delete(&pid);

    return 0;
}

Network Packet Analysis

Analyze network traffic at kernel level:

from bcc import BPF
from socket import inet_ntop, AF_INET
from struct import pack

# eBPF program
bpf_program = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>

struct packet_t {
    u32 saddr;
    u32 daddr;
    u16 sport;
    u16 dport;
    u32 len;
};

BPF_PERF_OUTPUT(packets);
BPF_HASH(packet_count, u64, u64);

int trace_packet(struct __sk_buff *skb) {
    // Parse ethernet header
    struct ethernet_t *eth = cursor_advance(cursor, sizeof(*eth));

    // Parse IP header
    struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
    if (ip->nextp != IPPROTO_TCP) {
        return 0;
    }

    // Parse TCP header
    struct tcp_t *tcp = cursor_advance(cursor, sizeof(*tcp));

    // Record packet
    struct packet_t pkt = {
        .saddr = ip->src,
        .daddr = ip->dst,
        .sport = tcp->src_port,
        .dport = tcp->dst_port,
        .len = skb->len,
    };

    packets.perf_submit(skb, &pkt, sizeof(pkt));

    // Count packets per connection
    u64 key = ((u64)ip->src << 32) | ip->dst;
    u64 *count = packet_count.lookup(&key);
    if (count) {
        (*count)++;
    } else {
        u64 init = 1;
        packet_count.update(&key, &init);
    }

    return 0;
}
"""

b = BPF(text=bpf_program)

# Attach to network interface
function_name = b.load_func("trace_packet", BPF.SOCKET_FILTER)
BPF.attach_raw_socket(function_name, "eth0")

def print_packet(cpu, data, size):
    event = b["packets"].event(data)
    print(f"{inet_ntop(AF_INET, pack('I', event.saddr))}:{event.sport} -> " +
          f"{inet_ntop(AF_INET, pack('I', event.daddr))}:{event.dport} " +
          f"({event.len} bytes)")

b["packets"].open_perf_buffer(print_packet)

while True:
    try:
        b.perf_buffer_poll()
    except KeyboardInterrupt:
        break

# Print statistics
print("\nPacket counts by connection:")
for k, v in b["packet_count"].items():
    saddr = inet_ntop(AF_INET, pack('I', k.value >> 32))
    daddr = inet_ntop(AF_INET, pack('I', k.value & 0xFFFFFFFF))
    print(f"{saddr} -> {daddr}: {v.value} packets")

Performance Profiling with eBPF

Profile CPU usage without overhead:

from bcc import BPF

# CPU profiling eBPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

struct key_t {
    u32 pid;
    char name[TASK_COMM_LEN];
    int user_stack_id;
    int kernel_stack_id;
};

BPF_HASH(counts, struct key_t);
BPF_STACK_TRACE(stack_traces, 10240);

int do_perf_event(struct bpf_perf_event_data *ctx) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;

    struct key_t key = {};
    key.pid = pid;
    bpf_get_current_comm(&key.name, sizeof(key.name));

    // Capture stack traces
    key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK);
    key.kernel_stack_id = stack_traces.get_stackid(&ctx->regs, 0);

    u64 zero = 0, *val;
    val = counts.lookup_or_init(&key, &zero);
    (*val)++;

    return 0;
}
"""

b = BPF(text=bpf_text)

# Sample at 99 Hz
b.attach_perf_event(
    ev_type=PerfType.SOFTWARE,
    ev_config=PerfSWConfig.CPU_CLOCK,
    fn_name="do_perf_event",
    sample_period=0,
    sample_freq=99
)

print("Profiling... Hit Ctrl-C to end.")

try:
    sleep(10)
except KeyboardInterrupt:
    pass

# Print profile
print("\nProfile:")
counts = b["counts"]
stack_traces = b["stack_traces"]

for k, v in sorted(counts.items(), key=lambda x: x[1].value, reverse=True)[:20]:
    print(f"\n{k.name.decode('utf-8', 'replace')} (PID {k.pid}): {v.value} samples")

    if k.kernel_stack_id >= 0:
        print("  Kernel stack:")
        for addr in stack_traces.walk(k.kernel_stack_id):
            print(f"    {b.ksym(addr).decode('utf-8', 'replace')}")

    if k.user_stack_id >= 0:
        print("  User stack:")
        for addr in stack_traces.walk(k.user_stack_id):
            print(f"    {b.sym(addr, k.pid).decode('utf-8', 'replace')}")

Database Query Tracing

Trace database queries without modifying the application:

from bcc import BPF, USDT

# Attach to PostgreSQL USDT probes
bpf_text = """
#include <uapi/linux/ptrace.h>

struct query_t {
    u64 timestamp;
    u32 pid;
    char query[256];
};

BPF_PERF_OUTPUT(queries);
BPF_HASH(start, u32, u64);

int trace_query_start(struct pt_regs *ctx) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u64 ts = bpf_ktime_get_ns();

    start.update(&pid, &ts);

    // Read query string
    struct query_t q = {};
    q.timestamp = ts;
    q.pid = pid;

    // Query string is first argument
    bpf_usdt_readarg_p(1, ctx, &q.query, sizeof(q.query));

    queries.perf_submit(ctx, &q, sizeof(q));

    return 0;
}

int trace_query_done(struct pt_regs *ctx) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u64 *tsp = start.lookup(&pid);

    if (tsp != 0) {
        u64 delta = bpf_ktime_get_ns() - *tsp;
        bpf_trace_printk("Query took %llu us\\n", delta / 1000);
        start.delete(&pid);
    }

    return 0;
}
"""

# Attach to PostgreSQL process
u = USDT(pid=int(postgres_pid))
u.enable_probe(probe="query__start", fn_name="trace_query_start")
u.enable_probe(probe="query__done", fn_name="trace_query_done")

b = BPF(text=bpf_text, usdt_contexts=[u])

def print_query(cpu, data, size):
    event = b["queries"].event(data)
    print(f"[{event.pid}] {event.query.decode('utf-8', 'replace')}")

b["queries"].open_perf_buffer(print_query)

print("Tracing PostgreSQL queries... Ctrl-C to exit")

while True:
    try:
        b.perf_buffer_poll()
    except KeyboardInterrupt:
        break

Memory Leak Detection

Detect memory leaks without restarting applications:

// memleak.c
#include <uapi/linux/ptrace.h>

struct alloc_info_t {
    u64 size;
    u64 timestamp;
    int stack_id;
};

BPF_HASH(sizes, u64);
BPF_HASH(allocs, u64, struct alloc_info_t);
BPF_STACK_TRACE(stack_traces, 10240);

int alloc_enter(struct pt_regs *ctx, size_t size) {
    u64 pid = bpf_get_current_pid_tgid();

    sizes.update(&pid, &size);

    return 0;
}

int alloc_exit(struct pt_regs *ctx) {
    u64 pid = bpf_get_current_pid_tgid();
    u64 addr = PT_REGS_RC(ctx);

    u64 *size = sizes.lookup(&pid);
    if (size == 0) {
        return 0;
    }

    struct alloc_info_t info = {
        .size = *size,
        .timestamp = bpf_ktime_get_ns(),
        .stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK),
    };

    allocs.update(&addr, &info);
    sizes.delete(&pid);

    return 0;
}

int free_enter(struct pt_regs *ctx, void *address) {
    u64 addr = (u64)address;

    allocs.delete(&addr);

    return 0;
}
from bcc import BPF

b = BPF(src_file="memleak.c")

# Attach to malloc/free
b.attach_uprobe(name="c", sym="malloc", fn_name="alloc_enter")
b.attach_uretprobe(name="c", sym="malloc", fn_name="alloc_exit")
b.attach_uprobe(name="c", sym="free", fn_name="free_enter")

print("Tracing memory allocations... Ctrl-C to show leaks")

try:
    sleep(30)
except KeyboardInterrupt:
    pass

# Analyze leaks
allocs = b["allocs"]
stack_traces = b["stack_traces"]

leaks = {}
for k, v in allocs.items():
    stack_id = v.stack_id
    if stack_id < 0:
        continue

    if stack_id not in leaks:
        leaks[stack_id] = {
            'count': 0,
            'size': 0,
            'stack': list(stack_traces.walk(stack_id))
        }

    leaks[stack_id]['count'] += 1
    leaks[stack_id]['size'] += v.size

# Print top leaks
print("\nTop memory leaks:")
for stack_id, leak in sorted(leaks.items(),
                             key=lambda x: x[1]['size'],
                             reverse=True)[:10]:
    print(f"\n{leak['count']} allocations, {leak['size']} bytes:")
    for addr in leak['stack']:
        print(f"  {b.sym(addr, -1).decode('utf-8', 'replace')}")

Container-Aware Monitoring

Monitor containers with eBPF:

from bcc import BPF

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

struct event_t {
    u32 pid;
    u32 cgroup_id;
    char comm[TASK_COMM_LEN];
    u64 cpu_time;
};

BPF_PERF_OUTPUT(events);

int trace_sched(struct pt_regs *ctx, struct task_struct *prev) {
    struct event_t event = {};

    event.pid = prev->pid;
    event.cgroup_id = prev->cgroups->dfl_cgrp->kn->id;
    bpf_probe_read_kernel(&event.comm, sizeof(event.comm), prev->comm);

    // Get CPU time
    event.cpu_time = prev->se.sum_exec_runtime;

    events.perf_submit(ctx, &event, sizeof(event));

    return 0;
}
"""

b = BPF(text=bpf_text)
b.attach_kprobe(event="finish_task_switch", fn_name="trace_sched")

# Map cgroup IDs to container names
cgroup_to_container = {}

def load_container_mappings():
    # Read from /sys/fs/cgroup to map cgroup IDs to containers
    # Implementation depends on container runtime
    pass

def print_event(cpu, data, size):
    event = b["events"].event(data)

    container = cgroup_to_container.get(event.cgroup_id, "unknown")

    print(f"[{container}] {event.comm.decode('utf-8')} " +
          f"(PID {event.pid}): {event.cpu_time} ns")

load_container_mappings()
b["events"].open_perf_buffer(print_event)

while True:
    try:
        b.perf_buffer_poll()
    except KeyboardInterrupt:
        break

Production eBPF Tools

Pixie for Kubernetes

# Deploy Pixie for automatic eBPF-based observability
apiVersion: v1
kind: Namespace
metadata:
  name: pl
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: vizier-pem
  namespace: pl
spec:
  selector:
    matchLabels:
      name: vizier-pem
  template:
    metadata:
      labels:
        name: vizier-pem
    spec:
      hostNetwork: true
      hostPID: true
      containers:
      - name: pem
        image: gcr.io/pixie-oss/pixie-prod/vizier/pem_image
        securityContext:
          privileged: true
        volumeMounts:
        - name: sys
          mountPath: /sys
        - name: debug
          mountPath: /sys/kernel/debug
      volumes:
      - name: sys
        hostPath:
          path: /sys
      - name: debug
        hostPath:
          path: /sys/kernel/debug

Cilium for Network Policies

# eBPF-based network security
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
  name: api-policy
spec:
  endpointSelector:
    matchLabels:
      app: api
  ingress:
  - fromEndpoints:
    - matchLabels:
        app: frontend
    toPorts:
    - ports:
      - port: "8080"
        protocol: TCP
      rules:
        http:
        - method: "GET"
          path: "/api/.*"

Key Takeaways

  1. eBPF provides deep observability: See into kernel without modifications
  2. Minimal overhead: Production-safe performance monitoring
  3. No application changes: Instrument running systems
  4. Security applications: Network policies, intrusion detection
  5. Container-aware: Native understanding of cgroups and namespaces
  6. Production tools: Pixie, Cilium, Falco use eBPF

eBPF is transforming observability and security. Start experimenting with BCC tools, then build custom eBPF programs for your specific needs.