Running applications across multiple geographic regions introduces significant complexity but is essential for global scale, disaster recovery, and regulatory compliance. After operating systems serving users across 6+ regions, I’ll share practical patterns for multi-region architectures.
Why Multi-Region?
Multi-region deployments solve several critical problems:
- Latency: Serve users from nearby regions (300ms cross-continent → 30ms local)
- Availability: Survive entire region failures
- Compliance: Data residency requirements (GDPR, etc.)
- Scale: Distribute load globally
Multi-Region Patterns
Pattern 1: Active-Passive (DR Focus)
Simplest approach: one active region, others for disaster recovery.
class ActivePassiveDeployment:
"""
Primary region serves all traffic
Secondary regions ready for failover
"""
def __init__(self):
self.primary_region = 'us-east-1'
self.secondary_regions = ['eu-west-1', 'ap-southeast-1']
self.health_checker = HealthChecker()
def route_request(self, request):
"""
Route all traffic to primary unless unhealthy
"""
if self.health_checker.is_healthy(self.primary_region):
return self.forward_to_region(request, self.primary_region)
# Failover to secondary
for region in self.secondary_regions:
if self.health_checker.is_healthy(region):
self.trigger_failover_alert(region)
return self.forward_to_region(request, region)
raise AllRegionsDownError()
def replicate_data(self):
"""
Async replication to secondary regions
"""
# Stream database changes to secondaries
for region in self.secondary_regions:
self.replicate_to_region(region, async_mode=True)
Pros: Simple, lower cost Cons: Higher latency for distant users, secondary capacity often idle
Pattern 2: Active-Active (Performance Focus)
All regions actively serve traffic based on geography.
type ActiveActiveRouter struct {
regions map[string]*RegionConfig
geoIP *GeoIPDatabase
}
type RegionConfig struct {
Endpoint string
Location GeoLocation
HealthCheck *HealthCheck
}
func (r *ActiveActiveRouter) RouteRequest(req *http.Request) (*http.Response, error) {
// Determine user location
clientIP := getClientIP(req)
userLocation := r.geoIP.Lookup(clientIP)
// Find nearest healthy region
nearestRegion := r.findNearestHealthyRegion(userLocation)
if nearestRegion == nil {
return nil, errors.New("no healthy regions available")
}
// Forward to nearest region
return r.forwardToRegion(req, nearestRegion)
}
func (r *ActiveActiveRouter) findNearestHealthyRegion(
userLoc GeoLocation) *RegionConfig {
var nearest *RegionConfig
minDistance := math.MaxFloat64
for _, region := range r.regions {
if !region.HealthCheck.IsHealthy() {
continue
}
distance := calculateDistance(userLoc, region.Location)
if distance < minDistance {
minDistance = distance
nearest = region
}
}
return nearest
}
Pattern 3: Read-Local, Write-Global
Optimize for read-heavy workloads.
public class ReadLocalWriteGlobalService {
private final Map<String, DatabaseConnection> regionalReadReplicas;
private final DatabaseConnection globalWritePrimary;
private final String currentRegion;
public User getUser(String userId) {
// Read from local replica
DatabaseConnection localDB = regionalReadReplicas.get(currentRegion);
User user = localDB.query(
"SELECT * FROM users WHERE id = ?",
userId
);
return user;
}
public void updateUser(User user) {
// Write to global primary
globalWritePrimary.execute(
"UPDATE users SET name = ?, email = ? WHERE id = ?",
user.getName(), user.getEmail(), user.getId()
);
// Invalidate caches in all regions
for (String region : regionalReadReplicas.keySet()) {
invalidateCache(region, "user:" + user.getId());
}
// Changes will replicate to read replicas asynchronously
// Accept eventual consistency for reads
}
public void updateUserWithConsistency(User user) {
// For operations requiring consistency
// Force read from primary after write
updateUser(user);
// Subsequent reads use primary until replicated
markUserAsRecentlyUpdated(user.getId(), Duration.ofSeconds(5));
}
private User getUserWithConsistency(String userId) {
if (wasRecentlyUpdated(userId)) {
// Read from primary to ensure consistency
return globalWritePrimary.query(
"SELECT * FROM users WHERE id = ?",
userId
);
}
// Use local replica
return getUser(userId);
}
}
Data Replication Strategies
Async Replication
class AsyncReplicationManager:
"""
Replicate data asynchronously between regions
High performance, eventual consistency
"""
def __init__(self):
self.kafka_producer = KafkaProducer()
self.primary_region = 'us-east-1'
def write_with_replication(self, table: str, record: dict):
"""
Write to primary, queue for replication
"""
# Write to primary region database
primary_db = self.get_db(self.primary_region)
primary_db.insert(table, record)
# Publish change event for replication
event = {
'operation': 'INSERT',
'table': table,
'data': record,
'timestamp': time.time(),
'region': self.primary_region
}
# Kafka topic with replication to other regions
self.kafka_producer.send(
f'db-changes-{table}',
value=json.dumps(event),
key=record['id']
)
def start_replication_consumer(self, target_region: str):
"""
Consume changes in target region
"""
consumer = KafkaConsumer(
'db-changes-*', # Subscribe to all tables
group_id=f'replication-{target_region}'
)
target_db = self.get_db(target_region)
for message in consumer:
event = json.loads(message.value)
# Skip events from this region (avoid loops)
if event['region'] == target_region:
continue
# Apply change to target region
try:
if event['operation'] == 'INSERT':
target_db.insert(event['table'], event['data'])
elif event['operation'] == 'UPDATE':
target_db.update(event['table'], event['data'])
elif event['operation'] == 'DELETE':
target_db.delete(event['table'], event['data']['id'])
# Track replication lag
lag = time.time() - event['timestamp']
metrics.record('replication.lag', lag,
tags={'source': event['region'],
'target': target_region})
except DuplicateKeyError:
# Already replicated, skip
pass
Sync Replication
type SyncReplicationWriter struct {
regions []string
quorum int // Number of regions that must confirm
}
func (w *SyncReplicationWriter) Write(key string, value []byte) error {
// Write to all regions in parallel
results := make(chan error, len(w.regions))
for _, region := range w.regions {
go func(r string) {
conn := w.getConnection(r)
results <- conn.Write(key, value)
}(region)
}
// Wait for quorum
successCount := 0
var lastError error
for i := 0; i < len(w.regions); i++ {
err := <-results
if err == nil {
successCount++
if successCount >= w.quorum {
// Quorum achieved, return success
// Remaining writes continue in background
return nil
}
} else {
lastError = err
}
}
// Quorum not achieved
return fmt.Errorf("replication failed: %w", lastError)
}
Conflict Resolution
Last-Write-Wins
class LastWriteWinsResolver:
"""
Simple conflict resolution: newest write wins
"""
def merge(self, local_value: dict, remote_value: dict) -> dict:
"""
Merge conflicting values based on timestamp
"""
local_ts = local_value.get('_timestamp', 0)
remote_ts = remote_value.get('_timestamp', 0)
if remote_ts > local_ts:
return remote_value
elif local_ts > remote_ts:
return local_value
else:
# Same timestamp, use deterministic tiebreaker
local_region = local_value.get('_region', '')
remote_region = remote_value.get('_region', '')
# Alphabetically first region wins
if remote_region < local_region:
return remote_value
else:
return local_value
Application-Level Merge
public class ApplicationMergeResolver {
public User mergeUsers(User local, User remote) {
// Application-specific merge logic
User merged = new User(local.getId());
// For each field, use newest non-null value
merged.setName(
getNewerValue(
local.getName(), local.getNameUpdatedAt(),
remote.getName(), remote.getNameUpdatedAt()
)
);
merged.setEmail(
getNewerValue(
local.getEmail(), local.getEmailUpdatedAt(),
remote.getEmail(), remote.getEmailUpdatedAt()
)
);
// For collections, union
Set<String> allPreferences = new HashSet<>();
allPreferences.addAll(local.getPreferences());
allPreferences.addAll(remote.getPreferences());
merged.setPreferences(allPreferences);
return merged;
}
private <T> T getNewerValue(T localVal, Instant localTime,
T remoteVal, Instant remoteTime) {
if (remoteTime.isAfter(localTime)) {
return remoteVal != null ? remoteVal : localVal;
} else {
return localVal != null ? localVal : remoteVal;
}
}
}
Traffic Management
DNS-Based Routing
# Use Route53 or similar for geographic routing
{
"Name": "api.example.com",
"Type": "A",
"SetIdentifier": "US East",
"GeoLocation": {
"ContinentCode": "NA" # North America
},
"ResourceRecords": [
{"Value": "54.123.45.67"} # US East load balancer
],
"TTL": 60
}
{
"Name": "api.example.com",
"Type": "A",
"SetIdentifier": "EU West",
"GeoLocation": {
"ContinentCode": "EU"
},
"ResourceRecords": [
{"Value": "52.98.76.54"} # EU West load balancer
],
"TTL": 60
}
Application-Level Routing
type GlobalLoadBalancer struct {
regions map[string]*Region
metrics *MetricsCollector
}
type Region struct {
Name string
Endpoints []string
Latency time.Duration
ErrorRate float64
}
func (glb *GlobalLoadBalancer) SelectRegion(userIP string) *Region {
// Get user's geographic location
userLoc := geoIP.Lookup(userIP)
// Find candidate regions
candidates := glb.getCandidateRegions(userLoc, 3)
// Select best region based on multiple factors
best := candidates[0]
bestScore := glb.scoreRegion(best, userLoc)
for _, candidate := range candidates[1:] {
score := glb.scoreRegion(candidate, userLoc)
if score > bestScore {
best = candidate
bestScore = score
}
}
return best
}
func (glb *GlobalLoadBalancer) scoreRegion(
region *Region, userLoc GeoLocation) float64 {
// Weighted scoring
distanceScore := 1.0 / (1.0 + calculateDistance(region.Location, userLoc))
latencyScore := 1.0 / (1.0 + float64(region.Latency.Milliseconds()))
reliabilityScore := 1.0 - region.ErrorRate
// Weights based on priorities
score := (distanceScore * 0.4) +
(latencyScore * 0.3) +
(reliabilityScore * 0.3)
return score
}
Monitoring Multi-Region Systems
class MultiRegionMonitor:
"""
Monitor health and performance across regions
"""
def check_region_health(self) -> dict:
"""
Check health of all regions
"""
health = {}
for region in self.regions:
health[region] = {
'status': self.check_status(region),
'latency_p99': self.get_latency_p99(region),
'error_rate': self.get_error_rate(region),
'replication_lag': self.get_replication_lag(region),
'capacity_used': self.get_capacity_usage(region)
}
return health
def check_cross_region_latency(self) -> dict:
"""
Measure latency between regions
"""
latencies = {}
for source in self.regions:
latencies[source] = {}
for target in self.regions:
if source != target:
latency = self.ping_region(source, target)
latencies[source][target] = latency
return latencies
def detect_split_brain(self) -> bool:
"""
Detect if regions can't communicate
"""
# Check if all regions can reach each other
for source in self.regions:
for target in self.regions:
if source != target:
if not self.can_reach(source, target):
self.alert_split_brain(source, target)
return True
return False
def monitor_replication_lag(self):
"""
Alert on high replication lag
"""
for target_region in self.regions:
lag = self.get_replication_lag(target_region)
if lag > 60: # 60 seconds
self.alert_high_replication_lag(
target_region,
lag_seconds=lag
)
# Record metric
self.metrics.gauge(
'replication.lag',
lag,
tags={'region': target_region}
)
Disaster Recovery Procedures
class DisasterRecoveryManager:
"""
Manage failover between regions
"""
def failover_to_region(self, target_region: str):
"""
Failover to target region
"""
print(f"Starting failover to {target_region}")
# 1. Stop writes to failed region
self.mark_region_read_only(self.primary_region)
# 2. Verify target region is healthy
if not self.verify_region_health(target_region):
raise Exception(f"Target region {target_region} is not healthy")
# 3. Promote target region to primary
self.promote_region_to_primary(target_region)
# 4. Update DNS to point to new primary
self.update_dns(target_region)
# 5. Verify failover
time.sleep(10) # Wait for DNS propagation
if not self.verify_failover(target_region):
self.rollback_failover()
raise Exception("Failover verification failed")
print(f"Failover to {target_region} completed successfully")
def automated_failover(self):
"""
Automatic failover on primary failure
"""
while True:
time.sleep(30) # Check every 30 seconds
if not self.check_primary_health():
# Primary is down, initiate failover
self.send_alert("Primary region failure detected")
# Select best secondary region
target = self.select_failover_target()
try:
self.failover_to_region(target)
self.send_alert(f"Automatic failover to {target} successful")
except Exception as e:
self.send_alert(f"Automatic failover failed: {e}")
Key Takeaways
- Choose pattern based on requirements: Active-passive for DR, active-active for global performance
- Accept eventual consistency: Perfect consistency across regions is expensive
- Monitor replication lag: It’s your early warning system
- Plan for network partitions: Regions will lose connectivity
- Test failover regularly: Untested DR plans don’t work
- Use conflict resolution: Concurrent writes will happen
- Measure cross-region latency: It impacts architecture decisions
Multi-region architectures are complex but necessary at scale. Start with single region, add DR secondary, then gradually move to active-active as needed.