Achieving high availability in a single datacenter is challenging. Achieving it across multiple cloud providers while maintaining low latency and managing costs is exponentially harder. This post shares our architecture patterns and operational learnings from running mission-critical systems at 99.99% availability across AWS, GCP, and Azure.
Understanding Availability Requirements
First, let’s understand what different availability targets mean:
- 99.9% (3 nines): 8.76 hours downtime/year
- 99.95%: 4.38 hours downtime/year
- 99.99% (4 nines): 52.56 minutes downtime/year
- 99.999% (5 nines): 5.26 minutes downtime/year
Each additional nine requires exponentially more investment in architecture, automation, and operational discipline.
Core Principles
1. Eliminate Single Points of Failure
// Example: Multi-region service discovery
interface ServiceEndpoint {
region: string;
cloud: 'aws' | 'gcp' | 'azure';
endpoint: string;
healthStatus: 'healthy' | 'degraded' | 'unhealthy';
latency: number;
}
class MultiCloudServiceDiscovery {
private endpoints: Map<string, ServiceEndpoint[]> = new Map();
async discoverService(serviceName: string): Promise<ServiceEndpoint> {
const candidates = this.endpoints.get(serviceName) || [];
// Filter to healthy endpoints
const healthy = candidates.filter(e => e.healthStatus === 'healthy');
if (healthy.length === 0) {
// Fallback to degraded if no healthy endpoints
const degraded = candidates.filter(e => e.healthStatus === 'degraded');
if (degraded.length > 0) {
return this.selectByLatency(degraded);
}
throw new Error(`No available endpoints for ${serviceName}`);
}
// Select endpoint based on latency and load
return this.selectByLatency(healthy);
}
private selectByLatency(endpoints: ServiceEndpoint[]): ServiceEndpoint {
// Use weighted random selection favoring low latency
const weights = endpoints.map(e => 1 / (e.latency + 1));
const totalWeight = weights.reduce((a, b) => a + b, 0);
let random = Math.random() * totalWeight;
for (let i = 0; i < endpoints.length; i++) {
random -= weights[i];
if (random <= 0) {
return endpoints[i];
}
}
return endpoints[0];
}
async updateHealth(serviceName: string, endpoint: string, health: HealthCheck) {
const endpoints = this.endpoints.get(serviceName);
if (!endpoints) return;
const target = endpoints.find(e => e.endpoint === endpoint);
if (target) {
target.healthStatus = this.evaluateHealth(health);
target.latency = health.latency;
}
}
private evaluateHealth(health: HealthCheck): 'healthy' | 'degraded' | 'unhealthy' {
if (health.errorRate > 0.1) return 'unhealthy';
if (health.errorRate > 0.01 || health.latency > 1000) return 'degraded';
return 'healthy';
}
}
2. Design for Degraded Operation
use std::time::Duration;
use tokio::time::timeout;
pub enum ServiceQuality {
Full,
Degraded,
Minimal,
}
pub struct AdaptiveService {
primary_backend: String,
fallback_backend: String,
cache: Cache,
}
impl AdaptiveService {
pub async fn get_data(&self, key: &str) -> Result<Data, Error> {
// Try full-quality path with timeout
match timeout(Duration::from_millis(100), self.fetch_from_primary(key)).await {
Ok(Ok(data)) => Ok(data),
_ => {
// Fallback to degraded path
self.get_degraded_data(key).await
}
}
}
async fn fetch_from_primary(&self, key: &str) -> Result<Data, Error> {
// Full-quality: real-time data with all enrichments
let data = self.query_database(key).await?;
let enriched = self.enrich_data(data).await?;
Ok(enriched)
}
async fn get_degraded_data(&self, key: &str) -> Result<Data, Error> {
// Try cache first
if let Some(cached) = self.cache.get(key).await {
return Ok(cached);
}
// Try fallback backend
match timeout(Duration::from_millis(200), self.fetch_from_fallback(key)).await {
Ok(Ok(data)) => Ok(data),
_ => {
// Minimal mode: return stale data if available
self.get_stale_data(key).await
}
}
}
async fn get_stale_data(&self, key: &str) -> Result<Data, Error> {
// Accept stale cache entries
self.cache.get_stale(key).await
.ok_or(Error::NoDataAvailable)
}
}
3. Global Load Balancing
from typing import List, Dict
import asyncio
from dataclasses import dataclass
from enum import Enum
class CloudProvider(Enum):
AWS = "aws"
GCP = "gcp"
AZURE = "azure"
@dataclass
class RegionalEndpoint:
provider: CloudProvider
region: str
endpoint: str
capacity: int
current_load: int
health_score: float # 0.0 to 1.0
class GlobalLoadBalancer:
def __init__(self):
self.endpoints: List[RegionalEndpoint] = []
self.health_checker = HealthChecker()
async def route_request(self, request: Request) -> RegionalEndpoint:
"""Route request to optimal endpoint"""
# Get client location
client_region = self.geolocate(request.source_ip)
# Score endpoints based on multiple factors
scored = []
for endpoint in self.endpoints:
if endpoint.health_score < 0.5:
continue # Skip unhealthy
score = self.calculate_score(
endpoint,
client_region,
request.priority
)
scored.append((score, endpoint))
if not scored:
raise NoHealthyEndpointsError()
# Select highest scoring endpoint
scored.sort(reverse=True)
return scored[0][1]
def calculate_score(
self,
endpoint: RegionalEndpoint,
client_region: str,
priority: str
) -> float:
"""Calculate composite score for endpoint selection"""
# Distance/latency factor (0-1, higher is better)
latency_score = self.get_latency_score(endpoint.region, client_region)
# Capacity factor (0-1, higher is better)
utilization = endpoint.current_load / endpoint.capacity
capacity_score = 1.0 - utilization
# Health factor (0-1, from health checks)
health_score = endpoint.health_score
# Provider diversity (prefer spreading across clouds)
diversity_score = self.get_diversity_score(endpoint.provider)
# Weighted combination
if priority == "latency":
return (
latency_score * 0.5 +
capacity_score * 0.2 +
health_score * 0.25 +
diversity_score * 0.05
)
elif priority == "reliability":
return (
health_score * 0.4 +
capacity_score * 0.3 +
diversity_score * 0.2 +
latency_score * 0.1
)
else: # balanced
return (
latency_score * 0.3 +
capacity_score * 0.25 +
health_score * 0.3 +
diversity_score * 0.15
)
def get_latency_score(self, endpoint_region: str, client_region: str) -> float:
"""Score based on network latency"""
latency_ms = self.estimate_latency(endpoint_region, client_region)
# Convert latency to score (exponential decay)
# 10ms -> 1.0, 50ms -> 0.6, 100ms -> 0.4, 200ms -> 0.2
return max(0.0, 1.0 - (latency_ms / 200.0))
async def health_check_loop(self):
"""Continuously monitor endpoint health"""
while True:
tasks = [
self.check_endpoint_health(endpoint)
for endpoint in self.endpoints
]
await asyncio.gather(*tasks)
await asyncio.sleep(10) # Check every 10 seconds
async def check_endpoint_health(self, endpoint: RegionalEndpoint):
"""Perform comprehensive health check"""
checks = await asyncio.gather(
self.health_checker.check_http(endpoint.endpoint),
self.health_checker.check_latency(endpoint.endpoint),
self.health_checker.check_error_rate(endpoint.endpoint),
return_exceptions=True
)
# Calculate composite health score
scores = [c.score if isinstance(c, HealthResult) else 0.0 for c in checks]
endpoint.health_score = sum(scores) / len(scores)
Data Replication Strategies
Asynchronous Multi-Region Replication
package replication
import (
"context"
"sync"
"time"
)
type ReplicationConfig struct {
PrimaryRegion string
ReplicaRegions []string
ReplicationLag time.Duration
ConsistencyMode string // "eventual" or "strong"
}
type MultiRegionStore struct {
primary Database
replicas map[string]Database
config ReplicationConfig
// Track replication lag
lagTracker *LagTracker
}
func (s *MultiRegionStore) Write(ctx context.Context, key string, value []byte) error {
// Write to primary
if err := s.primary.Write(ctx, key, value); err != nil {
return err
}
// Async replication to replicas
go s.replicateAsync(key, value)
return nil
}
func (s *MultiRegionStore) replicateAsync(key string, value []byte) {
var wg sync.WaitGroup
for region, replica := range s.replicas {
wg.Add(1)
go func(r string, db Database) {
defer wg.Done()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
start := time.Now()
if err := db.Write(ctx, key, value); err != nil {
log.Errorf("Replication to %s failed: %v", r, err)
s.lagTracker.RecordFailure(r, key)
} else {
lag := time.Since(start)
s.lagTracker.RecordSuccess(r, lag)
}
}(region, replica)
}
wg.Wait()
}
func (s *MultiRegionStore) Read(ctx context.Context, key string) ([]byte, error) {
// Try primary first
value, err := s.primary.Read(ctx, key)
if err == nil {
return value, nil
}
// Fallback to nearest healthy replica
replica := s.selectHealthyReplica()
if replica == nil {
return nil, ErrNoHealthyReplicas
}
return replica.Read(ctx, key)
}
func (s *MultiRegionStore) selectHealthyReplica() Database {
// Select replica with lowest lag
bestRegion := ""
lowestLag := time.Hour
for region := range s.replicas {
lag := s.lagTracker.GetAverageLag(region)
if lag < lowestLag {
lowestLag = lag
bestRegion = region
}
}
if bestRegion == "" {
return nil
}
return s.replicas[bestRegion]
}
Conflict Resolution for Multi-Master
from typing import Any, Dict, Optional
from datetime import datetime
from dataclasses import dataclass
@dataclass
class Version:
timestamp: datetime
region: str
value: Any
vector_clock: Dict[str, int]
class ConflictResolver:
def resolve(self, versions: List[Version]) -> Version:
"""Resolve conflicts using vector clocks and timestamps"""
# Check for concurrent writes (conflicting)
concurrent = self.find_concurrent_versions(versions)
if not concurrent:
# No conflict, return latest
return max(versions, key=lambda v: v.timestamp)
# Conflict exists, use application-specific resolution
return self.application_resolve(concurrent)
def find_concurrent_versions(self, versions: List[Version]) -> List[Version]:
"""Find versions that are concurrent (neither causally precedes the other)"""
concurrent = []
for i, v1 in enumerate(versions):
is_concurrent = False
for j, v2 in enumerate(versions):
if i != j and self.is_concurrent(v1, v2):
is_concurrent = True
break
if is_concurrent:
concurrent.append(v1)
return concurrent
def is_concurrent(self, v1: Version, v2: Version) -> bool:
"""Check if two versions are concurrent using vector clocks"""
v1_precedes_v2 = all(
v1.vector_clock.get(k, 0) <= v
for k, v in v2.vector_clock.items()
)
v2_precedes_v1 = all(
v2.vector_clock.get(k, 0) <= v
for k, v in v1.vector_clock.items()
)
# Concurrent if neither precedes the other
return not (v1_precedes_v2 or v2_precedes_v1)
def application_resolve(self, versions: List[Version]) -> Version:
"""Application-specific conflict resolution"""
# Strategy 1: Last-write-wins by timestamp
# return max(versions, key=lambda v: v.timestamp)
# Strategy 2: Merge values (for counter/set types)
if isinstance(versions[0].value, int):
# Counter: sum all concurrent increments
merged_value = sum(v.value for v in versions)
return Version(
timestamp=max(v.timestamp for v in versions),
region="merged",
value=merged_value,
vector_clock=self.merge_vector_clocks(versions)
)
# Strategy 3: Application-defined merge function
# return self.custom_merge(versions)
# Default: last-write-wins
return max(versions, key=lambda v: v.timestamp)
def merge_vector_clocks(self, versions: List[Version]) -> Dict[str, int]:
"""Merge multiple vector clocks"""
merged = {}
for version in versions:
for region, counter in version.vector_clock.items():
merged[region] = max(merged.get(region, 0), counter)
return merged
Circuit Breaking and Failover
class CircuitBreaker {
private state: 'closed' | 'open' | 'half-open' = 'closed';
private failureCount: number = 0;
private successCount: number = 0;
private lastFailureTime: number = 0;
constructor(
private failureThreshold: number = 5,
private successThreshold: number = 2,
private timeout: number = 60000, // 60 seconds
) {}
async execute<T>(fn: () => Promise<T>): Promise<T> {
if (this.state === 'open') {
if (Date.now() - this.lastFailureTime > this.timeout) {
this.state = 'half-open';
this.successCount = 0;
} else {
throw new Error('Circuit breaker is open');
}
}
try {
const result = await fn();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
private onSuccess(): void {
this.failureCount = 0;
if (this.state === 'half-open') {
this.successCount++;
if (this.successCount >= this.successThreshold) {
this.state = 'closed';
this.successCount = 0;
}
}
}
private onFailure(): void {
this.failureCount++;
this.lastFailureTime = Date.now();
if (this.failureCount >= this.failureThreshold) {
this.state = 'open';
}
}
getState(): string {
return this.state;
}
}
// Multi-cloud failover with circuit breakers
class MultiCloudClient {
private awsBreaker = new CircuitBreaker();
private gcpBreaker = new CircuitBreaker();
private azureBreaker = new CircuitBreaker();
async request(data: any): Promise<Response> {
// Try AWS first
try {
return await this.awsBreaker.execute(() => this.awsClient.request(data));
} catch (error) {
console.log('AWS failed, trying GCP');
}
// Fallback to GCP
try {
return await this.gcpBreaker.execute(() => this.gcpClient.request(data));
} catch (error) {
console.log('GCP failed, trying Azure');
}
// Last resort: Azure
return await this.azureBreaker.execute(() => this.azureClient.request(data));
}
}
Monitoring and Alerting
from prometheus_client import Counter, Histogram, Gauge
import asyncio
class HAMetrics:
def __init__(self):
# Request metrics
self.requests_total = Counter(
'requests_total',
'Total requests',
['region', 'cloud', 'status']
)
# Latency metrics
self.request_duration = Histogram(
'request_duration_seconds',
'Request duration',
['region', 'cloud']
)
# Availability metrics
self.endpoint_health = Gauge(
'endpoint_health_score',
'Endpoint health score',
['region', 'cloud']
)
# Replication lag
self.replication_lag = Gauge(
'replication_lag_seconds',
'Replication lag',
['source_region', 'target_region']
)
# Failover events
self.failovers_total = Counter(
'failovers_total',
'Total failover events',
['from_region', 'to_region', 'reason']
)
def record_request(self, region: str, cloud: str, duration: float, success: bool):
status = 'success' if success else 'error'
self.requests_total.labels(region=region, cloud=cloud, status=status).inc()
self.request_duration.labels(region=region, cloud=cloud).observe(duration)
def update_health(self, region: str, cloud: str, score: float):
self.endpoint_health.labels(region=region, cloud=cloud).set(score)
def record_failover(self, from_region: str, to_region: str, reason: str):
self.failovers_total.labels(
from_region=from_region,
to_region=to_region,
reason=reason
).inc()
class AvailabilityMonitor:
def __init__(self, metrics: HAMetrics):
self.metrics = metrics
self.alert_manager = AlertManager()
async def monitor_loop(self):
while True:
await self.check_availability()
await asyncio.sleep(30) # Check every 30 seconds
async def check_availability(self):
# Check each region
for region, cloud in self.get_regions():
health = await self.check_region_health(region, cloud)
self.metrics.update_health(region, cloud, health.score)
if health.score < 0.5:
await self.alert_manager.send_alert(
severity='critical',
message=f'{cloud}/{region} health degraded: {health.score}',
details=health.details
)
elif health.score < 0.8:
await self.alert_manager.send_alert(
severity='warning',
message=f'{cloud}/{region} health warning: {health.score}',
details=health.details
)
Chaos Engineering
import random
from typing import Callable, Any
class ChaosExperiment:
def __init__(self, name: str, blast_radius: float = 0.1):
self.name = name
self.blast_radius = blast_radius # % of traffic to affect
self.enabled = False
def should_inject_fault(self) -> bool:
return self.enabled and random.random() < self.blast_radius
async def inject_latency(self, fn: Callable, latency_ms: int = 1000) -> Any:
"""Inject artificial latency"""
if self.should_inject_fault():
await asyncio.sleep(latency_ms / 1000)
return await fn()
async def inject_failure(self, fn: Callable, error_rate: float = 1.0) -> Any:
"""Inject failures"""
if self.should_inject_fault() and random.random() < error_rate:
raise Exception(f'Chaos experiment: {self.name}')
return await fn()
async def inject_partial_failure(
self,
fn: Callable,
degradation_factor: float = 0.5
) -> Any:
"""Inject partial/degraded responses"""
result = await fn()
if self.should_inject_fault():
# Return partial result
if isinstance(result, list):
cutoff = int(len(result) * degradation_factor)
return result[:cutoff]
return result
# Usage
class ResilientService:
def __init__(self):
self.chaos = ChaosExperiment('region-failure', blast_radius=0.01)
async def call_backend(self):
return await self.chaos.inject_failure(
lambda: self.backend_client.request(),
error_rate=1.0
)
Conclusion
Building multi-cloud highly available systems requires:
- Eliminate SPOFs: Every component must have redundancy
- Design for degradation: Systems should gracefully degrade, not fail binary
- Global load balancing: Route traffic optimally across regions and clouds
- Async replication: Accept eventual consistency for availability
- Circuit breaking: Isolate failures to prevent cascade
- Comprehensive monitoring: Deep visibility into health and performance
- Chaos engineering: Regularly test failure scenarios
The cost of 99.99% availability is high—both in infrastructure and engineering effort. It’s critical to honestly assess whether you need it, as the jump from 99.9% to 99.99% often means 10x the complexity and cost.