Achieving high availability in a single datacenter is challenging. Achieving it across multiple cloud providers while maintaining low latency and managing costs is exponentially harder. This post shares our architecture patterns and operational learnings from running mission-critical systems at 99.99% availability across AWS, GCP, and Azure.

Understanding Availability Requirements

First, let’s understand what different availability targets mean:

  • 99.9% (3 nines): 8.76 hours downtime/year
  • 99.95%: 4.38 hours downtime/year
  • 99.99% (4 nines): 52.56 minutes downtime/year
  • 99.999% (5 nines): 5.26 minutes downtime/year

Each additional nine requires exponentially more investment in architecture, automation, and operational discipline.

Core Principles

1. Eliminate Single Points of Failure

// Example: Multi-region service discovery
interface ServiceEndpoint {
  region: string;
  cloud: 'aws' | 'gcp' | 'azure';
  endpoint: string;
  healthStatus: 'healthy' | 'degraded' | 'unhealthy';
  latency: number;
}

class MultiCloudServiceDiscovery {
  private endpoints: Map<string, ServiceEndpoint[]> = new Map();

  async discoverService(serviceName: string): Promise<ServiceEndpoint> {
    const candidates = this.endpoints.get(serviceName) || [];

    // Filter to healthy endpoints
    const healthy = candidates.filter(e => e.healthStatus === 'healthy');

    if (healthy.length === 0) {
      // Fallback to degraded if no healthy endpoints
      const degraded = candidates.filter(e => e.healthStatus === 'degraded');
      if (degraded.length > 0) {
        return this.selectByLatency(degraded);
      }
      throw new Error(`No available endpoints for ${serviceName}`);
    }

    // Select endpoint based on latency and load
    return this.selectByLatency(healthy);
  }

  private selectByLatency(endpoints: ServiceEndpoint[]): ServiceEndpoint {
    // Use weighted random selection favoring low latency
    const weights = endpoints.map(e => 1 / (e.latency + 1));
    const totalWeight = weights.reduce((a, b) => a + b, 0);

    let random = Math.random() * totalWeight;
    for (let i = 0; i < endpoints.length; i++) {
      random -= weights[i];
      if (random <= 0) {
        return endpoints[i];
      }
    }

    return endpoints[0];
  }

  async updateHealth(serviceName: string, endpoint: string, health: HealthCheck) {
    const endpoints = this.endpoints.get(serviceName);
    if (!endpoints) return;

    const target = endpoints.find(e => e.endpoint === endpoint);
    if (target) {
      target.healthStatus = this.evaluateHealth(health);
      target.latency = health.latency;
    }
  }

  private evaluateHealth(health: HealthCheck): 'healthy' | 'degraded' | 'unhealthy' {
    if (health.errorRate > 0.1) return 'unhealthy';
    if (health.errorRate > 0.01 || health.latency > 1000) return 'degraded';
    return 'healthy';
  }
}

2. Design for Degraded Operation

use std::time::Duration;
use tokio::time::timeout;

pub enum ServiceQuality {
    Full,
    Degraded,
    Minimal,
}

pub struct AdaptiveService {
    primary_backend: String,
    fallback_backend: String,
    cache: Cache,
}

impl AdaptiveService {
    pub async fn get_data(&self, key: &str) -> Result<Data, Error> {
        // Try full-quality path with timeout
        match timeout(Duration::from_millis(100), self.fetch_from_primary(key)).await {
            Ok(Ok(data)) => Ok(data),
            _ => {
                // Fallback to degraded path
                self.get_degraded_data(key).await
            }
        }
    }

    async fn fetch_from_primary(&self, key: &str) -> Result<Data, Error> {
        // Full-quality: real-time data with all enrichments
        let data = self.query_database(key).await?;
        let enriched = self.enrich_data(data).await?;
        Ok(enriched)
    }

    async fn get_degraded_data(&self, key: &str) -> Result<Data, Error> {
        // Try cache first
        if let Some(cached) = self.cache.get(key).await {
            return Ok(cached);
        }

        // Try fallback backend
        match timeout(Duration::from_millis(200), self.fetch_from_fallback(key)).await {
            Ok(Ok(data)) => Ok(data),
            _ => {
                // Minimal mode: return stale data if available
                self.get_stale_data(key).await
            }
        }
    }

    async fn get_stale_data(&self, key: &str) -> Result<Data, Error> {
        // Accept stale cache entries
        self.cache.get_stale(key).await
            .ok_or(Error::NoDataAvailable)
    }
}

3. Global Load Balancing

from typing import List, Dict
import asyncio
from dataclasses import dataclass
from enum import Enum

class CloudProvider(Enum):
    AWS = "aws"
    GCP = "gcp"
    AZURE = "azure"

@dataclass
class RegionalEndpoint:
    provider: CloudProvider
    region: str
    endpoint: str
    capacity: int
    current_load: int
    health_score: float  # 0.0 to 1.0

class GlobalLoadBalancer:
    def __init__(self):
        self.endpoints: List[RegionalEndpoint] = []
        self.health_checker = HealthChecker()

    async def route_request(self, request: Request) -> RegionalEndpoint:
        """Route request to optimal endpoint"""

        # Get client location
        client_region = self.geolocate(request.source_ip)

        # Score endpoints based on multiple factors
        scored = []
        for endpoint in self.endpoints:
            if endpoint.health_score < 0.5:
                continue  # Skip unhealthy

            score = self.calculate_score(
                endpoint,
                client_region,
                request.priority
            )
            scored.append((score, endpoint))

        if not scored:
            raise NoHealthyEndpointsError()

        # Select highest scoring endpoint
        scored.sort(reverse=True)
        return scored[0][1]

    def calculate_score(
        self,
        endpoint: RegionalEndpoint,
        client_region: str,
        priority: str
    ) -> float:
        """Calculate composite score for endpoint selection"""

        # Distance/latency factor (0-1, higher is better)
        latency_score = self.get_latency_score(endpoint.region, client_region)

        # Capacity factor (0-1, higher is better)
        utilization = endpoint.current_load / endpoint.capacity
        capacity_score = 1.0 - utilization

        # Health factor (0-1, from health checks)
        health_score = endpoint.health_score

        # Provider diversity (prefer spreading across clouds)
        diversity_score = self.get_diversity_score(endpoint.provider)

        # Weighted combination
        if priority == "latency":
            return (
                latency_score * 0.5 +
                capacity_score * 0.2 +
                health_score * 0.25 +
                diversity_score * 0.05
            )
        elif priority == "reliability":
            return (
                health_score * 0.4 +
                capacity_score * 0.3 +
                diversity_score * 0.2 +
                latency_score * 0.1
            )
        else:  # balanced
            return (
                latency_score * 0.3 +
                capacity_score * 0.25 +
                health_score * 0.3 +
                diversity_score * 0.15
            )

    def get_latency_score(self, endpoint_region: str, client_region: str) -> float:
        """Score based on network latency"""
        latency_ms = self.estimate_latency(endpoint_region, client_region)

        # Convert latency to score (exponential decay)
        # 10ms -> 1.0, 50ms -> 0.6, 100ms -> 0.4, 200ms -> 0.2
        return max(0.0, 1.0 - (latency_ms / 200.0))

    async def health_check_loop(self):
        """Continuously monitor endpoint health"""
        while True:
            tasks = [
                self.check_endpoint_health(endpoint)
                for endpoint in self.endpoints
            ]
            await asyncio.gather(*tasks)
            await asyncio.sleep(10)  # Check every 10 seconds

    async def check_endpoint_health(self, endpoint: RegionalEndpoint):
        """Perform comprehensive health check"""
        checks = await asyncio.gather(
            self.health_checker.check_http(endpoint.endpoint),
            self.health_checker.check_latency(endpoint.endpoint),
            self.health_checker.check_error_rate(endpoint.endpoint),
            return_exceptions=True
        )

        # Calculate composite health score
        scores = [c.score if isinstance(c, HealthResult) else 0.0 for c in checks]
        endpoint.health_score = sum(scores) / len(scores)

Data Replication Strategies

Asynchronous Multi-Region Replication

package replication

import (
    "context"
    "sync"
    "time"
)

type ReplicationConfig struct {
    PrimaryRegion   string
    ReplicaRegions  []string
    ReplicationLag  time.Duration
    ConsistencyMode string // "eventual" or "strong"
}

type MultiRegionStore struct {
    primary   Database
    replicas  map[string]Database
    config    ReplicationConfig

    // Track replication lag
    lagTracker *LagTracker
}

func (s *MultiRegionStore) Write(ctx context.Context, key string, value []byte) error {
    // Write to primary
    if err := s.primary.Write(ctx, key, value); err != nil {
        return err
    }

    // Async replication to replicas
    go s.replicateAsync(key, value)

    return nil
}

func (s *MultiRegionStore) replicateAsync(key string, value []byte) {
    var wg sync.WaitGroup

    for region, replica := range s.replicas {
        wg.Add(1)
        go func(r string, db Database) {
            defer wg.Done()

            ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
            defer cancel()

            start := time.Now()
            if err := db.Write(ctx, key, value); err != nil {
                log.Errorf("Replication to %s failed: %v", r, err)
                s.lagTracker.RecordFailure(r, key)
            } else {
                lag := time.Since(start)
                s.lagTracker.RecordSuccess(r, lag)
            }
        }(region, replica)
    }

    wg.Wait()
}

func (s *MultiRegionStore) Read(ctx context.Context, key string) ([]byte, error) {
    // Try primary first
    value, err := s.primary.Read(ctx, key)
    if err == nil {
        return value, nil
    }

    // Fallback to nearest healthy replica
    replica := s.selectHealthyReplica()
    if replica == nil {
        return nil, ErrNoHealthyReplicas
    }

    return replica.Read(ctx, key)
}

func (s *MultiRegionStore) selectHealthyReplica() Database {
    // Select replica with lowest lag
    bestRegion := ""
    lowestLag := time.Hour

    for region := range s.replicas {
        lag := s.lagTracker.GetAverageLag(region)
        if lag < lowestLag {
            lowestLag = lag
            bestRegion = region
        }
    }

    if bestRegion == "" {
        return nil
    }

    return s.replicas[bestRegion]
}

Conflict Resolution for Multi-Master

from typing import Any, Dict, Optional
from datetime import datetime
from dataclasses import dataclass

@dataclass
class Version:
    timestamp: datetime
    region: str
    value: Any
    vector_clock: Dict[str, int]

class ConflictResolver:
    def resolve(self, versions: List[Version]) -> Version:
        """Resolve conflicts using vector clocks and timestamps"""

        # Check for concurrent writes (conflicting)
        concurrent = self.find_concurrent_versions(versions)

        if not concurrent:
            # No conflict, return latest
            return max(versions, key=lambda v: v.timestamp)

        # Conflict exists, use application-specific resolution
        return self.application_resolve(concurrent)

    def find_concurrent_versions(self, versions: List[Version]) -> List[Version]:
        """Find versions that are concurrent (neither causally precedes the other)"""
        concurrent = []

        for i, v1 in enumerate(versions):
            is_concurrent = False
            for j, v2 in enumerate(versions):
                if i != j and self.is_concurrent(v1, v2):
                    is_concurrent = True
                    break
            if is_concurrent:
                concurrent.append(v1)

        return concurrent

    def is_concurrent(self, v1: Version, v2: Version) -> bool:
        """Check if two versions are concurrent using vector clocks"""
        v1_precedes_v2 = all(
            v1.vector_clock.get(k, 0) <= v
            for k, v in v2.vector_clock.items()
        )
        v2_precedes_v1 = all(
            v2.vector_clock.get(k, 0) <= v
            for k, v in v1.vector_clock.items()
        )

        # Concurrent if neither precedes the other
        return not (v1_precedes_v2 or v2_precedes_v1)

    def application_resolve(self, versions: List[Version]) -> Version:
        """Application-specific conflict resolution"""

        # Strategy 1: Last-write-wins by timestamp
        # return max(versions, key=lambda v: v.timestamp)

        # Strategy 2: Merge values (for counter/set types)
        if isinstance(versions[0].value, int):
            # Counter: sum all concurrent increments
            merged_value = sum(v.value for v in versions)
            return Version(
                timestamp=max(v.timestamp for v in versions),
                region="merged",
                value=merged_value,
                vector_clock=self.merge_vector_clocks(versions)
            )

        # Strategy 3: Application-defined merge function
        # return self.custom_merge(versions)

        # Default: last-write-wins
        return max(versions, key=lambda v: v.timestamp)

    def merge_vector_clocks(self, versions: List[Version]) -> Dict[str, int]:
        """Merge multiple vector clocks"""
        merged = {}
        for version in versions:
            for region, counter in version.vector_clock.items():
                merged[region] = max(merged.get(region, 0), counter)
        return merged

Circuit Breaking and Failover

class CircuitBreaker {
  private state: 'closed' | 'open' | 'half-open' = 'closed';
  private failureCount: number = 0;
  private successCount: number = 0;
  private lastFailureTime: number = 0;

  constructor(
    private failureThreshold: number = 5,
    private successThreshold: number = 2,
    private timeout: number = 60000, // 60 seconds
  ) {}

  async execute<T>(fn: () => Promise<T>): Promise<T> {
    if (this.state === 'open') {
      if (Date.now() - this.lastFailureTime > this.timeout) {
        this.state = 'half-open';
        this.successCount = 0;
      } else {
        throw new Error('Circuit breaker is open');
      }
    }

    try {
      const result = await fn();
      this.onSuccess();
      return result;
    } catch (error) {
      this.onFailure();
      throw error;
    }
  }

  private onSuccess(): void {
    this.failureCount = 0;

    if (this.state === 'half-open') {
      this.successCount++;
      if (this.successCount >= this.successThreshold) {
        this.state = 'closed';
        this.successCount = 0;
      }
    }
  }

  private onFailure(): void {
    this.failureCount++;
    this.lastFailureTime = Date.now();

    if (this.failureCount >= this.failureThreshold) {
      this.state = 'open';
    }
  }

  getState(): string {
    return this.state;
  }
}

// Multi-cloud failover with circuit breakers
class MultiCloudClient {
  private awsBreaker = new CircuitBreaker();
  private gcpBreaker = new CircuitBreaker();
  private azureBreaker = new CircuitBreaker();

  async request(data: any): Promise<Response> {
    // Try AWS first
    try {
      return await this.awsBreaker.execute(() => this.awsClient.request(data));
    } catch (error) {
      console.log('AWS failed, trying GCP');
    }

    // Fallback to GCP
    try {
      return await this.gcpBreaker.execute(() => this.gcpClient.request(data));
    } catch (error) {
      console.log('GCP failed, trying Azure');
    }

    // Last resort: Azure
    return await this.azureBreaker.execute(() => this.azureClient.request(data));
  }
}

Monitoring and Alerting

from prometheus_client import Counter, Histogram, Gauge
import asyncio

class HAMetrics:
    def __init__(self):
        # Request metrics
        self.requests_total = Counter(
            'requests_total',
            'Total requests',
            ['region', 'cloud', 'status']
        )

        # Latency metrics
        self.request_duration = Histogram(
            'request_duration_seconds',
            'Request duration',
            ['region', 'cloud']
        )

        # Availability metrics
        self.endpoint_health = Gauge(
            'endpoint_health_score',
            'Endpoint health score',
            ['region', 'cloud']
        )

        # Replication lag
        self.replication_lag = Gauge(
            'replication_lag_seconds',
            'Replication lag',
            ['source_region', 'target_region']
        )

        # Failover events
        self.failovers_total = Counter(
            'failovers_total',
            'Total failover events',
            ['from_region', 'to_region', 'reason']
        )

    def record_request(self, region: str, cloud: str, duration: float, success: bool):
        status = 'success' if success else 'error'
        self.requests_total.labels(region=region, cloud=cloud, status=status).inc()
        self.request_duration.labels(region=region, cloud=cloud).observe(duration)

    def update_health(self, region: str, cloud: str, score: float):
        self.endpoint_health.labels(region=region, cloud=cloud).set(score)

    def record_failover(self, from_region: str, to_region: str, reason: str):
        self.failovers_total.labels(
            from_region=from_region,
            to_region=to_region,
            reason=reason
        ).inc()

class AvailabilityMonitor:
    def __init__(self, metrics: HAMetrics):
        self.metrics = metrics
        self.alert_manager = AlertManager()

    async def monitor_loop(self):
        while True:
            await self.check_availability()
            await asyncio.sleep(30)  # Check every 30 seconds

    async def check_availability(self):
        # Check each region
        for region, cloud in self.get_regions():
            health = await self.check_region_health(region, cloud)
            self.metrics.update_health(region, cloud, health.score)

            if health.score < 0.5:
                await self.alert_manager.send_alert(
                    severity='critical',
                    message=f'{cloud}/{region} health degraded: {health.score}',
                    details=health.details
                )
            elif health.score < 0.8:
                await self.alert_manager.send_alert(
                    severity='warning',
                    message=f'{cloud}/{region} health warning: {health.score}',
                    details=health.details
                )

Chaos Engineering

import random
from typing import Callable, Any

class ChaosExperiment:
    def __init__(self, name: str, blast_radius: float = 0.1):
        self.name = name
        self.blast_radius = blast_radius  # % of traffic to affect
        self.enabled = False

    def should_inject_fault(self) -> bool:
        return self.enabled and random.random() < self.blast_radius

    async def inject_latency(self, fn: Callable, latency_ms: int = 1000) -> Any:
        """Inject artificial latency"""
        if self.should_inject_fault():
            await asyncio.sleep(latency_ms / 1000)
        return await fn()

    async def inject_failure(self, fn: Callable, error_rate: float = 1.0) -> Any:
        """Inject failures"""
        if self.should_inject_fault() and random.random() < error_rate:
            raise Exception(f'Chaos experiment: {self.name}')
        return await fn()

    async def inject_partial_failure(
        self,
        fn: Callable,
        degradation_factor: float = 0.5
    ) -> Any:
        """Inject partial/degraded responses"""
        result = await fn()

        if self.should_inject_fault():
            # Return partial result
            if isinstance(result, list):
                cutoff = int(len(result) * degradation_factor)
                return result[:cutoff]

        return result

# Usage
class ResilientService:
    def __init__(self):
        self.chaos = ChaosExperiment('region-failure', blast_radius=0.01)

    async def call_backend(self):
        return await self.chaos.inject_failure(
            lambda: self.backend_client.request(),
            error_rate=1.0
        )

Conclusion

Building multi-cloud highly available systems requires:

  1. Eliminate SPOFs: Every component must have redundancy
  2. Design for degradation: Systems should gracefully degrade, not fail binary
  3. Global load balancing: Route traffic optimally across regions and clouds
  4. Async replication: Accept eventual consistency for availability
  5. Circuit breaking: Isolate failures to prevent cascade
  6. Comprehensive monitoring: Deep visibility into health and performance
  7. Chaos engineering: Regularly test failure scenarios

The cost of 99.99% availability is high—both in infrastructure and engineering effort. It’s critical to honestly assess whether you need it, as the jump from 99.9% to 99.99% often means 10x the complexity and cost.