Operating 60+ microservices requires different approaches than managing a handful. After growing from 30 to 60+ services while maintaining reliability, here are the patterns and tools that made it possible.
Service Mesh: Essential at Scale
With 60+ services, point-to-point communication becomes unmanageable:
# Istio service mesh configuration
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: user-service
spec:
hosts:
- user-service
http:
- match:
- headers:
experiment:
exact: "new-algorithm"
route:
- destination:
host: user-service
subset: v2
weight: 100
- route:
- destination:
host: user-service
subset: v1
weight: 95
- destination:
host: user-service
subset: v2
weight: 5 # 5% canary traffic
---
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: user-service
spec:
host: user-service
trafficPolicy:
connectionPool:
tcp:
maxConnections: 100
http:
http1MaxPendingRequests: 50
http2MaxRequests: 100
maxRequestsPerConnection: 2
outlierDetection:
consecutiveErrors: 5
interval: 30s
baseEjectionTime: 30s
maxEjectionPercent: 50
subsets:
- name: v1
labels:
version: v1
- name: v2
labels:
version: v2
Service Registry and Discovery
Automating service discovery:
class ServiceRegistry:
"""
Automatic service registration and discovery
"""
def __init__(self):
self.consul_client = ConsulClient()
self.service_info = {}
def register_service(self, service_name: str, host: str, port: int,
tags: list = None, health_check: dict = None):
"""
Register service with health check
"""
service_id = f"{service_name}-{host}-{port}"
self.consul_client.agent.service.register(
name=service_name,
service_id=service_id,
address=host,
port=port,
tags=tags or [],
check={
'http': f'http://{host}:{port}/health',
'interval': '10s',
'timeout': '5s',
'deregister_critical_service_after': '1m'
}
)
logger.info(f"Registered {service_name} at {host}:{port}")
def discover_services(self, service_name: str) -> list:
"""
Find healthy instances of a service
"""
_, services = self.consul_client.health.service(
service_name,
passing=True # Only healthy instances
)
return [
{
'id': svc['Service']['ID'],
'address': svc['Service']['Address'],
'port': svc['Service']['Port'],
'tags': svc['Service']['Tags']
}
for svc in services
]
def watch_service(self, service_name: str, callback):
"""
Watch for service changes
"""
index = None
while True:
index, services = self.consul_client.health.service(
service_name,
index=index,
wait='30s'
)
# Notify callback of changes
callback(services)
Dependency Management
Tracking service dependencies:
type DependencyGraph struct {
services map[string]*Service
edges map[string][]string
}
type Service struct {
Name string
Version string
Owner string
Dependencies []string
Consumers []string
}
func (g *DependencyGraph) AddDependency(from, to string) {
g.edges[from] = append(g.edges[from], to)
// Update service objects
if fromSvc, ok := g.services[from]; ok {
fromSvc.Dependencies = append(fromSvc.Dependencies, to)
}
if toSvc, ok := g.services[to]; ok {
toSvc.Consumers = append(toSvc.Consumers, from)
}
}
func (g *DependencyGraph) FindTransitiveDependencies(service string) []string {
visited := make(map[string]bool)
var result []string
var dfs func(string)
dfs = func(svc string) {
if visited[svc] {
return
}
visited[svc] = true
if svc != service {
result = append(result, svc)
}
for _, dep := range g.edges[svc] {
dfs(dep)
}
}
dfs(service)
return result
}
func (g *DependencyGraph) DetectCircularDependencies() [][]string {
var cycles [][]string
visited := make(map[string]bool)
recStack := make(map[string]bool)
var detectCycle func(string, []string) bool
detectCycle = func(svc string, path []string) bool {
visited[svc] = true
recStack[svc] = true
path = append(path, svc)
for _, dep := range g.edges[svc] {
if !visited[dep] {
if detectCycle(dep, path) {
return true
}
} else if recStack[dep] {
// Found cycle
cycleStart := 0
for i, s := range path {
if s == dep {
cycleStart = i
break
}
}
cycles = append(cycles, path[cycleStart:])
return true
}
}
recStack[svc] = false
return false
}
for svc := range g.services {
if !visited[svc] {
detectCycle(svc, []string{})
}
}
return cycles
}
func (g *DependencyGraph) ImpactAnalysis(service string) ImpactReport {
"""
Analyze blast radius if service goes down
"""
downstream := g.FindTransitiveDependencies(service)
return ImpactReport{
Service: service,
DirectConsumers: g.services[service].Consumers,
TransitiveConsumers: downstream,
TotalImpact: len(downstream) + 1,
}
}
Standardized Deployment
Deployment automation for 60+ services:
class StandardizedDeployer:
"""
Standard deployment process for all services
"""
def deploy(self, service: str, version: str, environment: str,
rollout_strategy: str = 'rolling'):
"""
Deploy service with standard process
"""
logger.info(f"Deploying {service}:{version} to {environment}")
# 1. Pre-deployment validation
self.validate_deployment(service, version, environment)
# 2. Run pre-deployment tests
self.run_pre_deployment_tests(service, version)
# 3. Update configuration
self.update_config(service, version, environment)
# 4. Execute deployment based on strategy
if rollout_strategy == 'blue-green':
self.blue_green_deploy(service, version, environment)
elif rollout_strategy == 'canary':
self.canary_deploy(service, version, environment)
else:
self.rolling_deploy(service, version, environment)
# 5. Health check
if not self.health_check(service, environment, timeout=300):
self.rollback(service, environment)
raise DeploymentFailedError("Health check failed")
# 6. Run smoke tests
self.run_smoke_tests(service, environment)
# 7. Update service registry
self.update_registry(service, version, environment)
# 8. Notify stakeholders
self.notify_deployment_complete(service, version, environment)
def canary_deploy(self, service: str, version: str, environment: str):
"""
Gradually roll out new version
"""
stages = [
{'percentage': 5, 'duration': 300}, # 5% for 5 minutes
{'percentage': 25, 'duration': 600}, # 25% for 10 minutes
{'percentage': 50, 'duration': 600}, # 50% for 10 minutes
{'percentage': 100, 'duration': 0} # 100%
]
for stage in stages:
percentage = stage['percentage']
duration = stage['duration']
logger.info(f"Canary stage: {percentage}% traffic to {version}")
# Update traffic split
self.update_traffic_split(service, environment, {
'old': 100 - percentage,
'new': percentage
})
if duration > 0:
# Monitor during canary phase
time.sleep(duration)
# Check metrics
metrics = self.get_canary_metrics(service, environment, version)
if not self.are_metrics_healthy(metrics):
logger.error("Canary metrics unhealthy, rolling back")
self.rollback(service, environment)
raise DeploymentFailedError("Canary metrics failed")
logger.info(f"Canary deployment complete: {service}:{version}")
def are_metrics_healthy(self, metrics: dict) -> bool:
"""
Check if canary metrics are acceptable
"""
# Compare new version vs old version
error_rate_increase = (
metrics['new_error_rate'] - metrics['old_error_rate']
)
latency_increase = (
metrics['new_p99_latency'] - metrics['old_p99_latency']
)
return (
error_rate_increase < 0.01 and # <1% error rate increase
latency_increase < 100 and # <100ms latency increase
metrics['new_error_rate'] < 0.05 # <5% absolute error rate
)
Cross-Service Observability
Tracking requests across 60+ services:
public class DistributedTracing {
private final Tracer tracer;
public void handleRequest(Request request) {
// Extract trace context from incoming request
SpanContext parentContext = tracer.extract(
Format.Builtin.HTTP_HEADERS,
new TextMapAdapter(request.getHeaders())
);
// Start span for this service
Span span = tracer.buildSpan("handle-request")
.asChildOf(parentContext)
.withTag("service", "user-service")
.withTag("user_id", request.getUserId())
.withTag("endpoint", request.getEndpoint())
.start();
try (Scope scope = tracer.activateSpan(span)) {
// Call downstream services
String userData = callUserDataService(request.getUserId());
String activityData = callActivityService(request.getUserId());
// Process
Response response = processRequest(userData, activityData);
// Tag with response info
span.setTag("status_code", response.getStatusCode());
return response;
} catch (Exception e) {
span.setTag("error", true);
span.log(Map.of(
"event", "error",
"error.object", e,
"message", e.getMessage(),
"stack", getStackTrace(e)
));
throw e;
} finally {
span.finish();
}
}
private String callUserDataService(String userId) {
// Create child span for downstream call
Span span = tracer.buildSpan("call-user-data-service")
.asChildOf(tracer.activeSpan())
.withTag("span.kind", "client")
.withTag("downstream_service", "user-data-service")
.start();
try (Scope scope = tracer.activateSpan(span)) {
// Inject trace context into downstream request
Map<String, String> headers = new HashMap<>();
tracer.inject(
span.context(),
Format.Builtin.HTTP_HEADERS,
new TextMapAdapter(headers)
);
// Make HTTP call with trace headers
return httpClient.get(
"http://user-data-service/users/" + userId,
headers
);
} finally {
span.finish();
}
}
}
API Gateway
Single entry point for 60+ backend services:
// API Gateway with routing, auth, rate limiting
import express from 'express';
import { createProxyMiddleware } from 'http-proxy-middleware';
const app = express();
// Service routes
const services = {
'users': 'http://user-service:8080',
'products': 'http://product-service:8080',
'orders': 'http://order-service:8080',
'analytics': 'http://analytics-service:8080',
// ... 56 more services
};
// Authentication middleware
app.use(async (req, res, next) => {
const token = req.headers['authorization'];
if (!token) {
return res.status(401).json({ error: 'No authorization token' });
}
try {
const user = await verifyToken(token);
req.user = user;
next();
} catch (error) {
return res.status(401).json({ error: 'Invalid token' });
}
});
// Rate limiting
const rateLimiter = new RateLimiter({
windowMs: 60 * 1000, // 1 minute
max: 100 // 100 requests per minute per user
});
app.use(async (req, res, next) => {
const userId = req.user.id;
const allowed = await rateLimiter.checkLimit(userId);
if (!allowed) {
return res.status(429).json({
error: 'Rate limit exceeded',
retryAfter: rateLimiter.getRetryAfter(userId)
});
}
next();
});
// Dynamic routing based on path
app.use('/api/:service/*', (req, res, next) => {
const service = req.params.service;
const targetUrl = services[service];
if (!targetUrl) {
return res.status(404).json({
error: `Service not found: ${service}`
});
}
// Proxy to target service
return createProxyMiddleware({
target: targetUrl,
pathRewrite: {
[`^/api/${service}`]: ''
},
onProxyReq: (proxyReq, req) => {
// Add user context to downstream request
proxyReq.setHeader('X-User-Id', req.user.id);
proxyReq.setHeader('X-User-Roles', req.user.roles.join(','));
},
onError: (err, req, res) => {
logger.error('Proxy error', { service, error: err });
res.status(502).json({ error: 'Service unavailable' });
}
})(req, res, next);
});
Service Ownership Model
Clear ownership with 60+ services:
# service-ownership.yaml
services:
user-service:
owner:
team: identity-team
slack: "#identity-team"
oncall: identity-oncall@company.com
slo:
availability: 99.9%
latency_p99: 100ms
dependencies:
- auth-service
- database-service
docs: https://docs.company.com/user-service
product-service:
owner:
team: catalog-team
slack: "#catalog-team"
oncall: catalog-oncall@company.com
slo:
availability: 99.95%
latency_p99: 50ms
dependencies:
- inventory-service
- pricing-service
docs: https://docs.company.com/product-service
Chaos Engineering
Testing resilience with many services:
class ChaosExperiments:
"""
Chaos engineering for microservices
"""
def kill_random_instance(self, service: str):
"""
Kill random instance of a service
"""
instances = self.get_service_instances(service)
target = random.choice(instances)
logger.info(f"Chaos: Killing {target}")
self.k8s_client.delete_pod(target)
# Monitor impact
self.monitor_impact(service, duration=300)
def inject_latency(self, service: str, latency_ms: int, percentage: int):
"""
Add latency to service calls
"""
# Update Istio VirtualService
self.istio_client.apply_fault_injection(
service=service,
fault={
'delay': {
'percentage': {'value': percentage},
'fixedDelay': f'{latency_ms}ms'
}
}
)
logger.info(f"Injected {latency_ms}ms latency to {percentage}% of {service} traffic")
def partition_network(self, service_a: str, service_b: str):
"""
Create network partition between services
"""
# Block traffic using network policies
self.k8s_client.apply_network_policy({
'apiVersion': 'networking.k8s.io/v1',
'kind': 'NetworkPolicy',
'metadata': {'name': f'chaos-partition-{service_a}-{service_b}'},
'spec': {
'podSelector': {'matchLabels': {'app': service_a}},
'policyTypes': ['Egress'],
'egress': [{
'to': [{
'podSelector': {
'matchExpressions': [{
'key': 'app',
'operator': 'NotIn',
'values': [service_b]
}]
}
}]
}]
}
})
Key Takeaways
- Service mesh is essential: Don’t manage service-to-service communication manually
- Automate everything: Deployment, monitoring, discovery
- Track dependencies: Know what depends on what
- Distributed tracing is critical: You cannot debug 60+ services without it
- Clear ownership: Every service must have a team
- Standard patterns: Don’t let each service be a snowflake
- Chaos engineering: Test failure scenarios regularly
Operating 60+ microservices is manageable with the right tools and patterns. Focus on automation, observability, and standardization.