Platform engineering has emerged as a critical discipline for organizations building distributed systems at scale. This post presents a maturity model for platform engineering and practical guidance for evolving your internal developer platform (IDP).
The Platform Engineering Maturity Model
Level 0: Manual Operations
- Ad-hoc scripts
- Manual deployments
- Tribal knowledge
- No self-service
Level 1: Basic Automation
- CI/CD pipelines
- Infrastructure as code
- Basic monitoring
- Limited self-service
Level 2: Platform Services
- Service catalog
- Golden paths
- Observability platform
- Developer portal
Level 3: Self-Service Platform
- Full self-service capabilities
- Policy enforcement
- Cost management
- Developer experience focus
Level 4: Adaptive Platform
- AI-assisted operations
- Automated optimization
- Predictive scaling
- Continuous improvement
Building the Foundation: Infrastructure as Code
# Terraform module for standardized service deployment
module "service" {
source = "./modules/service"
name = "user-service"
environment = "production"
# Compute
instance_type = "c5.2xlarge"
min_instances = 3
max_instances = 20
# Networking
vpc_id = data.aws_vpc.main.id
subnet_ids = data.aws_subnet_ids.private.ids
# Observability (automatically configured)
enable_monitoring = true
enable_tracing = true
log_retention_days = 30
# Security (enforced by policy)
enable_encryption = true
enable_waf = true
# Cost management
cost_center = "engineering"
owner_team = "platform"
}
# Policy enforcement via Sentinel
policy "required_tags" {
enforcement_level = "hard-mandatory"
rule {
condition = all(resource.tags contains "owner") and
all(resource.tags contains "cost_center")
message = "Resources must have owner and cost_center tags"
}
}
policy "encryption_required" {
enforcement_level = "hard-mandatory"
rule {
condition = all(resource.encryption_enabled == true)
message = "All resources must have encryption enabled"
}
}
Service Catalog and Golden Paths
# service-catalog.yaml
apiVersion: platform.dev/v1
kind: ServiceTemplate
metadata:
name: web-service
description: "Standard web service with best practices"
spec:
parameters:
- name: service_name
type: string
required: true
- name: team
type: string
required: true
- name: languages
type: enum
values: [go, rust, python, java]
- name: database
type: enum
values: [postgres, mysql, dynamodb, none]
default: postgres
resources:
- type: compute
template: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ .service_name }}
labels:
app: {{ .service_name }}
team: {{ .team }}
spec:
replicas: 3
selector:
matchLabels:
app: {{ .service_name }}
template:
metadata:
labels:
app: {{ .service_name }}
spec:
containers:
- name: app
image: {{ .image }}
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
- type: networking
template: |
apiVersion: v1
kind: Service
metadata:
name: {{ .service_name }}
spec:
selector:
app: {{ .service_name }}
ports:
- port: 80
targetPort: 8080
- type: observability
automatic: true
includes:
- metrics
- logs
- traces
- alerts
- type: database
condition: .database != "none"
template: |
# Provision managed database
# Configure connection secrets
# Set up backups
workflows:
- name: deploy
steps:
- validate_config
- run_tests
- build_image
- deploy_canary
- run_smoke_tests
- promote_production
- name: rollback
steps:
- identify_previous_version
- deploy_previous_version
- verify_health
Developer Portal and Self-Service
// Developer portal API
import express from 'express';
import { PlatformOrchestrator } from './orchestrator';
const app = express();
const orchestrator = new PlatformOrchestrator();
// Create new service
app.post('/api/services', async (req, res) => {
const { name, team, template, parameters } = req.body;
// Validate request
const validation = await orchestrator.validateServiceRequest({
name,
team,
template,
parameters,
});
if (!validation.isValid) {
return res.status(400).json({ errors: validation.errors });
}
// Check resource quotas
const quotaCheck = await orchestrator.checkQuotas(team);
if (!quotaCheck.allowed) {
return res.status(403).json({
error: 'Quota exceeded',
current: quotaCheck.current,
limit: quotaCheck.limit,
});
}
// Provision service
const service = await orchestrator.provisionService({
name,
team,
template,
parameters,
});
// Return service details
res.json({
id: service.id,
name: service.name,
status: 'provisioning',
endpoints: service.endpoints,
dashboards: service.dashboards,
documentation: service.documentation,
});
});
// Get service status
app.get('/api/services/:id', async (req, res) => {
const service = await orchestrator.getService(req.params.id);
res.json({
id: service.id,
name: service.name,
status: service.status,
health: service.health,
metrics: {
requests_per_second: service.metrics.rps,
error_rate: service.metrics.errorRate,
latency_p99: service.metrics.p99,
},
cost: {
current_month: service.cost.currentMonth,
projected_month: service.cost.projected,
},
});
});
class PlatformOrchestrator {
async provisionService(request: ServiceRequest): Promise<Service> {
// Generate service configuration
const config = await this.generateConfig(request);
// Provision infrastructure
const infra = await this.provisionInfrastructure(config);
// Set up observability
await this.setupObservability(infra);
// Configure CI/CD
await this.setupCICD(infra);
// Create documentation
await this.generateDocumentation(infra);
return {
id: infra.id,
name: request.name,
status: 'active',
endpoints: infra.endpoints,
dashboards: this.getDashboardURLs(infra),
documentation: this.getDocumentationURL(infra),
};
}
async setupObservability(infra: Infrastructure): Promise<void> {
// Automatically configure metrics
await this.configureMetrics(infra);
// Set up log aggregation
await this.configureLogs(infra);
// Enable distributed tracing
await this.configureTracing(infra);
// Create default dashboards
await this.createDashboards(infra);
// Set up default alerts
await this.createAlerts(infra);
}
async createAlerts(infra: Infrastructure): Promise<void> {
const defaultAlerts = [
{
name: 'High Error Rate',
condition: 'error_rate > 0.01',
duration: '5m',
severity: 'warning',
},
{
name: 'Service Down',
condition: 'up == 0',
duration: '1m',
severity: 'critical',
},
{
name: 'High Latency',
condition: 'latency_p99 > 1000',
duration: '10m',
severity: 'warning',
},
];
for (const alert of defaultAlerts) {
await this.alertManager.createAlert({
...alert,
service: infra.name,
team: infra.team,
});
}
}
}
Policy as Code
from typing import List, Dict
from dataclasses import dataclass
@dataclass
class PolicyViolation:
rule: str
severity: str
message: str
resource: str
class PolicyEngine:
"""Enforce organizational policies on platform resources"""
def __init__(self):
self.policies: List[Policy] = []
def add_policy(self, policy: 'Policy'):
self.policies.append(policy)
def evaluate(self, resource: Dict) -> List[PolicyViolation]:
violations = []
for policy in self.policies:
if not policy.evaluate(resource):
violations.append(PolicyViolation(
rule=policy.name,
severity=policy.severity,
message=policy.message,
resource=resource.get('name', 'unknown')
))
return violations
class Policy:
def __init__(self, name: str, severity: str, message: str):
self.name = name
self.severity = severity
self.message = message
def evaluate(self, resource: Dict) -> bool:
raise NotImplementedError
# Example policies
class RequireTagsPolicy(Policy):
def __init__(self, required_tags: List[str]):
super().__init__(
name="required_tags",
severity="error",
message=f"Resources must have tags: {', '.join(required_tags)}"
)
self.required_tags = required_tags
def evaluate(self, resource: Dict) -> bool:
tags = resource.get('tags', {})
return all(tag in tags for tag in self.required_tags)
class EnforceEncryptionPolicy(Policy):
def __init__(self):
super().__init__(
name="enforce_encryption",
severity="error",
message="All data stores must have encryption enabled"
)
def evaluate(self, resource: Dict) -> bool:
if resource.get('type') not in ['database', 'storage']:
return True
return resource.get('encryption_enabled', False)
class CostLimitPolicy(Policy):
def __init__(self, max_monthly_cost: float):
super().__init__(
name="cost_limit",
severity="warning",
message=f"Estimated monthly cost exceeds ${max_monthly_cost}"
)
self.max_cost = max_monthly_cost
def evaluate(self, resource: Dict) -> bool:
estimated_cost = resource.get('estimated_monthly_cost', 0)
return estimated_cost <= self.max_cost
# Usage
engine = PolicyEngine()
engine.add_policy(RequireTagsPolicy(['owner', 'cost_center', 'environment']))
engine.add_policy(EnforceEncryptionPolicy())
engine.add_policy(CostLimitPolicy(10000))
violations = engine.evaluate(resource_spec)
if violations:
for v in violations:
print(f"{v.severity.upper()}: {v.message} (resource: {v.resource})")
Cost Management and Optimization
package cost
import (
"time"
)
type CostAnalyzer struct {
metricsDB MetricsDatabase
}
type ResourceCost struct {
ResourceID string
ResourceType string
Team string
DailyCost float64
MonthlyCost float64
Utilization float64
Recommendations []Recommendation
}
type Recommendation struct {
Type string
Description string
Savings float64
}
func (ca *CostAnalyzer) AnalyzeResources() []ResourceCost {
resources := ca.getAllResources()
costs := []ResourceCost{}
for _, resource := range resources {
// Calculate actual cost
cost := ca.calculateCost(resource)
// Calculate utilization
utilization := ca.calculateUtilization(resource)
// Generate recommendations
recommendations := ca.generateRecommendations(resource, utilization)
costs = append(costs, ResourceCost{
ResourceID: resource.ID,
ResourceType: resource.Type,
Team: resource.Team,
DailyCost: cost.Daily,
MonthlyCost: cost.Monthly,
Utilization: utilization,
Recommendations: recommendations,
})
}
return costs
}
func (ca *CostAnalyzer) generateRecommendations(
resource Resource,
utilization float64,
) []Recommendation {
recommendations := []Recommendation{}
// Right-sizing recommendations
if utilization < 0.3 {
savings := ca.estimateSavings(resource, "downsize")
recommendations = append(recommendations, Recommendation{
Type: "rightsize",
Description: "Resource is underutilized, consider downsizing",
Savings: savings,
})
}
// Spot instance recommendations
if resource.Type == "compute" && !resource.IsSpot {
savings := ca.estimateSavings(resource, "spot")
recommendations = append(recommendations, Recommendation{
Type: "spot_instance",
Description: "Consider using spot instances for non-critical workloads",
Savings: savings,
})
}
// Reserved capacity recommendations
if ca.hasConsistentUsage(resource, 6*30*24*time.Hour) {
savings := ca.estimateSavings(resource, "reserved")
recommendations = append(recommendations, Recommendation{
Type: "reserved_capacity",
Description: "Consistent usage detected, reserved capacity would be cheaper",
Savings: savings,
})
}
return recommendations
}
// Automatic optimization
func (ca *CostAnalyzer) AutoOptimize(dryRun bool) []OptimizationResult {
results := []OptimizationResult{}
costs := ca.AnalyzeResources()
for _, cost := range costs {
for _, rec := range cost.Recommendations {
// Only auto-apply low-risk optimizations
if rec.Type == "rightsize" && cost.Utilization < 0.2 {
if !dryRun {
ca.applyOptimization(cost.ResourceID, rec)
}
results = append(results, OptimizationResult{
ResourceID: cost.ResourceID,
Action: rec.Type,
Savings: rec.Savings,
Applied: !dryRun,
})
}
}
}
return results
}
Developer Experience: CLI Tool
# Platform CLI tool
import click
import requests
from rich.console import Console
from rich.table import Table
console = Console()
@click.group()
def cli():
"""Internal Developer Platform CLI"""
pass
@cli.command()
@click.option('--name', required=True, help='Service name')
@click.option('--team', required=True, help='Team name')
@click.option('--template', default='web-service', help='Service template')
def create(name: str, team: str, template: str):
"""Create a new service"""
console.print(f"[bold]Creating service: {name}[/bold]")
# Call platform API
response = requests.post(
'https://platform.company.com/api/services',
json={
'name': name,
'team': team,
'template': template,
}
)
if response.status_code == 200:
service = response.json()
console.print(f"[green]✓ Service created successfully![/green]")
console.print(f"\nService ID: {service['id']}")
console.print(f"Status: {service['status']}")
console.print(f"\nNext steps:")
console.print(f"1. View dashboard: {service['dashboards']['overview']}")
console.print(f"2. Read docs: {service['documentation']}")
console.print(f"3. Deploy code: platform deploy {name}")
else:
console.print(f"[red]✗ Failed to create service[/red]")
console.print(response.json()['error'])
@cli.command()
@click.argument('service_name')
def status(service_name: str):
"""Get service status"""
response = requests.get(
f'https://platform.company.com/api/services/{service_name}'
)
if response.status_code == 200:
service = response.json()
# Display status table
table = Table(title=f"Service: {service_name}")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Status", service['status'])
table.add_row("Health", service['health'])
table.add_row("RPS", str(service['metrics']['requests_per_second']))
table.add_row("Error Rate", f"{service['metrics']['error_rate']:.2%}")
table.add_row("P99 Latency", f"{service['metrics']['latency_p99']}ms")
table.add_row("Monthly Cost", f"${service['cost']['current_month']:.2f}")
console.print(table)
else:
console.print(f"[red]Service not found: {service_name}[/red]")
@cli.command()
def costs():
"""Show cost breakdown by team"""
response = requests.get('https://platform.company.com/api/costs')
costs = response.json()
table = Table(title="Cost Breakdown")
table.add_column("Team", style="cyan")
table.add_column("Current Month", style="green")
table.add_column("Projected", style="yellow")
table.add_column("Trend", style="magenta")
for team_cost in costs:
table.add_row(
team_cost['team'],
f"${team_cost['current']:.2f}",
f"${team_cost['projected']:.2f}",
team_cost['trend']
)
console.print(table)
if __name__ == '__main__':
cli()
Measuring Platform Success
from dataclasses import dataclass
from typing import List
@dataclass
class PlatformMetrics:
# Developer productivity
time_to_first_deploy: float # hours
deployments_per_day: float
services_created_per_week: float
# Reliability
platform_uptime: float # percentage
mean_time_to_recovery: float # minutes
change_failure_rate: float # percentage
# Efficiency
infrastructure_cost_per_service: float
cost_optimization_savings: float
resource_utilization: float # percentage
# Developer satisfaction
nps_score: float
self_service_adoption: float # percentage
support_ticket_volume: int
def calculate_platform_roi(metrics: PlatformMetrics) -> float:
"""Calculate ROI of platform investment"""
# Developer productivity gains
dev_time_saved_hours = (
metrics.deployments_per_day * 2 + # 2 hours saved per deployment
metrics.services_created_per_week * 8 # 8 hours saved per service
) * 52 # weeks per year
dev_cost_savings = dev_time_saved_hours * 150 # $150/hour avg cost
# Infrastructure efficiency gains
infra_savings = metrics.cost_optimization_savings
# Reliability gains (reduced downtime)
downtime_cost_savings = (
(1 - metrics.change_failure_rate) * 100000 # $100K per major incident
)
total_savings = dev_cost_savings + infra_savings + downtime_cost_savings
# Platform costs (estimated)
platform_team_cost = 5 * 200000 # 5 engineers
infrastructure_cost = 500000
total_cost = platform_team_cost + infrastructure_cost
roi = (total_savings - total_cost) / total_cost
return roi
Conclusion
Building a mature internal developer platform requires:
- Start with fundamentals - IaC, CI/CD, observability
- Create golden paths - Service templates and best practices
- Enable self-service - Developer portal and APIs
- Enforce policies - Security, compliance, cost management
- Optimize continuously - Cost, performance, developer experience
- Measure everything - Productivity, reliability, efficiency, satisfaction
The goal is not perfection, but continuous improvement. Start where you are, focus on developer pain points, and evolve incrementally toward a mature platform that enables teams to move fast while maintaining reliability and security.