Reliability Engineering
Building reliable systems requires architectural patterns for fault tolerance, graceful degradation, and proactive failure management. This guide covers circuit breakers, bulkheads, retry strategies, chaos engineering, health checks, disaster recovery planning, and multi-region architecture with production-ready implementation examples.
Reliability Design Patterns
Reliability patterns are reusable architectural solutions to common failure scenarios. Applying these patterns systematically is the difference between a service that fails gracefully and one that fails catastrophically.
| Pattern | Problem Solved | When to Use |
|---|---|---|
| Circuit Breaker | Cascade failures from a failing dependency | Every external service call |
| Bulkhead | Resource exhaustion spreading across components | Thread pools, connection pools, request queues |
| Retry + Backoff | Transient failures causing unnecessary errors | Idempotent operations only |
| Graceful Degradation | Total failure when non-critical features fail | Features with fallback content (recommendations, analytics) |
| Health Checks | Load balancers sending traffic to unhealthy instances | Every service behind a load balancer |
| Rate Limiting | Overload from unexpected traffic spikes | Public APIs, authentication endpoints |
| Timeouts | Resources held indefinitely by slow dependencies | Every external call (DB, HTTP, cache) |
Circuit Breaker Pattern
The circuit breaker prevents cascade failures by stopping requests to a failing dependency. When errors exceed a threshold, the circuit "opens" and requests fail fast. After a cooldown, the circuit enters "half-open" state to test if the dependency has recovered.
"""
circuit_breaker.py β Production-grade circuit breaker implementation.
States:
CLOSED β Normal operation, requests pass through
OPEN β Failure threshold exceeded, requests fail fast
HALF_OPEN β Testing if dependency recovered (limited requests)
Usage:
breaker = CircuitBreaker(
failure_threshold=5,
recovery_timeout=30,
half_open_max_calls=3
)
@breaker
def call_payment_gateway(request):
return requests.post(PAYMENT_GATEWAY_URL, json=request)
"""
import time
import threading
import logging
from enum import Enum
from functools import wraps
from typing import Callable, Any, Optional
logger = logging.getLogger(__name__)
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing fast
HALF_OPEN = "half_open" # Testing recovery
class CircuitBreakerOpen(Exception):
"""Raised when the circuit breaker is OPEN."""
pass
class CircuitBreaker:
"""Thread-safe circuit breaker implementation."""
def __init__(
self,
failure_threshold: int = 5,
recovery_timeout: float = 30.0,
half_open_max_calls: int = 3,
expected_exception: type = Exception,
name: str = "default"
):
"""
Args:
failure_threshold: Number of failures before opening circuit
recovery_timeout: Seconds to wait before trying recovery
half_open_max_calls: Max test calls in half-open state
expected_exception: Exception type that counts as failure
name: Circuit breaker identifier for metrics/logging
"""
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.half_open_max_calls = half_open_max_calls
self.expected_exception = expected_exception
self.name = name
self._state = CircuitState.CLOSED
self._failure_count = 0
self._success_count = 0
self._half_open_calls = 0
self._last_failure_time: Optional[float] = None
self._lock = threading.RLock()
@property
def state(self) -> CircuitState:
with self._lock:
if self._state == CircuitState.OPEN:
# Check if recovery timeout has elapsed
if self._last_failure_time and \
(time.time() - self._last_failure_time) >= self.recovery_timeout:
logger.info(f"[{self.name}] Recovery timeout elapsed, entering HALF_OPEN")
self._state = CircuitState.HALF_OPEN
self._half_open_calls = 0
self._success_count = 0
return self._state
def call(self, func: Callable, *args, **kwargs) -> Any:
"""Execute func with circuit breaker protection."""
with self._lock:
current_state = self.state
if current_state == CircuitState.OPEN:
raise CircuitBreakerOpen(
f"Circuit breaker '{self.name}' is OPEN. "
f"Last failure: {self._last_failure_time}"
)
if current_state == CircuitState.HALF_OPEN:
if self._half_open_calls >= self.half_open_max_calls:
raise CircuitBreakerOpen(
f"Circuit breaker '{self.name}' HALF_OPEN limit reached"
)
self._half_open_calls += 1
# Execute outside the lock to allow concurrency
try:
result = func(*args, **kwargs)
self._on_success()
return result
except self.expected_exception as e:
self._on_failure()
raise
def _on_success(self):
with self._lock:
if self._state == CircuitState.HALF_OPEN:
self._success_count += 1
# If we've had enough successes in half-open, close the circuit
if self._success_count >= self.half_open_max_calls:
logger.info(f"[{self.name}] Recovery confirmed, circuit CLOSED")
self._state = CircuitState.CLOSED
self._failure_count = 0
self._success_count = 0
elif self._state == CircuitState.CLOSED:
self._failure_count = max(0, self._failure_count - 1)
def _on_failure(self):
with self._lock:
self._failure_count += 1
self._last_failure_time = time.time()
if self._state == CircuitState.HALF_OPEN:
logger.warning(f"[{self.name}] Recovery failed, circuit OPEN")
self._state = CircuitState.OPEN
elif self._failure_count >= self.failure_threshold:
logger.error(
f"[{self.name}] Failure threshold reached ({self._failure_count}), "
f"circuit OPEN"
)
self._state = CircuitState.OPEN
def __call__(self, func: Callable) -> Callable:
"""Decorator usage: @circuit_breaker_instance"""
@wraps(func)
def wrapper(*args, **kwargs):
return self.call(func, *args, **kwargs)
return wrapper
def get_metrics(self) -> dict:
"""Export metrics for monitoring."""
return {
"circuit_breaker_name": self.name,
"state": self.state.value,
"failure_count": self._failure_count,
"failure_threshold": self.failure_threshold,
"last_failure_time": self._last_failure_time,
}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Integration with Flask/FastAPI for HTTP client calls
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Create circuit breakers for each external dependency
payment_gateway_cb = CircuitBreaker(
failure_threshold=5,
recovery_timeout=30,
half_open_max_calls=3,
name="payment_gateway"
)
fraud_service_cb = CircuitBreaker(
failure_threshold=3,
recovery_timeout=60,
half_open_max_calls=2,
name="fraud_service"
)
@payment_gateway_cb
def charge_card(card_token: str, amount: float) -> dict:
"""Charge a card through the payment gateway."""
response = requests.post(
PAYMENT_GATEWAY_URL,
json={"card_token": card_token, "amount": amount},
timeout=5
)
response.raise_for_status()
return response.json()
# In your service code:
try:
result = charge_card("tok_visa", 99.99)
except CircuitBreakerOpen:
# Circuit is open β fail fast and return a graceful response
logger.warning("Payment gateway circuit open β queueing for async processing")
queue_for_async_processing(card_token="tok_visa", amount=99.99)
return {"status": "queued", "message": "Payment will be processed shortly"}
except requests.RequestException as e:
# Other error (timeout, 5xx, etc.) β circuit breaker will count this
logger.error(f"Payment gateway error: {e}")
raise
Bulkhead Pattern (Resource Isolation)
The bulkhead pattern isolates resources so that a failure in one component cannot exhaust shared resources and affect other components. Named after ship bulkheads that prevent flooding from spreading across compartments.
"""
bulkhead.py β Thread pool-based bulkhead implementation.
Isolates different request types into separate thread pools,
preventing one slow operation from consuming all threads.
Usage:
payment_bulkhead = Bulkhead(max_concurrent=20, max_queue=100)
@payment_bulkhead
def process_payment(payment_request):
# This runs in the payment thread pool
...
"""
from concurrent.futures import ThreadPoolExecutor, Future
from functools import wraps
from queue import Queue, Full
import threading
import time
from typing import Callable, Any
class BulkheadFull(Exception):
"""Raised when the bulkhead has no capacity."""
pass
class BulkheadTimeout(Exception):
"""Raised when the bulkhead queue wait times out."""
pass
class Bulkhead:
"""Resource isolation bulkhead using thread pools."""
def __init__(
self,
max_concurrent: int = 20,
max_queue: int = 100,
name: str = "default"
):
"""
Args:
max_concurrent: Maximum concurrent executions
max_queue: Maximum queued waiting executions
name: Bulkhead identifier
"""
self.name = name
self.max_concurrent = max_concurrent
self.max_queue = max_queue
# ThreadPoolExecutor: max_concurrent workers + max_queue backlog
self._executor = ThreadPoolExecutor(
max_workers=max_concurrent,
thread_name_prefix=f"bulkhead-{name}"
)
self._active_count = 0
self._queue_size = 0
self._lock = threading.Lock()
self._semaphore = threading.Semaphore(max_concurrent + max_queue)
def submit(self, func: Callable, *args, **kwargs) -> Future:
"""Submit function to bulkhead. Raises BulkheadFull if at capacity."""
acquired = self._semaphore.acquire(blocking=False)
if not acquired:
raise BulkheadFull(
f"Bulkhead '{self.name}' at capacity: "
f"{self.max_concurrent} active + {self.max_queue} queued"
)
def wrapper():
try:
with self._lock:
self._active_count += 1
return func(*args, **kwargs)
finally:
with self._lock:
self._active_count -= 1
self._semaphore.release()
return self._executor.submit(wrapper)
def call(self, func: Callable, *args, **kwargs) -> Any:
"""Execute function synchronously with bulkhead protection."""
future = self.submit(func, *args, **kwargs)
return future.result()
def __call__(self, func: Callable) -> Callable:
"""Decorator usage: @bulkhead_instance"""
@wraps(func)
def wrapper(*args, **kwargs):
return self.call(func, *args, **kwargs)
return wrapper
def get_metrics(self) -> dict:
with self._lock:
return {
"bulkhead_name": self.name,
"max_concurrent": self.max_concurrent,
"max_queue": self.max_queue,
"active": self._active_count,
"available": self.max_concurrent - self._active_count,
}
def shutdown(self):
self._executor.shutdown(wait=True)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Multiple bulkheads for different operation types
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Critical path: small, fast pool β payment processing must be fast
payment_bulkhead = Bulkhead(max_concurrent=50, max_queue=200, name="payment")
# Non-critical: larger pool for background tasks
analytics_bulkhead = Bulkhead(max_concurrent=10, max_queue=500, name="analytics")
# External API: small pool to prevent overwhelming third parties
external_api_bulkhead = Bulkhead(max_concurrent=20, max_queue=50, name="external_api")
# In your service:
def handle_payment_request(request):
try:
# Payment processing β isolated pool
return payment_bulkhead.call(process_payment, request)
except BulkheadFull:
# Return 503 β client should retry with backoff
raise ServiceUnavailable("Payment system at capacity, please retry")
Retry with Exponential Backoff and Jitter
Retry is appropriate only for idempotent operations and transient failures. Without jitter, synchronized retries from multiple clients can create thundering herd problems. Full jitter is recommended for most cases.
"""
retry.py β Exponential backoff with full jitter for resilient retries.
Implements the AWS-recommended exponential backoff with full jitter:
sleep = random_between(0, min(cap, base * 2^attempt))
Usage:
@retry(max_attempts=5, base_delay=0.1, max_delay=30.0)
def call_external_api(request):
...
"""
import random
import time
import logging
from functools import wraps
from typing import Callable, Any, Optional, Tuple, Type
logger = logging.getLogger(__name__)
class RetryExhausted(Exception):
"""All retry attempts have been exhausted."""
pass
def retry(
max_attempts: int = 3,
base_delay: float = 0.1, # 100ms
max_delay: float = 60.0, # 60 seconds
exponential_base: float = 2.0,
jitter: bool = True,
retryable_exceptions: Tuple[Type[Exception], ...] = (Exception,),
on_retry: Optional[Callable[[Exception, int, float], None]] = None,
on_exhausted: Optional[Callable[[Exception, int], None]] = None
):
"""
Retry decorator with exponential backoff and full jitter.
Args:
max_attempts: Maximum number of attempts (including first)
base_delay: Initial delay between retries (seconds)
max_delay: Maximum delay between retries (seconds)
exponential_base: Multiplier for exponential growth
jitter: If True, apply full random jitter
retryable_exceptions: Exception types that trigger retry
on_retry: Callback(exception, attempt, delay) called before each retry
on_exhausted: Callback(exception, attempts) called when all retries fail
"""
def decorator(func: Callable) -> Callable:
@wraps(func)
def wrapper(*args, **kwargs) -> Any:
last_exception = None
for attempt in range(1, max_attempts + 1):
try:
return func(*args, **kwargs)
except retryable_exceptions as e:
last_exception = e
if attempt >= max_attempts:
if on_exhausted:
on_exhausted(e, attempt)
raise RetryExhausted(
f"Failed after {max_attempts} attempts: {e}"
) from e
# Calculate backoff with full jitter
delay = min(max_delay, base_delay * (exponential_base ** (attempt - 1)))
if jitter:
delay = random.uniform(0, delay)
logger.warning(
f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: "
f"{e}. Retrying in {delay:.2f}s"
)
if on_retry:
on_retry(e, attempt, delay)
time.sleep(delay)
# Should not reach here
raise RetryExhausted("Unexpected exit from retry loop")
return wrapper
return decorator
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Usage Examples
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. Basic retry for database queries
@retry(
max_attempts=5,
base_delay=0.1,
max_delay=5.0,
retryable_exceptions=(DatabaseTransientError, ConnectionError)
)
def get_user_profile(user_id: str) -> dict:
return db.query("SELECT * FROM users WHERE id = %s", (user_id,))
# 2. Retry with metrics callback for observability
attempt_counter = Counter('retry_attempts_total', 'Retry attempts', ['function'])
def record_retry(exception, attempt, delay):
attempt_counter.labels(function='call_payment_api').inc()
@retry(
max_attempts=3,
base_delay=0.5,
retryable_exceptions=(requests.RequestException,),
on_retry=record_retry
)
def call_payment_api(payload: dict) -> dict:
response = requests.post(PAYMENT_API_URL, json=payload, timeout=10)
response.raise_for_status()
return response.json()
# 3. Retry with circuit breaker combined
@payment_gateway_cb # Circuit breaker β fail fast if service is down
@retry(
max_attempts=3,
base_delay=0.1,
retryable_exceptions=(requests.RequestException,)
)
def charge_with_resilience(card_token: str, amount: float) -> dict:
"""Combines circuit breaker (fail fast) with retry (handle transient)."""
response = requests.post(
PAYMENT_GATEWAY_URL,
json={"card_token": card_token, "amount": amount},
timeout=5
)
response.raise_for_status()
return response.json()
# Backoff delay calculation (with jitter) for visualization:
# Attempt 1: delay = random(0, 0.1) = ~50ms average
# Attempt 2: delay = random(0, 0.2) = ~100ms average
# Attempt 3: delay = random(0, 0.4) = ~200ms average
# Attempt 4: delay = random(0, 0.8) = ~400ms average
# Attempt 5: delay = random(0, 1.6) = ~800ms average
# Attempt 6+: capped at max_delay (60s)
Idempotency-Key: <UUID> β the server stores processed keys and ignores duplicates.
Graceful Degradation Strategies
Graceful degradation ensures that when non-critical features fail, the core functionality continues working. Users get a reduced but functional experience instead of a complete failure.
# Example: E-commerce product page with graceful degradation
class ProductPageService:
"""Renders product page with graceful degradation for non-critical features."""
def __init__(self):
self.product_service = ProductService()
self.recommendation_service = RecommendationService()
self.review_service = ReviewService()
self.inventory_service = InventoryService()
self.price_service = PriceService()
def get_product_page(self, product_id: str, user_id: str) -> ProductPage:
# CRITICAL: Product details β must succeed
# If this fails, return 500 (core functionality broken)
product = self.product_service.get_product(product_id)
# NON-CRITICAL: Recommendations β degrade to popular items
try:
recommendations = self._with_timeout(
lambda: self.recommendation_service.get_for_user(user_id, product_id),
timeout_ms=200
)
except (TimeoutError, ServiceError):
logger.warning("Recommendations failed, using fallback")
recommendations = self._get_popular_fallback()
# NON-CRITICAL: Reviews β degrade to empty list
try:
reviews = self._with_timeout(
lambda: self.review_service.get_reviews(product_id),
timeout_ms=300
)
except (TimeoutError, ServiceError):
logger.warning("Reviews failed, showing empty")
reviews = []
# CRITICAL: Price β must succeed for purchase
# But can show "price unavailable" and disable buy button
try:
price = self._with_timeout(
lambda: self.price_service.get_price(product_id),
timeout_ms=100
)
except (TimeoutError, ServiceError):
logger.error("Price service failed β disabling purchase")
price = None
# NON-CRITICAL: Inventory β default to "in stock"
try:
inventory = self._with_timeout(
lambda: self.inventory_service.get_inventory(product_id),
timeout_ms=100
)
except (TimeoutError, ServiceError):
logger.warning("Inventory check failed, defaulting to in-stock")
inventory = InventoryStatus.IN_STOCK
return ProductPage(
product=product, # Critical β always present
recommendations=recommendations, # Degraded to popular
reviews=reviews, # Degraded to empty
price=price, # Null = purchase disabled
inventory=inventory, # Default in-stock
degraded_features=self._get_degraded_features()
)
def _get_popular_fallback(self) -> List[Product]:
"""Return globally popular products when personalized recs fail."""
cache_key = "popular_products"
cached = redis.get(cache_key)
if cached:
return json.loads(cached)
popular = self.product_service.get_popular(limit=10)
redis.setex(cache_key, 3600, json.dumps([p.to_dict() for p in popular]))
return popular
# Response with graceful degradation:
# {
# "product": { ... full product details ... }, β CRITICAL: complete
# "recommendations": [ ... popular items ... ], β degraded from personalized
# "reviews": [], β degraded to empty
# "price": null, β degraded, purchase disabled
# "inventory": "IN_STOCK", β default assumption
# "degraded_features": ["recommendations", "price"],
# "message": "Some features are temporarily unavailable."
# }
Chaos Engineering Principles
Chaos engineering is the discipline of experimenting on a system to build confidence in its capability to withstand turbulent conditions. It is not about breaking things randomly β it is about proving that your resilience mechanisms work.
Core Principles
- Start with a steady-state hypothesis: Define what "normal" looks like before injecting failure. Your experiment validates that the system maintains this state during failure.
- Vary real-world events: Inject failures that actually happen: server crashes, network latency, dependency failures, certificate expiry.
- Run in production: Staging environments rarely match production. Start with canary/small blast radius in production, or use shadow traffic.
- Automate to run continuously: Manual chaos experiments are valuable but episodic. Automated chaos (like Netflix's Simian Army) provides ongoing validation.
- Minimize blast radius: Every experiment must have an abort criterion and a quick rollback path. Start with 1% of traffic, not 100%.
Litmus Chaos Experiment (Complete YAML)
LitmusChaos is a Cloud Native Computing Foundation (CNCF) incubating project for Kubernetes chaos engineering. Below is a complete experiment that validates a service's resilience to pod failure.
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
LITMUS CHAOS EXPERIMENT: Pod Failure
Target: payment-api deployment
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
---
# 1. ChaosEngine β defines what to chaos and when
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
name: payment-api-pod-failure
namespace: litmus
spec:
appinfo:
appns: 'payment'
applabel: 'app=payment-api'
appkind: 'deployment'
# Run against app pods in 'payment' namespace
annotationCheck: 'true'
engineState: 'active'
# Service account for chaos operations
auxiliaryAppInfo: ''
# Define the steady-state hypothesis
experiments:
- name: pod-delete
spec:
components:
env:
# Total duration of chaos injection
- name: TOTAL_CHAOS_DURATION
value: '60'
# Kill 1 pod every 10 seconds
- name: CHAOS_INTERVAL
value: '10'
# Force delete (no graceful shutdown)
- name: FORCE
value: 'false'
# Target pod label
- name: PODS_AFFECTED_PERC
value: '25'
# Namespace scope
- name: TARGET_CONTAINER
value: 'payment-api'
# Probes β validate steady-state hypothesis
probes:
- name: "payment-api-health-check"
type: "httpProbe"
mode: "Continuous"
runProperties:
probeTimeout: "5s"
retry: 2
interval: "5s"
probePollingInterval: "2s"
initialDelay: "2s"
httpProbe/inputs:
url: "http://payment-api.payment.svc.cluster.local:8080/health"
insecureSkipVerify: false
method:
get:
criteria: ==
responseCode: "200"
---
# 2. ChaosSchedule β run this experiment automatically every week
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosSchedule
metadata:
name: payment-api-weekly-chaos
namespace: litmus
spec:
schedule:
repeat:
properties:
minChaosInterval:
hour:
everyNthHour: 168 # Every 7 days
workHours:
includedHours: 9-17 # Business hours only
engineTemplateSpec:
appinfo:
appns: 'payment'
applabel: 'app=payment-api'
appkind: 'deployment'
experiments:
- name: pod-delete
spec:
components:
env:
- name: TOTAL_CHAOS_DURATION
value: '60'
- name: CHAOS_INTERVAL
value: '10'
- name: PODS_AFFECTED_PERC
value: '25'
---
# 3. ChaosResult β view experiment results
# kubectl get chaosresult -n litmus
#
# Expected output when resilience is working:
# NAME VERDICT PHASE
# payment-api-pod-failure Pass Completed
#
# VERDICT values:
# Pass = Steady-state maintained during chaos
# Fail = Steady-state breached (probe failed)
# Awaited = Experiment still running
# Stopped = Experiment was manually aborted
# SAFETY GUARDRAILS:
# 1. Always set workHours to business hours when starting
# 2. Start with PODS_AFFECTED_PERC: 10 (small blast radius)
# 3. Ensure abort criteria are defined:
# - Error rate > 1% for 30s β auto-abort
# - Latency p95 > 5s β auto-abort
# 4. Never run chaos experiments:
# - During deployments
# - During known-fragile periods (Black Friday)
# - Without an observer present
Health Check Endpoint Design
Kubernetes and load balancers use health checks to determine whether to send traffic to a pod. Proper health check design is critical β a misconfigured health check can cause cascading failures by marking healthy pods as unhealthy.
# health_checks.py β Comprehensive health check implementation
from fastapi import FastAPI, HTTPException, status
from pydantic import BaseModel
from typing import Dict, List, Optional
import asyncio
import time
app = FastAPI()
class HealthStatus(BaseModel):
status: str # "pass", "fail", or "warn"
checks: Dict[str, dict]
version: str
uptime_seconds: float
# βββ /live (Liveness Probe) βββββββββββββββββββββββββββββββββββββ
# Tells Kubernetes: "Should this pod be killed and restarted?"
#
# Use sparingly β only return fail when the application is stuck
# and cannot recover without restart. A failing liveness probe
# causes Kubernetes to restart the container.
@app.get("/health/live", status_code=status.HTTP_200_OK)
async def liveness_probe():
"""
Liveness probe: Is the application process running?
This should be a lightweight check that verifies the
application process is alive and not deadlocked.
DO NOT check external dependencies here β a temporary DB
blip should not cause Kubernetes to restart all your pods.
"""
return {
"status": "alive",
"timestamp": time.time()
}
# βββ /ready (Readiness Probe) ββββββββββββββββββββββββββββββββββββ
# Tells Kubernetes: "Should this pod receive traffic?"
#
# Check all dependencies required to serve traffic. If a
# dependency is down, return non-200 so the pod is removed
# from the load balancer rotation.
class ReadinessChecker:
def __init__(self):
self.checks = {
'database': self._check_database,
'redis': self._check_redis,
'upstream-payment-gateway': self._check_payment_gateway,
}
self.last_check_time = 0
self.cached_result = None
self.cache_ttl = 5 # Cache readiness for 5 seconds
async def check(self) -> Dict:
# Cache results to prevent health check storms
now = time.time()
if self.cached_result and (now - self.last_check_time) < self.cache_ttl:
return self.cached_result
results = {}
overall_ready = True
# Run checks concurrently
check_tasks = [
(name, checker())
for name, checker in self.checks.items()
]
for name, coro in check_tasks:
try:
result = await asyncio.wait_for(coro, timeout=3.0)
results[name] = {"status": "pass", "response_time_ms": result}
except asyncio.TimeoutError:
results[name] = {"status": "warn", "error": "timeout"}
# Don't mark as fail β may be transient
except Exception as e:
results[name] = {"status": "fail", "error": str(e)}
# Critical dependencies cause not-ready
if name in ['database', 'redis']:
overall_ready = False
self.cached_result = results
self.last_check_time = now
return results, overall_ready
readiness_checker = ReadinessChecker()
@app.get("/health/ready", status_code=status.HTTP_200_OK)
async def readiness_probe():
"""
Readiness probe: Can this pod serve traffic?
Checks all critical dependencies. Returns non-200 if
any critical dependency is unreachable, causing Kubernetes
to remove this pod from service rotation.
"""
checks, is_ready = await readiness_checker.check()
status_code = 200 if is_ready else 503
return JSONResponse(
status_code=status_code,
content={
"status": "ready" if is_ready else "not_ready",
"checks": checks,
"version": "2.4.0",
"timestamp": time.time()
}
)
# βββ /health (Comprehensive Health) ββββββββββββββββββββββββββββββ
# Full health check for monitoring dashboards and debugging.
# Not used by Kubernetes β purely for observability.
@app.get("/health", status_code=status.HTTP_200_OK)
async def comprehensive_health():
"""
Comprehensive health check for dashboards.
Includes all /ready checks plus additional metrics.
"""
checks, _ = await readiness_checker.check()
# Add application-specific health metrics
checks["queue_depth"] = {
"status": "pass" if queue_depth < 1000 else "warn",
"value": queue_depth,
"threshold": 1000
}
checks["error_rate_5m"] = {
"status": "pass" if error_rate < 0.001 else "fail",
"value": error_rate,
"threshold": 0.001
}
return {
"status": "healthy",
"checks": checks,
"version": "2.4.0",
"git_commit": "abc123def456",
"uptime_seconds": time.time() - start_time,
"build_timestamp": "2024-01-15T10:30:00Z"
}
# βββ Kubernetes Pod Configuration ββββββββββββββββββββββββββββββββ
#
# livenessProbe:
# httpGet:
# path: /health/live
# port: 8080
# initialDelaySeconds: 10 # Wait for app startup
# periodSeconds: 10 # Check every 10s
# timeoutSeconds: 2 # Must respond in 2s
# failureThreshold: 3 # Fail after 3 consecutive failures
# successThreshold: 1 # Succeed on 1 pass
#
# readinessProbe:
# httpGet:
# path: /health/ready
# port: 8080
# initialDelaySeconds: 5
# periodSeconds: 5
# timeoutSeconds: 3
# failureThreshold: 3
# successThreshold: 2 # Require 2 passes before going ready
#
# startupProbe: # Use for slow-starting applications
# httpGet:
# path: /health/live
# port: 8080
# initialDelaySeconds: 5
# periodSeconds: 5
# timeoutSeconds: 3
# failureThreshold: 30 # Allow 2.5 min for startup
A poorly designed liveness probe that checks external dependencies can cause cascading failures: DB slows down β liveness probes fail β all pods restart simultaneously β thundering herd on DB β worse slowdown β more restarts. Never check external dependencies in a liveness probe. Check only that your process is alive and responsive.
Database Reliability
Read Replicas
Offload read traffic to replica databases. Write operations go to the primary; read operations (especially analytics, reporting, and cache warming) go to replicas. Implement automatic failover for the primary.
# Database configuration with primary/replica routing
# SQLAlchemy example with read replica support
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
import random
class RoutingSession:
"""Routes reads to replicas, writes to primary."""
def __init__(self, primary_url: str, replica_urls: List[str]):
self.primary_engine = create_engine(
primary_url,
pool_size=20,
max_overflow=10,
pool_pre_ping=True, # Verify connection before use
pool_recycle=3600, # Recycle connections after 1 hour
connect_args={"connect_timeout": 5}
)
self.replica_engines = [
create_engine(
url,
pool_size=10,
max_overflow=5,
pool_pre_ping=True,
pool_recycle=3600,
connect_args={"connect_timeout": 5}
)
for url in replica_urls
]
def get_read_session(self):
"""Get a session connected to a random read replica."""
engine = random.choice(self.replica_engines)
Session = sessionmaker(bind=engine)
return Session()
def get_write_session(self):
"""Get a session connected to the primary."""
Session = sessionmaker(bind=self.primary_engine)
return Session()
def execute_read(self, query, **params):
"""Execute a read query against a replica."""
session = self.get_read_session()
try:
return session.execute(query, params).fetchall()
finally:
session.close()
def execute_write(self, query, **params):
"""Execute a write query against the primary."""
session = self.get_write_session()
try:
result = session.execute(query, params)
session.commit()
return result
except Exception:
session.rollback()
raise
finally:
session.close()
# Usage:
db = RoutingSession(
primary_url="postgresql://user:pass@db-primary.internal/payments",
replica_urls=[
"postgresql://user:pass@db-replica-1.internal/payments",
"postgresql://user:pass@db-replica-2.internal/payments",
]
)
# Reads go to replicas (automatic load balancing)
user = db.execute_read("SELECT * FROM users WHERE id = :id", id=user_id)
# Writes go to primary
payment = db.execute_write(
"INSERT INTO payments (amount, status) VALUES (:amount, 'pending') RETURNING *",
amount=amount
)
Connection Pool Configuration
| Parameter | Recommended Value | Rationale |
|---|---|---|
| pool_size | 10β20 per instance | Enough for concurrent requests, not so many that DB is overwhelmed |
| max_overflow | pool_size / 2 | Extra connections for traffic spikes, but bounded |
| pool_pre_ping | True | Verify connection validity before use; prevents stale connection errors |
| pool_recycle | 3600 seconds (1 hour) | Prevent stale connections from long-lived pools |
| connect_timeout | 5 seconds | Fail fast if DB is unreachable; prevents request pileup |
| pool_timeout | 10 seconds | Time to wait for a connection from the pool before failing |
CDN and Caching for Reliability
CDNs and caching layers improve reliability by serving content from edge locations, reducing load on origin servers, and providing content even when origin is experiencing issues.
# CloudFront / CDN configuration for static assets and API responses
# Terraform configuration
resource "aws_cloudfront_distribution" "api_cache" {
enabled = true
comment = "Payment API cache layer"
origin {
domain_name = "api-origin.payment.internal"
origin_id = "payment-api-origin"
custom_origin_config {
http_port = 80
https_port = 443
origin_protocol_policy = "https-only"
origin_ssl_protocols = ["TLSv1.2"]
}
# Origin failover: if primary fails, serve stale content
custom_header {
name = "X-Origin-Failover"
value = "enabled"
}
}
# Default cache behavior β short TTL for dynamic content
default_cache_behavior {
allowed_methods = ["GET", "HEAD", "OPTIONS", "POST", "PUT", "DELETE"]
cached_methods = ["GET", "HEAD", "OPTIONS"]
target_origin_id = "payment-api-origin"
forwarded_values {
query_string = true
headers = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "Authorization"]
cookies { forward = "whitelist" whitelisted_names = ["session"] }
}
viewer_protocol_policy = "https-only"
min_ttl = 0
default_ttl = 60 # 1 minute default for API responses
max_ttl = 300 # 5 minutes max
compress = true
# Serve stale content if origin returns error
lambda_function_association {
event_type = "origin-response"
lambda_arn = aws_lambda_function.serve_stale.arn
}
}
# Ordered cache behavior for static content β long TTL
ordered_cache_behavior {
path_pattern = "/static/*"
allowed_methods = ["GET", "HEAD"]
cached_methods = ["GET", "HEAD"]
target_origin_id = "payment-api-origin"
forwarded_values {
query_string = false
cookies { forward = "none" }
}
min_ttl = 86400 # 1 day
default_ttl = 604800 # 1 week
max_ttl = 31536000 # 1 year
}
# Error pages β serve cached content on origin failure
custom_error_response {
error_code = 500
response_code = 200
response_page_path = "/static/error-pages/service-unavailable.html"
error_caching_min_ttl = 10
}
custom_error_response {
error_code = 503
response_code = 200
response_page_path = "/static/error-pages/service-unavailable.html"
error_caching_min_ttl = 10
}
}
Disaster Recovery Planning
Disaster recovery planning defines how to recover from catastrophic failures: region-wide outages, data center destruction, or widespread data corruption. DR planning is not optional for services handling financial transactions or sensitive data.
RTO and RPO Definitions
| Metric | Definition | Typical Targets |
|---|---|---|
| RTO Recovery Time Objective |
Maximum acceptable time to restore service after a disaster | Tier 1: < 1 hour Tier 2: < 4 hours Tier 3: < 24 hours |
| RPO Recovery Point Objective |
Maximum acceptable data loss measured in time | Tier 1: < 5 minutes (near-zero) Tier 2: < 1 hour Tier 3: < 24 hours |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DISASTER RECOVERY PLAN TEMPLATE
Service: Payment Processing Platform
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CLASSIFICATION: Tier 1 (Critical β revenue-impacting)
RTO Target: 1 hour
RPO Target: 5 minutes
Last Tested: [DATE]
Next Test: [DATE]
Owner: Pay SRE Team
βββ DISASTER SCENARIOS βββββββββββββββββββββββββββββββββββββββββββ
Scenario 1: AWS Region Failure (us-west-2)
Trigger: Region-wide AWS outage
Impact: Primary region completely unavailable
Recovery Procedure:
1. Declare disaster (IC approval required)
2. Update DNS to point to secondary region (us-east-1)
$ ./scripts/dns-failover.sh --to us-east-1
3. Promote read replica in us-east-1 to primary
$ ./scripts/db-promote.sh --region us-east-1
4. Scale up Kubernetes cluster in us-east-1
$ kubectl scale deployment payment-api --replicas=10 -n payment
5. Verify health checks pass
$ ./scripts/smoke-tests.sh --region us-east-1
6. Notify stakeholders
Expected RTO: 45 minutes
Expected RPO: 5 minutes (async replication lag)
Scenario 2: Database Corruption
Trigger: Data corruption detected in primary database
Impact: Data integrity compromised
Recovery Procedure:
1. Stop all writes to affected database
2. Identify last known good backup
$ aws s3 ls s3://db-backups/payment/ | tail -20
3. Restore from backup to new instance
$ ./scripts/db-restore.sh --backup-date 2024-01-15T10:00:00Z
4. Replay WAL (Write-Ahead Log) to RPO point
5. Redirect application to restored database
6. Verify data integrity with checksums
Expected RTO: 2 hours
Expected RPO: 5 minutes (point-in-time recovery)
Scenario 3: Ransomware / Security Incident
Trigger: Security team detects compromise
Impact: Unknown scope β assume worst case
Recovery Procedure:
1. Isolate compromised systems (security team leads)
2. Engage DR team and legal
3. Restore from known-clean backups (verify backup integrity)
4. Rebuild infrastructure from IaC (Terraform/K8s)
5. Rotate all credentials and certificates
6. Gradual traffic restoration with enhanced monitoring
Expected RTO: 4 hours
Expected RPO: 1 hour
βββ BACKUP STRATEGY ββββββββββββββββββββββββββββββββββββββββββββββ
Database:
β’ Continuous WAL archiving to S3 (5-minute intervals)
β’ Full daily backups at 02:00 UTC (retention: 30 days)
β’ Weekly snapshot retained for 90 days
β’ Monthly snapshot retained for 1 year
β’ Backup verification: automated restore test every Sunday
Application Configuration:
β’ All infrastructure in Terraform (Git-backed)
β’ Kubernetes manifests in Git (GitOps)
β’ Application config in ConfigMaps (versioned)
β’ Recovery: `terraform apply` + `kubectl apply -k overlays/dr`
βββ DR TESTING SCHEDULE ββββββββββββββββββββββββββββββββββββββββββ
Quarterly: Tabletop walkthrough (2 hours, no actual failover)
β’ Review procedures with team
β’ Update for infrastructure changes
β’ Identify gaps
Semi-annually: Controlled failover test (4 hours)
β’ Failover to DR region during maintenance window
β’ Run smoke tests
β’ Failback to primary
β’ Document findings
Annually: Full DR simulation (full day)
β’ Simulate complete primary region loss
β’ Execute full recovery procedure
β’ Run production traffic in DR for 4 hours
β’ Full failback procedure
Multi-Region Active-Active Architecture
Active-active architecture distributes traffic across multiple regions simultaneously. This provides the highest availability (survives a region failure with zero RTO/RPO) but is complex to implement, especially for stateful services.
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β MULTI-REGION ACTIVE-ACTIVE ARCHITECTURE β
β β
β ββββββββββββββββ β
β β Route 53 β β
β β (Geo-DNS) β β
β ββββββββ¬ββββββββ β
β β β
β ββββββββββββββββββΌβββββββββββββββββ β
β βΌ βΌ βΌ β
β ββββββββββββββββ ββββββββββββββββ ββββββββββββββββ β
β β us-west-2 β β us-east-1 β β eu-west-1 β β
β β (Primary) β β (Active) β β (Active) β β
β ββββββββββββββββ€ ββββββββββββββββ€ ββββββββββββββββ€ β
β β β β β β β β
β β K8s Cluster β β K8s Cluster β β K8s Cluster β β
β β ββββββββββ β β ββββββββββ β β ββββββββββ β β
β β β API β β β β API β β β β API β β β
β β β Pods β β β β Pods β β β β Pods β β β
β β ββββββββββ β β ββββββββββ β β ββββββββββ β β
β β β β β β β β β β β
β β ββββββ΄βββββ β β ββββββ΄βββββ β β ββββββ΄βββββ β β
β β βDB Primaryβ β β βDB Replicaβ β β βDB Replicaβ β β
β β β(writes) β β β β(reads) β β β β(reads) β β β
β β ββββββ¬βββββ β β ββββββββββ β β ββββββββββ β β
β β β β β β β β β
β β ββββββ΄βββββ β β β β β β
β β βCache β β β β β β β
β β β(Redis) β β β β β β β
β β ββββββββββ β β β β β β
β ββββββββββββββββ ββββββββββββββββ ββββββββββββββββ β
β β
β Data Flow: β
β β’ Writes β us-west-2 DB Primary β async replicate to replicas β
β β’ Reads β nearest region DB Replica β
β β’ Caches β regional Redis (no cross-region cache sync) β
β β’ Sessions β JWT (stateless) or sticky to region β
β β
β Conflict Resolution: β
β β’ Financial data: single-writer (us-west-2 only) β
β β’ User preferences: last-write-wins with vector clock β
β β’ Analytics: eventually consistent, no conflict resolution β
β β
β Failover: β
β β’ Automatic for reads (Route 53 health checks) β
β β’ Manual for writes (DB promotion requires IC approval) β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Capacity Planning and Load Testing
Capacity planning ensures your infrastructure can handle expected load with headroom for spikes. Load testing validates your capacity plans and identifies breaking points before production traffic finds them.
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CAPACITY PLAN β Payment API
Q1 2024
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CURRENT STATE (January 2024):
ββββββββββββββββββββ¬βββββββββββββ¬βββββββββββββ¬βββββββββββββββ
β Resource β Current β Capacity β Utilization β
ββββββββββββββββββββΌβββββββββββββΌβββββββββββββΌβββββββββββββββ€
β API Pods β 15 β 30 (K8s) β 50% β
β CPU per pod β 500m β 2000m β 25% β
β Memory per pod β 512Mi β 2Gi β 25% β
β DB Connections β 45 β 100 β 45% β
β DB CPU β 30% β 100% β 30% β
β Redis Memory β 2GB β 8GB β 25% β
β Network β 200 Mbps β 1 Gbps β 20% β
ββββββββββββββββββββ΄βββββββββββββ΄βββββββββββββ΄βββββββββββββββ
TRAFFIC PROJECTIONS:
ββββββββββββββββββββ¬ββββββββββββ¬ββββββββββββ¬ββββββββββββ
β Metric β Jan (now) β Mar (Q1) β Jun (Q2) β
ββββββββββββββββββββΌββββββββββββΌββββββββββββΌββββββββββββ€
β Peak RPS β 2,400 β 3,600 β 5,000 β
β Avg RPS β 800 β 1,200 β 1,700 β
β Daily requests β 69M β 104M β 147M β
β MAU β 12M β 15M β 20M β
β Growth rate β β β 50% β 39% β
ββββββββββββββββββββ΄ββββββββββββ΄ββββββββββββ΄ββββββββββββ
CAPACITY HEADROOM TARGET: 70% max utilization at peak
SCALING TRIGGER: 60% sustained utilization for 10 minutes
ACTIONS REQUIRED:
1. Pre-scale to 20 pods by Feb 1 (+$450/month)
2. Increase DB connection pool to 150 (code change + config)
3. Evaluate DB instance size upgrade (r5.2xlarge β r5.4xlarge)
4. Enable cluster autoscaler (min: 15, max: 40 pods)
# Kubernetes Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: payment-api-hpa
namespace: payment
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: payment-api
minReplicas: 15
maxReplicas: 40
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 60
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 70
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 100
periodSeconds: 15
scaleDown:
stabilizationWindowSeconds: 300 # 5 min cooldown before scaling down
policies:
- type: Percent
value: 10
periodSeconds: 60
Load Testing with k6
// payment-api-loadtest.js β k6 load testing script
// Run: k6 run --vus 100 --duration 10m payment-api-loadtest.js
// Or with stages for ramp-up: k6 run payment-api-loadtest.js
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend, Counter } from 'k6/metrics';
// Custom metrics
const errorRate = new Rate('errors');
const paymentLatency = new Trend('payment_latency');
const successCounter = new Counter('successful_payments');
export const options = {
stages: [
{ duration: '2m', target: 50 }, // Ramp up to 50 VUs
{ duration: '5m', target: 200 }, // Ramp up to 200 VUs
{ duration: '10m', target: 500 }, // Sustained 500 VUs
{ duration: '5m', target: 1000 }, // Peak load 1000 VUs
{ duration: '3m', target: 0 }, // Ramp down
],
thresholds: {
http_req_duration: ['p(95)<300'], // 95% of requests under 300ms
http_req_failed: ['rate<0.001'], // Error rate below 0.1%
errors: ['rate<0.001'],
},
};
const BASE_URL = __ENV.BASE_URL || 'https://api.payment.samsung.com';
const API_KEY = __ENV.API_KEY;
export default function () {
// 1. Get auth token (once per VU iteration)
const tokenResponse = http.post(`${BASE_URL}/v1/token`, null, {
headers: { 'Authorization': `Bearer ${API_KEY}` },
});
check(tokenResponse, {
'token request status is 200': (r) => r.status === 200,
});
const token = tokenResponse.json('access_token');
// 2. Process payment (main load test)
const payload = JSON.stringify({
amount: 10.00 + Math.random() * 90.00, // $10-$100
currency: 'USD',
card_token: 'test_token_visa_' + Math.floor(Math.random() * 10),
merchant_id: 'test_merchant',
idempotency_key: `loadtest-${__VU}-${__ITER}-${Date.now()}`,
});
const startTime = Date.now();
const paymentResponse = http.post(`${BASE_URL}/v1/payments`, payload, {
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${token}`,
},
});
const latency = Date.now() - startTime;
paymentLatency.add(latency);
const success = check(paymentResponse, {
'payment status is 200': (r) => r.status === 200,
'payment approved': (r) => r.json('status') === 'approved',
'payment latency < 300ms': () => latency < 300,
});
errorRate.add(!success);
if (success) {
successCounter.add(1);
}
// 3. Get payment status (read operation)
if (success) {
const paymentId = paymentResponse.json('payment_id');
const statusResponse = http.get(
`${BASE_URL}/v1/payments/${paymentId}`,
{ headers: { 'Authorization': `Bearer ${token}` } }
);
check(statusResponse, {
'status check is 200': (r) => r.status === 200,
});
}
// Think time between iterations
sleep(Math.random() * 2 + 1); // 1-3 seconds
}
export function handleSummary(data) {
return {
'loadtest-summary.json': JSON.stringify(data),
stdout: `
Payment API Load Test Summary
=============================
Peak VUs: ${data.metrics.vus_max.values.max}
Requests: ${data.metrics.http_reqs.values.count}
Error Rate: ${(data.metrics.errors.values.rate * 100).toFixed(2)}%
p50 Latency: ${data.metrics.payment_latency.values['avg'].toFixed(0)}ms
p95 Latency: ${data.metrics.payment_latency.values['p(95)'].toFixed(0)}ms
p99 Latency: ${data.metrics.payment_latency.values['p(99)'].toFixed(0)}ms
Throughput: ${(data.metrics.http_reqs.values.rate * 60).toFixed(0)} req/min
`,
};
}
Related Topics
- Monitoring & Observability β Instrumentation for reliability measurement
- Incident Management β Responding to reliability failures
- SLO / SLI / SLA β Defining and measuring reliability targets
- SRE Principles β Error budgets and toil elimination
Last updated: June 2026 | Author: SRE Team