41 pages Β· 8 sections
Ctrl K
GitHub Portfolio

Reliability Engineering

Building reliable systems requires architectural patterns for fault tolerance, graceful degradation, and proactive failure management. This guide covers circuit breakers, bulkheads, retry strategies, chaos engineering, health checks, disaster recovery planning, and multi-region architecture with production-ready implementation examples.

Reliability Design Patterns

Reliability patterns are reusable architectural solutions to common failure scenarios. Applying these patterns systematically is the difference between a service that fails gracefully and one that fails catastrophically.

Pattern Problem Solved When to Use
Circuit Breaker Cascade failures from a failing dependency Every external service call
Bulkhead Resource exhaustion spreading across components Thread pools, connection pools, request queues
Retry + Backoff Transient failures causing unnecessary errors Idempotent operations only
Graceful Degradation Total failure when non-critical features fail Features with fallback content (recommendations, analytics)
Health Checks Load balancers sending traffic to unhealthy instances Every service behind a load balancer
Rate Limiting Overload from unexpected traffic spikes Public APIs, authentication endpoints
Timeouts Resources held indefinitely by slow dependencies Every external call (DB, HTTP, cache)

Circuit Breaker Pattern

The circuit breaker prevents cascade failures by stopping requests to a failing dependency. When errors exceed a threshold, the circuit "opens" and requests fail fast. After a cooldown, the circuit enters "half-open" state to test if the dependency has recovered.

"""
circuit_breaker.py β€” Production-grade circuit breaker implementation.

States:
  CLOSED   β€” Normal operation, requests pass through
  OPEN     β€” Failure threshold exceeded, requests fail fast
  HALF_OPEN β€” Testing if dependency recovered (limited requests)

Usage:
    breaker = CircuitBreaker(
        failure_threshold=5,
        recovery_timeout=30,
        half_open_max_calls=3
    )
    
    @breaker
    def call_payment_gateway(request):
        return requests.post(PAYMENT_GATEWAY_URL, json=request)
"""

import time
import threading
import logging
from enum import Enum
from functools import wraps
from typing import Callable, Any, Optional

logger = logging.getLogger(__name__)

class CircuitState(Enum):
    CLOSED = "closed"       # Normal operation
    OPEN = "open"           # Failing fast
    HALF_OPEN = "half_open" # Testing recovery

class CircuitBreakerOpen(Exception):
    """Raised when the circuit breaker is OPEN."""
    pass

class CircuitBreaker:
    """Thread-safe circuit breaker implementation."""
    
    def __init__(
        self,
        failure_threshold: int = 5,
        recovery_timeout: float = 30.0,
        half_open_max_calls: int = 3,
        expected_exception: type = Exception,
        name: str = "default"
    ):
        """
        Args:
            failure_threshold: Number of failures before opening circuit
            recovery_timeout: Seconds to wait before trying recovery
            half_open_max_calls: Max test calls in half-open state
            expected_exception: Exception type that counts as failure
            name: Circuit breaker identifier for metrics/logging
        """
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.half_open_max_calls = half_open_max_calls
        self.expected_exception = expected_exception
        self.name = name
        
        self._state = CircuitState.CLOSED
        self._failure_count = 0
        self._success_count = 0
        self._half_open_calls = 0
        self._last_failure_time: Optional[float] = None
        self._lock = threading.RLock()
    
    @property
    def state(self) -> CircuitState:
        with self._lock:
            if self._state == CircuitState.OPEN:
                # Check if recovery timeout has elapsed
                if self._last_failure_time and \
                   (time.time() - self._last_failure_time) >= self.recovery_timeout:
                    logger.info(f"[{self.name}] Recovery timeout elapsed, entering HALF_OPEN")
                    self._state = CircuitState.HALF_OPEN
                    self._half_open_calls = 0
                    self._success_count = 0
            return self._state
    
    def call(self, func: Callable, *args, **kwargs) -> Any:
        """Execute func with circuit breaker protection."""
        with self._lock:
            current_state = self.state
            
            if current_state == CircuitState.OPEN:
                raise CircuitBreakerOpen(
                    f"Circuit breaker '{self.name}' is OPEN. "
                    f"Last failure: {self._last_failure_time}"
                )
            
            if current_state == CircuitState.HALF_OPEN:
                if self._half_open_calls >= self.half_open_max_calls:
                    raise CircuitBreakerOpen(
                        f"Circuit breaker '{self.name}' HALF_OPEN limit reached"
                    )
                self._half_open_calls += 1
        
        # Execute outside the lock to allow concurrency
        try:
            result = func(*args, **kwargs)
            self._on_success()
            return result
        except self.expected_exception as e:
            self._on_failure()
            raise
    
    def _on_success(self):
        with self._lock:
            if self._state == CircuitState.HALF_OPEN:
                self._success_count += 1
                # If we've had enough successes in half-open, close the circuit
                if self._success_count >= self.half_open_max_calls:
                    logger.info(f"[{self.name}] Recovery confirmed, circuit CLOSED")
                    self._state = CircuitState.CLOSED
                    self._failure_count = 0
                    self._success_count = 0
            elif self._state == CircuitState.CLOSED:
                self._failure_count = max(0, self._failure_count - 1)
    
    def _on_failure(self):
        with self._lock:
            self._failure_count += 1
            self._last_failure_time = time.time()
            
            if self._state == CircuitState.HALF_OPEN:
                logger.warning(f"[{self.name}] Recovery failed, circuit OPEN")
                self._state = CircuitState.OPEN
            elif self._failure_count >= self.failure_threshold:
                logger.error(
                    f"[{self.name}] Failure threshold reached ({self._failure_count}), "
                    f"circuit OPEN"
                )
                self._state = CircuitState.OPEN
    
    def __call__(self, func: Callable) -> Callable:
        """Decorator usage: @circuit_breaker_instance"""
        @wraps(func)
        def wrapper(*args, **kwargs):
            return self.call(func, *args, **kwargs)
        return wrapper
    
    def get_metrics(self) -> dict:
        """Export metrics for monitoring."""
        return {
            "circuit_breaker_name": self.name,
            "state": self.state.value,
            "failure_count": self._failure_count,
            "failure_threshold": self.failure_threshold,
            "last_failure_time": self._last_failure_time,
        }


# ═══════════════════════════════════════════════════════════════
# Integration with Flask/FastAPI for HTTP client calls
# ═══════════════════════════════════════════════════════════════

# Create circuit breakers for each external dependency
payment_gateway_cb = CircuitBreaker(
    failure_threshold=5,
    recovery_timeout=30,
    half_open_max_calls=3,
    name="payment_gateway"
)

fraud_service_cb = CircuitBreaker(
    failure_threshold=3,
    recovery_timeout=60,
    half_open_max_calls=2,
    name="fraud_service"
)

@payment_gateway_cb
def charge_card(card_token: str, amount: float) -> dict:
    """Charge a card through the payment gateway."""
    response = requests.post(
        PAYMENT_GATEWAY_URL,
        json={"card_token": card_token, "amount": amount},
        timeout=5
    )
    response.raise_for_status()
    return response.json()

# In your service code:
try:
    result = charge_card("tok_visa", 99.99)
except CircuitBreakerOpen:
    # Circuit is open β€” fail fast and return a graceful response
    logger.warning("Payment gateway circuit open β€” queueing for async processing")
    queue_for_async_processing(card_token="tok_visa", amount=99.99)
    return {"status": "queued", "message": "Payment will be processed shortly"}
except requests.RequestException as e:
    # Other error (timeout, 5xx, etc.) β€” circuit breaker will count this
    logger.error(f"Payment gateway error: {e}")
    raise

Bulkhead Pattern (Resource Isolation)

The bulkhead pattern isolates resources so that a failure in one component cannot exhaust shared resources and affect other components. Named after ship bulkheads that prevent flooding from spreading across compartments.

"""
bulkhead.py β€” Thread pool-based bulkhead implementation.

Isolates different request types into separate thread pools,
preventing one slow operation from consuming all threads.

Usage:
    payment_bulkhead = Bulkhead(max_concurrent=20, max_queue=100)
    
    @payment_bulkhead
    def process_payment(payment_request):
        # This runs in the payment thread pool
        ...
"""

from concurrent.futures import ThreadPoolExecutor, Future
from functools import wraps
from queue import Queue, Full
import threading
import time
from typing import Callable, Any

class BulkheadFull(Exception):
    """Raised when the bulkhead has no capacity."""
    pass

class BulkheadTimeout(Exception):
    """Raised when the bulkhead queue wait times out."""
    pass

class Bulkhead:
    """Resource isolation bulkhead using thread pools."""
    
    def __init__(
        self,
        max_concurrent: int = 20,
        max_queue: int = 100,
        name: str = "default"
    ):
        """
        Args:
            max_concurrent: Maximum concurrent executions
            max_queue: Maximum queued waiting executions
            name: Bulkhead identifier
        """
        self.name = name
        self.max_concurrent = max_concurrent
        self.max_queue = max_queue
        
        # ThreadPoolExecutor: max_concurrent workers + max_queue backlog
        self._executor = ThreadPoolExecutor(
            max_workers=max_concurrent,
            thread_name_prefix=f"bulkhead-{name}"
        )
        self._active_count = 0
        self._queue_size = 0
        self._lock = threading.Lock()
        self._semaphore = threading.Semaphore(max_concurrent + max_queue)
    
    def submit(self, func: Callable, *args, **kwargs) -> Future:
        """Submit function to bulkhead. Raises BulkheadFull if at capacity."""
        acquired = self._semaphore.acquire(blocking=False)
        if not acquired:
            raise BulkheadFull(
                f"Bulkhead '{self.name}' at capacity: "
                f"{self.max_concurrent} active + {self.max_queue} queued"
            )
        
        def wrapper():
            try:
                with self._lock:
                    self._active_count += 1
                return func(*args, **kwargs)
            finally:
                with self._lock:
                    self._active_count -= 1
                self._semaphore.release()
        
        return self._executor.submit(wrapper)
    
    def call(self, func: Callable, *args, **kwargs) -> Any:
        """Execute function synchronously with bulkhead protection."""
        future = self.submit(func, *args, **kwargs)
        return future.result()
    
    def __call__(self, func: Callable) -> Callable:
        """Decorator usage: @bulkhead_instance"""
        @wraps(func)
        def wrapper(*args, **kwargs):
            return self.call(func, *args, **kwargs)
        return wrapper
    
    def get_metrics(self) -> dict:
        with self._lock:
            return {
                "bulkhead_name": self.name,
                "max_concurrent": self.max_concurrent,
                "max_queue": self.max_queue,
                "active": self._active_count,
                "available": self.max_concurrent - self._active_count,
            }
    
    def shutdown(self):
        self._executor.shutdown(wait=True)


# ═══════════════════════════════════════════════════════════════
# Multiple bulkheads for different operation types
# ═══════════════════════════════════════════════════════════════

# Critical path: small, fast pool β€” payment processing must be fast
payment_bulkhead = Bulkhead(max_concurrent=50, max_queue=200, name="payment")

# Non-critical: larger pool for background tasks
analytics_bulkhead = Bulkhead(max_concurrent=10, max_queue=500, name="analytics")

# External API: small pool to prevent overwhelming third parties
external_api_bulkhead = Bulkhead(max_concurrent=20, max_queue=50, name="external_api")

# In your service:
def handle_payment_request(request):
    try:
        # Payment processing β€” isolated pool
        return payment_bulkhead.call(process_payment, request)
    except BulkheadFull:
        # Return 503 β€” client should retry with backoff
        raise ServiceUnavailable("Payment system at capacity, please retry")

Retry with Exponential Backoff and Jitter

Retry is appropriate only for idempotent operations and transient failures. Without jitter, synchronized retries from multiple clients can create thundering herd problems. Full jitter is recommended for most cases.

"""
retry.py β€” Exponential backoff with full jitter for resilient retries.

Implements the AWS-recommended exponential backoff with full jitter:
    sleep = random_between(0, min(cap, base * 2^attempt))

Usage:
    @retry(max_attempts=5, base_delay=0.1, max_delay=30.0)
    def call_external_api(request):
        ...
"""

import random
import time
import logging
from functools import wraps
from typing import Callable, Any, Optional, Tuple, Type

logger = logging.getLogger(__name__)

class RetryExhausted(Exception):
    """All retry attempts have been exhausted."""
    pass

def retry(
    max_attempts: int = 3,
    base_delay: float = 0.1,      # 100ms
    max_delay: float = 60.0,       # 60 seconds
    exponential_base: float = 2.0,
    jitter: bool = True,
    retryable_exceptions: Tuple[Type[Exception], ...] = (Exception,),
    on_retry: Optional[Callable[[Exception, int, float], None]] = None,
    on_exhausted: Optional[Callable[[Exception, int], None]] = None
):
    """
    Retry decorator with exponential backoff and full jitter.
    
    Args:
        max_attempts: Maximum number of attempts (including first)
        base_delay: Initial delay between retries (seconds)
        max_delay: Maximum delay between retries (seconds)
        exponential_base: Multiplier for exponential growth
        jitter: If True, apply full random jitter
        retryable_exceptions: Exception types that trigger retry
        on_retry: Callback(exception, attempt, delay) called before each retry
        on_exhausted: Callback(exception, attempts) called when all retries fail
    """
    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(*args, **kwargs) -> Any:
            last_exception = None
            
            for attempt in range(1, max_attempts + 1):
                try:
                    return func(*args, **kwargs)
                except retryable_exceptions as e:
                    last_exception = e
                    
                    if attempt >= max_attempts:
                        if on_exhausted:
                            on_exhausted(e, attempt)
                        raise RetryExhausted(
                            f"Failed after {max_attempts} attempts: {e}"
                        ) from e
                    
                    # Calculate backoff with full jitter
                    delay = min(max_delay, base_delay * (exponential_base ** (attempt - 1)))
                    if jitter:
                        delay = random.uniform(0, delay)
                    
                    logger.warning(
                        f"Attempt {attempt}/{max_attempts} failed for {func.__name__}: "
                        f"{e}. Retrying in {delay:.2f}s"
                    )
                    
                    if on_retry:
                        on_retry(e, attempt, delay)
                    
                    time.sleep(delay)
            
            # Should not reach here
            raise RetryExhausted("Unexpected exit from retry loop")
        
        return wrapper
    return decorator


# ═══════════════════════════════════════════════════════════════
# Usage Examples
# ═══════════════════════════════════════════════════════════════

# 1. Basic retry for database queries
@retry(
    max_attempts=5,
    base_delay=0.1,
    max_delay=5.0,
    retryable_exceptions=(DatabaseTransientError, ConnectionError)
)
def get_user_profile(user_id: str) -> dict:
    return db.query("SELECT * FROM users WHERE id = %s", (user_id,))

# 2. Retry with metrics callback for observability
attempt_counter = Counter('retry_attempts_total', 'Retry attempts', ['function'])

def record_retry(exception, attempt, delay):
    attempt_counter.labels(function='call_payment_api').inc()

@retry(
    max_attempts=3,
    base_delay=0.5,
    retryable_exceptions=(requests.RequestException,),
    on_retry=record_retry
)
def call_payment_api(payload: dict) -> dict:
    response = requests.post(PAYMENT_API_URL, json=payload, timeout=10)
    response.raise_for_status()
    return response.json()

# 3. Retry with circuit breaker combined
@payment_gateway_cb  # Circuit breaker β€” fail fast if service is down
@retry(
    max_attempts=3,
    base_delay=0.1,
    retryable_exceptions=(requests.RequestException,)
)
def charge_with_resilience(card_token: str, amount: float) -> dict:
    """Combines circuit breaker (fail fast) with retry (handle transient)."""
    response = requests.post(
        PAYMENT_GATEWAY_URL,
        json={"card_token": card_token, "amount": amount},
        timeout=5
    )
    response.raise_for_status()
    return response.json()

# Backoff delay calculation (with jitter) for visualization:
# Attempt 1: delay = random(0, 0.1)   = ~50ms average
# Attempt 2: delay = random(0, 0.2)   = ~100ms average
# Attempt 3: delay = random(0, 0.4)   = ~200ms average
# Attempt 4: delay = random(0, 0.8)   = ~400ms average
# Attempt 5: delay = random(0, 1.6)   = ~800ms average
# Attempt 6+: capped at max_delay (60s)
Retry Safety Rule: Never retry non-idempotent operations (POST that creates resources, financial transactions) without a deduplication mechanism. A retried payment request could result in double-charging. Use idempotency keys: Idempotency-Key: <UUID> β€” the server stores processed keys and ignores duplicates.

Graceful Degradation Strategies

Graceful degradation ensures that when non-critical features fail, the core functionality continues working. Users get a reduced but functional experience instead of a complete failure.

# Example: E-commerce product page with graceful degradation

class ProductPageService:
    """Renders product page with graceful degradation for non-critical features."""
    
    def __init__(self):
        self.product_service = ProductService()
        self.recommendation_service = RecommendationService()
        self.review_service = ReviewService()
        self.inventory_service = InventoryService()
        self.price_service = PriceService()
    
    def get_product_page(self, product_id: str, user_id: str) -> ProductPage:
        # CRITICAL: Product details β€” must succeed
        # If this fails, return 500 (core functionality broken)
        product = self.product_service.get_product(product_id)
        
        # NON-CRITICAL: Recommendations β€” degrade to popular items
        try:
            recommendations = self._with_timeout(
                lambda: self.recommendation_service.get_for_user(user_id, product_id),
                timeout_ms=200
            )
        except (TimeoutError, ServiceError):
            logger.warning("Recommendations failed, using fallback")
            recommendations = self._get_popular_fallback()
        
        # NON-CRITICAL: Reviews β€” degrade to empty list
        try:
            reviews = self._with_timeout(
                lambda: self.review_service.get_reviews(product_id),
                timeout_ms=300
            )
        except (TimeoutError, ServiceError):
            logger.warning("Reviews failed, showing empty")
            reviews = []
        
        # CRITICAL: Price β€” must succeed for purchase
        # But can show "price unavailable" and disable buy button
        try:
            price = self._with_timeout(
                lambda: self.price_service.get_price(product_id),
                timeout_ms=100
            )
        except (TimeoutError, ServiceError):
            logger.error("Price service failed β€” disabling purchase")
            price = None
        
        # NON-CRITICAL: Inventory β€” default to "in stock"
        try:
            inventory = self._with_timeout(
                lambda: self.inventory_service.get_inventory(product_id),
                timeout_ms=100
            )
        except (TimeoutError, ServiceError):
            logger.warning("Inventory check failed, defaulting to in-stock")
            inventory = InventoryStatus.IN_STOCK
        
        return ProductPage(
            product=product,                    # Critical β€” always present
            recommendations=recommendations,     # Degraded to popular
            reviews=reviews,                     # Degraded to empty
            price=price,                         # Null = purchase disabled
            inventory=inventory,                 # Default in-stock
            degraded_features=self._get_degraded_features()
        )
    
    def _get_popular_fallback(self) -> List[Product]:
        """Return globally popular products when personalized recs fail."""
        cache_key = "popular_products"
        cached = redis.get(cache_key)
        if cached:
            return json.loads(cached)
        popular = self.product_service.get_popular(limit=10)
        redis.setex(cache_key, 3600, json.dumps([p.to_dict() for p in popular]))
        return popular

# Response with graceful degradation:
# {
#   "product": { ... full product details ... },  ← CRITICAL: complete
#   "recommendations": [ ... popular items ... ],  ← degraded from personalized
#   "reviews": [],                                   ← degraded to empty
#   "price": null,                                   ← degraded, purchase disabled
#   "inventory": "IN_STOCK",                         ← default assumption
#   "degraded_features": ["recommendations", "price"],
#   "message": "Some features are temporarily unavailable."
# }

Chaos Engineering Principles

Chaos engineering is the discipline of experimenting on a system to build confidence in its capability to withstand turbulent conditions. It is not about breaking things randomly β€” it is about proving that your resilience mechanisms work.

Core Principles

  1. Start with a steady-state hypothesis: Define what "normal" looks like before injecting failure. Your experiment validates that the system maintains this state during failure.
  2. Vary real-world events: Inject failures that actually happen: server crashes, network latency, dependency failures, certificate expiry.
  3. Run in production: Staging environments rarely match production. Start with canary/small blast radius in production, or use shadow traffic.
  4. Automate to run continuously: Manual chaos experiments are valuable but episodic. Automated chaos (like Netflix's Simian Army) provides ongoing validation.
  5. Minimize blast radius: Every experiment must have an abort criterion and a quick rollback path. Start with 1% of traffic, not 100%.

Litmus Chaos Experiment (Complete YAML)

LitmusChaos is a Cloud Native Computing Foundation (CNCF) incubating project for Kubernetes chaos engineering. Below is a complete experiment that validates a service's resilience to pod failure.

═══════════════════════════════════════════════════════════════════
            LITMUS CHAOS EXPERIMENT: Pod Failure
            Target: payment-api deployment
═══════════════════════════════════════════════════════════════════

---
# 1. ChaosEngine β€” defines what to chaos and when
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: payment-api-pod-failure
  namespace: litmus
spec:
  appinfo:
    appns: 'payment'
    applabel: 'app=payment-api'
    appkind: 'deployment'
  
  # Run against app pods in 'payment' namespace
  annotationCheck: 'true'
  engineState: 'active'
  
  # Service account for chaos operations
  auxiliaryAppInfo: ''
  
  # Define the steady-state hypothesis
  experiments:
    - name: pod-delete
      spec:
        components:
          env:
            # Total duration of chaos injection
            - name: TOTAL_CHAOS_DURATION
              value: '60'
            
            # Kill 1 pod every 10 seconds
            - name: CHAOS_INTERVAL
              value: '10'
            
            # Force delete (no graceful shutdown)
            - name: FORCE
              value: 'false'
            
            # Target pod label
            - name: PODS_AFFECTED_PERC
              value: '25'
            
            # Namespace scope
            - name: TARGET_CONTAINER
              value: 'payment-api'
        
        # Probes β€” validate steady-state hypothesis
        probes:
          - name: "payment-api-health-check"
            type: "httpProbe"
            mode: "Continuous"
            runProperties:
              probeTimeout: "5s"
              retry: 2
              interval: "5s"
              probePollingInterval: "2s"
              initialDelay: "2s"
            httpProbe/inputs:
              url: "http://payment-api.payment.svc.cluster.local:8080/health"
              insecureSkipVerify: false
              method:
                get:
                  criteria: ==
                  responseCode: "200"

---
# 2. ChaosSchedule β€” run this experiment automatically every week
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosSchedule
metadata:
  name: payment-api-weekly-chaos
  namespace: litmus
spec:
  schedule:
    repeat:
      properties:
        minChaosInterval:
          hour:
            everyNthHour: 168  # Every 7 days
      workHours:
        includedHours: 9-17  # Business hours only
  engineTemplateSpec:
    appinfo:
      appns: 'payment'
      applabel: 'app=payment-api'
      appkind: 'deployment'
    experiments:
      - name: pod-delete
        spec:
          components:
            env:
              - name: TOTAL_CHAOS_DURATION
                value: '60'
              - name: CHAOS_INTERVAL
                value: '10'
              - name: PODS_AFFECTED_PERC
                value: '25'

---
# 3. ChaosResult β€” view experiment results
# kubectl get chaosresult -n litmus
# 
# Expected output when resilience is working:
# NAME                          VERDICT   PHASE
# payment-api-pod-failure       Pass      Completed
#
# VERDICT values:
#   Pass    = Steady-state maintained during chaos
#   Fail    = Steady-state breached (probe failed)
#   Awaited = Experiment still running
#   Stopped = Experiment was manually aborted

# SAFETY GUARDRAILS:
# 1. Always set workHours to business hours when starting
# 2. Start with PODS_AFFECTED_PERC: 10 (small blast radius)
# 3. Ensure abort criteria are defined:
#    - Error rate > 1% for 30s β†’ auto-abort
#    - Latency p95 > 5s β†’ auto-abort
# 4. Never run chaos experiments:
#    - During deployments
#    - During known-fragile periods (Black Friday)
#    - Without an observer present

Health Check Endpoint Design

Kubernetes and load balancers use health checks to determine whether to send traffic to a pod. Proper health check design is critical β€” a misconfigured health check can cause cascading failures by marking healthy pods as unhealthy.

# health_checks.py β€” Comprehensive health check implementation

from fastapi import FastAPI, HTTPException, status
from pydantic import BaseModel
from typing import Dict, List, Optional
import asyncio
import time

app = FastAPI()

class HealthStatus(BaseModel):
    status: str           # "pass", "fail", or "warn"
    checks: Dict[str, dict]
    version: str
    uptime_seconds: float

# ─── /live (Liveness Probe) ─────────────────────────────────────
# Tells Kubernetes: "Should this pod be killed and restarted?"
# 
# Use sparingly β€” only return fail when the application is stuck
# and cannot recover without restart. A failing liveness probe
# causes Kubernetes to restart the container.

@app.get("/health/live", status_code=status.HTTP_200_OK)
async def liveness_probe():
    """
    Liveness probe: Is the application process running?
    
    This should be a lightweight check that verifies the
    application process is alive and not deadlocked.
    
    DO NOT check external dependencies here β€” a temporary DB
    blip should not cause Kubernetes to restart all your pods.
    """
    return {
        "status": "alive",
        "timestamp": time.time()
    }

# ─── /ready (Readiness Probe) ────────────────────────────────────
# Tells Kubernetes: "Should this pod receive traffic?"
# 
# Check all dependencies required to serve traffic. If a
# dependency is down, return non-200 so the pod is removed
# from the load balancer rotation.

class ReadinessChecker:
    def __init__(self):
        self.checks = {
            'database': self._check_database,
            'redis': self._check_redis,
            'upstream-payment-gateway': self._check_payment_gateway,
        }
        self.last_check_time = 0
        self.cached_result = None
        self.cache_ttl = 5  # Cache readiness for 5 seconds
    
    async def check(self) -> Dict:
        # Cache results to prevent health check storms
        now = time.time()
        if self.cached_result and (now - self.last_check_time) < self.cache_ttl:
            return self.cached_result
        
        results = {}
        overall_ready = True
        
        # Run checks concurrently
        check_tasks = [
            (name, checker()) 
            for name, checker in self.checks.items()
        ]
        
        for name, coro in check_tasks:
            try:
                result = await asyncio.wait_for(coro, timeout=3.0)
                results[name] = {"status": "pass", "response_time_ms": result}
            except asyncio.TimeoutError:
                results[name] = {"status": "warn", "error": "timeout"}
                # Don't mark as fail β€” may be transient
            except Exception as e:
                results[name] = {"status": "fail", "error": str(e)}
                # Critical dependencies cause not-ready
                if name in ['database', 'redis']:
                    overall_ready = False
        
        self.cached_result = results
        self.last_check_time = now
        
        return results, overall_ready

readiness_checker = ReadinessChecker()

@app.get("/health/ready", status_code=status.HTTP_200_OK)
async def readiness_probe():
    """
    Readiness probe: Can this pod serve traffic?
    
    Checks all critical dependencies. Returns non-200 if
    any critical dependency is unreachable, causing Kubernetes
    to remove this pod from service rotation.
    """
    checks, is_ready = await readiness_checker.check()
    
    status_code = 200 if is_ready else 503
    
    return JSONResponse(
        status_code=status_code,
        content={
            "status": "ready" if is_ready else "not_ready",
            "checks": checks,
            "version": "2.4.0",
            "timestamp": time.time()
        }
    )

# ─── /health (Comprehensive Health) ──────────────────────────────
# Full health check for monitoring dashboards and debugging.
# Not used by Kubernetes β€” purely for observability.

@app.get("/health", status_code=status.HTTP_200_OK)
async def comprehensive_health():
    """
    Comprehensive health check for dashboards.
    Includes all /ready checks plus additional metrics.
    """
    checks, _ = await readiness_checker.check()
    
    # Add application-specific health metrics
    checks["queue_depth"] = {
        "status": "pass" if queue_depth < 1000 else "warn",
        "value": queue_depth,
        "threshold": 1000
    }
    checks["error_rate_5m"] = {
        "status": "pass" if error_rate < 0.001 else "fail",
        "value": error_rate,
        "threshold": 0.001
    }
    
    return {
        "status": "healthy",
        "checks": checks,
        "version": "2.4.0",
        "git_commit": "abc123def456",
        "uptime_seconds": time.time() - start_time,
        "build_timestamp": "2024-01-15T10:30:00Z"
    }


# ─── Kubernetes Pod Configuration ────────────────────────────────
# 
# livenessProbe:
#   httpGet:
#     path: /health/live
#     port: 8080
#   initialDelaySeconds: 10     # Wait for app startup
#   periodSeconds: 10           # Check every 10s
#   timeoutSeconds: 2           # Must respond in 2s
#   failureThreshold: 3         # Fail after 3 consecutive failures
#   successThreshold: 1         # Succeed on 1 pass
#
# readinessProbe:
#   httpGet:
#     path: /health/ready
#     port: 8080
#   initialDelaySeconds: 5
#   periodSeconds: 5
#   timeoutSeconds: 3
#   failureThreshold: 3
#   successThreshold: 2         # Require 2 passes before going ready
#
# startupProbe:                 # Use for slow-starting applications
#   httpGet:
#     path: /health/live
#     port: 8080
#   initialDelaySeconds: 5
#   periodSeconds: 5
#   timeoutSeconds: 3
#   failureThreshold: 30        # Allow 2.5 min for startup
Critical: Liveness Probe Safety
A poorly designed liveness probe that checks external dependencies can cause cascading failures: DB slows down β†’ liveness probes fail β†’ all pods restart simultaneously β†’ thundering herd on DB β†’ worse slowdown β†’ more restarts. Never check external dependencies in a liveness probe. Check only that your process is alive and responsive.

Database Reliability

Read Replicas

Offload read traffic to replica databases. Write operations go to the primary; read operations (especially analytics, reporting, and cache warming) go to replicas. Implement automatic failover for the primary.

# Database configuration with primary/replica routing
# SQLAlchemy example with read replica support

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
import random

class RoutingSession:
    """Routes reads to replicas, writes to primary."""
    
    def __init__(self, primary_url: str, replica_urls: List[str]):
        self.primary_engine = create_engine(
            primary_url,
            pool_size=20,
            max_overflow=10,
            pool_pre_ping=True,        # Verify connection before use
            pool_recycle=3600,         # Recycle connections after 1 hour
            connect_args={"connect_timeout": 5}
        )
        
        self.replica_engines = [
            create_engine(
                url,
                pool_size=10,
                max_overflow=5,
                pool_pre_ping=True,
                pool_recycle=3600,
                connect_args={"connect_timeout": 5}
            )
            for url in replica_urls
        ]
    
    def get_read_session(self):
        """Get a session connected to a random read replica."""
        engine = random.choice(self.replica_engines)
        Session = sessionmaker(bind=engine)
        return Session()
    
    def get_write_session(self):
        """Get a session connected to the primary."""
        Session = sessionmaker(bind=self.primary_engine)
        return Session()
    
    def execute_read(self, query, **params):
        """Execute a read query against a replica."""
        session = self.get_read_session()
        try:
            return session.execute(query, params).fetchall()
        finally:
            session.close()
    
    def execute_write(self, query, **params):
        """Execute a write query against the primary."""
        session = self.get_write_session()
        try:
            result = session.execute(query, params)
            session.commit()
            return result
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()


# Usage:
db = RoutingSession(
    primary_url="postgresql://user:pass@db-primary.internal/payments",
    replica_urls=[
        "postgresql://user:pass@db-replica-1.internal/payments",
        "postgresql://user:pass@db-replica-2.internal/payments",
    ]
)

# Reads go to replicas (automatic load balancing)
user = db.execute_read("SELECT * FROM users WHERE id = :id", id=user_id)

# Writes go to primary
payment = db.execute_write(
    "INSERT INTO payments (amount, status) VALUES (:amount, 'pending') RETURNING *",
    amount=amount
)

Connection Pool Configuration

Parameter Recommended Value Rationale
pool_size 10–20 per instance Enough for concurrent requests, not so many that DB is overwhelmed
max_overflow pool_size / 2 Extra connections for traffic spikes, but bounded
pool_pre_ping True Verify connection validity before use; prevents stale connection errors
pool_recycle 3600 seconds (1 hour) Prevent stale connections from long-lived pools
connect_timeout 5 seconds Fail fast if DB is unreachable; prevents request pileup
pool_timeout 10 seconds Time to wait for a connection from the pool before failing

CDN and Caching for Reliability

CDNs and caching layers improve reliability by serving content from edge locations, reducing load on origin servers, and providing content even when origin is experiencing issues.

# CloudFront / CDN configuration for static assets and API responses
# Terraform configuration

resource "aws_cloudfront_distribution" "api_cache" {
  enabled = true
  comment = "Payment API cache layer"
  
  origin {
    domain_name = "api-origin.payment.internal"
    origin_id   = "payment-api-origin"
    
    custom_origin_config {
      http_port              = 80
      https_port             = 443
      origin_protocol_policy = "https-only"
      origin_ssl_protocols   = ["TLSv1.2"]
    }
    
    # Origin failover: if primary fails, serve stale content
    custom_header {
      name  = "X-Origin-Failover"
      value = "enabled"
    }
  }
  
  # Default cache behavior β€” short TTL for dynamic content
  default_cache_behavior {
    allowed_methods  = ["GET", "HEAD", "OPTIONS", "POST", "PUT", "DELETE"]
    cached_methods   = ["GET", "HEAD", "OPTIONS"]
    target_origin_id = "payment-api-origin"
    
    forwarded_values {
      query_string = true
      headers      = ["Origin", "Access-Control-Request-Headers", "Access-Control-Request-Method", "Authorization"]
      cookies { forward = "whitelist" whitelisted_names = ["session"] }
    }
    
    viewer_protocol_policy = "https-only"
    min_ttl                = 0
    default_ttl            = 60       # 1 minute default for API responses
    max_ttl                = 300      # 5 minutes max
    compress               = true
    
    # Serve stale content if origin returns error
    lambda_function_association {
      event_type = "origin-response"
      lambda_arn = aws_lambda_function.serve_stale.arn
    }
  }
  
  # Ordered cache behavior for static content β€” long TTL
  ordered_cache_behavior {
    path_pattern     = "/static/*"
    allowed_methods  = ["GET", "HEAD"]
    cached_methods   = ["GET", "HEAD"]
    target_origin_id = "payment-api-origin"
    
    forwarded_values {
      query_string = false
      cookies { forward = "none" }
    }
    
    min_ttl     = 86400    # 1 day
    default_ttl = 604800   # 1 week
    max_ttl     = 31536000 # 1 year
  }
  
  # Error pages β€” serve cached content on origin failure
  custom_error_response {
    error_code         = 500
    response_code      = 200
    response_page_path = "/static/error-pages/service-unavailable.html"
    error_caching_min_ttl = 10
  }
  
  custom_error_response {
    error_code         = 503
    response_code      = 200
    response_page_path = "/static/error-pages/service-unavailable.html"
    error_caching_min_ttl = 10
  }
}

Disaster Recovery Planning

Disaster recovery planning defines how to recover from catastrophic failures: region-wide outages, data center destruction, or widespread data corruption. DR planning is not optional for services handling financial transactions or sensitive data.

RTO and RPO Definitions

Metric Definition Typical Targets
RTO
Recovery Time Objective
Maximum acceptable time to restore service after a disaster Tier 1: < 1 hour
Tier 2: < 4 hours
Tier 3: < 24 hours
RPO
Recovery Point Objective
Maximum acceptable data loss measured in time Tier 1: < 5 minutes (near-zero)
Tier 2: < 1 hour
Tier 3: < 24 hours
═══════════════════════════════════════════════════════════════════
            DISASTER RECOVERY PLAN TEMPLATE
            Service: Payment Processing Platform
═══════════════════════════════════════════════════════════════════

CLASSIFICATION: Tier 1 (Critical β€” revenue-impacting)
RTO Target: 1 hour
RPO Target: 5 minutes
Last Tested: [DATE]
Next Test: [DATE]
Owner: Pay SRE Team

─── DISASTER SCENARIOS ───────────────────────────────────────────

Scenario 1: AWS Region Failure (us-west-2)
  Trigger: Region-wide AWS outage
  Impact: Primary region completely unavailable
  
  Recovery Procedure:
    1. Declare disaster (IC approval required)
    2. Update DNS to point to secondary region (us-east-1)
       $ ./scripts/dns-failover.sh --to us-east-1
    3. Promote read replica in us-east-1 to primary
       $ ./scripts/db-promote.sh --region us-east-1
    4. Scale up Kubernetes cluster in us-east-1
       $ kubectl scale deployment payment-api --replicas=10 -n payment
    5. Verify health checks pass
       $ ./scripts/smoke-tests.sh --region us-east-1
    6. Notify stakeholders
  
  Expected RTO: 45 minutes
  Expected RPO: 5 minutes (async replication lag)

Scenario 2: Database Corruption
  Trigger: Data corruption detected in primary database
  Impact: Data integrity compromised
  
  Recovery Procedure:
    1. Stop all writes to affected database
    2. Identify last known good backup
       $ aws s3 ls s3://db-backups/payment/ | tail -20
    3. Restore from backup to new instance
       $ ./scripts/db-restore.sh --backup-date 2024-01-15T10:00:00Z
    4. Replay WAL (Write-Ahead Log) to RPO point
    5. Redirect application to restored database
    6. Verify data integrity with checksums
  
  Expected RTO: 2 hours
  Expected RPO: 5 minutes (point-in-time recovery)

Scenario 3: Ransomware / Security Incident
  Trigger: Security team detects compromise
  Impact: Unknown scope β€” assume worst case
  
  Recovery Procedure:
    1. Isolate compromised systems (security team leads)
    2. Engage DR team and legal
    3. Restore from known-clean backups (verify backup integrity)
    4. Rebuild infrastructure from IaC (Terraform/K8s)
    5. Rotate all credentials and certificates
    6. Gradual traffic restoration with enhanced monitoring
  
  Expected RTO: 4 hours
  Expected RPO: 1 hour

─── BACKUP STRATEGY ──────────────────────────────────────────────

Database:
  β€’ Continuous WAL archiving to S3 (5-minute intervals)
  β€’ Full daily backups at 02:00 UTC (retention: 30 days)
  β€’ Weekly snapshot retained for 90 days
  β€’ Monthly snapshot retained for 1 year
  β€’ Backup verification: automated restore test every Sunday

Application Configuration:
  β€’ All infrastructure in Terraform (Git-backed)
  β€’ Kubernetes manifests in Git (GitOps)
  β€’ Application config in ConfigMaps (versioned)
  β€’ Recovery: `terraform apply` + `kubectl apply -k overlays/dr`

─── DR TESTING SCHEDULE ──────────────────────────────────────────

Quarterly: Tabletop walkthrough (2 hours, no actual failover)
  β€’ Review procedures with team
  β€’ Update for infrastructure changes
  β€’ Identify gaps

Semi-annually: Controlled failover test (4 hours)
  β€’ Failover to DR region during maintenance window
  β€’ Run smoke tests
  β€’ Failback to primary
  β€’ Document findings

Annually: Full DR simulation (full day)
  β€’ Simulate complete primary region loss
  β€’ Execute full recovery procedure
  β€’ Run production traffic in DR for 4 hours
  β€’ Full failback procedure

Multi-Region Active-Active Architecture

Active-active architecture distributes traffic across multiple regions simultaneously. This provides the highest availability (survives a region failure with zero RTO/RPO) but is complex to implement, especially for stateful services.

β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚              MULTI-REGION ACTIVE-ACTIVE ARCHITECTURE            β”‚
β”‚                                                                 β”‚
β”‚                        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”                         β”‚
β”‚                        β”‚   Route 53   β”‚                         β”‚
β”‚                        β”‚  (Geo-DNS)   β”‚                         β”‚
β”‚                        β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜                         β”‚
β”‚                               β”‚                                 β”‚
β”‚              β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”                β”‚
β”‚              β–Ό                β–Ό                β–Ό                β”‚
β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”  β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”  β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”‚
β”‚    β”‚  us-west-2   β”‚  β”‚  us-east-1   β”‚  β”‚ eu-west-1    β”‚        β”‚
β”‚    β”‚  (Primary)   β”‚  β”‚  (Active)    β”‚  β”‚  (Active)    β”‚        β”‚
β”‚    β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€  β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€  β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€        β”‚
β”‚    β”‚              β”‚  β”‚              β”‚  β”‚              β”‚        β”‚
β”‚    β”‚  K8s Cluster β”‚  β”‚  K8s Cluster β”‚  β”‚  K8s Cluster β”‚        β”‚
β”‚    β”‚  β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”  β”‚  β”‚  β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”  β”‚  β”‚  β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”  β”‚        β”‚
β”‚    β”‚  β”‚ API    β”‚  β”‚  β”‚  β”‚ API    β”‚  β”‚  β”‚  β”‚ API    β”‚  β”‚        β”‚
β”‚    β”‚  β”‚ Pods   β”‚  β”‚  β”‚  β”‚ Pods   β”‚  β”‚  β”‚  β”‚ Pods   β”‚  β”‚        β”‚
β”‚    β”‚  β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β”‚  β”‚  β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β”‚  β”‚  β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β”‚        β”‚
β”‚    β”‚       β”‚      β”‚  β”‚       β”‚      β”‚  β”‚       β”‚      β”‚        β”‚
β”‚    β”‚  β”Œβ”€β”€β”€β”€β”΄β”€β”€β”€β”€β” β”‚  β”‚  β”Œβ”€β”€β”€β”€β”΄β”€β”€β”€β”€β” β”‚  β”‚  β”Œβ”€β”€β”€β”€β”΄β”€β”€β”€β”€β” β”‚        β”‚
β”‚    β”‚  β”‚DB Primaryβ”‚ β”‚  β”‚  β”‚DB Replicaβ”‚ β”‚  β”‚  β”‚DB Replicaβ”‚ β”‚       β”‚
β”‚    β”‚  β”‚(writes) β”‚ β”‚  β”‚  β”‚(reads)  β”‚  β”‚  β”‚  β”‚(reads)  β”‚  β”‚       β”‚
β”‚    β”‚  β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β”‚  β”‚  β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β”‚  β”‚  β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β”‚        β”‚
β”‚    β”‚       β”‚      β”‚  β”‚              β”‚  β”‚              β”‚        β”‚
β”‚    β”‚  β”Œβ”€β”€β”€β”€β”΄β”€β”€β”€β”€β” β”‚  β”‚              β”‚  β”‚              β”‚        β”‚
β”‚    β”‚  β”‚Cache   β”‚  β”‚  β”‚              β”‚  β”‚              β”‚        β”‚
β”‚    β”‚  β”‚(Redis) β”‚  β”‚  β”‚              β”‚  β”‚              β”‚        β”‚
β”‚    β”‚  β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β”‚  β”‚              β”‚  β”‚              β”‚        β”‚
β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜  β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β”‚
β”‚                                                                 β”‚
β”‚  Data Flow:                                                     β”‚
β”‚  β€’ Writes β†’ us-west-2 DB Primary β†’ async replicate to replicas  β”‚
β”‚  β€’ Reads  β†’ nearest region DB Replica                           β”‚
β”‚  β€’ Caches β†’ regional Redis (no cross-region cache sync)         β”‚
β”‚  β€’ Sessions β†’ JWT (stateless) or sticky to region               β”‚
β”‚                                                                 β”‚
β”‚  Conflict Resolution:                                           β”‚
β”‚  β€’ Financial data: single-writer (us-west-2 only)               β”‚
β”‚  β€’ User preferences: last-write-wins with vector clock            β”‚
β”‚  β€’ Analytics: eventually consistent, no conflict resolution     β”‚
β”‚                                                                 β”‚
β”‚  Failover:                                                      β”‚
β”‚  β€’ Automatic for reads (Route 53 health checks)                 β”‚
β”‚  β€’ Manual for writes (DB promotion requires IC approval)        β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

Capacity Planning and Load Testing

Capacity planning ensures your infrastructure can handle expected load with headroom for spikes. Load testing validates your capacity plans and identifies breaking points before production traffic finds them.

═══════════════════════════════════════════════════════════════════
            CAPACITY PLAN β€” Payment API
            Q1 2024
═══════════════════════════════════════════════════════════════════

CURRENT STATE (January 2024):
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Resource         β”‚ Current    β”‚ Capacity   β”‚ Utilization  β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ API Pods         β”‚ 15         β”‚ 30 (K8s)   β”‚ 50%          β”‚
β”‚ CPU per pod      β”‚ 500m       β”‚ 2000m      β”‚ 25%          β”‚
β”‚ Memory per pod   β”‚ 512Mi      β”‚ 2Gi        β”‚ 25%          β”‚
β”‚ DB Connections   β”‚ 45         β”‚ 100        β”‚ 45%          β”‚
β”‚ DB CPU           β”‚ 30%        β”‚ 100%       β”‚ 30%          β”‚
β”‚ Redis Memory     β”‚ 2GB        β”‚ 8GB        β”‚ 25%          β”‚
β”‚ Network          β”‚ 200 Mbps   β”‚ 1 Gbps     β”‚ 20%          β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

TRAFFIC PROJECTIONS:
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Metric           β”‚ Jan (now) β”‚ Mar (Q1)  β”‚ Jun (Q2)  β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Peak RPS         β”‚ 2,400     β”‚ 3,600     β”‚ 5,000     β”‚
β”‚ Avg RPS          β”‚ 800       β”‚ 1,200     β”‚ 1,700     β”‚
β”‚ Daily requests   β”‚ 69M       β”‚ 104M      β”‚ 147M      β”‚
β”‚ MAU              β”‚ 12M       β”‚ 15M       β”‚ 20M       β”‚
β”‚ Growth rate      β”‚ β€”         β”‚ 50%       β”‚ 39%       β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

CAPACITY HEADROOM TARGET: 70% max utilization at peak
SCALING TRIGGER: 60% sustained utilization for 10 minutes

ACTIONS REQUIRED:
1. Pre-scale to 20 pods by Feb 1 (+$450/month)
2. Increase DB connection pool to 150 (code change + config)
3. Evaluate DB instance size upgrade (r5.2xlarge β†’ r5.4xlarge)
4. Enable cluster autoscaler (min: 15, max: 40 pods)

# Kubernetes Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: payment-api-hpa
  namespace: payment
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: payment-api
  minReplicas: 15
  maxReplicas: 40
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 60
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 70
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
        - type: Percent
          value: 100
          periodSeconds: 15
    scaleDown:
      stabilizationWindowSeconds: 300  # 5 min cooldown before scaling down
      policies:
        - type: Percent
          value: 10
          periodSeconds: 60

Load Testing with k6

// payment-api-loadtest.js β€” k6 load testing script
// Run: k6 run --vus 100 --duration 10m payment-api-loadtest.js
// Or with stages for ramp-up: k6 run payment-api-loadtest.js

import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend, Counter } from 'k6/metrics';

// Custom metrics
const errorRate = new Rate('errors');
const paymentLatency = new Trend('payment_latency');
const successCounter = new Counter('successful_payments');

export const options = {
  stages: [
    { duration: '2m', target: 50 },    // Ramp up to 50 VUs
    { duration: '5m', target: 200 },   // Ramp up to 200 VUs
    { duration: '10m', target: 500 },  // Sustained 500 VUs
    { duration: '5m', target: 1000 },  // Peak load 1000 VUs
    { duration: '3m', target: 0 },     // Ramp down
  ],
  thresholds: {
    http_req_duration: ['p(95)<300'],   // 95% of requests under 300ms
    http_req_failed: ['rate<0.001'],    // Error rate below 0.1%
    errors: ['rate<0.001'],
  },
};

const BASE_URL = __ENV.BASE_URL || 'https://api.payment.samsung.com';
const API_KEY = __ENV.API_KEY;

export default function () {
  // 1. Get auth token (once per VU iteration)
  const tokenResponse = http.post(`${BASE_URL}/v1/token`, null, {
    headers: { 'Authorization': `Bearer ${API_KEY}` },
  });
  
  check(tokenResponse, {
    'token request status is 200': (r) => r.status === 200,
  });
  
  const token = tokenResponse.json('access_token');
  
  // 2. Process payment (main load test)
  const payload = JSON.stringify({
    amount: 10.00 + Math.random() * 90.00,  // $10-$100
    currency: 'USD',
    card_token: 'test_token_visa_' + Math.floor(Math.random() * 10),
    merchant_id: 'test_merchant',
    idempotency_key: `loadtest-${__VU}-${__ITER}-${Date.now()}`,
  });
  
  const startTime = Date.now();
  const paymentResponse = http.post(`${BASE_URL}/v1/payments`, payload, {
    headers: {
      'Content-Type': 'application/json',
      'Authorization': `Bearer ${token}`,
    },
  });
  const latency = Date.now() - startTime;
  
  paymentLatency.add(latency);
  
  const success = check(paymentResponse, {
    'payment status is 200': (r) => r.status === 200,
    'payment approved': (r) => r.json('status') === 'approved',
    'payment latency < 300ms': () => latency < 300,
  });
  
  errorRate.add(!success);
  if (success) {
    successCounter.add(1);
  }
  
  // 3. Get payment status (read operation)
  if (success) {
    const paymentId = paymentResponse.json('payment_id');
    const statusResponse = http.get(
      `${BASE_URL}/v1/payments/${paymentId}`,
      { headers: { 'Authorization': `Bearer ${token}` } }
    );
    
    check(statusResponse, {
      'status check is 200': (r) => r.status === 200,
    });
  }
  
  // Think time between iterations
  sleep(Math.random() * 2 + 1);  // 1-3 seconds
}

export function handleSummary(data) {
  return {
    'loadtest-summary.json': JSON.stringify(data),
    stdout: `
      Payment API Load Test Summary
      =============================
      Peak VUs: ${data.metrics.vus_max.values.max}
      Requests: ${data.metrics.http_reqs.values.count}
      Error Rate: ${(data.metrics.errors.values.rate * 100).toFixed(2)}%
      p50 Latency: ${data.metrics.payment_latency.values['avg'].toFixed(0)}ms
      p95 Latency: ${data.metrics.payment_latency.values['p(95)'].toFixed(0)}ms
      p99 Latency: ${data.metrics.payment_latency.values['p(99)'].toFixed(0)}ms
      Throughput: ${(data.metrics.http_reqs.values.rate * 60).toFixed(0)} req/min
    `,
  };
}

Last updated: June 2026 | Author: SRE Team