Defining SLOs, SLIs, and Error Budgets (Google SRE Style)
8 min read5.6k
Introduction
Building systems that scale requires more than just knowing the technology—it demands understanding business requirements and engineering constraints. Here, we explore defining slos, slis, and error budgets (google sre style), providing actionable patterns and design principles.
System Requirements
Functional Requirements
Capacity Estimation
| Metric | Value | Calculation |
|---|---|---|
| Daily Active Users | 10M | Given |
| Requests per second | 5,000 | 10M * 50 / 86400 |
| Storage per day | 500 GB | 10M * 50 KB |
| Bandwidth | 200 Mbps | 500 GB * 8 / 86400 |
| Cache memory | 100 GB | 20% of hot data |
High-Level Architecture
Detailed Component Design
API Gateway Pattern
typescript
// API Gateway implementation
import express from 'express';
import { createProxyMiddleware } from 'http-proxy-middleware';
import rateLimit from 'express-rate-limit';
const app = express();
// Rate limiting
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
max: 100, // limit each IP to 100 requests per windowMs
message: 'Too many requests from this IP'
});
app.use('/api/', limiter);
// Authentication middleware
app.use('/api/', async (req, res, next) => {
const token = req.headers.authorization?.split(' ')[1];
if (!token) {
return res.status(401).json({ error: 'No token provided' });
}
try {
const user = await verifyToken(token);
req.user = user;
next();
} catch (error) {
return res.status(401).json({ error: 'Invalid token' });
}
});
// Service proxies
app.use('/api/users', createProxyMiddleware({
target: 'http://user-service:3001',
changeOrigin: true
}));
app.use('/api/orders', createProxyMiddleware({
target: 'http://order-service:3002',
changeOrigin: true
}));
app.listen(3000);Data Flow
Database Design
Schema Implementation
sql
-- Users table
CREATE TABLE users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
email VARCHAR(255) UNIQUE NOT NULL,
password_hash VARCHAR(255) NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_users_email ON users(email);
-- Orders table with partitioning
CREATE TABLE orders (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id),
total_amount DECIMAL(10, 2) NOT NULL,
status VARCHAR(50) NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) PARTITION BY RANGE (created_at);
CREATE TABLE orders_2024_q1 PARTITION OF orders
FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');
CREATE TABLE orders_2024_q2 PARTITION OF orders
FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');
-- Order items
CREATE TABLE order_items (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
order_id UUID NOT NULL REFERENCES orders(id) ON DELETE CASCADE,
product_id UUID NOT NULL REFERENCES products(id),
quantity INTEGER NOT NULL CHECK (quantity > 0),
price DECIMAL(10, 2) NOT NULL,
UNIQUE(order_id, product_id)
);
CREATE INDEX idx_order_items_order_id ON order_items(order_id);
CREATE INDEX idx_order_items_product_id ON order_items(product_id);Caching Strategy
Cache Implementation
python
import redis
import json
from functools import wraps
from typing import Optional, Any
class CacheManager:
def __init__(self, host='localhost', port=6379):
self.redis_client = redis.Redis(
host=host,
port=port,
decode_responses=True
)
self.default_ttl = 3600 # 1 hour
def get(self, key: str) -> Optional[Any]:
"""Get value from cache"""
value = self.redis_client.get(key)
if value:
return json.loads(value)
return None
def set(self, key: str, value: Any, ttl: int = None) -> bool:
"""Set value in cache with TTL"""
ttl = ttl or self.default_ttl
return self.redis_client.setex(
key,
ttl,
json.dumps(value)
)
def delete(self, key: str) -> bool:
"""Delete key from cache"""
return self.redis_client.delete(key) > 0
def cache_aside(self, ttl: int = None):
"""Decorator for cache-aside pattern"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Generate cache key from function name and args
cache_key = f"{func.__name__}:{str(args)}:{str(kwargs)}"
# Try to get from cache
cached_value = self.get(cache_key)
if cached_value is not None:
return cached_value
# Cache miss - call function
result = func(*args, **kwargs)
# Store in cache
self.set(cache_key, result, ttl)
return result
return wrapper
return decorator
# Usage example
cache = CacheManager()
@cache.cache_aside(ttl=600)
def get_user_profile(user_id: str):
# This will be cached for 10 minutes
return database.query(f"SELECT * FROM users WHERE id = '{user_id}'")Scaling Strategy
Monitoring & Observability
Failure Modes & Recovery
Circuit Breaker Implementation
go
package circuitbreaker
import (
"errors"
"sync"
"time"
)
type State int
const (
StateClosed State = iota
StateOpen
StateHalfOpen
)
type CircuitBreaker struct {
maxFailures int
resetTimeout time.Duration
mu sync.RWMutex
state State
failures int
lastFailTime time.Time
}
func NewCircuitBreaker(maxFailures int, resetTimeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
maxFailures: maxFailures,
resetTimeout: resetTimeout,
state: StateClosed,
}
}
func (cb *CircuitBreaker) Call(fn func() error) error {
cb.mu.RLock()
state := cb.state
cb.mu.RUnlock()
if state == StateOpen {
if time.Since(cb.lastFailTime) > cb.resetTimeout {
cb.mu.Lock()
cb.state = StateHalfOpen
cb.mu.Unlock()
} else {
return errors.New("circuit breaker is open")
}
}
err := fn()
cb.mu.Lock()
defer cb.mu.Unlock()
if err != nil {
cb.failures++
cb.lastFailTime = time.Now()
if cb.failures >= cb.maxFailures {
cb.state = StateOpen
}
return err
}
if cb.state == StateHalfOpen {
cb.state = StateClosed
}
cb.failures = 0
return nil
}Performance Benchmarks
Deployment Strategy
Conclusion
Building scalable systems requires careful consideration of trade-offs, continuous monitoring, and iterative improvements. The patterns discussed provide a foundation for designing robust architectures.
Key Takeaways
- Design for failure from the start
- Implement observability at every layer
- Use caching strategically
- Scale horizontally when possible
- Monitor and optimize continuously
References
- Designing Data-Intensive Applications by Martin Kleppmann
- System Design Interview by Alex Xu
- AWS Architecture Blog - https://aws.amazon.com/blogs/architecture/