stemedb/docs/operations/deployment/envoy/stemedb.yaml
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

435 lines
18 KiB
YAML

# Envoy Proxy Configuration for StemeDB
#
# This configuration provides:
# - Load balancing across 3-node cluster (round-robin)
# - Health checks (HTTP /v1/health every 5s)
# - Circuit breakers (max 1000 connections per node)
# - Rate limiting (100 req/sec per IP)
# - Retry policies (3 retries on 5xx errors)
# - TLS termination
# - Access logging
# - Metrics (Prometheus format)
#
# Usage:
# envoy -c stemedb.yaml
#
# Or with Docker:
# docker run -d -p 8443:8443 -p 9901:9901 -v $(pwd)/stemedb.yaml:/etc/envoy/envoy.yaml envoyproxy/envoy:v1.28-latest
admin:
address:
socket_address:
address: 0.0.0.0
port_value: 9901 # Admin interface (metrics, config dump)
static_resources:
listeners:
# ┌───────────────────────────────────────────────────────┐
# │ HTTPS Listener (Port 8443) │
# └───────────────────────────────────────────────────────┘
- name: stemedb_https_listener
address:
socket_address:
address: 0.0.0.0
port_value: 8443
filter_chains:
- filters:
# HTTP Connection Manager
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: stemedb_https
codec_type: AUTO
# Routing
route_config:
name: stemedb_route
virtual_hosts:
- name: stemedb_backend
domains: ["*"]
routes:
# Health check endpoint (public, no rate limit)
- match:
path: "/v1/health"
route:
cluster: stemedb_cluster
timeout: 5s
typed_per_filter_config:
envoy.filters.http.local_ratelimit:
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
stat_prefix: health_check
filter_enabled:
default_value:
numerator: 0 # Disable rate limiting
denominator: HUNDRED
# Write endpoints (stricter rate limit: 10 req/sec)
- match:
prefix: "/v1/assert"
route:
cluster: stemedb_cluster
timeout: 30s
retry_policy:
retry_on: "5xx"
num_retries: 0 # Don't retry writes (not idempotent)
typed_per_filter_config:
envoy.filters.http.local_ratelimit:
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
stat_prefix: write_endpoints
token_bucket:
max_tokens: 20
tokens_per_fill: 10
fill_interval: 1s
- match:
prefix: "/v1/retract"
route:
cluster: stemedb_cluster
timeout: 30s
retry_policy:
retry_on: "5xx"
num_retries: 0
typed_per_filter_config:
envoy.filters.http.local_ratelimit:
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
stat_prefix: write_endpoints
token_bucket:
max_tokens: 20
tokens_per_fill: 10
fill_interval: 1s
# Admin endpoints (restricted)
- match:
prefix: "/v1/admin/"
route:
cluster: stemedb_cluster
timeout: 30s
typed_per_filter_config:
envoy.filters.http.rbac:
"@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
rules:
action: ALLOW
policies:
"internal-network":
permissions:
- any: true
principals:
- remote_ip:
address_prefix: "10.0.0.0"
prefix_len: 8
- remote_ip:
address_prefix: "172.16.0.0"
prefix_len: 12
- remote_ip:
address_prefix: "192.168.0.0"
prefix_len: 16
# Metrics endpoint (Prometheus only)
- match:
path: "/metrics"
route:
cluster: stemedb_cluster
timeout: 10s
typed_per_filter_config:
envoy.filters.http.rbac:
"@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
rules:
action: ALLOW
policies:
"prometheus-server":
permissions:
- any: true
principals:
- remote_ip:
address_prefix: "10.0.1.100"
prefix_len: 32
# Query endpoints (standard rate limit: 100 req/sec)
- match:
prefix: "/v1/query"
route:
cluster: stemedb_cluster
timeout: 30s
retry_policy:
retry_on: "5xx,reset,connect-failure"
num_retries: 3
per_try_timeout: 10s
typed_per_filter_config:
envoy.filters.http.local_ratelimit:
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
stat_prefix: query_endpoints
token_bucket:
max_tokens: 200
tokens_per_fill: 100
fill_interval: 1s
# All other endpoints (default)
- match:
prefix: "/"
route:
cluster: stemedb_cluster
timeout: 30s
retry_policy:
retry_on: "5xx,reset,connect-failure"
num_retries: 3
per_try_timeout: 10s
# HTTP filters
http_filters:
# Rate limiting filter
- name: envoy.filters.http.local_ratelimit
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
stat_prefix: http_local_rate_limiter
token_bucket:
max_tokens: 200
tokens_per_fill: 100
fill_interval: 1s
filter_enabled:
runtime_key: local_rate_limit_enabled
default_value:
numerator: 100
denominator: HUNDRED
filter_enforced:
runtime_key: local_rate_limit_enforced
default_value:
numerator: 100
denominator: HUNDRED
response_headers_to_add:
- append: false
header:
key: x-rate-limit-exceeded
value: "true"
# RBAC filter (for admin endpoints)
- name: envoy.filters.http.rbac
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
rules:
action: ALLOW
policies:
"allow-all":
permissions:
- any: true
principals:
- any: true
# Router filter (must be last)
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
# Access logging
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: /dev/stdout
format: "[%START_TIME%] \"%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%\" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% \"%REQ(X-FORWARDED-FOR)%\" \"%REQ(USER-AGENT)%\" \"%REQ(X-REQUEST-ID)%\" \"%REQ(:AUTHORITY)%\" \"%UPSTREAM_HOST%\"\n"
# TLS configuration
transport_socket:
name: envoy.transport_sockets.tls
typed_config:
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext
common_tls_context:
tls_certificates:
- certificate_chain:
filename: /etc/letsencrypt/live/stemedb.example.com/fullchain.pem
private_key:
filename: /etc/letsencrypt/live/stemedb.example.com/privkey.pem
tls_params:
tls_minimum_protocol_version: TLSv1_3
tls_maximum_protocol_version: TLSv1_3
# ┌───────────────────────────────────────────────────────────┐
# │ Clusters (Upstream Servers) │
# └───────────────────────────────────────────────────────────┘
clusters:
- name: stemedb_cluster
type: STRICT_DNS
connect_timeout: 5s
lb_policy: ROUND_ROBIN
# Load balancing
load_assignment:
cluster_name: stemedb_cluster
endpoints:
- lb_endpoints:
# Node 1
- endpoint:
address:
socket_address:
address: 10.0.1.51
port_value: 18180
health_check_config:
port_value: 18180
# Node 2
- endpoint:
address:
socket_address:
address: 10.0.1.52
port_value: 18180
health_check_config:
port_value: 18180
# Node 3
- endpoint:
address:
socket_address:
address: 10.0.1.53
port_value: 18180
health_check_config:
port_value: 18180
# Health checks
health_checks:
- timeout: 3s
interval: 5s
unhealthy_threshold: 3
healthy_threshold: 2
http_health_check:
path: "/v1/health"
expected_statuses:
- start: 200
end: 299
# Circuit breakers
circuit_breakers:
thresholds:
- priority: DEFAULT
max_connections: 1000
max_pending_requests: 1000
max_requests: 1000
max_retries: 3
# Outlier detection (automatic node removal)
outlier_detection:
consecutive_5xx: 5
interval: 10s
base_ejection_time: 30s
max_ejection_percent: 50
enforcing_consecutive_5xx: 100
# Connection pool settings
common_lb_config:
healthy_panic_threshold:
value: 50.0 # Allow 50% unhealthy before panic
# HTTP/2 settings
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http2_protocol_options:
max_concurrent_streams: 100
# ┌───────────────────────────────────────────────────────────┐
# │ Usage Instructions │
# └───────────────────────────────────────────────────────────┘
#
# 1. Install Envoy:
# wget https://github.com/envoyproxy/envoy/releases/download/v1.28.0/envoy-1.28.0-linux-x86_64
# chmod +x envoy-1.28.0-linux-x86_64
# sudo mv envoy-1.28.0-linux-x86_64 /usr/local/bin/envoy
#
# 2. Update configuration:
# - Replace stemedb.example.com with your domain
# - Update node IPs (10.0.1.51-53)
# - Update Prometheus IP (10.0.1.100)
# - Update TLS certificate paths
#
# 3. Validate config:
# envoy --mode validate -c stemedb.yaml
#
# 4. Start Envoy:
# envoy -c stemedb.yaml
#
# 5. Test endpoints:
# curl -k https://localhost:8443/v1/health
#
# 6. View admin interface:
# curl http://localhost:9901/stats/prometheus # Metrics
# curl http://localhost:9901/config_dump # Config
# curl http://localhost:9901/clusters # Cluster status
#
# 7. Test rate limiting:
# for i in {1..150}; do curl -k https://localhost:8443/v1/health; done
# # Should see 429 after 100 requests
#
# 8. Test health check:
# # Stop node 2
# ssh node2 "sudo systemctl stop stemedb-api"
# # Wait 15s for health check to fail
# curl http://localhost:9901/clusters | grep node2
# # Should show: health_flags: /failed_active_hc
# ┌───────────────────────────────────────────────────────────┐
# │ Systemd Service (Optional) │
# └───────────────────────────────────────────────────────────┘
#
# Save as /etc/systemd/system/envoy.service:
#
# [Unit]
# Description=Envoy Proxy
# After=network.target
#
# [Service]
# Type=simple
# User=envoy
# Group=envoy
# ExecStart=/usr/local/bin/envoy -c /etc/envoy/stemedb.yaml
# Restart=on-failure
# RestartSec=5s
#
# [Install]
# WantedBy=multi-user.target
#
# Then:
# sudo systemctl daemon-reload
# sudo systemctl enable envoy
# sudo systemctl start envoy
# ┌───────────────────────────────────────────────────────────┐
# │ Monitoring & Troubleshooting │
# └───────────────────────────────────────────────────────────┘
#
# View stats:
# curl http://localhost:9901/stats
#
# View Prometheus metrics:
# curl http://localhost:9901/stats/prometheus
#
# Check cluster health:
# curl http://localhost:9901/clusters
#
# Dump config:
# curl http://localhost:9901/config_dump
#
# View access logs:
# docker logs -f envoy-container
#
# Test circuit breaker:
# # Simulate 5 consecutive 500 errors from node2
# # Node2 should be ejected for 30s
# ┌───────────────────────────────────────────────────────────┐
# │ Production Hardening Checklist │
# └───────────────────────────────────────────────────────────┘
#
# - [ ] Configure external authorization (OAuth2, JWT)
# - [ ] Set up centralized logging (ELK, Splunk)
# - [ ] Enable Envoy access logs to file (not just stdout)
# - [ ] Configure metrics scraping (Prometheus)
# - [ ] Set up distributed tracing (Jaeger, Zipkin)
# - [ ] Test certificate renewal process
# - [ ] Document rate limit thresholds
# - [ ] Test circuit breaker behavior
# - [ ] Set up alerting on outlier detection
# - [ ] Configure WAF (Web Application Firewall)