This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
435 lines
18 KiB
YAML
435 lines
18 KiB
YAML
# Envoy Proxy Configuration for StemeDB
|
|
#
|
|
# This configuration provides:
|
|
# - Load balancing across 3-node cluster (round-robin)
|
|
# - Health checks (HTTP /v1/health every 5s)
|
|
# - Circuit breakers (max 1000 connections per node)
|
|
# - Rate limiting (100 req/sec per IP)
|
|
# - Retry policies (3 retries on 5xx errors)
|
|
# - TLS termination
|
|
# - Access logging
|
|
# - Metrics (Prometheus format)
|
|
#
|
|
# Usage:
|
|
# envoy -c stemedb.yaml
|
|
#
|
|
# Or with Docker:
|
|
# docker run -d -p 8443:8443 -p 9901:9901 -v $(pwd)/stemedb.yaml:/etc/envoy/envoy.yaml envoyproxy/envoy:v1.28-latest
|
|
|
|
admin:
|
|
address:
|
|
socket_address:
|
|
address: 0.0.0.0
|
|
port_value: 9901 # Admin interface (metrics, config dump)
|
|
|
|
static_resources:
|
|
listeners:
|
|
# ┌───────────────────────────────────────────────────────┐
|
|
# │ HTTPS Listener (Port 8443) │
|
|
# └───────────────────────────────────────────────────────┘
|
|
|
|
- name: stemedb_https_listener
|
|
address:
|
|
socket_address:
|
|
address: 0.0.0.0
|
|
port_value: 8443
|
|
|
|
filter_chains:
|
|
- filters:
|
|
# HTTP Connection Manager
|
|
- name: envoy.filters.network.http_connection_manager
|
|
typed_config:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
|
|
stat_prefix: stemedb_https
|
|
codec_type: AUTO
|
|
|
|
# Routing
|
|
route_config:
|
|
name: stemedb_route
|
|
virtual_hosts:
|
|
- name: stemedb_backend
|
|
domains: ["*"]
|
|
|
|
routes:
|
|
# Health check endpoint (public, no rate limit)
|
|
- match:
|
|
path: "/v1/health"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 5s
|
|
typed_per_filter_config:
|
|
envoy.filters.http.local_ratelimit:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
|
|
stat_prefix: health_check
|
|
filter_enabled:
|
|
default_value:
|
|
numerator: 0 # Disable rate limiting
|
|
denominator: HUNDRED
|
|
|
|
# Write endpoints (stricter rate limit: 10 req/sec)
|
|
- match:
|
|
prefix: "/v1/assert"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 30s
|
|
retry_policy:
|
|
retry_on: "5xx"
|
|
num_retries: 0 # Don't retry writes (not idempotent)
|
|
typed_per_filter_config:
|
|
envoy.filters.http.local_ratelimit:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
|
|
stat_prefix: write_endpoints
|
|
token_bucket:
|
|
max_tokens: 20
|
|
tokens_per_fill: 10
|
|
fill_interval: 1s
|
|
|
|
- match:
|
|
prefix: "/v1/retract"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 30s
|
|
retry_policy:
|
|
retry_on: "5xx"
|
|
num_retries: 0
|
|
typed_per_filter_config:
|
|
envoy.filters.http.local_ratelimit:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
|
|
stat_prefix: write_endpoints
|
|
token_bucket:
|
|
max_tokens: 20
|
|
tokens_per_fill: 10
|
|
fill_interval: 1s
|
|
|
|
# Admin endpoints (restricted)
|
|
- match:
|
|
prefix: "/v1/admin/"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 30s
|
|
typed_per_filter_config:
|
|
envoy.filters.http.rbac:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
|
|
rules:
|
|
action: ALLOW
|
|
policies:
|
|
"internal-network":
|
|
permissions:
|
|
- any: true
|
|
principals:
|
|
- remote_ip:
|
|
address_prefix: "10.0.0.0"
|
|
prefix_len: 8
|
|
- remote_ip:
|
|
address_prefix: "172.16.0.0"
|
|
prefix_len: 12
|
|
- remote_ip:
|
|
address_prefix: "192.168.0.0"
|
|
prefix_len: 16
|
|
|
|
# Metrics endpoint (Prometheus only)
|
|
- match:
|
|
path: "/metrics"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 10s
|
|
typed_per_filter_config:
|
|
envoy.filters.http.rbac:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
|
|
rules:
|
|
action: ALLOW
|
|
policies:
|
|
"prometheus-server":
|
|
permissions:
|
|
- any: true
|
|
principals:
|
|
- remote_ip:
|
|
address_prefix: "10.0.1.100"
|
|
prefix_len: 32
|
|
|
|
# Query endpoints (standard rate limit: 100 req/sec)
|
|
- match:
|
|
prefix: "/v1/query"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 30s
|
|
retry_policy:
|
|
retry_on: "5xx,reset,connect-failure"
|
|
num_retries: 3
|
|
per_try_timeout: 10s
|
|
typed_per_filter_config:
|
|
envoy.filters.http.local_ratelimit:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
|
|
stat_prefix: query_endpoints
|
|
token_bucket:
|
|
max_tokens: 200
|
|
tokens_per_fill: 100
|
|
fill_interval: 1s
|
|
|
|
# All other endpoints (default)
|
|
- match:
|
|
prefix: "/"
|
|
route:
|
|
cluster: stemedb_cluster
|
|
timeout: 30s
|
|
retry_policy:
|
|
retry_on: "5xx,reset,connect-failure"
|
|
num_retries: 3
|
|
per_try_timeout: 10s
|
|
|
|
# HTTP filters
|
|
http_filters:
|
|
# Rate limiting filter
|
|
- name: envoy.filters.http.local_ratelimit
|
|
typed_config:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
|
|
stat_prefix: http_local_rate_limiter
|
|
token_bucket:
|
|
max_tokens: 200
|
|
tokens_per_fill: 100
|
|
fill_interval: 1s
|
|
filter_enabled:
|
|
runtime_key: local_rate_limit_enabled
|
|
default_value:
|
|
numerator: 100
|
|
denominator: HUNDRED
|
|
filter_enforced:
|
|
runtime_key: local_rate_limit_enforced
|
|
default_value:
|
|
numerator: 100
|
|
denominator: HUNDRED
|
|
response_headers_to_add:
|
|
- append: false
|
|
header:
|
|
key: x-rate-limit-exceeded
|
|
value: "true"
|
|
|
|
# RBAC filter (for admin endpoints)
|
|
- name: envoy.filters.http.rbac
|
|
typed_config:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
|
|
rules:
|
|
action: ALLOW
|
|
policies:
|
|
"allow-all":
|
|
permissions:
|
|
- any: true
|
|
principals:
|
|
- any: true
|
|
|
|
# Router filter (must be last)
|
|
- name: envoy.filters.http.router
|
|
typed_config:
|
|
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
|
|
|
|
# Access logging
|
|
access_log:
|
|
- name: envoy.access_loggers.file
|
|
typed_config:
|
|
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
|
|
path: /dev/stdout
|
|
format: "[%START_TIME%] \"%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%\" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% \"%REQ(X-FORWARDED-FOR)%\" \"%REQ(USER-AGENT)%\" \"%REQ(X-REQUEST-ID)%\" \"%REQ(:AUTHORITY)%\" \"%UPSTREAM_HOST%\"\n"
|
|
|
|
# TLS configuration
|
|
transport_socket:
|
|
name: envoy.transport_sockets.tls
|
|
typed_config:
|
|
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext
|
|
common_tls_context:
|
|
tls_certificates:
|
|
- certificate_chain:
|
|
filename: /etc/letsencrypt/live/stemedb.example.com/fullchain.pem
|
|
private_key:
|
|
filename: /etc/letsencrypt/live/stemedb.example.com/privkey.pem
|
|
tls_params:
|
|
tls_minimum_protocol_version: TLSv1_3
|
|
tls_maximum_protocol_version: TLSv1_3
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Clusters (Upstream Servers) │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
|
|
clusters:
|
|
- name: stemedb_cluster
|
|
type: STRICT_DNS
|
|
connect_timeout: 5s
|
|
lb_policy: ROUND_ROBIN
|
|
|
|
# Load balancing
|
|
load_assignment:
|
|
cluster_name: stemedb_cluster
|
|
endpoints:
|
|
- lb_endpoints:
|
|
# Node 1
|
|
- endpoint:
|
|
address:
|
|
socket_address:
|
|
address: 10.0.1.51
|
|
port_value: 18180
|
|
health_check_config:
|
|
port_value: 18180
|
|
|
|
# Node 2
|
|
- endpoint:
|
|
address:
|
|
socket_address:
|
|
address: 10.0.1.52
|
|
port_value: 18180
|
|
health_check_config:
|
|
port_value: 18180
|
|
|
|
# Node 3
|
|
- endpoint:
|
|
address:
|
|
socket_address:
|
|
address: 10.0.1.53
|
|
port_value: 18180
|
|
health_check_config:
|
|
port_value: 18180
|
|
|
|
# Health checks
|
|
health_checks:
|
|
- timeout: 3s
|
|
interval: 5s
|
|
unhealthy_threshold: 3
|
|
healthy_threshold: 2
|
|
http_health_check:
|
|
path: "/v1/health"
|
|
expected_statuses:
|
|
- start: 200
|
|
end: 299
|
|
|
|
# Circuit breakers
|
|
circuit_breakers:
|
|
thresholds:
|
|
- priority: DEFAULT
|
|
max_connections: 1000
|
|
max_pending_requests: 1000
|
|
max_requests: 1000
|
|
max_retries: 3
|
|
|
|
# Outlier detection (automatic node removal)
|
|
outlier_detection:
|
|
consecutive_5xx: 5
|
|
interval: 10s
|
|
base_ejection_time: 30s
|
|
max_ejection_percent: 50
|
|
enforcing_consecutive_5xx: 100
|
|
|
|
# Connection pool settings
|
|
common_lb_config:
|
|
healthy_panic_threshold:
|
|
value: 50.0 # Allow 50% unhealthy before panic
|
|
|
|
# HTTP/2 settings
|
|
typed_extension_protocol_options:
|
|
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
|
|
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
|
|
explicit_http_config:
|
|
http2_protocol_options:
|
|
max_concurrent_streams: 100
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Usage Instructions │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
#
|
|
# 1. Install Envoy:
|
|
# wget https://github.com/envoyproxy/envoy/releases/download/v1.28.0/envoy-1.28.0-linux-x86_64
|
|
# chmod +x envoy-1.28.0-linux-x86_64
|
|
# sudo mv envoy-1.28.0-linux-x86_64 /usr/local/bin/envoy
|
|
#
|
|
# 2. Update configuration:
|
|
# - Replace stemedb.example.com with your domain
|
|
# - Update node IPs (10.0.1.51-53)
|
|
# - Update Prometheus IP (10.0.1.100)
|
|
# - Update TLS certificate paths
|
|
#
|
|
# 3. Validate config:
|
|
# envoy --mode validate -c stemedb.yaml
|
|
#
|
|
# 4. Start Envoy:
|
|
# envoy -c stemedb.yaml
|
|
#
|
|
# 5. Test endpoints:
|
|
# curl -k https://localhost:8443/v1/health
|
|
#
|
|
# 6. View admin interface:
|
|
# curl http://localhost:9901/stats/prometheus # Metrics
|
|
# curl http://localhost:9901/config_dump # Config
|
|
# curl http://localhost:9901/clusters # Cluster status
|
|
#
|
|
# 7. Test rate limiting:
|
|
# for i in {1..150}; do curl -k https://localhost:8443/v1/health; done
|
|
# # Should see 429 after 100 requests
|
|
#
|
|
# 8. Test health check:
|
|
# # Stop node 2
|
|
# ssh node2 "sudo systemctl stop stemedb-api"
|
|
# # Wait 15s for health check to fail
|
|
# curl http://localhost:9901/clusters | grep node2
|
|
# # Should show: health_flags: /failed_active_hc
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Systemd Service (Optional) │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
#
|
|
# Save as /etc/systemd/system/envoy.service:
|
|
#
|
|
# [Unit]
|
|
# Description=Envoy Proxy
|
|
# After=network.target
|
|
#
|
|
# [Service]
|
|
# Type=simple
|
|
# User=envoy
|
|
# Group=envoy
|
|
# ExecStart=/usr/local/bin/envoy -c /etc/envoy/stemedb.yaml
|
|
# Restart=on-failure
|
|
# RestartSec=5s
|
|
#
|
|
# [Install]
|
|
# WantedBy=multi-user.target
|
|
#
|
|
# Then:
|
|
# sudo systemctl daemon-reload
|
|
# sudo systemctl enable envoy
|
|
# sudo systemctl start envoy
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Monitoring & Troubleshooting │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
#
|
|
# View stats:
|
|
# curl http://localhost:9901/stats
|
|
#
|
|
# View Prometheus metrics:
|
|
# curl http://localhost:9901/stats/prometheus
|
|
#
|
|
# Check cluster health:
|
|
# curl http://localhost:9901/clusters
|
|
#
|
|
# Dump config:
|
|
# curl http://localhost:9901/config_dump
|
|
#
|
|
# View access logs:
|
|
# docker logs -f envoy-container
|
|
#
|
|
# Test circuit breaker:
|
|
# # Simulate 5 consecutive 500 errors from node2
|
|
# # Node2 should be ejected for 30s
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Production Hardening Checklist │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
#
|
|
# - [ ] Configure external authorization (OAuth2, JWT)
|
|
# - [ ] Set up centralized logging (ELK, Splunk)
|
|
# - [ ] Enable Envoy access logs to file (not just stdout)
|
|
# - [ ] Configure metrics scraping (Prometheus)
|
|
# - [ ] Set up distributed tracing (Jaeger, Zipkin)
|
|
# - [ ] Test certificate renewal process
|
|
# - [ ] Document rate limit thresholds
|
|
# - [ ] Test circuit breaker behavior
|
|
# - [ ] Set up alerting on outlier detection
|
|
# - [ ] Configure WAF (Web Application Firewall)
|