# Envoy Proxy Configuration for StemeDB # # This configuration provides: # - Load balancing across 3-node cluster (round-robin) # - Health checks (HTTP /v1/health every 5s) # - Circuit breakers (max 1000 connections per node) # - Rate limiting (100 req/sec per IP) # - Retry policies (3 retries on 5xx errors) # - TLS termination # - Access logging # - Metrics (Prometheus format) # # Usage: # envoy -c stemedb.yaml # # Or with Docker: # docker run -d -p 8443:8443 -p 9901:9901 -v $(pwd)/stemedb.yaml:/etc/envoy/envoy.yaml envoyproxy/envoy:v1.28-latest admin: address: socket_address: address: 0.0.0.0 port_value: 9901 # Admin interface (metrics, config dump) static_resources: listeners: # ┌───────────────────────────────────────────────────────┐ # │ HTTPS Listener (Port 8443) │ # └───────────────────────────────────────────────────────┘ - name: stemedb_https_listener address: socket_address: address: 0.0.0.0 port_value: 8443 filter_chains: - filters: # HTTP Connection Manager - name: envoy.filters.network.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager stat_prefix: stemedb_https codec_type: AUTO # Routing route_config: name: stemedb_route virtual_hosts: - name: stemedb_backend domains: ["*"] routes: # Health check endpoint (public, no rate limit) - match: path: "/v1/health" route: cluster: stemedb_cluster timeout: 5s typed_per_filter_config: envoy.filters.http.local_ratelimit: "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit stat_prefix: health_check filter_enabled: default_value: numerator: 0 # Disable rate limiting denominator: HUNDRED # Write endpoints (stricter rate limit: 10 req/sec) - match: prefix: "/v1/assert" route: cluster: stemedb_cluster timeout: 30s retry_policy: retry_on: "5xx" num_retries: 0 # Don't retry writes (not idempotent) typed_per_filter_config: envoy.filters.http.local_ratelimit: "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit stat_prefix: write_endpoints token_bucket: max_tokens: 20 tokens_per_fill: 10 fill_interval: 1s - match: prefix: "/v1/retract" route: cluster: stemedb_cluster timeout: 30s retry_policy: retry_on: "5xx" num_retries: 0 typed_per_filter_config: envoy.filters.http.local_ratelimit: "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit stat_prefix: write_endpoints token_bucket: max_tokens: 20 tokens_per_fill: 10 fill_interval: 1s # Admin endpoints (restricted) - match: prefix: "/v1/admin/" route: cluster: stemedb_cluster timeout: 30s typed_per_filter_config: envoy.filters.http.rbac: "@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC rules: action: ALLOW policies: "internal-network": permissions: - any: true principals: - remote_ip: address_prefix: "10.0.0.0" prefix_len: 8 - remote_ip: address_prefix: "172.16.0.0" prefix_len: 12 - remote_ip: address_prefix: "192.168.0.0" prefix_len: 16 # Metrics endpoint (Prometheus only) - match: path: "/metrics" route: cluster: stemedb_cluster timeout: 10s typed_per_filter_config: envoy.filters.http.rbac: "@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC rules: action: ALLOW policies: "prometheus-server": permissions: - any: true principals: - remote_ip: address_prefix: "10.0.1.100" prefix_len: 32 # Query endpoints (standard rate limit: 100 req/sec) - match: prefix: "/v1/query" route: cluster: stemedb_cluster timeout: 30s retry_policy: retry_on: "5xx,reset,connect-failure" num_retries: 3 per_try_timeout: 10s typed_per_filter_config: envoy.filters.http.local_ratelimit: "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit stat_prefix: query_endpoints token_bucket: max_tokens: 200 tokens_per_fill: 100 fill_interval: 1s # All other endpoints (default) - match: prefix: "/" route: cluster: stemedb_cluster timeout: 30s retry_policy: retry_on: "5xx,reset,connect-failure" num_retries: 3 per_try_timeout: 10s # HTTP filters http_filters: # Rate limiting filter - name: envoy.filters.http.local_ratelimit typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit stat_prefix: http_local_rate_limiter token_bucket: max_tokens: 200 tokens_per_fill: 100 fill_interval: 1s filter_enabled: runtime_key: local_rate_limit_enabled default_value: numerator: 100 denominator: HUNDRED filter_enforced: runtime_key: local_rate_limit_enforced default_value: numerator: 100 denominator: HUNDRED response_headers_to_add: - append: false header: key: x-rate-limit-exceeded value: "true" # RBAC filter (for admin endpoints) - name: envoy.filters.http.rbac typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC rules: action: ALLOW policies: "allow-all": permissions: - any: true principals: - any: true # Router filter (must be last) - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router # Access logging access_log: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog path: /dev/stdout format: "[%START_TIME%] \"%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%\" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% \"%REQ(X-FORWARDED-FOR)%\" \"%REQ(USER-AGENT)%\" \"%REQ(X-REQUEST-ID)%\" \"%REQ(:AUTHORITY)%\" \"%UPSTREAM_HOST%\"\n" # TLS configuration transport_socket: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext common_tls_context: tls_certificates: - certificate_chain: filename: /etc/letsencrypt/live/stemedb.example.com/fullchain.pem private_key: filename: /etc/letsencrypt/live/stemedb.example.com/privkey.pem tls_params: tls_minimum_protocol_version: TLSv1_3 tls_maximum_protocol_version: TLSv1_3 # ┌───────────────────────────────────────────────────────────┐ # │ Clusters (Upstream Servers) │ # └───────────────────────────────────────────────────────────┘ clusters: - name: stemedb_cluster type: STRICT_DNS connect_timeout: 5s lb_policy: ROUND_ROBIN # Load balancing load_assignment: cluster_name: stemedb_cluster endpoints: - lb_endpoints: # Node 1 - endpoint: address: socket_address: address: 10.0.1.51 port_value: 18180 health_check_config: port_value: 18180 # Node 2 - endpoint: address: socket_address: address: 10.0.1.52 port_value: 18180 health_check_config: port_value: 18180 # Node 3 - endpoint: address: socket_address: address: 10.0.1.53 port_value: 18180 health_check_config: port_value: 18180 # Health checks health_checks: - timeout: 3s interval: 5s unhealthy_threshold: 3 healthy_threshold: 2 http_health_check: path: "/v1/health" expected_statuses: - start: 200 end: 299 # Circuit breakers circuit_breakers: thresholds: - priority: DEFAULT max_connections: 1000 max_pending_requests: 1000 max_requests: 1000 max_retries: 3 # Outlier detection (automatic node removal) outlier_detection: consecutive_5xx: 5 interval: 10s base_ejection_time: 30s max_ejection_percent: 50 enforcing_consecutive_5xx: 100 # Connection pool settings common_lb_config: healthy_panic_threshold: value: 50.0 # Allow 50% unhealthy before panic # HTTP/2 settings typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions explicit_http_config: http2_protocol_options: max_concurrent_streams: 100 # ┌───────────────────────────────────────────────────────────┐ # │ Usage Instructions │ # └───────────────────────────────────────────────────────────┘ # # 1. Install Envoy: # wget https://github.com/envoyproxy/envoy/releases/download/v1.28.0/envoy-1.28.0-linux-x86_64 # chmod +x envoy-1.28.0-linux-x86_64 # sudo mv envoy-1.28.0-linux-x86_64 /usr/local/bin/envoy # # 2. Update configuration: # - Replace stemedb.example.com with your domain # - Update node IPs (10.0.1.51-53) # - Update Prometheus IP (10.0.1.100) # - Update TLS certificate paths # # 3. Validate config: # envoy --mode validate -c stemedb.yaml # # 4. Start Envoy: # envoy -c stemedb.yaml # # 5. Test endpoints: # curl -k https://localhost:8443/v1/health # # 6. View admin interface: # curl http://localhost:9901/stats/prometheus # Metrics # curl http://localhost:9901/config_dump # Config # curl http://localhost:9901/clusters # Cluster status # # 7. Test rate limiting: # for i in {1..150}; do curl -k https://localhost:8443/v1/health; done # # Should see 429 after 100 requests # # 8. Test health check: # # Stop node 2 # ssh node2 "sudo systemctl stop stemedb-api" # # Wait 15s for health check to fail # curl http://localhost:9901/clusters | grep node2 # # Should show: health_flags: /failed_active_hc # ┌───────────────────────────────────────────────────────────┐ # │ Systemd Service (Optional) │ # └───────────────────────────────────────────────────────────┘ # # Save as /etc/systemd/system/envoy.service: # # [Unit] # Description=Envoy Proxy # After=network.target # # [Service] # Type=simple # User=envoy # Group=envoy # ExecStart=/usr/local/bin/envoy -c /etc/envoy/stemedb.yaml # Restart=on-failure # RestartSec=5s # # [Install] # WantedBy=multi-user.target # # Then: # sudo systemctl daemon-reload # sudo systemctl enable envoy # sudo systemctl start envoy # ┌───────────────────────────────────────────────────────────┐ # │ Monitoring & Troubleshooting │ # └───────────────────────────────────────────────────────────┘ # # View stats: # curl http://localhost:9901/stats # # View Prometheus metrics: # curl http://localhost:9901/stats/prometheus # # Check cluster health: # curl http://localhost:9901/clusters # # Dump config: # curl http://localhost:9901/config_dump # # View access logs: # docker logs -f envoy-container # # Test circuit breaker: # # Simulate 5 consecutive 500 errors from node2 # # Node2 should be ejected for 30s # ┌───────────────────────────────────────────────────────────┐ # │ Production Hardening Checklist │ # └───────────────────────────────────────────────────────────┘ # # - [ ] Configure external authorization (OAuth2, JWT) # - [ ] Set up centralized logging (ELK, Splunk) # - [ ] Enable Envoy access logs to file (not just stdout) # - [ ] Configure metrics scraping (Prometheus) # - [ ] Set up distributed tracing (Jaeger, Zipkin) # - [ ] Test certificate renewal process # - [ ] Document rate limit thresholds # - [ ] Test circuit breaker behavior # - [ ] Set up alerting on outlier detection # - [ ] Configure WAF (Web Application Firewall)