stemedb/docs/operations/deployment/envoy/stemedb.yaml

# Envoy Proxy Configuration for StemeDB
#
# This configuration provides:
# - Load balancing across 3-node cluster (round-robin)
# - Health checks (HTTP /v1/health every 5s)
# - Circuit breakers (max 1000 connections per node)
# - Rate limiting (100 req/sec per IP)
# - Retry policies (3 retries on 5xx errors)
# - TLS termination
# - Access logging
# - Metrics (Prometheus format)
#
# Usage:
#   envoy -c stemedb.yaml
#
# Or with Docker:
#   docker run -d -p 8443:8443 -p 9901:9901 -v $(pwd)/stemedb.yaml:/etc/envoy/envoy.yaml envoyproxy/envoy:v1.28-latest

admin:
  address:
    socket_address:
      address: 0.0.0.0
      port_value: 9901  # Admin interface (metrics, config dump)

static_resources:
  listeners:
    # ┌───────────────────────────────────────────────────────┐
    # │  HTTPS Listener (Port 8443)                           │
    # └───────────────────────────────────────────────────────┘

    - name: stemedb_https_listener
      address:
        socket_address:
          address: 0.0.0.0
          port_value: 8443

      filter_chains:
        - filters:
            # HTTP Connection Manager
            - name: envoy.filters.network.http_connection_manager
              typed_config:
                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
                stat_prefix: stemedb_https
                codec_type: AUTO

                # Routing
                route_config:
                  name: stemedb_route
                  virtual_hosts:
                    - name: stemedb_backend
                      domains: ["*"]

                      routes:
                        # Health check endpoint (public, no rate limit)
                        - match:
                            path: "/v1/health"
                          route:
                            cluster: stemedb_cluster
                            timeout: 5s
                          typed_per_filter_config:
                            envoy.filters.http.local_ratelimit:
                              "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
                              stat_prefix: health_check
                              filter_enabled:
                                default_value:
                                  numerator: 0  # Disable rate limiting
                                  denominator: HUNDRED

                        # Write endpoints (stricter rate limit: 10 req/sec)
                        - match:
                            prefix: "/v1/assert"
                          route:
                            cluster: stemedb_cluster
                            timeout: 30s
                            retry_policy:
                              retry_on: "5xx"
                              num_retries: 0  # Don't retry writes (not idempotent)
                          typed_per_filter_config:
                            envoy.filters.http.local_ratelimit:
                              "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
                              stat_prefix: write_endpoints
                              token_bucket:
                                max_tokens: 20
                                tokens_per_fill: 10
                                fill_interval: 1s

                        - match:
                            prefix: "/v1/retract"
                          route:
                            cluster: stemedb_cluster
                            timeout: 30s
                            retry_policy:
                              retry_on: "5xx"
                              num_retries: 0
                          typed_per_filter_config:
                            envoy.filters.http.local_ratelimit:
                              "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
                              stat_prefix: write_endpoints
                              token_bucket:
                                max_tokens: 20
                                tokens_per_fill: 10
                                fill_interval: 1s

                        # Admin endpoints (restricted)
                        - match:
                            prefix: "/v1/admin/"
                          route:
                            cluster: stemedb_cluster
                            timeout: 30s
                          typed_per_filter_config:
                            envoy.filters.http.rbac:
                              "@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
                              rules:
                                action: ALLOW
                                policies:
                                  "internal-network":
                                    permissions:
                                      - any: true
                                    principals:
                                      - remote_ip:
                                          address_prefix: "10.0.0.0"
                                          prefix_len: 8
                                      - remote_ip:
                                          address_prefix: "172.16.0.0"
                                          prefix_len: 12
                                      - remote_ip:
                                          address_prefix: "192.168.0.0"
                                          prefix_len: 16

                        # Metrics endpoint (Prometheus only)
                        - match:
                            path: "/metrics"
                          route:
                            cluster: stemedb_cluster
                            timeout: 10s
                          typed_per_filter_config:
                            envoy.filters.http.rbac:
                              "@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
                              rules:
                                action: ALLOW
                                policies:
                                  "prometheus-server":
                                    permissions:
                                      - any: true
                                    principals:
                                      - remote_ip:
                                          address_prefix: "10.0.1.100"
                                          prefix_len: 32

                        # Query endpoints (standard rate limit: 100 req/sec)
                        - match:
                            prefix: "/v1/query"
                          route:
                            cluster: stemedb_cluster
                            timeout: 30s
                            retry_policy:
                              retry_on: "5xx,reset,connect-failure"
                              num_retries: 3
                              per_try_timeout: 10s
                          typed_per_filter_config:
                            envoy.filters.http.local_ratelimit:
                              "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
                              stat_prefix: query_endpoints
                              token_bucket:
                                max_tokens: 200
                                tokens_per_fill: 100
                                fill_interval: 1s

                        # All other endpoints (default)
                        - match:
                            prefix: "/"
                          route:
                            cluster: stemedb_cluster
                            timeout: 30s
                            retry_policy:
                              retry_on: "5xx,reset,connect-failure"
                              num_retries: 3
                              per_try_timeout: 10s

                # HTTP filters
                http_filters:
                  # Rate limiting filter
                  - name: envoy.filters.http.local_ratelimit
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
                      stat_prefix: http_local_rate_limiter
                      token_bucket:
                        max_tokens: 200
                        tokens_per_fill: 100
                        fill_interval: 1s
                      filter_enabled:
                        runtime_key: local_rate_limit_enabled
                        default_value:
                          numerator: 100
                          denominator: HUNDRED
                      filter_enforced:
                        runtime_key: local_rate_limit_enforced
                        default_value:
                          numerator: 100
                          denominator: HUNDRED
                      response_headers_to_add:
                        - append: false
                          header:
                            key: x-rate-limit-exceeded
                            value: "true"

                  # RBAC filter (for admin endpoints)
                  - name: envoy.filters.http.rbac
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC
                      rules:
                        action: ALLOW
                        policies:
                          "allow-all":
                            permissions:
                              - any: true
                            principals:
                              - any: true

                  # Router filter (must be last)
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

                # Access logging
                access_log:
                  - name: envoy.access_loggers.file
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
                      path: /dev/stdout
                      format: "[%START_TIME%] \"%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%\" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% \"%REQ(X-FORWARDED-FOR)%\" \"%REQ(USER-AGENT)%\" \"%REQ(X-REQUEST-ID)%\" \"%REQ(:AUTHORITY)%\" \"%UPSTREAM_HOST%\"\n"

          # TLS configuration
          transport_socket:
            name: envoy.transport_sockets.tls
            typed_config:
              "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext
              common_tls_context:
                tls_certificates:
                  - certificate_chain:
                      filename: /etc/letsencrypt/live/stemedb.example.com/fullchain.pem
                    private_key:
                      filename: /etc/letsencrypt/live/stemedb.example.com/privkey.pem
                tls_params:
                  tls_minimum_protocol_version: TLSv1_3
                  tls_maximum_protocol_version: TLSv1_3

  # ┌───────────────────────────────────────────────────────────┐
  # │  Clusters (Upstream Servers)                              │
  # └───────────────────────────────────────────────────────────┘

  clusters:
    - name: stemedb_cluster
      type: STRICT_DNS
      connect_timeout: 5s
      lb_policy: ROUND_ROBIN

      # Load balancing
      load_assignment:
        cluster_name: stemedb_cluster
        endpoints:
          - lb_endpoints:
              # Node 1
              - endpoint:
                  address:
                    socket_address:
                      address: 10.0.1.51
                      port_value: 18180
                health_check_config:
                  port_value: 18180

              # Node 2
              - endpoint:
                  address:
                    socket_address:
                      address: 10.0.1.52
                      port_value: 18180
                health_check_config:
                  port_value: 18180

              # Node 3
              - endpoint:
                  address:
                    socket_address:
                      address: 10.0.1.53
                      port_value: 18180
                health_check_config:
                  port_value: 18180

      # Health checks
      health_checks:
        - timeout: 3s
          interval: 5s
          unhealthy_threshold: 3
          healthy_threshold: 2
          http_health_check:
            path: "/v1/health"
            expected_statuses:
              - start: 200
                end: 299

      # Circuit breakers
      circuit_breakers:
        thresholds:
          - priority: DEFAULT
            max_connections: 1000
            max_pending_requests: 1000
            max_requests: 1000
            max_retries: 3

      # Outlier detection (automatic node removal)
      outlier_detection:
        consecutive_5xx: 5
        interval: 10s
        base_ejection_time: 30s
        max_ejection_percent: 50
        enforcing_consecutive_5xx: 100

      # Connection pool settings
      common_lb_config:
        healthy_panic_threshold:
          value: 50.0  # Allow 50% unhealthy before panic

      # HTTP/2 settings
      typed_extension_protocol_options:
        envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
          "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
          explicit_http_config:
            http2_protocol_options:
              max_concurrent_streams: 100

# ┌───────────────────────────────────────────────────────────┐
# │  Usage Instructions                                       │
# └───────────────────────────────────────────────────────────┘
#
# 1. Install Envoy:
#    wget https://github.com/envoyproxy/envoy/releases/download/v1.28.0/envoy-1.28.0-linux-x86_64
#    chmod +x envoy-1.28.0-linux-x86_64
#    sudo mv envoy-1.28.0-linux-x86_64 /usr/local/bin/envoy
#
# 2. Update configuration:
#    - Replace stemedb.example.com with your domain
#    - Update node IPs (10.0.1.51-53)
#    - Update Prometheus IP (10.0.1.100)
#    - Update TLS certificate paths
#
# 3. Validate config:
#    envoy --mode validate -c stemedb.yaml
#
# 4. Start Envoy:
#    envoy -c stemedb.yaml
#
# 5. Test endpoints:
#    curl -k https://localhost:8443/v1/health
#
# 6. View admin interface:
#    curl http://localhost:9901/stats/prometheus  # Metrics
#    curl http://localhost:9901/config_dump      # Config
#    curl http://localhost:9901/clusters         # Cluster status
#
# 7. Test rate limiting:
#    for i in {1..150}; do curl -k https://localhost:8443/v1/health; done
#    # Should see 429 after 100 requests
#
# 8. Test health check:
#    # Stop node 2
#    ssh node2 "sudo systemctl stop stemedb-api"
#    # Wait 15s for health check to fail
#    curl http://localhost:9901/clusters | grep node2
#    # Should show: health_flags: /failed_active_hc

# ┌───────────────────────────────────────────────────────────┐
# │  Systemd Service (Optional)                               │
# └───────────────────────────────────────────────────────────┘
#
# Save as /etc/systemd/system/envoy.service:
#
# [Unit]
# Description=Envoy Proxy
# After=network.target
#
# [Service]
# Type=simple
# User=envoy
# Group=envoy
# ExecStart=/usr/local/bin/envoy -c /etc/envoy/stemedb.yaml
# Restart=on-failure
# RestartSec=5s
#
# [Install]
# WantedBy=multi-user.target
#
# Then:
#   sudo systemctl daemon-reload
#   sudo systemctl enable envoy
#   sudo systemctl start envoy

# ┌───────────────────────────────────────────────────────────┐
# │  Monitoring & Troubleshooting                             │
# └───────────────────────────────────────────────────────────┘
#
# View stats:
#   curl http://localhost:9901/stats
#
# View Prometheus metrics:
#   curl http://localhost:9901/stats/prometheus
#
# Check cluster health:
#   curl http://localhost:9901/clusters
#
# Dump config:
#   curl http://localhost:9901/config_dump
#
# View access logs:
#   docker logs -f envoy-container
#
# Test circuit breaker:
#   # Simulate 5 consecutive 500 errors from node2
#   # Node2 should be ejected for 30s

# ┌───────────────────────────────────────────────────────────┐
# │  Production Hardening Checklist                           │
# └───────────────────────────────────────────────────────────┘
#
# - [ ] Configure external authorization (OAuth2, JWT)
# - [ ] Set up centralized logging (ELK, Splunk)
# - [ ] Enable Envoy access logs to file (not just stdout)
# - [ ] Configure metrics scraping (Prometheus)
# - [ ] Set up distributed tracing (Jaeger, Zipkin)
# - [ ] Test certificate renewal process
# - [ ] Document rate limit thresholds
# - [ ] Test circuit breaker behavior
# - [ ] Set up alerting on outlier detection
# - [ ] Configure WAF (Web Application Firewall)