stemedb/docs/operations/deployment/nginx/stemedb.conf
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

390 lines
17 KiB
Plaintext

# Nginx Reverse Proxy Configuration for StemeDB
#
# This configuration provides:
# - TLS 1.3 termination with Let's Encrypt
# - HTTP → HTTPS redirect
# - Request size limits (2MB)
# - Rate limiting (100 req/sec per IP)
# - Security headers (HSTS, X-Frame-Options)
# - Health-checked upstream (single-node or cluster)
# - Admin endpoint restrictions (VPN-only)
# - Metrics endpoint restrictions (internal-only)
#
# Installation:
# sudo cp stemedb.conf /etc/nginx/sites-available/
# sudo ln -s /etc/nginx/sites-available/stemedb.conf /etc/nginx/sites-enabled/
# sudo nginx -t
# sudo systemctl reload nginx
# ┌───────────────────────────────────────────────────────────┐
# │ Rate Limiting Zones │
# └───────────────────────────────────────────────────────────┘
# Zone for general API requests (100 req/sec per IP)
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=100r/s;
# Zone for write-heavy endpoints (10 req/sec per IP)
limit_req_zone $binary_remote_addr zone=write_limit:10m rate=10r/s;
# Connection limit (max 10 concurrent per IP)
limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
# ┌───────────────────────────────────────────────────────────┐
# │ Upstream Configuration │
# └───────────────────────────────────────────────────────────┘
# Single-node configuration
upstream stemedb_backend {
server localhost:18180;
# Health check (requires nginx_upstream_check_module)
# check interval=5000 rise=2 fall=3 timeout=3000;
# Connection keepalive
keepalive 32;
}
# Three-node cluster configuration (comment out single-node above)
# upstream stemedb_cluster {
# # Round-robin (default)
# server 10.0.1.51:18180 weight=1 max_fails=3 fail_timeout=30s;
# server 10.0.1.52:18180 weight=1 max_fails=3 fail_timeout=30s;
# server 10.0.1.53:18180 weight=1 max_fails=3 fail_timeout=30s;
#
# # Connection keepalive
# keepalive 32;
# }
# ┌───────────────────────────────────────────────────────────┐
# │ HTTP → HTTPS Redirect │
# └───────────────────────────────────────────────────────────┘
server {
listen 80;
listen [::]:80;
server_name stemedb.example.com;
# Let's Encrypt ACME challenge
location /.well-known/acme-challenge/ {
root /var/www/certbot;
}
# Redirect all other traffic to HTTPS
location / {
return 301 https://$server_name$request_uri;
}
}
# ┌───────────────────────────────────────────────────────────┐
# │ HTTPS Server (Main Configuration) │
# └───────────────────────────────────────────────────────────┘
server {
listen 443 ssl http2;
listen [::]:443 ssl http2;
server_name stemedb.example.com;
# ─────────────────────────────────────────────────────────
# TLS Configuration
# ─────────────────────────────────────────────────────────
# Let's Encrypt certificates (managed by certbot)
ssl_certificate /etc/letsencrypt/live/stemedb.example.com/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/stemedb.example.com/privkey.pem;
# TLS 1.3 only (most secure)
ssl_protocols TLSv1.3;
# Strong ciphers (TLS 1.3)
ssl_prefer_server_ciphers on;
ssl_ciphers 'TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_128_GCM_SHA256';
# SSL session cache
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
ssl_session_tickets off;
# OCSP Stapling
ssl_stapling on;
ssl_stapling_verify on;
ssl_trusted_certificate /etc/letsencrypt/live/stemedb.example.com/chain.pem;
resolver 8.8.8.8 8.8.4.4 valid=300s;
resolver_timeout 5s;
# ─────────────────────────────────────────────────────────
# Security Headers
# ─────────────────────────────────────────────────────────
# HSTS (1 year, include subdomains)
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
# Prevent clickjacking
add_header X-Frame-Options "SAMEORIGIN" always;
# Content type sniffing
add_header X-Content-Type-Options "nosniff" always;
# XSS protection
add_header X-XSS-Protection "1; mode=block" always;
# Referrer policy
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
# CSP (Content Security Policy)
add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; font-src 'self'; connect-src 'self';" always;
# ─────────────────────────────────────────────────────────
# Logging
# ─────────────────────────────────────────────────────────
access_log /var/log/nginx/stemedb-access.log combined;
error_log /var/log/nginx/stemedb-error.log warn;
# ─────────────────────────────────────────────────────────
# Global Limits
# ─────────────────────────────────────────────────────────
# Max request body size (2MB for assertions)
client_max_body_size 2M;
# Timeout settings
proxy_connect_timeout 10s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
# Connection limits
limit_conn conn_limit 10;
# ─────────────────────────────────────────────────────────
# Health Check Endpoint (Public)
# ─────────────────────────────────────────────────────────
location = /v1/health {
proxy_pass http://stemedb_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
# No rate limiting on health checks
limit_req off;
# Fast timeout for health checks
proxy_connect_timeout 3s;
proxy_send_timeout 5s;
proxy_read_timeout 5s;
}
# ─────────────────────────────────────────────────────────
# Write Endpoints (Stricter Rate Limits)
# ─────────────────────────────────────────────────────────
location ~ ^/v1/(assert|retract)$ {
# Apply write rate limit (10 req/sec, burst 20)
limit_req zone=write_limit burst=20 nodelay;
proxy_pass http://stemedb_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Don't retry writes (not idempotent)
proxy_next_upstream off;
}
# ─────────────────────────────────────────────────────────
# Query Endpoints (Standard Rate Limits)
# ─────────────────────────────────────────────────────────
location /v1/query {
# Apply API rate limit (100 req/sec, burst 200)
limit_req zone=api_limit burst=200 nodelay;
proxy_pass http://stemedb_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Retry on specific errors
proxy_next_upstream error timeout http_502 http_503;
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 10s;
}
# ─────────────────────────────────────────────────────────
# Admin Endpoints (Restricted to Internal Network)
# ─────────────────────────────────────────────────────────
location /v1/admin/ {
# ⚠️ CRITICAL: Admin endpoints have NO authentication
# Restrict to internal network only
# Allow from internal network
allow 10.0.0.0/8;
allow 172.16.0.0/12;
allow 192.168.0.0/16;
# Or allow from specific VPN subnet
# allow 10.8.0.0/24;
# Deny all others
deny all;
proxy_pass http://stemedb_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# ─────────────────────────────────────────────────────────
# Metrics Endpoint (Restricted to Prometheus)
# ─────────────────────────────────────────────────────────
location /metrics {
# Only allow from Prometheus server
allow 10.0.1.100; # Replace with your Prometheus IP
# Deny all others
deny all;
proxy_pass http://stemedb_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
# No rate limiting on metrics
limit_req off;
}
# ─────────────────────────────────────────────────────────
# Dashboard (Public with Rate Limiting)
# ─────────────────────────────────────────────────────────
location / {
# Apply API rate limit
limit_req zone=api_limit burst=200 nodelay;
proxy_pass http://stemedb_backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade"; # For WebSocket support
}
# ─────────────────────────────────────────────────────────
# Static Files (Optional - for custom dashboard assets)
# ─────────────────────────────────────────────────────────
# location /static/ {
# alias /var/www/stemedb/static/;
# expires 1y;
# add_header Cache-Control "public, immutable";
# }
# ─────────────────────────────────────────────────────────
# Error Pages
# ─────────────────────────────────────────────────────────
error_page 502 503 504 /50x.html;
location = /50x.html {
root /usr/share/nginx/html;
internal;
}
# Custom 429 (rate limit) page
error_page 429 /429.html;
location = /429.html {
root /usr/share/nginx/html;
internal;
}
# Custom 403 (forbidden) page
error_page 403 /403.html;
location = /403.html {
root /usr/share/nginx/html;
internal;
}
}
# ┌───────────────────────────────────────────────────────────┐
# │ Usage Instructions │
# └───────────────────────────────────────────────────────────┘
#
# 1. Install certbot:
# sudo apt install certbot python3-certbot-nginx
#
# 2. Obtain certificate:
# sudo certbot --nginx -d stemedb.example.com
#
# 3. Copy config:
# sudo cp stemedb.conf /etc/nginx/sites-available/
#
# 4. Update variables:
# - Replace stemedb.example.com with your domain
# - Update internal network ranges (10.0.0.0/8)
# - Update Prometheus IP (10.0.1.100)
#
# 5. Enable site:
# sudo ln -s /etc/nginx/sites-available/stemedb.conf /etc/nginx/sites-enabled/
#
# 6. Test config:
# sudo nginx -t
#
# 7. Reload nginx:
# sudo systemctl reload nginx
#
# 8. Test endpoints:
# curl https://stemedb.example.com/v1/health
#
# 9. Set up auto-renewal:
# sudo crontab -e
# # Add: 0 3 * * * certbot renew --quiet && systemctl reload nginx
# ┌───────────────────────────────────────────────────────────┐
# │ Monitoring & Troubleshooting │
# └───────────────────────────────────────────────────────────┘
#
# View access logs:
# sudo tail -f /var/log/nginx/stemedb-access.log
#
# View error logs:
# sudo tail -f /var/log/nginx/stemedb-error.log
#
# Check rate limit status:
# sudo grep "limiting requests" /var/log/nginx/stemedb-error.log
#
# Test rate limiting:
# for i in {1..150}; do curl https://stemedb.example.com/v1/health; done
# # Should see 429 after 100 requests
#
# Check TLS configuration:
# openssl s_client -connect stemedb.example.com:443 -tls1_3
#
# Test security headers:
# curl -I https://stemedb.example.com/v1/health
# ┌───────────────────────────────────────────────────────────┐
# │ Production Hardening Checklist │
# └───────────────────────────────────────────────────────────┘
#
# - [ ] Enable ModSecurity WAF (optional)
# - [ ] Set up fail2ban for DDoS protection
# - [ ] Configure log rotation (logrotate)
# - [ ] Set up centralized logging (ELK, Splunk)
# - [ ] Enable nginx status page (/nginx_status) for monitoring
# - [ ] Configure backup upstream servers
# - [ ] Set up nginx Prometheus exporter
# - [ ] Test certificate renewal process
# - [ ] Document rate limit thresholds
# - [ ] Create custom error pages (50x.html, 429.html)