This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
290 lines
8.9 KiB
YAML
290 lines
8.9 KiB
YAML
# Docker Compose: StemeDB Pilot with Monitoring
|
|
#
|
|
# This configuration deploys:
|
|
# - StemeDB API (single-node)
|
|
# - Prometheus (metrics collection)
|
|
# - Grafana (visualization + pre-configured dashboard)
|
|
# - Backup container (daily automated backups)
|
|
#
|
|
# Usage:
|
|
# docker-compose -f pilot-with-monitoring.yml up -d
|
|
#
|
|
# Access:
|
|
# - StemeDB API: http://localhost:18180
|
|
# - StemeDB Dashboard: http://localhost:18188
|
|
# - Grafana: http://localhost:3000 (admin/admin)
|
|
# - Prometheus: http://localhost:9090
|
|
|
|
version: '3.8'
|
|
|
|
services:
|
|
# ┌─────────────────────────────────────────────────────┐
|
|
# │ StemeDB API Server │
|
|
# └─────────────────────────────────────────────────────┘
|
|
|
|
stemedb:
|
|
image: stemedb/stemedb-api:latest # Replace with your registry
|
|
container_name: stemedb-api
|
|
restart: unless-stopped
|
|
|
|
ports:
|
|
- "18180:18180" # API + Metrics
|
|
- "18188:18188" # Dashboard
|
|
|
|
environment:
|
|
STEMEDB_BIND_ADDR: "0.0.0.0:18180"
|
|
STEMEDB_WAL_DIR: "/data/wal"
|
|
STEMEDB_DB_DIR: "/data/db"
|
|
STEMEDB_METER_ENABLED: "true"
|
|
RUST_LOG: "info,stemedb=debug"
|
|
|
|
# Optional: Cluster mode (disabled for single-node pilot)
|
|
# STEMEDB_CLUSTER_ENABLED: "false"
|
|
|
|
volumes:
|
|
- stemedb-wal:/data/wal
|
|
- stemedb-db:/data/db
|
|
- ./config.toml:/etc/stemedb/config.toml:ro # Optional custom config
|
|
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:18180/v1/health"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 3
|
|
start_period: 30s
|
|
|
|
networks:
|
|
- stemedb-network
|
|
|
|
# Resource limits (adjust based on load)
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '2.0'
|
|
memory: 4G
|
|
reservations:
|
|
cpus: '1.0'
|
|
memory: 2G
|
|
|
|
# ┌─────────────────────────────────────────────────────┐
|
|
# │ Prometheus (Metrics Collection) │
|
|
# └─────────────────────────────────────────────────────┘
|
|
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
container_name: prometheus
|
|
restart: unless-stopped
|
|
|
|
ports:
|
|
- "9090:9090"
|
|
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
|
- '--web.console.templates=/etc/prometheus/consoles'
|
|
- '--storage.tsdb.retention.time=30d' # Retain 30 days of metrics
|
|
|
|
volumes:
|
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- prometheus-data:/prometheus
|
|
|
|
networks:
|
|
- stemedb-network
|
|
|
|
depends_on:
|
|
- stemedb
|
|
|
|
# ┌─────────────────────────────────────────────────────┐
|
|
# │ Grafana (Visualization) │
|
|
# └─────────────────────────────────────────────────────┘
|
|
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
container_name: grafana
|
|
restart: unless-stopped
|
|
|
|
ports:
|
|
- "3000:3000"
|
|
|
|
environment:
|
|
GF_SECURITY_ADMIN_USER: admin
|
|
GF_SECURITY_ADMIN_PASSWORD: admin # CHANGE IN PRODUCTION
|
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
|
GF_INSTALL_PLUGINS: "grafana-piechart-panel"
|
|
|
|
volumes:
|
|
- grafana-data:/var/lib/grafana
|
|
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
|
|
networks:
|
|
- stemedb-network
|
|
|
|
depends_on:
|
|
- prometheus
|
|
|
|
# ┌─────────────────────────────────────────────────────┐
|
|
# │ Backup Container (Daily Automated Backups) │
|
|
# └─────────────────────────────────────────────────────┘
|
|
|
|
backup:
|
|
image: alpine:latest
|
|
container_name: stemedb-backup
|
|
restart: unless-stopped
|
|
|
|
command: >
|
|
sh -c "
|
|
apk add --no-cache rsync &&
|
|
while true; do
|
|
echo '[$(date)] Starting backup...'
|
|
BACKUP_DIR=/backups/stemedb-backup-$(date +%Y%m%d-%H%M%S)
|
|
mkdir -p $$BACKUP_DIR
|
|
rsync -av --delete /data/wal/ $$BACKUP_DIR/wal/
|
|
rsync -av --delete /data/db/ $$BACKUP_DIR/db/
|
|
echo '{\"timestamp\": \"'$(date -Iseconds)'\", \"version\": \"0.1.0\"}' > $$BACKUP_DIR/metadata.json
|
|
echo '[$(date)] Backup complete: $$BACKUP_DIR'
|
|
|
|
# Cleanup old backups (keep last 7)
|
|
ls -dt /backups/stemedb-backup-* | tail -n +8 | xargs rm -rf
|
|
|
|
# Sleep until next run (daily at 2 AM)
|
|
sleep 86400
|
|
done
|
|
"
|
|
|
|
volumes:
|
|
- stemedb-wal:/data/wal:ro
|
|
- stemedb-db:/data/db:ro
|
|
- ./backups:/backups
|
|
|
|
networks:
|
|
- stemedb-network
|
|
|
|
depends_on:
|
|
- stemedb
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Volumes (Persistent Storage) │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
|
|
volumes:
|
|
stemedb-wal:
|
|
driver: local
|
|
stemedb-db:
|
|
driver: local
|
|
prometheus-data:
|
|
driver: local
|
|
grafana-data:
|
|
driver: local
|
|
|
|
# ┌───────────────────────────────────────────────────────────┐
|
|
# │ Networks │
|
|
# └───────────────────────────────────────────────────────────┘
|
|
|
|
networks:
|
|
stemedb-network:
|
|
driver: bridge
|
|
|
|
---
|
|
# prometheus.yml (save as ./prometheus.yml)
|
|
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
- job_name: 'stemedb'
|
|
static_configs:
|
|
- targets: ['stemedb:18180']
|
|
metrics_path: '/metrics'
|
|
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['prometheus:9090']
|
|
|
|
---
|
|
# Grafana Provisioning: Datasource (save as ./grafana/provisioning/datasources/prometheus.yml)
|
|
|
|
apiVersion: 1
|
|
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://prometheus:9090
|
|
isDefault: true
|
|
editable: false
|
|
|
|
---
|
|
# Grafana Provisioning: Dashboard (save as ./grafana/provisioning/dashboards/stemedb.yml)
|
|
|
|
apiVersion: 1
|
|
|
|
providers:
|
|
- name: 'StemeDB'
|
|
folder: 'StemeDB'
|
|
type: file
|
|
options:
|
|
path: /var/lib/grafana/dashboards
|
|
|
|
---
|
|
# Grafana Dashboard JSON (save as ./grafana/dashboards/stemedb-overview.json)
|
|
#
|
|
# This is a simplified dashboard. For full dashboard, see:
|
|
# https://github.com/yourorg/stemedb/blob/main/grafana/dashboards/stemedb-overview.json
|
|
#
|
|
# Panels:
|
|
# 1. Query Latency (p50, p95, p99)
|
|
# 2. Ingest Rate (assertions/sec)
|
|
# 3. Disk Usage (WAL + DB)
|
|
# 4. Error Rate (4xx, 5xx)
|
|
# 5. Quarantine Queue Size
|
|
# 6. Circuit Breaker States
|
|
|
|
---
|
|
# Usage Instructions:
|
|
#
|
|
# 1. Create directory structure:
|
|
# mkdir -p ./grafana/provisioning/datasources
|
|
# mkdir -p ./grafana/provisioning/dashboards
|
|
# mkdir -p ./grafana/dashboards
|
|
# mkdir -p ./backups
|
|
#
|
|
# 2. Save prometheus.yml in current directory
|
|
#
|
|
# 3. Save Grafana provisioning files in ./grafana/provisioning/
|
|
#
|
|
# 4. Start stack:
|
|
# docker-compose -f pilot-with-monitoring.yml up -d
|
|
#
|
|
# 5. Verify health:
|
|
# curl http://localhost:18180/v1/health
|
|
# open http://localhost:3000 # Grafana (admin/admin)
|
|
#
|
|
# 6. View metrics:
|
|
# open http://localhost:9090 # Prometheus
|
|
#
|
|
# 7. Check backups:
|
|
# ls -lh ./backups/
|
|
#
|
|
# 8. Stop stack:
|
|
# docker-compose -f pilot-with-monitoring.yml down
|
|
#
|
|
# 9. Clean volumes (⚠️ DELETES ALL DATA):
|
|
# docker-compose -f pilot-with-monitoring.yml down -v
|
|
|
|
---
|
|
# Production Hardening Checklist:
|
|
#
|
|
# - [ ] Change Grafana admin password
|
|
# - [ ] Add TLS reverse proxy (see nginx config)
|
|
# - [ ] Set resource limits based on load testing
|
|
# - [ ] Configure external backup storage (S3, NFS)
|
|
# - [ ] Set up alerting (Prometheus Alertmanager)
|
|
# - [ ] Enable log aggregation (ELK, Loki)
|
|
# - [ ] Restrict network access (firewall rules)
|
|
# - [ ] Use secrets management (Docker secrets, Vault)
|
|
# - [ ] Enable monitoring for backup container
|
|
# - [ ] Test restore procedure monthly
|