stemedb/docs/operations/deployment/docker-compose/pilot-with-monitoring.yml
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

290 lines
8.9 KiB
YAML

# Docker Compose: StemeDB Pilot with Monitoring
#
# This configuration deploys:
# - StemeDB API (single-node)
# - Prometheus (metrics collection)
# - Grafana (visualization + pre-configured dashboard)
# - Backup container (daily automated backups)
#
# Usage:
# docker-compose -f pilot-with-monitoring.yml up -d
#
# Access:
# - StemeDB API: http://localhost:18180
# - StemeDB Dashboard: http://localhost:18188
# - Grafana: http://localhost:3000 (admin/admin)
# - Prometheus: http://localhost:9090
version: '3.8'
services:
# ┌─────────────────────────────────────────────────────┐
# │ StemeDB API Server │
# └─────────────────────────────────────────────────────┘
stemedb:
image: stemedb/stemedb-api:latest # Replace with your registry
container_name: stemedb-api
restart: unless-stopped
ports:
- "18180:18180" # API + Metrics
- "18188:18188" # Dashboard
environment:
STEMEDB_BIND_ADDR: "0.0.0.0:18180"
STEMEDB_WAL_DIR: "/data/wal"
STEMEDB_DB_DIR: "/data/db"
STEMEDB_METER_ENABLED: "true"
RUST_LOG: "info,stemedb=debug"
# Optional: Cluster mode (disabled for single-node pilot)
# STEMEDB_CLUSTER_ENABLED: "false"
volumes:
- stemedb-wal:/data/wal
- stemedb-db:/data/db
- ./config.toml:/etc/stemedb/config.toml:ro # Optional custom config
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:18180/v1/health"]
interval: 10s
timeout: 5s
retries: 3
start_period: 30s
networks:
- stemedb-network
# Resource limits (adjust based on load)
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
reservations:
cpus: '1.0'
memory: 2G
# ┌─────────────────────────────────────────────────────┐
# │ Prometheus (Metrics Collection) │
# └─────────────────────────────────────────────────────┘
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=30d' # Retain 30 days of metrics
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
networks:
- stemedb-network
depends_on:
- stemedb
# ┌─────────────────────────────────────────────────────┐
# │ Grafana (Visualization) │
# └─────────────────────────────────────────────────────┘
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: admin # CHANGE IN PRODUCTION
GF_USERS_ALLOW_SIGN_UP: "false"
GF_INSTALL_PLUGINS: "grafana-piechart-panel"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
networks:
- stemedb-network
depends_on:
- prometheus
# ┌─────────────────────────────────────────────────────┐
# │ Backup Container (Daily Automated Backups) │
# └─────────────────────────────────────────────────────┘
backup:
image: alpine:latest
container_name: stemedb-backup
restart: unless-stopped
command: >
sh -c "
apk add --no-cache rsync &&
while true; do
echo '[$(date)] Starting backup...'
BACKUP_DIR=/backups/stemedb-backup-$(date +%Y%m%d-%H%M%S)
mkdir -p $$BACKUP_DIR
rsync -av --delete /data/wal/ $$BACKUP_DIR/wal/
rsync -av --delete /data/db/ $$BACKUP_DIR/db/
echo '{\"timestamp\": \"'$(date -Iseconds)'\", \"version\": \"0.1.0\"}' > $$BACKUP_DIR/metadata.json
echo '[$(date)] Backup complete: $$BACKUP_DIR'
# Cleanup old backups (keep last 7)
ls -dt /backups/stemedb-backup-* | tail -n +8 | xargs rm -rf
# Sleep until next run (daily at 2 AM)
sleep 86400
done
"
volumes:
- stemedb-wal:/data/wal:ro
- stemedb-db:/data/db:ro
- ./backups:/backups
networks:
- stemedb-network
depends_on:
- stemedb
# ┌───────────────────────────────────────────────────────────┐
# │ Volumes (Persistent Storage) │
# └───────────────────────────────────────────────────────────┘
volumes:
stemedb-wal:
driver: local
stemedb-db:
driver: local
prometheus-data:
driver: local
grafana-data:
driver: local
# ┌───────────────────────────────────────────────────────────┐
# │ Networks │
# └───────────────────────────────────────────────────────────┘
networks:
stemedb-network:
driver: bridge
---
# prometheus.yml (save as ./prometheus.yml)
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'stemedb'
static_configs:
- targets: ['stemedb:18180']
metrics_path: '/metrics'
- job_name: 'prometheus'
static_configs:
- targets: ['prometheus:9090']
---
# Grafana Provisioning: Datasource (save as ./grafana/provisioning/datasources/prometheus.yml)
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
---
# Grafana Provisioning: Dashboard (save as ./grafana/provisioning/dashboards/stemedb.yml)
apiVersion: 1
providers:
- name: 'StemeDB'
folder: 'StemeDB'
type: file
options:
path: /var/lib/grafana/dashboards
---
# Grafana Dashboard JSON (save as ./grafana/dashboards/stemedb-overview.json)
#
# This is a simplified dashboard. For full dashboard, see:
# https://github.com/yourorg/stemedb/blob/main/grafana/dashboards/stemedb-overview.json
#
# Panels:
# 1. Query Latency (p50, p95, p99)
# 2. Ingest Rate (assertions/sec)
# 3. Disk Usage (WAL + DB)
# 4. Error Rate (4xx, 5xx)
# 5. Quarantine Queue Size
# 6. Circuit Breaker States
---
# Usage Instructions:
#
# 1. Create directory structure:
# mkdir -p ./grafana/provisioning/datasources
# mkdir -p ./grafana/provisioning/dashboards
# mkdir -p ./grafana/dashboards
# mkdir -p ./backups
#
# 2. Save prometheus.yml in current directory
#
# 3. Save Grafana provisioning files in ./grafana/provisioning/
#
# 4. Start stack:
# docker-compose -f pilot-with-monitoring.yml up -d
#
# 5. Verify health:
# curl http://localhost:18180/v1/health
# open http://localhost:3000 # Grafana (admin/admin)
#
# 6. View metrics:
# open http://localhost:9090 # Prometheus
#
# 7. Check backups:
# ls -lh ./backups/
#
# 8. Stop stack:
# docker-compose -f pilot-with-monitoring.yml down
#
# 9. Clean volumes (⚠️ DELETES ALL DATA):
# docker-compose -f pilot-with-monitoring.yml down -v
---
# Production Hardening Checklist:
#
# - [ ] Change Grafana admin password
# - [ ] Add TLS reverse proxy (see nginx config)
# - [ ] Set resource limits based on load testing
# - [ ] Configure external backup storage (S3, NFS)
# - [ ] Set up alerting (Prometheus Alertmanager)
# - [ ] Enable log aggregation (ELK, Loki)
# - [ ] Restrict network access (firewall rules)
# - [ ] Use secrets management (Docker secrets, Vault)
# - [ ] Enable monitoring for backup container
# - [ ] Test restore procedure monthly