# Docker Compose: StemeDB Pilot with Monitoring # # This configuration deploys: # - StemeDB API (single-node) # - Prometheus (metrics collection) # - Grafana (visualization + pre-configured dashboard) # - Backup container (daily automated backups) # # Usage: # docker-compose -f pilot-with-monitoring.yml up -d # # Access: # - StemeDB API: http://localhost:18180 # - StemeDB Dashboard: http://localhost:18188 # - Grafana: http://localhost:3000 (admin/admin) # - Prometheus: http://localhost:9090 version: '3.8' services: # ┌─────────────────────────────────────────────────────┐ # │ StemeDB API Server │ # └─────────────────────────────────────────────────────┘ stemedb: image: stemedb/stemedb-api:latest # Replace with your registry container_name: stemedb-api restart: unless-stopped ports: - "18180:18180" # API + Metrics - "18188:18188" # Dashboard environment: STEMEDB_BIND_ADDR: "0.0.0.0:18180" STEMEDB_WAL_DIR: "/data/wal" STEMEDB_DB_DIR: "/data/db" STEMEDB_METER_ENABLED: "true" RUST_LOG: "info,stemedb=debug" # Optional: Cluster mode (disabled for single-node pilot) # STEMEDB_CLUSTER_ENABLED: "false" volumes: - stemedb-wal:/data/wal - stemedb-db:/data/db - ./config.toml:/etc/stemedb/config.toml:ro # Optional custom config healthcheck: test: ["CMD", "curl", "-f", "http://localhost:18180/v1/health"] interval: 10s timeout: 5s retries: 3 start_period: 30s networks: - stemedb-network # Resource limits (adjust based on load) deploy: resources: limits: cpus: '2.0' memory: 4G reservations: cpus: '1.0' memory: 2G # ┌─────────────────────────────────────────────────────┐ # │ Prometheus (Metrics Collection) │ # └─────────────────────────────────────────────────────┘ prometheus: image: prom/prometheus:latest container_name: prometheus restart: unless-stopped ports: - "9090:9090" command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--storage.tsdb.retention.time=30d' # Retain 30 days of metrics volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus-data:/prometheus networks: - stemedb-network depends_on: - stemedb # ┌─────────────────────────────────────────────────────┐ # │ Grafana (Visualization) │ # └─────────────────────────────────────────────────────┘ grafana: image: grafana/grafana:latest container_name: grafana restart: unless-stopped ports: - "3000:3000" environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: admin # CHANGE IN PRODUCTION GF_USERS_ALLOW_SIGN_UP: "false" GF_INSTALL_PLUGINS: "grafana-piechart-panel" volumes: - grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro networks: - stemedb-network depends_on: - prometheus # ┌─────────────────────────────────────────────────────┐ # │ Backup Container (Daily Automated Backups) │ # └─────────────────────────────────────────────────────┘ backup: image: alpine:latest container_name: stemedb-backup restart: unless-stopped command: > sh -c " apk add --no-cache rsync && while true; do echo '[$(date)] Starting backup...' BACKUP_DIR=/backups/stemedb-backup-$(date +%Y%m%d-%H%M%S) mkdir -p $$BACKUP_DIR rsync -av --delete /data/wal/ $$BACKUP_DIR/wal/ rsync -av --delete /data/db/ $$BACKUP_DIR/db/ echo '{\"timestamp\": \"'$(date -Iseconds)'\", \"version\": \"0.1.0\"}' > $$BACKUP_DIR/metadata.json echo '[$(date)] Backup complete: $$BACKUP_DIR' # Cleanup old backups (keep last 7) ls -dt /backups/stemedb-backup-* | tail -n +8 | xargs rm -rf # Sleep until next run (daily at 2 AM) sleep 86400 done " volumes: - stemedb-wal:/data/wal:ro - stemedb-db:/data/db:ro - ./backups:/backups networks: - stemedb-network depends_on: - stemedb # ┌───────────────────────────────────────────────────────────┐ # │ Volumes (Persistent Storage) │ # └───────────────────────────────────────────────────────────┘ volumes: stemedb-wal: driver: local stemedb-db: driver: local prometheus-data: driver: local grafana-data: driver: local # ┌───────────────────────────────────────────────────────────┐ # │ Networks │ # └───────────────────────────────────────────────────────────┘ networks: stemedb-network: driver: bridge --- # prometheus.yml (save as ./prometheus.yml) global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'stemedb' static_configs: - targets: ['stemedb:18180'] metrics_path: '/metrics' - job_name: 'prometheus' static_configs: - targets: ['prometheus:9090'] --- # Grafana Provisioning: Datasource (save as ./grafana/provisioning/datasources/prometheus.yml) apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: false --- # Grafana Provisioning: Dashboard (save as ./grafana/provisioning/dashboards/stemedb.yml) apiVersion: 1 providers: - name: 'StemeDB' folder: 'StemeDB' type: file options: path: /var/lib/grafana/dashboards --- # Grafana Dashboard JSON (save as ./grafana/dashboards/stemedb-overview.json) # # This is a simplified dashboard. For full dashboard, see: # https://github.com/yourorg/stemedb/blob/main/grafana/dashboards/stemedb-overview.json # # Panels: # 1. Query Latency (p50, p95, p99) # 2. Ingest Rate (assertions/sec) # 3. Disk Usage (WAL + DB) # 4. Error Rate (4xx, 5xx) # 5. Quarantine Queue Size # 6. Circuit Breaker States --- # Usage Instructions: # # 1. Create directory structure: # mkdir -p ./grafana/provisioning/datasources # mkdir -p ./grafana/provisioning/dashboards # mkdir -p ./grafana/dashboards # mkdir -p ./backups # # 2. Save prometheus.yml in current directory # # 3. Save Grafana provisioning files in ./grafana/provisioning/ # # 4. Start stack: # docker-compose -f pilot-with-monitoring.yml up -d # # 5. Verify health: # curl http://localhost:18180/v1/health # open http://localhost:3000 # Grafana (admin/admin) # # 6. View metrics: # open http://localhost:9090 # Prometheus # # 7. Check backups: # ls -lh ./backups/ # # 8. Stop stack: # docker-compose -f pilot-with-monitoring.yml down # # 9. Clean volumes (⚠️ DELETES ALL DATA): # docker-compose -f pilot-with-monitoring.yml down -v --- # Production Hardening Checklist: # # - [ ] Change Grafana admin password # - [ ] Add TLS reverse proxy (see nginx config) # - [ ] Set resource limits based on load testing # - [ ] Configure external backup storage (S3, NFS) # - [ ] Set up alerting (Prometheus Alertmanager) # - [ ] Enable log aggregation (ELK, Loki) # - [ ] Restrict network access (firewall rules) # - [ ] Use secrets management (Docker secrets, Vault) # - [ ] Enable monitoring for backup container # - [ ] Test restore procedure monthly