stemedb/docs/operations/deployment/docker-compose/pilot-with-monitoring.yml

# Docker Compose: StemeDB Pilot with Monitoring
#
# This configuration deploys:
# - StemeDB API (single-node)
# - Prometheus (metrics collection)
# - Grafana (visualization + pre-configured dashboard)
# - Backup container (daily automated backups)
#
# Usage:
#   docker-compose -f pilot-with-monitoring.yml up -d
#
# Access:
#   - StemeDB API: http://localhost:18180
#   - StemeDB Dashboard: http://localhost:18188
#   - Grafana: http://localhost:3000 (admin/admin)
#   - Prometheus: http://localhost:9090

version: '3.8'

services:
  # ┌─────────────────────────────────────────────────────┐
  # │  StemeDB API Server                                 │
  # └─────────────────────────────────────────────────────┘

  stemedb:
    image: stemedb/stemedb-api:latest  # Replace with your registry
    container_name: stemedb-api
    restart: unless-stopped

    ports:
      - "18180:18180"  # API + Metrics
      - "18188:18188"  # Dashboard

    environment:
      STEMEDB_BIND_ADDR: "0.0.0.0:18180"
      STEMEDB_WAL_DIR: "/data/wal"
      STEMEDB_DB_DIR: "/data/db"
      STEMEDB_METER_ENABLED: "true"
      RUST_LOG: "info,stemedb=debug"

      # Optional: Cluster mode (disabled for single-node pilot)
      # STEMEDB_CLUSTER_ENABLED: "false"

    volumes:
      - stemedb-wal:/data/wal
      - stemedb-db:/data/db
      - ./config.toml:/etc/stemedb/config.toml:ro  # Optional custom config

    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:18180/v1/health"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s

    networks:
      - stemedb-network

    # Resource limits (adjust based on load)
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 4G
        reservations:
          cpus: '1.0'
          memory: 2G

  # ┌─────────────────────────────────────────────────────┐
  # │  Prometheus (Metrics Collection)                    │
  # └─────────────────────────────────────────────────────┘

  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped

    ports:
      - "9090:9090"

    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=30d'  # Retain 30 days of metrics

    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus

    networks:
      - stemedb-network

    depends_on:
      - stemedb

  # ┌─────────────────────────────────────────────────────┐
  # │  Grafana (Visualization)                            │
  # └─────────────────────────────────────────────────────┘

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    restart: unless-stopped

    ports:
      - "3000:3000"

    environment:
      GF_SECURITY_ADMIN_USER: admin
      GF_SECURITY_ADMIN_PASSWORD: admin  # CHANGE IN PRODUCTION
      GF_USERS_ALLOW_SIGN_UP: "false"
      GF_INSTALL_PLUGINS: "grafana-piechart-panel"

    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro
      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro

    networks:
      - stemedb-network

    depends_on:
      - prometheus

  # ┌─────────────────────────────────────────────────────┐
  # │  Backup Container (Daily Automated Backups)         │
  # └─────────────────────────────────────────────────────┘

  backup:
    image: alpine:latest
    container_name: stemedb-backup
    restart: unless-stopped

    command: >
      sh -c "
      apk add --no-cache rsync &&
      while true; do
        echo '[$(date)] Starting backup...'
        BACKUP_DIR=/backups/stemedb-backup-$(date +%Y%m%d-%H%M%S)
        mkdir -p $$BACKUP_DIR
        rsync -av --delete /data/wal/ $$BACKUP_DIR/wal/
        rsync -av --delete /data/db/ $$BACKUP_DIR/db/
        echo '{\"timestamp\": \"'$(date -Iseconds)'\", \"version\": \"0.1.0\"}' > $$BACKUP_DIR/metadata.json
        echo '[$(date)] Backup complete: $$BACKUP_DIR'

        # Cleanup old backups (keep last 7)
        ls -dt /backups/stemedb-backup-* | tail -n +8 | xargs rm -rf

        # Sleep until next run (daily at 2 AM)
        sleep 86400
      done
      "

    volumes:
      - stemedb-wal:/data/wal:ro
      - stemedb-db:/data/db:ro
      - ./backups:/backups

    networks:
      - stemedb-network

    depends_on:
      - stemedb

# ┌───────────────────────────────────────────────────────────┐
# │  Volumes (Persistent Storage)                             │
# └───────────────────────────────────────────────────────────┘

volumes:
  stemedb-wal:
    driver: local
  stemedb-db:
    driver: local
  prometheus-data:
    driver: local
  grafana-data:
    driver: local

# ┌───────────────────────────────────────────────────────────┐
# │  Networks                                                 │
# └───────────────────────────────────────────────────────────┘

networks:
  stemedb-network:
    driver: bridge

---
# prometheus.yml (save as ./prometheus.yml)

global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'stemedb'
    static_configs:
      - targets: ['stemedb:18180']
    metrics_path: '/metrics'

  - job_name: 'prometheus'
    static_configs:
      - targets: ['prometheus:9090']

---
# Grafana Provisioning: Datasource (save as ./grafana/provisioning/datasources/prometheus.yml)

apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: false

---
# Grafana Provisioning: Dashboard (save as ./grafana/provisioning/dashboards/stemedb.yml)

apiVersion: 1

providers:
  - name: 'StemeDB'
    folder: 'StemeDB'
    type: file
    options:
      path: /var/lib/grafana/dashboards

---
# Grafana Dashboard JSON (save as ./grafana/dashboards/stemedb-overview.json)
#
# This is a simplified dashboard. For full dashboard, see:
# https://github.com/yourorg/stemedb/blob/main/grafana/dashboards/stemedb-overview.json
#
# Panels:
# 1. Query Latency (p50, p95, p99)
# 2. Ingest Rate (assertions/sec)
# 3. Disk Usage (WAL + DB)
# 4. Error Rate (4xx, 5xx)
# 5. Quarantine Queue Size
# 6. Circuit Breaker States

---
# Usage Instructions:
#
# 1. Create directory structure:
#    mkdir -p ./grafana/provisioning/datasources
#    mkdir -p ./grafana/provisioning/dashboards
#    mkdir -p ./grafana/dashboards
#    mkdir -p ./backups
#
# 2. Save prometheus.yml in current directory
#
# 3. Save Grafana provisioning files in ./grafana/provisioning/
#
# 4. Start stack:
#    docker-compose -f pilot-with-monitoring.yml up -d
#
# 5. Verify health:
#    curl http://localhost:18180/v1/health
#    open http://localhost:3000  # Grafana (admin/admin)
#
# 6. View metrics:
#    open http://localhost:9090  # Prometheus
#
# 7. Check backups:
#    ls -lh ./backups/
#
# 8. Stop stack:
#    docker-compose -f pilot-with-monitoring.yml down
#
# 9. Clean volumes (⚠️ DELETES ALL DATA):
#    docker-compose -f pilot-with-monitoring.yml down -v

---
# Production Hardening Checklist:
#
# - [ ] Change Grafana admin password
# - [ ] Add TLS reverse proxy (see nginx config)
# - [ ] Set resource limits based on load testing
# - [ ] Configure external backup storage (S3, NFS)
# - [ ] Set up alerting (Prometheus Alertmanager)
# - [ ] Enable log aggregation (ELK, Loki)
# - [ ] Restrict network access (firewall rules)
# - [ ] Use secrets management (Docker secrets, Vault)
# - [ ] Enable monitoring for backup container
# - [ ] Test restore procedure monthly