This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
120 lines
5.3 KiB
YAML
120 lines
5.3 KiB
YAML
groups:
|
|
- name: stemedb_info
|
|
interval: 5m
|
|
rules:
|
|
- alert: CircuitBreakerOpen
|
|
expr: stemedb_circuit_breakers_open > 0
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
component: protection
|
|
annotations:
|
|
summary: "Circuit breaker tripped for agent"
|
|
description: "Circuit breaker for {{ $labels.agent_id }} is open on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/circuit-breaker.md"
|
|
impact: "Requests from this agent are being rejected. No impact on other agents."
|
|
action: "Monitor agent behavior. Circuit will auto-reset if agent recovers."
|
|
|
|
- alert: QuarantineBacklogGrowing
|
|
expr: rate(stemedb_quarantine_entries_total[10m]) > 10
|
|
for: 30m
|
|
labels:
|
|
severity: info
|
|
component: quarantine
|
|
annotations:
|
|
summary: "Quarantine backlog growing (>10/min)"
|
|
description: "Quarantine entries increasing at {{ $value }}/min on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/quarantine-backlog.md"
|
|
impact: "Manual review queue growing. May delay assertion approval."
|
|
action: "Review quarantine entries via GET /v1/admin/quarantine"
|
|
|
|
- alert: NewNodeJoined
|
|
expr: changes(stemedb_cluster_nodes_alive[5m]) > 0
|
|
labels:
|
|
severity: info
|
|
component: cluster
|
|
annotations:
|
|
summary: "New node joined cluster"
|
|
description: "Node count changed on {{ $labels.instance }}. New node may have joined."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/node-join.md"
|
|
impact: "None. Informational alert for cluster topology changes."
|
|
action: "Verify expected scaling operation. Monitor replication to new node."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: process_resident_memory_bytes{job="stemedb"} > (0.70 * node_memory_MemTotal_bytes)
|
|
for: 30m
|
|
labels:
|
|
severity: info
|
|
component: process
|
|
annotations:
|
|
summary: "Memory usage >70%"
|
|
description: "Memory usage is {{ $value | humanize1024 }}B (70% of system memory) on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/memory-usage.md"
|
|
impact: "None yet, but approaching critical threshold."
|
|
action: "Monitor memory trend. Plan capacity increase if usage continues rising."
|
|
|
|
- alert: APIKeyRotationDue
|
|
expr: (time() - stemedb_api_key_created_timestamp) > (90 * 24 * 60 * 60)
|
|
for: 1d
|
|
labels:
|
|
severity: info
|
|
component: security
|
|
annotations:
|
|
summary: "API key older than 90 days"
|
|
description: "API key {{ $labels.key_id }} was created {{ $value | humanizeDuration }} ago on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/api-key-rotation.md"
|
|
impact: "None. Reminder to follow key rotation policy."
|
|
action: "Rotate API key via POST /v1/admin/api_keys/rotate"
|
|
|
|
- alert: GoldStandardCountLow
|
|
expr: stemedb_gold_standard_count < 3
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
component: trust
|
|
annotations:
|
|
summary: "Gold standard count <3"
|
|
description: "Only {{ $value }} gold standards configured on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/gold-standards.md"
|
|
impact: "Trust calibration may be less accurate with fewer gold standards."
|
|
action: "Consider adding more gold standard entries for better trust ranking."
|
|
|
|
- alert: CertificateExpiringIn30Days
|
|
expr: (stemedb_tls_certificate_expiry_seconds - time()) < (30 * 24 * 60 * 60)
|
|
for: 1d
|
|
labels:
|
|
severity: info
|
|
component: tls
|
|
annotations:
|
|
summary: "TLS certificate expires in <30 days"
|
|
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/certificate-renewal.md"
|
|
impact: "None yet. Advance notice for renewal."
|
|
action: "Schedule certificate renewal before expiry."
|
|
|
|
- alert: WALSegmentCountHigh
|
|
expr: stemedb_wal_segments_count > 100
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
component: wal
|
|
annotations:
|
|
summary: "WAL has >100 segments"
|
|
description: "WAL segment count is {{ $value }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/wal-cleanup.md"
|
|
impact: "None. May indicate cleanup not running or high write volume."
|
|
action: "Verify cleanup cron job is running. Adjust retention if needed."
|
|
|
|
- alert: LowQueryThroughput
|
|
expr: rate(stemedb_http_requests_total{path=~"/v1/query.*"}[5m]) < 0.1
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
component: api
|
|
annotations:
|
|
summary: "Query throughput <0.1/sec for 1 hour"
|
|
description: "Query rate is {{ $value }}/sec on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/low-traffic.md"
|
|
impact: "None. May indicate low usage or upstream issue."
|
|
action: "Verify expected traffic patterns. Check client connectivity."
|