This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
107 lines
5.0 KiB
YAML
107 lines
5.0 KiB
YAML
groups:
|
|
- name: stemedb_critical
|
|
interval: 30s
|
|
rules:
|
|
- alert: StemeDBAPIDown
|
|
expr: up{job="stemedb"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: api
|
|
annotations:
|
|
summary: "StemeDB API is down"
|
|
description: "The StemeDB API at {{ $labels.instance }} has been unreachable for 1 minute."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/server-wont-start.md"
|
|
dashboard: "https://grafana.example.com/d/sli-dashboard"
|
|
|
|
- alert: WALDiskNearlyFull
|
|
expr: (stemedb_wal_disk_usage_bytes / stemedb_wal_disk_capacity_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: wal
|
|
annotations:
|
|
summary: "WAL disk usage >90%"
|
|
description: "WAL disk usage is at {{ $value | humanizePercentage }} on {{ $labels.instance }}. Disk will fill in <30 minutes at current rate."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/disk-full.md"
|
|
impact: "Write operations will fail when disk reaches 100%. Service will become read-only."
|
|
action: "Increase disk size immediately or run cleanup to free space."
|
|
|
|
- alert: ReplicationLagCritical
|
|
expr: stemedb_sync_lag_seconds > 300
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: sync
|
|
annotations:
|
|
summary: "Replication lag >5 minutes"
|
|
description: "Peer {{ $labels.peer_id }} is lagging by {{ $value | humanizeDuration }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/high-replication-lag.md"
|
|
impact: "Data inconsistency across cluster. Queries may return stale data."
|
|
action: "Check network connectivity, peer health, and disk I/O on lagging node."
|
|
|
|
- alert: HighStorageErrorRate
|
|
expr: rate(stemedb_errors_total{layer="storage"}[5m]) > 1.0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: storage
|
|
annotations:
|
|
summary: "High storage error rate (>1/sec)"
|
|
description: "Storage layer is experiencing {{ $value | humanize }} errors/sec on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/storage-errors.md"
|
|
impact: "Write and read operations failing. Data durability at risk."
|
|
action: "Check disk health, filesystem errors, and storage backend logs immediately."
|
|
|
|
- alert: WALFsyncFailure
|
|
expr: rate(stemedb_wal_write_errors_total{error="fsync_failed"}[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: wal
|
|
annotations:
|
|
summary: "WAL fsync failures detected"
|
|
description: "WAL fsync is failing on {{ $labels.instance }}. This indicates disk I/O errors."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/wal-fsync-failure.md"
|
|
impact: "Data durability compromised. Recent writes may be lost on crash."
|
|
action: "Check disk health with `iostat -x 1` and dmesg for I/O errors. Consider failing over to healthy node."
|
|
|
|
- alert: ClusterSplitBrain
|
|
expr: stemedb_cluster_nodes_alive < (stemedb_cluster_total_nodes / 2)
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: cluster
|
|
annotations:
|
|
summary: "Cluster has lost quorum"
|
|
description: "Only {{ $value }} of {{ $labels.total_nodes }} nodes are alive. Cluster has lost quorum."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/split-brain.md"
|
|
impact: "Write operations may be rejected. Risk of split-brain scenario."
|
|
action: "Investigate network partition. Do NOT restart nodes until partition is resolved."
|
|
|
|
- alert: MemoryExhaustion
|
|
expr: process_resident_memory_bytes{job="stemedb"} > (0.90 * node_memory_MemTotal_bytes)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: process
|
|
annotations:
|
|
summary: "StemeDB using >90% of system memory"
|
|
description: "Memory usage is {{ $value | humanize1024 }}B on {{ $labels.instance }}. OOM killer may terminate process."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/memory-exhaustion.md"
|
|
impact: "Process may be killed by OS, causing downtime."
|
|
action: "Increase memory or reduce load. Check for memory leaks in logs."
|
|
|
|
- alert: CertificateExpiringSoon
|
|
expr: (stemedb_tls_certificate_expiry_seconds - time()) < (7 * 24 * 60 * 60)
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
component: tls
|
|
annotations:
|
|
summary: "TLS certificate expires in <7 days"
|
|
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/certificate-renewal.md"
|
|
impact: "API will become inaccessible when certificate expires."
|
|
action: "Renew certificate immediately. Update cert-manager or manual cert files."
|