groups: - name: stemedb_critical interval: 30s rules: - alert: StemeDBAPIDown expr: up{job="stemedb"} == 0 for: 1m labels: severity: critical component: api annotations: summary: "StemeDB API is down" description: "The StemeDB API at {{ $labels.instance }} has been unreachable for 1 minute." runbook: "https://docs.stemedb.com/operations/runbooks/server-wont-start.md" dashboard: "https://grafana.example.com/d/sli-dashboard" - alert: WALDiskNearlyFull expr: (stemedb_wal_disk_usage_bytes / stemedb_wal_disk_capacity_bytes) > 0.90 for: 5m labels: severity: critical component: wal annotations: summary: "WAL disk usage >90%" description: "WAL disk usage is at {{ $value | humanizePercentage }} on {{ $labels.instance }}. Disk will fill in <30 minutes at current rate." runbook: "https://docs.stemedb.com/operations/runbooks/disk-full.md" impact: "Write operations will fail when disk reaches 100%. Service will become read-only." action: "Increase disk size immediately or run cleanup to free space." - alert: ReplicationLagCritical expr: stemedb_sync_lag_seconds > 300 for: 5m labels: severity: critical component: sync annotations: summary: "Replication lag >5 minutes" description: "Peer {{ $labels.peer_id }} is lagging by {{ $value | humanizeDuration }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/high-replication-lag.md" impact: "Data inconsistency across cluster. Queries may return stale data." action: "Check network connectivity, peer health, and disk I/O on lagging node." - alert: HighStorageErrorRate expr: rate(stemedb_errors_total{layer="storage"}[5m]) > 1.0 for: 2m labels: severity: critical component: storage annotations: summary: "High storage error rate (>1/sec)" description: "Storage layer is experiencing {{ $value | humanize }} errors/sec on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/storage-errors.md" impact: "Write and read operations failing. Data durability at risk." action: "Check disk health, filesystem errors, and storage backend logs immediately." - alert: WALFsyncFailure expr: rate(stemedb_wal_write_errors_total{error="fsync_failed"}[5m]) > 0 for: 1m labels: severity: critical component: wal annotations: summary: "WAL fsync failures detected" description: "WAL fsync is failing on {{ $labels.instance }}. This indicates disk I/O errors." runbook: "https://docs.stemedb.com/operations/runbooks/wal-fsync-failure.md" impact: "Data durability compromised. Recent writes may be lost on crash." action: "Check disk health with `iostat -x 1` and dmesg for I/O errors. Consider failing over to healthy node." - alert: ClusterSplitBrain expr: stemedb_cluster_nodes_alive < (stemedb_cluster_total_nodes / 2) for: 2m labels: severity: critical component: cluster annotations: summary: "Cluster has lost quorum" description: "Only {{ $value }} of {{ $labels.total_nodes }} nodes are alive. Cluster has lost quorum." runbook: "https://docs.stemedb.com/operations/runbooks/split-brain.md" impact: "Write operations may be rejected. Risk of split-brain scenario." action: "Investigate network partition. Do NOT restart nodes until partition is resolved." - alert: MemoryExhaustion expr: process_resident_memory_bytes{job="stemedb"} > (0.90 * node_memory_MemTotal_bytes) for: 5m labels: severity: critical component: process annotations: summary: "StemeDB using >90% of system memory" description: "Memory usage is {{ $value | humanize1024 }}B on {{ $labels.instance }}. OOM killer may terminate process." runbook: "https://docs.stemedb.com/operations/runbooks/memory-exhaustion.md" impact: "Process may be killed by OS, causing downtime." action: "Increase memory or reduce load. Check for memory leaks in logs." - alert: CertificateExpiringSoon expr: (stemedb_tls_certificate_expiry_seconds - time()) < (7 * 24 * 60 * 60) for: 1h labels: severity: critical component: tls annotations: summary: "TLS certificate expires in <7 days" description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}." runbook: "https://docs.stemedb.com/operations/runbooks/certificate-renewal.md" impact: "API will become inaccessible when certificate expires." action: "Renew certificate immediately. Update cert-manager or manual cert files."