groups: - name: stemedb_warning interval: 1m rules: - alert: WALFsyncSlow expr: histogram_quantile(0.99, rate(stemedb_wal_fsync_latency_seconds_bucket[5m])) > 0.100 for: 5m labels: severity: warning component: wal annotations: summary: "WAL fsync p99 latency >100ms" description: "WAL fsync p99 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/slow-fsync.md" impact: "Write operations slowing down. May impact ingestion throughput." action: "Check disk I/O with `iostat -x 1`. Consider adding more IOPS or using faster storage." - alert: HighAPIErrorRate expr: rate(stemedb_errors_total[5m]) > 0.01 for: 5m labels: severity: warning component: api annotations: summary: "API error rate >1%" description: "API error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/high-error-rate.md" impact: "Client requests failing. User experience degraded." action: "Check logs for error details. Verify input validation and external dependencies." - alert: IndexLookupSlow expr: histogram_quantile(0.95, rate(stemedb_index_lookup_duration_seconds_bucket[5m])) > 0.050 for: 10m labels: severity: warning component: storage annotations: summary: "Index lookup p95 latency >50ms" description: "Index {{ $labels.index }} p95 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/slow-index-lookup.md" impact: "Query performance degraded. API response times increasing." action: "Check if indexes need compaction. Verify storage backend health." - alert: WALDiskUsageHigh expr: (stemedb_wal_disk_usage_bytes / stemedb_wal_disk_capacity_bytes) > 0.70 for: 10m labels: severity: warning component: wal annotations: summary: "WAL disk usage >70%" description: "WAL disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/disk-full.md" impact: "Disk will fill in next few hours at current rate." action: "Run cleanup to remove old WAL segments or increase disk size." - alert: ReplicationLagWarning expr: stemedb_sync_lag_seconds > 60 for: 10m labels: severity: warning component: sync annotations: summary: "Replication lag >1 minute" description: "Peer {{ $labels.peer_id }} is lagging by {{ $value | humanizeDuration }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/high-replication-lag.md" impact: "Data freshness degraded. Queries may return slightly stale data." action: "Monitor for escalation. Check network latency and peer load." - alert: HighAPILatency expr: histogram_quantile(0.99, rate(stemedb_http_request_duration_seconds_bucket[5m])) > 0.500 for: 5m labels: severity: warning component: api annotations: summary: "API p99 latency >500ms" description: "API p99 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/high-latency.md" impact: "User experience degraded. SLO at risk (target: p99 <500ms)." action: "Check slow query logs. Investigate storage and index performance." - alert: StorageCompactionPending expr: stemedb_storage_compaction_pending_size_bytes > (10 * 1024 * 1024 * 1024) for: 1h labels: severity: warning component: storage annotations: summary: "Compaction backlog >10GB" description: "Storage compaction backlog is {{ $value | humanize1024 }}B on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/compaction-backlog.md" impact: "Read amplification increasing. Query performance degrading." action: "Trigger manual compaction or reduce write load temporarily." - alert: CircuitBreakerHalfOpen expr: stemedb_circuit_breakers_half_open > 0 for: 15m labels: severity: warning component: protection annotations: summary: "Circuit breaker stuck in half-open state" description: "Circuit breaker for {{ $labels.agent_id }} has been half-open for 15 minutes on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/circuit-breaker.md" impact: "Agent requests partially failing. Service degraded for this agent." action: "Investigate agent health. Reset circuit if agent recovered." - alert: TrustRankDecayOverdue expr: (time() - stemedb_trust_rank_last_decay_timestamp) > (24 * 60 * 60) for: 1h labels: severity: warning component: trust annotations: summary: "Trust rank decay not run in >24 hours" description: "Trust ranks have not been decayed in {{ $value | humanizeDuration }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/trust-rank-decay.md" impact: "Trust scores becoming stale. May affect query ranking." action: "Trigger manual decay via POST /v1/admin/decay_trust_ranks"