groups: - name: tidaldb interval: 30s rules: - alert: TidalDBDown expr: tidaldb_health_ok == 0 for: 1m labels: { severity: critical } annotations: summary: "tidalDB is unhealthy" description: "tidaldb_health_ok is 0 — database is unhealthy or shut down." - alert: TidalDBCheckpointStale expr: tidaldb_checkpoint_age_seconds > 300 for: 2m labels: { severity: warning } annotations: summary: "Signal checkpoint not running" description: "{{ $value }}s since last checkpoint (threshold: 300s). Signal durability at risk." - alert: TidalDBCheckpointFailures expr: increase(tidaldb_checkpoint_failures_total[5m]) > 0 labels: { severity: warning } annotations: summary: "Signal checkpoint failures detected" description: "Checkpoint failures in last 5m. Check disk space and storage errors." - alert: TidalDBWALDiskPressure expr: tidaldb_wal_lag_bytes > 1000000000 for: 5m labels: { severity: warning } annotations: summary: "WAL disk usage exceeds 1GB" description: "{{ $value | humanize1024 }}B of WAL uncompacted. Compaction may be stuck." - alert: TidalDBSignalBacklog expr: tidaldb_signal_hot_entries > 4000000 for: 5m labels: { severity: warning } annotations: summary: "Signal ledger over 80% of capacity" description: "{{ $value }} hot entries (threshold: 4M / 80% of 5M budget)." - alert: TidalDBDegradedRanking expr: tidaldb_degradation_level > 0 for: 2m labels: { severity: warning } annotations: summary: "Ranking quality degraded" description: "Degradation level {{ $value }} active. Scale up or reduce load." - alert: TidalDBSessionLeak expr: rate(tidaldb_active_sessions[5m]) > 10 and tidaldb_active_sessions > 100 for: 5m labels: { severity: warning } annotations: summary: "Active session count growing rapidly" description: "{{ $value }} active sessions and growing. Agents may not be closing sessions." - alert: TidalDBHighRateLimiting expr: rate(tidaldb_rate_limited_total[5m]) > 100 for: 5m labels: { severity: info } annotations: summary: "Sustained rate limiting" description: "{{ $value }}/s rate-limited writes. Review agent rate limit config." - alert: TidalDBTantivySegmentBloat expr: tidaldb_tantivy_segment_count > 30 for: 10m labels: { severity: warning } annotations: summary: "Tantivy segment count elevated" description: "{{ $value }} segments (threshold: 30). Text syncer may be stalled." - alert: TidalDBSlowRetrieve expr: histogram_quantile(0.95, rate(tidaldb_retrieve_latency_us_bucket[5m])) > 500000 for: 5m labels: { severity: warning } annotations: summary: "Retrieve p95 latency exceeds 500ms" description: "p95 retrieve latency is {{ $value | humanizeDuration }}. Check signal ledger load and degradation level." - alert: TidalDBSlowSearch expr: histogram_quantile(0.95, rate(tidaldb_search_latency_us_bucket[5m])) > 1000000 for: 5m labels: { severity: warning } annotations: summary: "Search p95 latency exceeds 1s" description: "p95 search latency is {{ $value | humanizeDuration }}. Check Tantivy segment count and ANN index health."