groups: - name: stemedb_info interval: 5m rules: - alert: CircuitBreakerOpen expr: stemedb_circuit_breakers_open > 0 for: 10m labels: severity: info component: protection annotations: summary: "Circuit breaker tripped for agent" description: "Circuit breaker for {{ $labels.agent_id }} is open on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/circuit-breaker.md" impact: "Requests from this agent are being rejected. No impact on other agents." action: "Monitor agent behavior. Circuit will auto-reset if agent recovers." - alert: QuarantineBacklogGrowing expr: rate(stemedb_quarantine_entries_total[10m]) > 10 for: 30m labels: severity: info component: quarantine annotations: summary: "Quarantine backlog growing (>10/min)" description: "Quarantine entries increasing at {{ $value }}/min on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/quarantine-backlog.md" impact: "Manual review queue growing. May delay assertion approval." action: "Review quarantine entries via GET /v1/admin/quarantine" - alert: NewNodeJoined expr: changes(stemedb_cluster_nodes_alive[5m]) > 0 labels: severity: info component: cluster annotations: summary: "New node joined cluster" description: "Node count changed on {{ $labels.instance }}. New node may have joined." runbook: "https://docs.stemedb.com/operations/runbooks/node-join.md" impact: "None. Informational alert for cluster topology changes." action: "Verify expected scaling operation. Monitor replication to new node." - alert: HighMemoryUsage expr: process_resident_memory_bytes{job="stemedb"} > (0.70 * node_memory_MemTotal_bytes) for: 30m labels: severity: info component: process annotations: summary: "Memory usage >70%" description: "Memory usage is {{ $value | humanize1024 }}B (70% of system memory) on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/memory-usage.md" impact: "None yet, but approaching critical threshold." action: "Monitor memory trend. Plan capacity increase if usage continues rising." - alert: APIKeyRotationDue expr: (time() - stemedb_api_key_created_timestamp) > (90 * 24 * 60 * 60) for: 1d labels: severity: info component: security annotations: summary: "API key older than 90 days" description: "API key {{ $labels.key_id }} was created {{ $value | humanizeDuration }} ago on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/api-key-rotation.md" impact: "None. Reminder to follow key rotation policy." action: "Rotate API key via POST /v1/admin/api_keys/rotate" - alert: GoldStandardCountLow expr: stemedb_gold_standard_count < 3 for: 1h labels: severity: info component: trust annotations: summary: "Gold standard count <3" description: "Only {{ $value }} gold standards configured on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/gold-standards.md" impact: "Trust calibration may be less accurate with fewer gold standards." action: "Consider adding more gold standard entries for better trust ranking." - alert: CertificateExpiringIn30Days expr: (stemedb_tls_certificate_expiry_seconds - time()) < (30 * 24 * 60 * 60) for: 1d labels: severity: info component: tls annotations: summary: "TLS certificate expires in <30 days" description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}." runbook: "https://docs.stemedb.com/operations/runbooks/certificate-renewal.md" impact: "None yet. Advance notice for renewal." action: "Schedule certificate renewal before expiry." - alert: WALSegmentCountHigh expr: stemedb_wal_segments_count > 100 for: 1h labels: severity: info component: wal annotations: summary: "WAL has >100 segments" description: "WAL segment count is {{ $value }} on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/wal-cleanup.md" impact: "None. May indicate cleanup not running or high write volume." action: "Verify cleanup cron job is running. Adjust retention if needed." - alert: LowQueryThroughput expr: rate(stemedb_http_requests_total{path=~"/v1/query.*"}[5m]) < 0.1 for: 1h labels: severity: info component: api annotations: summary: "Query throughput <0.1/sec for 1 hour" description: "Query rate is {{ $value }}/sec on {{ $labels.instance }}." runbook: "https://docs.stemedb.com/operations/runbooks/low-traffic.md" impact: "None. May indicate low usage or upstream issue." action: "Verify expected traffic patterns. Check client connectivity."