stemedb/docs/operations/monitoring/prometheus/alerts/warning.yml
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

121 lines
5.6 KiB
YAML

groups:
- name: stemedb_warning
interval: 1m
rules:
- alert: WALFsyncSlow
expr: histogram_quantile(0.99, rate(stemedb_wal_fsync_latency_seconds_bucket[5m])) > 0.100
for: 5m
labels:
severity: warning
component: wal
annotations:
summary: "WAL fsync p99 latency >100ms"
description: "WAL fsync p99 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/slow-fsync.md"
impact: "Write operations slowing down. May impact ingestion throughput."
action: "Check disk I/O with `iostat -x 1`. Consider adding more IOPS or using faster storage."
- alert: HighAPIErrorRate
expr: rate(stemedb_errors_total[5m]) > 0.01
for: 5m
labels:
severity: warning
component: api
annotations:
summary: "API error rate >1%"
description: "API error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/high-error-rate.md"
impact: "Client requests failing. User experience degraded."
action: "Check logs for error details. Verify input validation and external dependencies."
- alert: IndexLookupSlow
expr: histogram_quantile(0.95, rate(stemedb_index_lookup_duration_seconds_bucket[5m])) > 0.050
for: 10m
labels:
severity: warning
component: storage
annotations:
summary: "Index lookup p95 latency >50ms"
description: "Index {{ $labels.index }} p95 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/slow-index-lookup.md"
impact: "Query performance degraded. API response times increasing."
action: "Check if indexes need compaction. Verify storage backend health."
- alert: WALDiskUsageHigh
expr: (stemedb_wal_disk_usage_bytes / stemedb_wal_disk_capacity_bytes) > 0.70
for: 10m
labels:
severity: warning
component: wal
annotations:
summary: "WAL disk usage >70%"
description: "WAL disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/disk-full.md"
impact: "Disk will fill in next few hours at current rate."
action: "Run cleanup to remove old WAL segments or increase disk size."
- alert: ReplicationLagWarning
expr: stemedb_sync_lag_seconds > 60
for: 10m
labels:
severity: warning
component: sync
annotations:
summary: "Replication lag >1 minute"
description: "Peer {{ $labels.peer_id }} is lagging by {{ $value | humanizeDuration }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/high-replication-lag.md"
impact: "Data freshness degraded. Queries may return slightly stale data."
action: "Monitor for escalation. Check network latency and peer load."
- alert: HighAPILatency
expr: histogram_quantile(0.99, rate(stemedb_http_request_duration_seconds_bucket[5m])) > 0.500
for: 5m
labels:
severity: warning
component: api
annotations:
summary: "API p99 latency >500ms"
description: "API p99 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/high-latency.md"
impact: "User experience degraded. SLO at risk (target: p99 <500ms)."
action: "Check slow query logs. Investigate storage and index performance."
- alert: StorageCompactionPending
expr: stemedb_storage_compaction_pending_size_bytes > (10 * 1024 * 1024 * 1024)
for: 1h
labels:
severity: warning
component: storage
annotations:
summary: "Compaction backlog >10GB"
description: "Storage compaction backlog is {{ $value | humanize1024 }}B on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/compaction-backlog.md"
impact: "Read amplification increasing. Query performance degrading."
action: "Trigger manual compaction or reduce write load temporarily."
- alert: CircuitBreakerHalfOpen
expr: stemedb_circuit_breakers_half_open > 0
for: 15m
labels:
severity: warning
component: protection
annotations:
summary: "Circuit breaker stuck in half-open state"
description: "Circuit breaker for {{ $labels.agent_id }} has been half-open for 15 minutes on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/circuit-breaker.md"
impact: "Agent requests partially failing. Service degraded for this agent."
action: "Investigate agent health. Reset circuit if agent recovered."
- alert: TrustRankDecayOverdue
expr: (time() - stemedb_trust_rank_last_decay_timestamp) > (24 * 60 * 60)
for: 1h
labels:
severity: warning
component: trust
annotations:
summary: "Trust rank decay not run in >24 hours"
description: "Trust ranks have not been decayed in {{ $value | humanizeDuration }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/trust-rank-decay.md"
impact: "Trust scores becoming stale. May affect query ranking."
action: "Trigger manual decay via POST /v1/admin/decay_trust_ranks"