stemedb/docs/operations/monitoring/prometheus/alerts/critical.yml
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

107 lines
5.0 KiB
YAML

groups:
- name: stemedb_critical
interval: 30s
rules:
- alert: StemeDBAPIDown
expr: up{job="stemedb"} == 0
for: 1m
labels:
severity: critical
component: api
annotations:
summary: "StemeDB API is down"
description: "The StemeDB API at {{ $labels.instance }} has been unreachable for 1 minute."
runbook: "https://docs.stemedb.com/operations/runbooks/server-wont-start.md"
dashboard: "https://grafana.example.com/d/sli-dashboard"
- alert: WALDiskNearlyFull
expr: (stemedb_wal_disk_usage_bytes / stemedb_wal_disk_capacity_bytes) > 0.90
for: 5m
labels:
severity: critical
component: wal
annotations:
summary: "WAL disk usage >90%"
description: "WAL disk usage is at {{ $value | humanizePercentage }} on {{ $labels.instance }}. Disk will fill in <30 minutes at current rate."
runbook: "https://docs.stemedb.com/operations/runbooks/disk-full.md"
impact: "Write operations will fail when disk reaches 100%. Service will become read-only."
action: "Increase disk size immediately or run cleanup to free space."
- alert: ReplicationLagCritical
expr: stemedb_sync_lag_seconds > 300
for: 5m
labels:
severity: critical
component: sync
annotations:
summary: "Replication lag >5 minutes"
description: "Peer {{ $labels.peer_id }} is lagging by {{ $value | humanizeDuration }} on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/high-replication-lag.md"
impact: "Data inconsistency across cluster. Queries may return stale data."
action: "Check network connectivity, peer health, and disk I/O on lagging node."
- alert: HighStorageErrorRate
expr: rate(stemedb_errors_total{layer="storage"}[5m]) > 1.0
for: 2m
labels:
severity: critical
component: storage
annotations:
summary: "High storage error rate (>1/sec)"
description: "Storage layer is experiencing {{ $value | humanize }} errors/sec on {{ $labels.instance }}."
runbook: "https://docs.stemedb.com/operations/runbooks/storage-errors.md"
impact: "Write and read operations failing. Data durability at risk."
action: "Check disk health, filesystem errors, and storage backend logs immediately."
- alert: WALFsyncFailure
expr: rate(stemedb_wal_write_errors_total{error="fsync_failed"}[5m]) > 0
for: 1m
labels:
severity: critical
component: wal
annotations:
summary: "WAL fsync failures detected"
description: "WAL fsync is failing on {{ $labels.instance }}. This indicates disk I/O errors."
runbook: "https://docs.stemedb.com/operations/runbooks/wal-fsync-failure.md"
impact: "Data durability compromised. Recent writes may be lost on crash."
action: "Check disk health with `iostat -x 1` and dmesg for I/O errors. Consider failing over to healthy node."
- alert: ClusterSplitBrain
expr: stemedb_cluster_nodes_alive < (stemedb_cluster_total_nodes / 2)
for: 2m
labels:
severity: critical
component: cluster
annotations:
summary: "Cluster has lost quorum"
description: "Only {{ $value }} of {{ $labels.total_nodes }} nodes are alive. Cluster has lost quorum."
runbook: "https://docs.stemedb.com/operations/runbooks/split-brain.md"
impact: "Write operations may be rejected. Risk of split-brain scenario."
action: "Investigate network partition. Do NOT restart nodes until partition is resolved."
- alert: MemoryExhaustion
expr: process_resident_memory_bytes{job="stemedb"} > (0.90 * node_memory_MemTotal_bytes)
for: 5m
labels:
severity: critical
component: process
annotations:
summary: "StemeDB using >90% of system memory"
description: "Memory usage is {{ $value | humanize1024 }}B on {{ $labels.instance }}. OOM killer may terminate process."
runbook: "https://docs.stemedb.com/operations/runbooks/memory-exhaustion.md"
impact: "Process may be killed by OS, causing downtime."
action: "Increase memory or reduce load. Check for memory leaks in logs."
- alert: CertificateExpiringSoon
expr: (stemedb_tls_certificate_expiry_seconds - time()) < (7 * 24 * 60 * 60)
for: 1h
labels:
severity: critical
component: tls
annotations:
summary: "TLS certificate expires in <7 days"
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
runbook: "https://docs.stemedb.com/operations/runbooks/certificate-renewal.md"
impact: "API will become inaccessible when certificate expires."
action: "Renew certificate immediately. Update cert-manager or manual cert files."