This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
121 lines
5.6 KiB
YAML
121 lines
5.6 KiB
YAML
groups:
|
|
- name: stemedb_warning
|
|
interval: 1m
|
|
rules:
|
|
- alert: WALFsyncSlow
|
|
expr: histogram_quantile(0.99, rate(stemedb_wal_fsync_latency_seconds_bucket[5m])) > 0.100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: wal
|
|
annotations:
|
|
summary: "WAL fsync p99 latency >100ms"
|
|
description: "WAL fsync p99 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/slow-fsync.md"
|
|
impact: "Write operations slowing down. May impact ingestion throughput."
|
|
action: "Check disk I/O with `iostat -x 1`. Consider adding more IOPS or using faster storage."
|
|
|
|
- alert: HighAPIErrorRate
|
|
expr: rate(stemedb_errors_total[5m]) > 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: api
|
|
annotations:
|
|
summary: "API error rate >1%"
|
|
description: "API error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/high-error-rate.md"
|
|
impact: "Client requests failing. User experience degraded."
|
|
action: "Check logs for error details. Verify input validation and external dependencies."
|
|
|
|
- alert: IndexLookupSlow
|
|
expr: histogram_quantile(0.95, rate(stemedb_index_lookup_duration_seconds_bucket[5m])) > 0.050
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: storage
|
|
annotations:
|
|
summary: "Index lookup p95 latency >50ms"
|
|
description: "Index {{ $labels.index }} p95 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/slow-index-lookup.md"
|
|
impact: "Query performance degraded. API response times increasing."
|
|
action: "Check if indexes need compaction. Verify storage backend health."
|
|
|
|
- alert: WALDiskUsageHigh
|
|
expr: (stemedb_wal_disk_usage_bytes / stemedb_wal_disk_capacity_bytes) > 0.70
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: wal
|
|
annotations:
|
|
summary: "WAL disk usage >70%"
|
|
description: "WAL disk usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/disk-full.md"
|
|
impact: "Disk will fill in next few hours at current rate."
|
|
action: "Run cleanup to remove old WAL segments or increase disk size."
|
|
|
|
- alert: ReplicationLagWarning
|
|
expr: stemedb_sync_lag_seconds > 60
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: sync
|
|
annotations:
|
|
summary: "Replication lag >1 minute"
|
|
description: "Peer {{ $labels.peer_id }} is lagging by {{ $value | humanizeDuration }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/high-replication-lag.md"
|
|
impact: "Data freshness degraded. Queries may return slightly stale data."
|
|
action: "Monitor for escalation. Check network latency and peer load."
|
|
|
|
- alert: HighAPILatency
|
|
expr: histogram_quantile(0.99, rate(stemedb_http_request_duration_seconds_bucket[5m])) > 0.500
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: api
|
|
annotations:
|
|
summary: "API p99 latency >500ms"
|
|
description: "API p99 latency is {{ $value | humanizeDuration }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/high-latency.md"
|
|
impact: "User experience degraded. SLO at risk (target: p99 <500ms)."
|
|
action: "Check slow query logs. Investigate storage and index performance."
|
|
|
|
- alert: StorageCompactionPending
|
|
expr: stemedb_storage_compaction_pending_size_bytes > (10 * 1024 * 1024 * 1024)
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
component: storage
|
|
annotations:
|
|
summary: "Compaction backlog >10GB"
|
|
description: "Storage compaction backlog is {{ $value | humanize1024 }}B on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/compaction-backlog.md"
|
|
impact: "Read amplification increasing. Query performance degrading."
|
|
action: "Trigger manual compaction or reduce write load temporarily."
|
|
|
|
- alert: CircuitBreakerHalfOpen
|
|
expr: stemedb_circuit_breakers_half_open > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
component: protection
|
|
annotations:
|
|
summary: "Circuit breaker stuck in half-open state"
|
|
description: "Circuit breaker for {{ $labels.agent_id }} has been half-open for 15 minutes on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/circuit-breaker.md"
|
|
impact: "Agent requests partially failing. Service degraded for this agent."
|
|
action: "Investigate agent health. Reset circuit if agent recovered."
|
|
|
|
- alert: TrustRankDecayOverdue
|
|
expr: (time() - stemedb_trust_rank_last_decay_timestamp) > (24 * 60 * 60)
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
component: trust
|
|
annotations:
|
|
summary: "Trust rank decay not run in >24 hours"
|
|
description: "Trust ranks have not been decayed in {{ $value | humanizeDuration }} on {{ $labels.instance }}."
|
|
runbook: "https://docs.stemedb.com/operations/runbooks/trust-rank-decay.md"
|
|
impact: "Trust scores becoming stale. May affect query ranking."
|
|
action: "Trigger manual decay via POST /v1/admin/decay_trust_ranks"
|