stemedb/docs/operations/monitoring/grafana/storage-health.json
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

159 lines
4.4 KiB
JSON

{
"dashboard": {
"title": "StemeDB - Storage Health",
"tags": ["stemedb", "storage", "wal"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "WAL Fsync Latency (p50, p95, p99)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(stemedb_wal_fsync_latency_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(stemedb_wal_fsync_latency_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(stemedb_wal_fsync_latency_seconds_bucket[5m]))",
"legendFormat": "p99"
}
],
"yaxes": [
{"format": "s", "label": "Latency"},
{"format": "short"}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "WAL Disk Usage",
"type": "gauge",
"targets": [
{
"expr": "stemedb_wal_disk_usage_bytes / (1024*1024*1024)",
"legendFormat": "Disk Usage (GB)"
}
],
"fieldConfig": {
"defaults": {
"unit": "decgbytes",
"min": 0,
"max": 100,
"thresholds": {
"mode": "percentage",
"steps": [
{"value": 0, "color": "green"},
{"value": 70, "color": "yellow"},
{"value": 90, "color": "red"}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}
},
{
"id": 3,
"title": "WAL Write Rate",
"type": "graph",
"targets": [
{
"expr": "rate(stemedb_wal_writes_total[5m])",
"legendFormat": "Writes/sec"
},
{
"expr": "rate(stemedb_wal_bytes_written_total[5m]) / (1024*1024)",
"legendFormat": "MB/sec"
}
],
"yaxes": [
{"format": "ops", "label": "Rate"},
{"format": "short"}
],
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 0}
},
{
"id": 4,
"title": "WAL Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(stemedb_wal_write_errors_total[5m])",
"legendFormat": "{{error}}"
}
],
"yaxes": [
{"format": "ops", "label": "Errors/sec"},
{"format": "short"}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"alert": {
"conditions": [
{
"evaluator": {"params": [0.01], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A", "5m", "now"]},
"reducer": {"type": "avg"}
}
],
"name": "High WAL Error Rate"
}
},
{
"id": 5,
"title": "Storage Operation Latency (by operation)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.99, rate(stemedb_storage_operation_duration_seconds_bucket[5m]))",
"legendFormat": "{{operation}} ({{backend}})"
}
],
"yaxes": [
{"format": "s", "label": "Latency (p99)"},
{"format": "short"}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 6,
"title": "Index Lookup Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(stemedb_index_lookup_duration_seconds_bucket[5m]))",
"legendFormat": "{{index}} (p95)"
}
],
"yaxes": [
{"format": "s", "label": "Latency"},
{"format": "short"}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 7,
"title": "Storage Operations/sec",
"type": "graph",
"targets": [
{
"expr": "rate(stemedb_storage_operations_total[5m])",
"legendFormat": "{{operation}} ({{backend}})"
}
],
"yaxes": [
{"format": "ops", "label": "Operations/sec"},
{"format": "short"}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
}
],
"refresh": "30s",
"schemaVersion": 30,
"version": 1
}
}