This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
161 lines
4.5 KiB
JSON
161 lines
4.5 KiB
JSON
{
|
|
"dashboard": {
|
|
"title": "StemeDB - SLI & Availability",
|
|
"tags": ["stemedb", "sli", "availability"],
|
|
"timezone": "browser",
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Request Rate (by endpoint)",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(stemedb_http_requests_total[5m])",
|
|
"legendFormat": "{{method}} {{path}}"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "reqps", "label": "Requests/sec"},
|
|
{"format": "short"}
|
|
],
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Request Latency p99 (by endpoint)",
|
|
"type": "heatmap",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.99, rate(stemedb_http_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "{{method}} {{path}}"
|
|
}
|
|
],
|
|
"yaxis": {"format": "s"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Error Rate (by type)",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(stemedb_errors_total[5m])",
|
|
"legendFormat": "{{type}} ({{layer}})"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "ops", "label": "Errors/sec"},
|
|
{"format": "short"}
|
|
],
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
|
"alert": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {"params": [0.01], "type": "gt"},
|
|
"operator": {"type": "and"},
|
|
"query": {"params": ["A", "5m", "now"]},
|
|
"reducer": {"type": "avg"}
|
|
}
|
|
],
|
|
"name": "High Error Rate"
|
|
}
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Availability (Success Rate)",
|
|
"type": "gauge",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(stemedb_http_request_duration_seconds_count{status=~\"2..\"}[5m])) / sum(rate(stemedb_http_request_duration_seconds_count[5m]))",
|
|
"legendFormat": "Availability %"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percentunit",
|
|
"min": 0,
|
|
"max": 1,
|
|
"thresholds": {
|
|
"mode": "percentage",
|
|
"steps": [
|
|
{"value": 0, "color": "red"},
|
|
{"value": 0.95, "color": "yellow"},
|
|
{"value": 0.99, "color": "green"}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 8}
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Request Status Distribution",
|
|
"type": "piechart",
|
|
"targets": [
|
|
{
|
|
"expr": "sum by (status) (rate(stemedb_http_request_duration_seconds_count[5m]))",
|
|
"legendFormat": "{{status}}"
|
|
}
|
|
],
|
|
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 8}
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "Latency Distribution (all endpoints)",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, rate(stemedb_http_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(stemedb_http_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, rate(stemedb_http_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p99"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"format": "s", "label": "Latency"},
|
|
{"format": "short"}
|
|
],
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "Circuit Breaker Status",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "stemedb_circuit_breakers_open",
|
|
"legendFormat": "Open"
|
|
},
|
|
{
|
|
"expr": "stemedb_circuit_breakers_half_open",
|
|
"legendFormat": "Half-Open"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "short",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"value": 0, "color": "green"},
|
|
{"value": 1, "color": "yellow"},
|
|
{"value": 3, "color": "red"}
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
|
|
}
|
|
],
|
|
"refresh": "15s",
|
|
"schemaVersion": 30,
|
|
"version": 1
|
|
}
|
|
}
|