tidaldb/docs/ops/grafana-dashboard.json
jordan a0a33f4d9a feat: harden tidal-server for production (Weeks 1–3)
Week 1 — deployment prerequisites:
- Add TIDAL_API_KEY Bearer auth middleware (constant-time comparison)
- Handle SIGTERM alongside ctrl-c for graceful shutdown
- Remove test-utils feature from production tidal-server binary
- Fix standalone Dockerfile; add cluster Dockerfile and docker-compose
- Extract MultiRegionState into state.rs with per-region TidalDb map

Week 2 — operational middleware and observability:
- Add body limit (2MB), request timeout (30s), concurrency limit (100)
- Add SetRequestIdLayer + PropagateRequestIdLayer (x-request-id header)
- Add TraceLayer with structured spans including request ID
- Activate Prometheus /metrics endpoint via --metrics flag
- Add monitoring.md, recovery.md, prometheus-alerts.yaml, grafana-dashboard.json

Week 3 — query latency histograms and middleware integration tests:
- Add QUERY_LATENCY_BOUNDS (100µs–10s) histogram to tidal library
- Instrument retrieve() and search() with tidaldb_retrieve/search_latency_us
- Fix: search() latency now recorded on error paths (was skipped via ?)
- Lib+bin split in tidal-server enabling integration tests
- Add 8 middleware integration tests (auth, body limit, request ID)
- Add 2 Prometheus alert rules and 2 Grafana latency panels

Post-review fixes:
- Fix SIGTERM handler compilation on non-Unix targets (#[cfg(unix)] guard)
- Exempt /health from TimeoutLayer + ConcurrencyLimitLayer (prevents false liveness failures under load)
- Case-insensitive Bearer scheme matching per RFC 7235 §2.1
2026-02-27 20:32:39 -07:00

524 lines
14 KiB
JSON

{
"uid": "tidaldb-overview",
"title": "tidalDB Overview",
"description": "Operational dashboard covering all 20 tidalDB metrics including retrieve and search latency histograms.",
"schemaVersion": 38,
"version": 2,
"refresh": "30s",
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"tags": ["tidaldb"],
"panels": [
{
"id": 1,
"type": "row",
"title": "Health Overview",
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 },
"collapsed": false
},
{
"id": 2,
"type": "stat",
"title": "Health",
"gridPos": { "x": 0, "y": 1, "w": 4, "h": 4 },
"targets": [
{
"expr": "tidaldb_health_ok",
"legendFormat": "health_ok"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": { "0": { "text": "UNHEALTHY", "color": "red" } } },
{ "type": "value", "options": { "1": { "text": "OK", "color": "green" } } }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": 0 },
{ "color": "green", "value": 1 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "background" }
},
{
"id": 3,
"type": "stat",
"title": "Uptime",
"gridPos": { "x": 4, "y": 1, "w": 4, "h": 4 },
"targets": [
{
"expr": "tidaldb_uptime_seconds",
"legendFormat": "uptime_seconds"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "value" }
},
{
"id": 4,
"type": "stat",
"title": "Degradation Level",
"gridPos": { "x": 8, "y": 1, "w": 4, "h": 4 },
"targets": [
{
"expr": "tidaldb_degradation_level",
"legendFormat": "degradation_level"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "red", "value": 1 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "background" }
},
{
"id": 5,
"type": "stat",
"title": "Version",
"gridPos": { "x": 12, "y": 1, "w": 4, "h": 4 },
"targets": [
{
"expr": "tidaldb_info",
"legendFormat": "{{version}}"
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "fixed", "fixedColor": "text" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "none", "textMode": "name" }
},
{
"id": 10,
"type": "row",
"title": "Signal Throughput",
"gridPos": { "x": 0, "y": 5, "w": 24, "h": 1 },
"collapsed": false
},
{
"id": 11,
"type": "timeseries",
"title": "Signal Write Rate (per second)",
"gridPos": { "x": 0, "y": 6, "w": 8, "h": 6 },
"targets": [
{
"expr": "rate(tidaldb_signal_writes_total[5m])",
"legendFormat": "writes/s"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 12,
"type": "timeseries",
"title": "Signal Write Latency (µs)",
"gridPos": { "x": 8, "y": 6, "w": 8, "h": 6 },
"targets": [
{
"expr": "tidaldb_signal_write_latency_us",
"legendFormat": "latency_us"
}
],
"fieldConfig": {
"defaults": {
"unit": "µs",
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 13,
"type": "gauge",
"title": "Signal Hot Entries",
"gridPos": { "x": 16, "y": 6, "w": 8, "h": 6 },
"targets": [
{
"expr": "tidaldb_signal_hot_entries",
"legendFormat": "hot_entries"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 5000000,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "yellow", "value": 4000000 },
{ "color": "red", "value": 5000000 }
]
},
"color": { "mode": "thresholds" }
}
}
},
{
"id": 14,
"type": "timeseries",
"title": "Retrieve Latency Percentiles (µs)",
"gridPos": { "x": 0, "y": 12, "w": 12, "h": 6 },
"targets": [
{
"expr": "histogram_quantile(0.50, rate(tidaldb_retrieve_latency_us_bucket[$__rate_interval]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(tidaldb_retrieve_latency_us_bucket[$__rate_interval]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(tidaldb_retrieve_latency_us_bucket[$__rate_interval]))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": {
"unit": "µs",
"color": { "mode": "palette-classic" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "yellow", "value": 500000 }
]
}
}
}
},
{
"id": 15,
"type": "timeseries",
"title": "Search Latency Percentiles (µs)",
"gridPos": { "x": 12, "y": 12, "w": 12, "h": 6 },
"targets": [
{
"expr": "histogram_quantile(0.50, rate(tidaldb_search_latency_us_bucket[$__rate_interval]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(tidaldb_search_latency_us_bucket[$__rate_interval]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(tidaldb_search_latency_us_bucket[$__rate_interval]))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": {
"unit": "µs",
"color": { "mode": "palette-classic" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "yellow", "value": 1000000 }
]
}
}
}
},
{
"id": 20,
"type": "row",
"title": "Durability",
"gridPos": { "x": 0, "y": 19, "w": 24, "h": 1 },
"collapsed": false
},
{
"id": 21,
"type": "timeseries",
"title": "Checkpoint Age (seconds)",
"gridPos": { "x": 0, "y": 20, "w": 6, "h": 6 },
"targets": [
{
"expr": "tidaldb_checkpoint_age_seconds",
"legendFormat": "checkpoint_age_seconds"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "red", "value": 300 }
]
},
"color": { "mode": "thresholds" }
}
}
},
{
"id": 22,
"type": "stat",
"title": "Checkpoint Failures",
"gridPos": { "x": 6, "y": 20, "w": 6, "h": 6 },
"targets": [
{
"expr": "tidaldb_checkpoint_failures_total",
"legendFormat": "checkpoint_failures_total"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "red", "value": 1 }
]
},
"color": { "mode": "thresholds" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background" }
},
{
"id": 23,
"type": "timeseries",
"title": "WAL Lag (bytes)",
"gridPos": { "x": 12, "y": 20, "w": 6, "h": 6 },
"targets": [
{
"expr": "tidaldb_wal_lag_bytes",
"legendFormat": "wal_lag_bytes"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "yellow", "value": 500000000 },
{ "color": "red", "value": 1000000000 }
]
},
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 24,
"type": "timeseries",
"title": "WAL Compacted Segments (rate)",
"gridPos": { "x": 18, "y": 20, "w": 6, "h": 6 },
"targets": [
{
"expr": "rate(tidaldb_wal_compacted_segments_total[5m])",
"legendFormat": "compacted/s"
}
],
"fieldConfig": {
"defaults": {
"unit": "cps",
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 30,
"type": "row",
"title": "Index Health",
"gridPos": { "x": 0, "y": 26, "w": 24, "h": 1 },
"collapsed": false
},
{
"id": 31,
"type": "stat",
"title": "Tantivy Indexed Docs",
"gridPos": { "x": 0, "y": 27, "w": 4, "h": 4 },
"targets": [
{ "expr": "tidaldb_tantivy_indexed_docs", "legendFormat": "indexed_docs" }
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
},
{
"id": 32,
"type": "gauge",
"title": "Tantivy Segment Count",
"gridPos": { "x": 4, "y": 27, "w": 4, "h": 4 },
"targets": [
{ "expr": "tidaldb_tantivy_segment_count", "legendFormat": "segment_count" }
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 50,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "yellow", "value": 20 },
{ "color": "red", "value": 30 }
]
},
"color": { "mode": "thresholds" }
}
}
},
{
"id": 33,
"type": "stat",
"title": "uSearch Vector Count",
"gridPos": { "x": 8, "y": 27, "w": 4, "h": 4 },
"targets": [
{ "expr": "tidaldb_usearch_vector_count", "legendFormat": "vector_count" }
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
},
{
"id": 34,
"type": "stat",
"title": "uSearch Index Size",
"gridPos": { "x": 12, "y": 27, "w": 4, "h": 4 },
"targets": [
{ "expr": "tidaldb_usearch_index_size_bytes", "legendFormat": "index_size_bytes" }
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
},
{
"id": 35,
"type": "stat",
"title": "Bitmap Index Cardinality",
"gridPos": { "x": 16, "y": 27, "w": 4, "h": 4 },
"targets": [
{ "expr": "tidaldb_bitmap_index_cardinality", "legendFormat": "bitmap_cardinality" }
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "fixed", "fixedColor": "blue" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
},
{
"id": 40,
"type": "row",
"title": "Sessions",
"gridPos": { "x": 0, "y": 31, "w": 24, "h": 1 },
"collapsed": false
},
{
"id": 41,
"type": "timeseries",
"title": "Active Sessions",
"gridPos": { "x": 0, "y": 32, "w": 6, "h": 6 },
"targets": [
{ "expr": "tidaldb_active_sessions", "legendFormat": "active_sessions" }
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 42,
"type": "timeseries",
"title": "Session Close Rate (per second)",
"gridPos": { "x": 6, "y": 32, "w": 6, "h": 6 },
"targets": [
{ "expr": "rate(tidaldb_closed_sessions_total[5m])", "legendFormat": "closes/s" }
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 43,
"type": "stat",
"title": "Auto-Closed Sessions",
"gridPos": { "x": 12, "y": 32, "w": 4, "h": 6 },
"targets": [
{ "expr": "tidaldb_session_auto_closed_total", "legendFormat": "auto_closed_total" }
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "fixed", "fixedColor": "yellow" }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
},
{
"id": 44,
"type": "timeseries",
"title": "Rate Limited (per second)",
"gridPos": { "x": 16, "y": 32, "w": 8, "h": 6 },
"targets": [
{ "expr": "rate(tidaldb_rate_limited_total[5m])", "legendFormat": "rate_limited/s" }
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": 0 },
{ "color": "red", "value": 100 }
]
},
"color": { "mode": "palette-classic" }
}
}
}
]
}