Week 1 — deployment prerequisites: - Add TIDAL_API_KEY Bearer auth middleware (constant-time comparison) - Handle SIGTERM alongside ctrl-c for graceful shutdown - Remove test-utils feature from production tidal-server binary - Fix standalone Dockerfile; add cluster Dockerfile and docker-compose - Extract MultiRegionState into state.rs with per-region TidalDb map Week 2 — operational middleware and observability: - Add body limit (2MB), request timeout (30s), concurrency limit (100) - Add SetRequestIdLayer + PropagateRequestIdLayer (x-request-id header) - Add TraceLayer with structured spans including request ID - Activate Prometheus /metrics endpoint via --metrics flag - Add monitoring.md, recovery.md, prometheus-alerts.yaml, grafana-dashboard.json Week 3 — query latency histograms and middleware integration tests: - Add QUERY_LATENCY_BOUNDS (100µs–10s) histogram to tidal library - Instrument retrieve() and search() with tidaldb_retrieve/search_latency_us - Fix: search() latency now recorded on error paths (was skipped via ?) - Lib+bin split in tidal-server enabling integration tests - Add 8 middleware integration tests (auth, body limit, request ID) - Add 2 Prometheus alert rules and 2 Grafana latency panels Post-review fixes: - Fix SIGTERM handler compilation on non-Unix targets (#[cfg(unix)] guard) - Exempt /health from TimeoutLayer + ConcurrencyLimitLayer (prevents false liveness failures under load) - Case-insensitive Bearer scheme matching per RFC 7235 §2.1
91 lines
3.5 KiB
YAML
91 lines
3.5 KiB
YAML
groups:
|
|
- name: tidaldb
|
|
interval: 30s
|
|
rules:
|
|
- alert: TidalDBDown
|
|
expr: tidaldb_health_ok == 0
|
|
for: 1m
|
|
labels: { severity: critical }
|
|
annotations:
|
|
summary: "tidalDB is unhealthy"
|
|
description: "tidaldb_health_ok is 0 — database is unhealthy or shut down."
|
|
|
|
- alert: TidalDBCheckpointStale
|
|
expr: tidaldb_checkpoint_age_seconds > 300
|
|
for: 2m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Signal checkpoint not running"
|
|
description: "{{ $value }}s since last checkpoint (threshold: 300s). Signal durability at risk."
|
|
|
|
- alert: TidalDBCheckpointFailures
|
|
expr: increase(tidaldb_checkpoint_failures_total[5m]) > 0
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Signal checkpoint failures detected"
|
|
description: "Checkpoint failures in last 5m. Check disk space and storage errors."
|
|
|
|
- alert: TidalDBWALDiskPressure
|
|
expr: tidaldb_wal_lag_bytes > 1000000000
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "WAL disk usage exceeds 1GB"
|
|
description: "{{ $value | humanize1024 }}B of WAL uncompacted. Compaction may be stuck."
|
|
|
|
- alert: TidalDBSignalBacklog
|
|
expr: tidaldb_signal_hot_entries > 4000000
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Signal ledger over 80% of capacity"
|
|
description: "{{ $value }} hot entries (threshold: 4M / 80% of 5M budget)."
|
|
|
|
- alert: TidalDBDegradedRanking
|
|
expr: tidaldb_degradation_level > 0
|
|
for: 2m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Ranking quality degraded"
|
|
description: "Degradation level {{ $value }} active. Scale up or reduce load."
|
|
|
|
- alert: TidalDBSessionLeak
|
|
expr: rate(tidaldb_active_sessions[5m]) > 10 and tidaldb_active_sessions > 100
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Active session count growing rapidly"
|
|
description: "{{ $value }} active sessions and growing. Agents may not be closing sessions."
|
|
|
|
- alert: TidalDBHighRateLimiting
|
|
expr: rate(tidaldb_rate_limited_total[5m]) > 100
|
|
for: 5m
|
|
labels: { severity: info }
|
|
annotations:
|
|
summary: "Sustained rate limiting"
|
|
description: "{{ $value }}/s rate-limited writes. Review agent rate limit config."
|
|
|
|
- alert: TidalDBTantivySegmentBloat
|
|
expr: tidaldb_tantivy_segment_count > 30
|
|
for: 10m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Tantivy segment count elevated"
|
|
description: "{{ $value }} segments (threshold: 30). Text syncer may be stalled."
|
|
|
|
- alert: TidalDBSlowRetrieve
|
|
expr: histogram_quantile(0.95, rate(tidaldb_retrieve_latency_us_bucket[5m])) > 500000
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Retrieve p95 latency exceeds 500ms"
|
|
description: "p95 retrieve latency is {{ $value | humanizeDuration }}. Check signal ledger load and degradation level."
|
|
|
|
- alert: TidalDBSlowSearch
|
|
expr: histogram_quantile(0.95, rate(tidaldb_search_latency_us_bucket[5m])) > 1000000
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Search p95 latency exceeds 1s"
|
|
description: "p95 search latency is {{ $value | humanizeDuration }}. Check Tantivy segment count and ANN index health."
|