feat: harden tidal-server for production (Weeks 1–3)
Week 1 — deployment prerequisites: - Add TIDAL_API_KEY Bearer auth middleware (constant-time comparison) - Handle SIGTERM alongside ctrl-c for graceful shutdown - Remove test-utils feature from production tidal-server binary - Fix standalone Dockerfile; add cluster Dockerfile and docker-compose - Extract MultiRegionState into state.rs with per-region TidalDb map Week 2 — operational middleware and observability: - Add body limit (2MB), request timeout (30s), concurrency limit (100) - Add SetRequestIdLayer + PropagateRequestIdLayer (x-request-id header) - Add TraceLayer with structured spans including request ID - Activate Prometheus /metrics endpoint via --metrics flag - Add monitoring.md, recovery.md, prometheus-alerts.yaml, grafana-dashboard.json Week 3 — query latency histograms and middleware integration tests: - Add QUERY_LATENCY_BOUNDS (100µs–10s) histogram to tidal library - Instrument retrieve() and search() with tidaldb_retrieve/search_latency_us - Fix: search() latency now recorded on error paths (was skipped via ?) - Lib+bin split in tidal-server enabling integration tests - Add 8 middleware integration tests (auth, body limit, request ID) - Add 2 Prometheus alert rules and 2 Grafana latency panels Post-review fixes: - Fix SIGTERM handler compilation on non-Unix targets (#[cfg(unix)] guard) - Exempt /health from TimeoutLayer + ConcurrencyLimitLayer (prevents false liveness failures under load) - Case-insensitive Bearer scheme matching per RFC 7235 §2.1
This commit is contained in:
parent
928bcf98d6
commit
a0a33f4d9a
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -3324,9 +3324,12 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_yaml",
|
"serde_yaml",
|
||||||
|
"subtle",
|
||||||
"thiserror 2.0.18",
|
"thiserror 2.0.18",
|
||||||
"tidaldb",
|
"tidaldb",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"tower",
|
||||||
|
"tower-http 0.6.8",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
]
|
]
|
||||||
@ -3517,6 +3520,7 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"sync_wrapper",
|
"sync_wrapper",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
"tower-layer",
|
"tower-layer",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
"tracing",
|
"tracing",
|
||||||
@ -3560,9 +3564,12 @@ dependencies = [
|
|||||||
"http-body",
|
"http-body",
|
||||||
"iri-string",
|
"iri-string",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
|
"tokio",
|
||||||
"tower",
|
"tower",
|
||||||
"tower-layer",
|
"tower-layer",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
|
"tracing",
|
||||||
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|||||||
@ -1,3 +1,9 @@
|
|||||||
|
# LEGACY: This file was originally a simulated multi-region cluster image.
|
||||||
|
# The cluster mode has been removed from tidal-server. This Dockerfile now
|
||||||
|
# builds an identical standalone image and is preserved only to avoid breaking
|
||||||
|
# existing CI references.
|
||||||
|
#
|
||||||
|
# For new deployments use docker/standalone/Dockerfile instead.
|
||||||
FROM rust:1.91 as builder
|
FROM rust:1.91 as builder
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
@ -19,12 +25,16 @@ RUN cargo build -p tidal-server --release
|
|||||||
FROM debian:bookworm-slim
|
FROM debian:bookworm-slim
|
||||||
WORKDIR /srv
|
WORKDIR /srv
|
||||||
RUN useradd --system --home /srv tidal && \
|
RUN useradd --system --home /srv tidal && \
|
||||||
apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
|
apt-get update && apt-get install -y ca-certificates curl && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY --from=builder /app/target/release/tidal-server /usr/local/bin/tidal-server
|
COPY --from=builder /app/target/release/tidal-server /usr/local/bin/tidal-server
|
||||||
COPY tidal-server/config /etc/tidal-server
|
COPY tidal-server/config /etc/tidal-server
|
||||||
|
|
||||||
USER tidal
|
USER tidal
|
||||||
EXPOSE 9500
|
EXPOSE 9400
|
||||||
|
|
||||||
ENTRYPOINT ["tidal-server", "cluster", "--listen", "0.0.0.0:9500", "--schema", "/etc/tidal-server/default-schema.yaml", "--topology", "/etc/tidal-server/default-cluster.yaml"]
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
|
||||||
|
CMD curl -f -H "Authorization: Bearer ${TIDAL_API_KEY:-}" http://localhost:9400/health || exit 1
|
||||||
|
|
||||||
|
ENTRYPOINT ["tidal-server", "standalone", "--listen", "0.0.0.0:9400"]
|
||||||
|
|||||||
31
docker/docker-compose.yml
Normal file
31
docker/docker-compose.yml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
services:
|
||||||
|
tidaldb:
|
||||||
|
build:
|
||||||
|
context: ..
|
||||||
|
dockerfile: docker/standalone/Dockerfile
|
||||||
|
ports:
|
||||||
|
- "9400:9400"
|
||||||
|
- "9091:9091"
|
||||||
|
environment:
|
||||||
|
- TIDAL_API_KEY=${TIDAL_API_KEY}
|
||||||
|
- TIDAL_SERVER_LOG=info
|
||||||
|
volumes:
|
||||||
|
- tidaldb-data:/var/lib/tidaldb
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "-H", "Authorization: Bearer ${TIDAL_API_KEY}", "http://localhost:9400/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v2.53.0
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
volumes:
|
||||||
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
depends_on:
|
||||||
|
- tidaldb
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
tidaldb-data:
|
||||||
8
docker/prometheus.yml
Normal file
8
docker/prometheus.yml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: tidaldb
|
||||||
|
static_configs:
|
||||||
|
- targets: ['tidaldb:9091']
|
||||||
35
docker/standalone/Dockerfile
Normal file
35
docker/standalone/Dockerfile
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
FROM rust:1.91 AS builder
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy workspace manifests first for layer caching.
|
||||||
|
COPY Cargo.toml Cargo.lock ./
|
||||||
|
COPY tidal/Cargo.toml tidal/Cargo.toml
|
||||||
|
COPY tidalctl/Cargo.toml tidalctl/Cargo.toml
|
||||||
|
COPY tidal-server/Cargo.toml tidal-server/Cargo.toml
|
||||||
|
COPY applications/forage/engine/Cargo.toml applications/forage/engine/Cargo.toml
|
||||||
|
COPY applications/forage/server/Cargo.toml applications/forage/server/Cargo.toml
|
||||||
|
COPY applications/forage/embedder/Cargo.toml applications/forage/embedder/Cargo.toml
|
||||||
|
COPY applications/iknowyou/engine/Cargo.toml applications/iknowyou/engine/Cargo.toml
|
||||||
|
|
||||||
|
# Copy full workspace and build.
|
||||||
|
COPY . .
|
||||||
|
RUN cargo build -p tidal-server --release
|
||||||
|
|
||||||
|
FROM debian:bookworm-slim
|
||||||
|
WORKDIR /srv
|
||||||
|
RUN useradd --system --home /srv tidal && \
|
||||||
|
apt-get update && apt-get install -y ca-certificates curl && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY --from=builder /app/target/release/tidal-server /usr/local/bin/tidal-server
|
||||||
|
COPY tidal-server/config /etc/tidal-server
|
||||||
|
|
||||||
|
USER tidal
|
||||||
|
EXPOSE 9400 9091
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
|
||||||
|
CMD curl -f -H "Authorization: Bearer ${TIDAL_API_KEY:-}" http://localhost:9400/health || exit 1
|
||||||
|
|
||||||
|
ENTRYPOINT ["tidal-server", "standalone", \
|
||||||
|
"--listen", "0.0.0.0:9400", \
|
||||||
|
"--metrics", "0.0.0.0:9091"]
|
||||||
523
docs/ops/grafana-dashboard.json
Normal file
523
docs/ops/grafana-dashboard.json
Normal file
@ -0,0 +1,523 @@
|
|||||||
|
{
|
||||||
|
"uid": "tidaldb-overview",
|
||||||
|
"title": "tidalDB Overview",
|
||||||
|
"description": "Operational dashboard covering all 20 tidalDB metrics including retrieve and search latency histograms.",
|
||||||
|
"schemaVersion": 38,
|
||||||
|
"version": 2,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"timepicker": {},
|
||||||
|
"tags": ["tidaldb"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "row",
|
||||||
|
"title": "Health Overview",
|
||||||
|
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 1 },
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Health",
|
||||||
|
"gridPos": { "x": 0, "y": 1, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_health_ok",
|
||||||
|
"legendFormat": "health_ok"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"mappings": [
|
||||||
|
{ "type": "value", "options": { "0": { "text": "UNHEALTHY", "color": "red" } } },
|
||||||
|
{ "type": "value", "options": { "1": { "text": "OK", "color": "green" } } }
|
||||||
|
],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": 0 },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "thresholds" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "background" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Uptime",
|
||||||
|
"gridPos": { "x": 4, "y": 1, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_uptime_seconds",
|
||||||
|
"legendFormat": "uptime_seconds"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "blue" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Degradation Level",
|
||||||
|
"gridPos": { "x": 8, "y": 1, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_degradation_level",
|
||||||
|
"legendFormat": "degradation_level"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "thresholds" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "background" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Version",
|
||||||
|
"gridPos": { "x": 12, "y": 1, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_info",
|
||||||
|
"legendFormat": "{{version}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "text" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "colorMode": "none", "textMode": "name" }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "row",
|
||||||
|
"title": "Signal Throughput",
|
||||||
|
"gridPos": { "x": 0, "y": 5, "w": 24, "h": 1 },
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Signal Write Rate (per second)",
|
||||||
|
"gridPos": { "x": 0, "y": 6, "w": 8, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(tidaldb_signal_writes_total[5m])",
|
||||||
|
"legendFormat": "writes/s"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Signal Write Latency (µs)",
|
||||||
|
"gridPos": { "x": 8, "y": 6, "w": 8, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_signal_write_latency_us",
|
||||||
|
"legendFormat": "latency_us"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "µs",
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "Signal Hot Entries",
|
||||||
|
"gridPos": { "x": 16, "y": 6, "w": 8, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_signal_hot_entries",
|
||||||
|
"legendFormat": "hot_entries"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"min": 0,
|
||||||
|
"max": 5000000,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "yellow", "value": 4000000 },
|
||||||
|
{ "color": "red", "value": 5000000 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "thresholds" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Retrieve Latency Percentiles (µs)",
|
||||||
|
"gridPos": { "x": 0, "y": 12, "w": 12, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, rate(tidaldb_retrieve_latency_us_bucket[$__rate_interval]))",
|
||||||
|
"legendFormat": "p50"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, rate(tidaldb_retrieve_latency_us_bucket[$__rate_interval]))",
|
||||||
|
"legendFormat": "p95"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.99, rate(tidaldb_retrieve_latency_us_bucket[$__rate_interval]))",
|
||||||
|
"legendFormat": "p99"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "µs",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "yellow", "value": 500000 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Search Latency Percentiles (µs)",
|
||||||
|
"gridPos": { "x": 12, "y": 12, "w": 12, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, rate(tidaldb_search_latency_us_bucket[$__rate_interval]))",
|
||||||
|
"legendFormat": "p50"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, rate(tidaldb_search_latency_us_bucket[$__rate_interval]))",
|
||||||
|
"legendFormat": "p95"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.99, rate(tidaldb_search_latency_us_bucket[$__rate_interval]))",
|
||||||
|
"legendFormat": "p99"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "µs",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "yellow", "value": 1000000 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"type": "row",
|
||||||
|
"title": "Durability",
|
||||||
|
"gridPos": { "x": 0, "y": 19, "w": 24, "h": 1 },
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Checkpoint Age (seconds)",
|
||||||
|
"gridPos": { "x": 0, "y": 20, "w": 6, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_checkpoint_age_seconds",
|
||||||
|
"legendFormat": "checkpoint_age_seconds"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "red", "value": 300 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "thresholds" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Checkpoint Failures",
|
||||||
|
"gridPos": { "x": 6, "y": 20, "w": 6, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_checkpoint_failures_total",
|
||||||
|
"legendFormat": "checkpoint_failures_total"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "thresholds" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "WAL Lag (bytes)",
|
||||||
|
"gridPos": { "x": 12, "y": 20, "w": 6, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "tidaldb_wal_lag_bytes",
|
||||||
|
"legendFormat": "wal_lag_bytes"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "yellow", "value": 500000000 },
|
||||||
|
{ "color": "red", "value": 1000000000 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 24,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "WAL Compacted Segments (rate)",
|
||||||
|
"gridPos": { "x": 18, "y": 20, "w": 6, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(tidaldb_wal_compacted_segments_total[5m])",
|
||||||
|
"legendFormat": "compacted/s"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cps",
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"type": "row",
|
||||||
|
"title": "Index Health",
|
||||||
|
"gridPos": { "x": 0, "y": 26, "w": 24, "h": 1 },
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Tantivy Indexed Docs",
|
||||||
|
"gridPos": { "x": 0, "y": 27, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_tantivy_indexed_docs", "legendFormat": "indexed_docs" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "blue" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 32,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "Tantivy Segment Count",
|
||||||
|
"gridPos": { "x": 4, "y": 27, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_tantivy_segment_count", "legendFormat": "segment_count" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"min": 0,
|
||||||
|
"max": 50,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "yellow", "value": 20 },
|
||||||
|
{ "color": "red", "value": 30 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "thresholds" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 33,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "uSearch Vector Count",
|
||||||
|
"gridPos": { "x": 8, "y": 27, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_usearch_vector_count", "legendFormat": "vector_count" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "blue" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 34,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "uSearch Index Size",
|
||||||
|
"gridPos": { "x": 12, "y": 27, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_usearch_index_size_bytes", "legendFormat": "index_size_bytes" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "blue" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 35,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Bitmap Index Cardinality",
|
||||||
|
"gridPos": { "x": 16, "y": 27, "w": 4, "h": 4 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_bitmap_index_cardinality", "legendFormat": "bitmap_cardinality" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "blue" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 40,
|
||||||
|
"type": "row",
|
||||||
|
"title": "Sessions",
|
||||||
|
"gridPos": { "x": 0, "y": 31, "w": 24, "h": 1 },
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 41,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Active Sessions",
|
||||||
|
"gridPos": { "x": 0, "y": 32, "w": 6, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_active_sessions", "legendFormat": "active_sessions" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 42,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Session Close Rate (per second)",
|
||||||
|
"gridPos": { "x": 6, "y": 32, "w": 6, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(tidaldb_closed_sessions_total[5m])", "legendFormat": "closes/s" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 43,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Auto-Closed Sessions",
|
||||||
|
"gridPos": { "x": 12, "y": 32, "w": 4, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "tidaldb_session_auto_closed_total", "legendFormat": "auto_closed_total" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "fixed", "fixedColor": "yellow" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 44,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Rate Limited (per second)",
|
||||||
|
"gridPos": { "x": 16, "y": 32, "w": 8, "h": 6 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(tidaldb_rate_limited_total[5m])", "legendFormat": "rate_limited/s" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": 0 },
|
||||||
|
{ "color": "red", "value": 100 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
90
docs/ops/prometheus-alerts.yaml
Normal file
90
docs/ops/prometheus-alerts.yaml
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
groups:
|
||||||
|
- name: tidaldb
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
- alert: TidalDBDown
|
||||||
|
expr: tidaldb_health_ok == 0
|
||||||
|
for: 1m
|
||||||
|
labels: { severity: critical }
|
||||||
|
annotations:
|
||||||
|
summary: "tidalDB is unhealthy"
|
||||||
|
description: "tidaldb_health_ok is 0 — database is unhealthy or shut down."
|
||||||
|
|
||||||
|
- alert: TidalDBCheckpointStale
|
||||||
|
expr: tidaldb_checkpoint_age_seconds > 300
|
||||||
|
for: 2m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Signal checkpoint not running"
|
||||||
|
description: "{{ $value }}s since last checkpoint (threshold: 300s). Signal durability at risk."
|
||||||
|
|
||||||
|
- alert: TidalDBCheckpointFailures
|
||||||
|
expr: increase(tidaldb_checkpoint_failures_total[5m]) > 0
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Signal checkpoint failures detected"
|
||||||
|
description: "Checkpoint failures in last 5m. Check disk space and storage errors."
|
||||||
|
|
||||||
|
- alert: TidalDBWALDiskPressure
|
||||||
|
expr: tidaldb_wal_lag_bytes > 1000000000
|
||||||
|
for: 5m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "WAL disk usage exceeds 1GB"
|
||||||
|
description: "{{ $value | humanize1024 }}B of WAL uncompacted. Compaction may be stuck."
|
||||||
|
|
||||||
|
- alert: TidalDBSignalBacklog
|
||||||
|
expr: tidaldb_signal_hot_entries > 4000000
|
||||||
|
for: 5m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Signal ledger over 80% of capacity"
|
||||||
|
description: "{{ $value }} hot entries (threshold: 4M / 80% of 5M budget)."
|
||||||
|
|
||||||
|
- alert: TidalDBDegradedRanking
|
||||||
|
expr: tidaldb_degradation_level > 0
|
||||||
|
for: 2m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Ranking quality degraded"
|
||||||
|
description: "Degradation level {{ $value }} active. Scale up or reduce load."
|
||||||
|
|
||||||
|
- alert: TidalDBSessionLeak
|
||||||
|
expr: rate(tidaldb_active_sessions[5m]) > 10 and tidaldb_active_sessions > 100
|
||||||
|
for: 5m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Active session count growing rapidly"
|
||||||
|
description: "{{ $value }} active sessions and growing. Agents may not be closing sessions."
|
||||||
|
|
||||||
|
- alert: TidalDBHighRateLimiting
|
||||||
|
expr: rate(tidaldb_rate_limited_total[5m]) > 100
|
||||||
|
for: 5m
|
||||||
|
labels: { severity: info }
|
||||||
|
annotations:
|
||||||
|
summary: "Sustained rate limiting"
|
||||||
|
description: "{{ $value }}/s rate-limited writes. Review agent rate limit config."
|
||||||
|
|
||||||
|
- alert: TidalDBTantivySegmentBloat
|
||||||
|
expr: tidaldb_tantivy_segment_count > 30
|
||||||
|
for: 10m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Tantivy segment count elevated"
|
||||||
|
description: "{{ $value }} segments (threshold: 30). Text syncer may be stalled."
|
||||||
|
|
||||||
|
- alert: TidalDBSlowRetrieve
|
||||||
|
expr: histogram_quantile(0.95, rate(tidaldb_retrieve_latency_us_bucket[5m])) > 500000
|
||||||
|
for: 5m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Retrieve p95 latency exceeds 500ms"
|
||||||
|
description: "p95 retrieve latency is {{ $value | humanizeDuration }}. Check signal ledger load and degradation level."
|
||||||
|
|
||||||
|
- alert: TidalDBSlowSearch
|
||||||
|
expr: histogram_quantile(0.95, rate(tidaldb_search_latency_us_bucket[5m])) > 1000000
|
||||||
|
for: 5m
|
||||||
|
labels: { severity: warning }
|
||||||
|
annotations:
|
||||||
|
summary: "Search p95 latency exceeds 1s"
|
||||||
|
description: "p95 search latency is {{ $value | humanizeDuration }}. Check Tantivy segment count and ANN index health."
|
||||||
@ -5,9 +5,16 @@ edition.workspace = true
|
|||||||
rust-version.workspace = true
|
rust-version.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
|
[lib]
|
||||||
|
name = "tidal_server"
|
||||||
|
path = "src/lib.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
axum = "0.8"
|
axum = "0.8"
|
||||||
clap = { version = "4.5", features = ["derive"] }
|
clap = { version = "4.5", features = ["derive"] }
|
||||||
|
subtle = "2"
|
||||||
|
tower = { version = "0.5", features = ["limit"] }
|
||||||
|
tower-http = { version = "0.6", features = ["timeout", "trace", "request-id"] }
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
serde_yaml = "0.9"
|
serde_yaml = "0.9"
|
||||||
@ -15,4 +22,4 @@ thiserror = "2"
|
|||||||
tokio = { version = "1", features = ["macros", "rt-multi-thread", "signal"] }
|
tokio = { version = "1", features = ["macros", "rt-multi-thread", "signal"] }
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
tidaldb = { path = "../tidal", features = ["test-utils"] }
|
tidaldb = { path = "../tidal" }
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
use std::collections::HashSet;
|
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@ -9,13 +8,6 @@ use tidaldb::schema::{DecaySpec, EntityKind, Schema, SchemaBuilder, TextFieldTyp
|
|||||||
use crate::error::{Result, ServerError};
|
use crate::error::{Result, ServerError};
|
||||||
|
|
||||||
const DEFAULT_SCHEMA_YAML: &str = include_str!("../config/default-schema.yaml");
|
const DEFAULT_SCHEMA_YAML: &str = include_str!("../config/default-schema.yaml");
|
||||||
const DEFAULT_CLUSTER_YAML: &str = include_str!("../config/default-cluster.yaml");
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct ClusterLayout {
|
|
||||||
pub regions: Vec<String>,
|
|
||||||
pub leader: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn load_schema(path: Option<&Path>) -> Result<Schema> {
|
pub fn load_schema(path: Option<&Path>) -> Result<Schema> {
|
||||||
let raw = read_config(path, DEFAULT_SCHEMA_YAML)?;
|
let raw = read_config(path, DEFAULT_SCHEMA_YAML)?;
|
||||||
@ -64,40 +56,6 @@ pub fn load_schema(path: Option<&Path>) -> Result<Schema> {
|
|||||||
builder.build().map_err(ServerError::SchemaBuild)
|
builder.build().map_err(ServerError::SchemaBuild)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn load_cluster_layout(path: Option<&Path>) -> Result<ClusterLayout> {
|
|
||||||
let raw = read_config(path, DEFAULT_CLUSTER_YAML)?;
|
|
||||||
let spec: ClusterSpec = serde_yaml::from_str(&raw)
|
|
||||||
.map_err(|e| ServerError::ClusterConfig(format!("parse cluster yaml: {e}")))?;
|
|
||||||
|
|
||||||
if spec.regions.is_empty() {
|
|
||||||
return Err(ServerError::ClusterConfig(
|
|
||||||
"cluster config must include at least one region".into(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut seen = HashSet::new();
|
|
||||||
for region in &spec.regions {
|
|
||||||
if !seen.insert(region.name.to_lowercase()) {
|
|
||||||
return Err(ServerError::ClusterConfig(format!(
|
|
||||||
"duplicate region name: {}",
|
|
||||||
region.name
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !seen.contains(&spec.leader.to_lowercase()) {
|
|
||||||
return Err(ServerError::ClusterConfig(format!(
|
|
||||||
"leader '{}' not found in region list",
|
|
||||||
spec.leader
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(ClusterLayout {
|
|
||||||
regions: spec.regions.into_iter().map(|r| r.name).collect(),
|
|
||||||
leader: spec.leader,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_config(path: Option<&Path>, fallback: &str) -> Result<String> {
|
fn read_config(path: Option<&Path>, fallback: &str) -> Result<String> {
|
||||||
match path {
|
match path {
|
||||||
Some(p) => fs::read_to_string(p).map_err(|e| ServerError::io(p, e)),
|
Some(p) => fs::read_to_string(p).map_err(|e| ServerError::io(p, e)),
|
||||||
@ -180,17 +138,6 @@ struct EmbeddingSpec {
|
|||||||
dimensions: usize,
|
dimensions: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct ClusterSpec {
|
|
||||||
regions: Vec<RegionSpec>,
|
|
||||||
leader: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct RegionSpec {
|
|
||||||
name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_entity_kind(input: &str) -> Result<EntityKind> {
|
fn parse_entity_kind(input: &str) -> Result<EntityKind> {
|
||||||
match input.trim().to_lowercase().as_str() {
|
match input.trim().to_lowercase().as_str() {
|
||||||
"item" | "items" => Ok(EntityKind::Item),
|
"item" | "items" => Ok(EntityKind::Item),
|
||||||
|
|||||||
@ -15,8 +15,6 @@ pub enum ServerError {
|
|||||||
SchemaConfig(String),
|
SchemaConfig(String),
|
||||||
#[error("schema build failed: {0}")]
|
#[error("schema build failed: {0}")]
|
||||||
SchemaBuild(#[from] tidaldb::schema::SchemaError),
|
SchemaBuild(#[from] tidaldb::schema::SchemaError),
|
||||||
#[error("invalid cluster config: {0}")]
|
|
||||||
ClusterConfig(String),
|
|
||||||
#[error("tidalDB error: {0}")]
|
#[error("tidalDB error: {0}")]
|
||||||
Tidal(#[from] tidaldb::TidalError),
|
Tidal(#[from] tidaldb::TidalError),
|
||||||
#[error("http server error: {0}")]
|
#[error("http server error: {0}")]
|
||||||
|
|||||||
8
tidal-server/src/lib.rs
Normal file
8
tidal-server/src/lib.rs
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
/// Public modules exposed for integration testing.
|
||||||
|
///
|
||||||
|
/// The binary (`main.rs`) imports from this lib crate rather than declaring
|
||||||
|
/// the modules directly, so integration tests in `tests/` can access them.
|
||||||
|
pub mod config;
|
||||||
|
pub mod error;
|
||||||
|
pub mod router;
|
||||||
|
pub mod state;
|
||||||
@ -1,20 +1,14 @@
|
|||||||
mod config;
|
|
||||||
mod error;
|
|
||||||
mod router;
|
|
||||||
mod state;
|
|
||||||
|
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use clap::{Args, Parser, Subcommand};
|
use clap::{Args, Parser, Subcommand};
|
||||||
|
use tidal_server::config::load_schema;
|
||||||
|
use tidal_server::error::{Result, ServerError};
|
||||||
|
use tidal_server::router::build_router;
|
||||||
|
use tidal_server::state::ServerState;
|
||||||
use tidaldb::TidalDb;
|
use tidaldb::TidalDb;
|
||||||
|
|
||||||
use crate::config::{load_cluster_layout, load_schema};
|
|
||||||
use crate::error::{Result, ServerError};
|
|
||||||
use crate::router::build_router;
|
|
||||||
use crate::state::ServerState;
|
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(version, about = "HTTP wrapper for tidalDB")]
|
#[command(version, about = "HTTP wrapper for tidalDB")]
|
||||||
struct Cli {
|
struct Cli {
|
||||||
@ -26,8 +20,6 @@ struct Cli {
|
|||||||
enum Command {
|
enum Command {
|
||||||
#[command(about = "Run a single-node server wrapping one tidalDB instance")]
|
#[command(about = "Run a single-node server wrapping one tidalDB instance")]
|
||||||
Standalone(StandaloneArgs),
|
Standalone(StandaloneArgs),
|
||||||
#[command(about = "Run the simulated multi-region cluster server")]
|
|
||||||
Cluster(ClusterArgs),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
@ -38,16 +30,11 @@ struct StandaloneArgs {
|
|||||||
schema: Option<PathBuf>,
|
schema: Option<PathBuf>,
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
data_dir: Option<PathBuf>,
|
data_dir: Option<PathBuf>,
|
||||||
}
|
#[arg(
|
||||||
|
long,
|
||||||
#[derive(Args)]
|
help = "Bind address for Prometheus /metrics endpoint (e.g. 127.0.0.1:9091)"
|
||||||
struct ClusterArgs {
|
)]
|
||||||
#[arg(long, default_value = "0.0.0.0:9500")]
|
metrics: Option<String>,
|
||||||
listen: String,
|
|
||||||
#[arg(long)]
|
|
||||||
schema: Option<PathBuf>,
|
|
||||||
#[arg(long)]
|
|
||||||
topology: Option<PathBuf>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
@ -64,7 +51,6 @@ async fn run() -> Result<()> {
|
|||||||
|
|
||||||
match cli.mode {
|
match cli.mode {
|
||||||
Command::Standalone(args) => run_standalone(args).await,
|
Command::Standalone(args) => run_standalone(args).await,
|
||||||
Command::Cluster(args) => run_cluster(args).await,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,20 +70,39 @@ async fn run_standalone(args: StandaloneArgs) -> Result<()> {
|
|||||||
} else {
|
} else {
|
||||||
builder = builder.ephemeral();
|
builder = builder.ephemeral();
|
||||||
}
|
}
|
||||||
|
if let Some(ref addr) = args.metrics {
|
||||||
|
builder = builder.enable_metrics(addr);
|
||||||
|
}
|
||||||
|
|
||||||
let db = builder.open()?;
|
let db = builder.open()?;
|
||||||
let state = ServerState::standalone(db);
|
if let Some(addr) = db.metrics_addr() {
|
||||||
serve(state, &args.listen).await
|
tracing::info!("metrics endpoint listening on http://{addr}/metrics");
|
||||||
|
}
|
||||||
|
let state = ServerState::new(db);
|
||||||
|
|
||||||
|
let api_key = read_api_key();
|
||||||
|
serve(state, &args.listen, api_key).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run_cluster(args: ClusterArgs) -> Result<()> {
|
/// Read the API key from the environment.
|
||||||
let schema = load_schema(args.schema.as_deref())?;
|
///
|
||||||
let layout = load_cluster_layout(args.topology.as_deref())?;
|
/// If `TIDAL_API_KEY` is not set, all requests are accepted without
|
||||||
let state = ServerState::cluster(schema, layout)?;
|
/// authentication. This is appropriate for local development but should
|
||||||
serve(state, &args.listen).await
|
/// never be used in production. A startup warning is emitted.
|
||||||
|
fn read_api_key() -> Option<Arc<str>> {
|
||||||
|
match std::env::var("TIDAL_API_KEY") {
|
||||||
|
Ok(key) if !key.is_empty() => Some(Arc::from(key.as_str())),
|
||||||
|
_ => {
|
||||||
|
tracing::warn!(
|
||||||
|
"TIDAL_API_KEY is not set — all endpoints are unauthenticated. \
|
||||||
|
Set this variable before exposing the server to any network."
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn serve(state: ServerState, addr: &str) -> Result<()> {
|
async fn serve(state: ServerState, addr: &str, api_key: Option<Arc<str>>) -> Result<()> {
|
||||||
let socket: SocketAddr = addr
|
let socket: SocketAddr = addr
|
||||||
.parse()
|
.parse()
|
||||||
.map_err(|e| ServerError::BadRequest(format!("invalid addr: {e}")))?;
|
.map_err(|e| ServerError::BadRequest(format!("invalid addr: {e}")))?;
|
||||||
@ -106,15 +111,40 @@ async fn serve(state: ServerState, addr: &str) -> Result<()> {
|
|||||||
let actual = listener.local_addr()?;
|
let actual = listener.local_addr()?;
|
||||||
tracing::info!("listening on http://{actual}");
|
tracing::info!("listening on http://{actual}");
|
||||||
|
|
||||||
axum::serve(listener, build_router(Arc::new(state)))
|
axum::serve(listener, build_router(Arc::new(state), api_key))
|
||||||
.with_graceful_shutdown(shutdown_signal())
|
.with_graceful_shutdown(shutdown_signal())
|
||||||
.await?;
|
.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn shutdown_signal() {
|
async fn shutdown_signal() {
|
||||||
if let Err(err) = tokio::signal::ctrl_c().await {
|
// SIGTERM is Unix-only; on other platforms we fall back to ctrl-c alone.
|
||||||
tracing::warn!("ctrl-c handler error: {err}");
|
#[cfg(unix)]
|
||||||
|
let sigterm = async {
|
||||||
|
use tokio::signal::unix::{SignalKind, signal};
|
||||||
|
match signal(SignalKind::terminate()) {
|
||||||
|
Ok(mut stream) => {
|
||||||
|
stream.recv().await;
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
tracing::warn!("SIGTERM handler registration failed: {err}");
|
||||||
|
// If we cannot register SIGTERM, park this branch so the
|
||||||
|
// select falls back to ctrl_c exclusively.
|
||||||
|
std::future::pending::<()>().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#[cfg(not(unix))]
|
||||||
|
let sigterm = std::future::pending::<()>();
|
||||||
|
|
||||||
|
tokio::select! {
|
||||||
|
result = tokio::signal::ctrl_c() => {
|
||||||
|
if let Err(err) = result {
|
||||||
|
tracing::warn!("ctrl-c handler error: {err}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = sigterm => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
tracing::info!("shutdown signal received");
|
tracing::info!("shutdown signal received");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,38 +1,178 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use axum::Json;
|
use axum::Json;
|
||||||
use axum::Router;
|
use axum::Router;
|
||||||
use axum::extract::{Query, State};
|
use axum::extract::{Query, Request, State};
|
||||||
use axum::http::StatusCode;
|
use axum::http::StatusCode;
|
||||||
|
use axum::http::header::AUTHORIZATION;
|
||||||
|
use axum::middleware::{self, Next};
|
||||||
use axum::response::{IntoResponse, Response};
|
use axum::response::{IntoResponse, Response};
|
||||||
use axum::routing::{get, post};
|
use axum::routing::{get, post};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use subtle::ConstantTimeEq;
|
||||||
use tidaldb::query::retrieve::Retrieve;
|
use tidaldb::query::retrieve::Retrieve;
|
||||||
use tidaldb::query::search::Search;
|
use tidaldb::query::search::Search;
|
||||||
use tidaldb::schema::EntityId;
|
use tidaldb::schema::EntityId;
|
||||||
|
use tower::ServiceBuilder;
|
||||||
|
use tower::limit::ConcurrencyLimitLayer;
|
||||||
|
use tower_http::request_id::{
|
||||||
|
MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer,
|
||||||
|
};
|
||||||
|
use tower_http::timeout::TimeoutLayer;
|
||||||
|
use tower_http::trace::TraceLayer;
|
||||||
|
|
||||||
use crate::error::{Result, ServerError};
|
use crate::error::{Result, ServerError};
|
||||||
use crate::state::{ClusterHealth, ServerState};
|
use crate::state::ServerState;
|
||||||
|
|
||||||
pub fn build_router(state: Arc<ServerState>) -> Router {
|
/// Maximum request body size. Requests exceeding this are rejected with 413
|
||||||
let mut app = Router::new()
|
/// before any deserialization occurs.
|
||||||
|
const BODY_LIMIT_BYTES: usize = 2 * 1024 * 1024;
|
||||||
|
|
||||||
|
/// Maximum wall-clock time a single request may occupy. Exceeded requests
|
||||||
|
/// receive 408 Request Timeout.
|
||||||
|
const REQUEST_TIMEOUT_SECS: u64 = 30;
|
||||||
|
|
||||||
|
/// Maximum number of requests processed concurrently. Additional requests
|
||||||
|
/// are queued by the concurrency layer until a slot opens.
|
||||||
|
const MAX_CONCURRENCY: usize = 100;
|
||||||
|
|
||||||
|
/// Sequential request ID generator — assigns monotonically increasing IDs.
|
||||||
|
#[derive(Clone, Default)]
|
||||||
|
struct SequentialRequestId(Arc<AtomicU64>);
|
||||||
|
|
||||||
|
impl MakeRequestId for SequentialRequestId {
|
||||||
|
fn make_request_id<B>(&mut self, _: &Request<B>) -> Option<RequestId> {
|
||||||
|
// Relaxed ordering is sufficient: we only need unique IDs, not
|
||||||
|
// cross-thread happens-before ordering.
|
||||||
|
let id = self.0.fetch_add(1, Ordering::Relaxed);
|
||||||
|
axum::http::HeaderValue::from_str(&id.to_string())
|
||||||
|
.ok()
|
||||||
|
.map(RequestId::new)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the application router.
|
||||||
|
///
|
||||||
|
/// Routes are split into two groups:
|
||||||
|
/// - **Public** (`/health`) — never requires auth; safe for liveness/readiness probes.
|
||||||
|
/// - **Protected** — require `Authorization: Bearer <key>` when `api_key` is `Some`.
|
||||||
|
///
|
||||||
|
/// Global middleware stack (applied to all routes, outermost → innermost):
|
||||||
|
/// 1. `SetRequestIdLayer` — assigns sequential `x-request-id` before the span is created
|
||||||
|
/// 2. `PropagateRequestIdLayer` — echoes `x-request-id` into the response
|
||||||
|
/// 3. `TraceLayer` — creates a structured span; reads the ID set in step 1
|
||||||
|
///
|
||||||
|
/// Protected-only middleware (not applied to `/health`):
|
||||||
|
/// 4. `TimeoutLayer` — returns 408 for requests exceeding [`REQUEST_TIMEOUT_SECS`]
|
||||||
|
/// 5. `ConcurrencyLimitLayer` — queues beyond [`MAX_CONCURRENCY`] in-flight requests
|
||||||
|
///
|
||||||
|
/// Keeping `/health` outside the timeout/concurrency layers means health probes
|
||||||
|
/// are never queued or timed out under saturation, preventing false liveness failures.
|
||||||
|
pub fn build_router(state: Arc<ServerState>, api_key: Option<Arc<str>>) -> Router {
|
||||||
|
// Public routes — exempt from auth so health probes always work.
|
||||||
|
let public = Router::new()
|
||||||
|
.route("/health", get(health))
|
||||||
|
.with_state(Arc::clone(&state));
|
||||||
|
|
||||||
|
// Protected routes — gated by Bearer token when a key is configured.
|
||||||
|
let protected = Router::new()
|
||||||
.route("/items", post(create_item))
|
.route("/items", post(create_item))
|
||||||
.route("/embeddings", post(write_embedding))
|
.route("/embeddings", post(write_embedding))
|
||||||
.route("/signals", post(write_signal))
|
.route("/signals", post(write_signal))
|
||||||
.route("/feed", get(feed))
|
.route("/feed", get(feed))
|
||||||
.route("/search", get(search))
|
.route("/search", get(search))
|
||||||
.route("/health", get(health));
|
.layer(axum::extract::DefaultBodyLimit::max(BODY_LIMIT_BYTES))
|
||||||
|
.with_state(state);
|
||||||
|
|
||||||
if state.is_cluster() {
|
let protected = match api_key {
|
||||||
app = app
|
Some(key) => protected.layer(middleware::from_fn(move |req: Request, next: Next| {
|
||||||
.route("/cluster/status", get(cluster_status))
|
let key = key.clone();
|
||||||
.route("/cluster/promote", post(promote_leader))
|
async move { bearer_auth(req, next, &key).await }
|
||||||
.route("/cluster/partition", post(partition_region))
|
})),
|
||||||
.route("/cluster/heal", post(heal_region));
|
None => protected,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Timeout and concurrency apply only to protected routes so health probes
|
||||||
|
// are never queued or dropped during overload.
|
||||||
|
let protected = protected.layer(
|
||||||
|
ServiceBuilder::new()
|
||||||
|
.layer(TimeoutLayer::with_status_code(
|
||||||
|
StatusCode::REQUEST_TIMEOUT,
|
||||||
|
Duration::from_secs(REQUEST_TIMEOUT_SECS),
|
||||||
|
))
|
||||||
|
.layer(ConcurrencyLimitLayer::new(MAX_CONCURRENCY)),
|
||||||
|
);
|
||||||
|
|
||||||
|
// SetRequestId must be outermost so the ID is in headers when TraceLayer
|
||||||
|
// creates its span. In ServiceBuilder the first .layer() is outermost.
|
||||||
|
public.merge(protected).layer(
|
||||||
|
ServiceBuilder::new()
|
||||||
|
.layer(SetRequestIdLayer::x_request_id(
|
||||||
|
SequentialRequestId::default(),
|
||||||
|
))
|
||||||
|
.layer(PropagateRequestIdLayer::x_request_id())
|
||||||
|
.layer(
|
||||||
|
TraceLayer::new_for_http().make_span_with(|req: &Request<_>| {
|
||||||
|
// x-request-id is guaranteed present: SetRequestIdLayer runs first.
|
||||||
|
let id = req
|
||||||
|
.headers()
|
||||||
|
.get("x-request-id")
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.unwrap_or("-");
|
||||||
|
tracing::info_span!(
|
||||||
|
"request",
|
||||||
|
method = %req.method(),
|
||||||
|
uri = %req.uri(),
|
||||||
|
request_id = %id,
|
||||||
|
)
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate an `Authorization: Bearer <token>` header.
|
||||||
|
///
|
||||||
|
/// Returns `401 Unauthorized` with a `WWW-Authenticate: Bearer` header if the
|
||||||
|
/// token is absent or does not match the expected key. The comparison uses
|
||||||
|
/// constant-time equality to prevent timing-based token reconstruction.
|
||||||
|
async fn bearer_auth(request: Request, next: Next, expected_key: &str) -> Response {
|
||||||
|
// RFC 7235 §2.1: auth-scheme tokens are case-insensitive.
|
||||||
|
let token = request
|
||||||
|
.headers()
|
||||||
|
.get(AUTHORIZATION)
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.and_then(|s| {
|
||||||
|
// "Bearer " is 7 bytes; check case-insensitively then slice the token.
|
||||||
|
if s.len() > 7 && s[..7].eq_ignore_ascii_case("bearer ") {
|
||||||
|
Some(&s[7..])
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let authorized = match token {
|
||||||
|
Some(t) => {
|
||||||
|
let a = t.as_bytes();
|
||||||
|
let b = expected_key.as_bytes();
|
||||||
|
// Length mismatch does not leak token content; short-circuit is safe.
|
||||||
|
a.len() == b.len() && bool::from(a.ct_eq(b))
|
||||||
|
}
|
||||||
|
None => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
if authorized {
|
||||||
|
next.run(request).await
|
||||||
|
} else {
|
||||||
|
(
|
||||||
|
StatusCode::UNAUTHORIZED,
|
||||||
|
[("www-authenticate", "Bearer")],
|
||||||
|
Json(serde_json::json!({"error": "missing or invalid api key"})),
|
||||||
|
)
|
||||||
|
.into_response()
|
||||||
}
|
}
|
||||||
|
|
||||||
app.with_state(state)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
@ -216,9 +356,7 @@ async fn search(
|
|||||||
State(state): State<Arc<ServerState>>,
|
State(state): State<Arc<ServerState>>,
|
||||||
Query(query): Query<SearchQueryParams>,
|
Query(query): Query<SearchQueryParams>,
|
||||||
) -> Result<Json<SearchResponse>, AppError> {
|
) -> Result<Json<SearchResponse>, AppError> {
|
||||||
let mut builder = Search::builder()
|
let mut builder = Search::builder().query(&query.query).limit(query.limit);
|
||||||
.query(&query.query)
|
|
||||||
.limit(query.limit);
|
|
||||||
if let Some(user_id) = query.user_id {
|
if let Some(user_id) = query.user_id {
|
||||||
builder = builder.for_user(user_id);
|
builder = builder.for_user(user_id);
|
||||||
}
|
}
|
||||||
@ -262,56 +400,13 @@ async fn health(
|
|||||||
) -> Result<Json<HealthResponse>, AppError> {
|
) -> Result<Json<HealthResponse>, AppError> {
|
||||||
let region = query.get("region").map(|s| s.as_str());
|
let region = query.get("region").map(|s| s.as_str());
|
||||||
let items = state.item_count(region).map_err(AppError)?;
|
let items = state.item_count(region).map_err(AppError)?;
|
||||||
let mode = if state.is_cluster() {
|
|
||||||
"cluster"
|
|
||||||
} else {
|
|
||||||
"standalone"
|
|
||||||
};
|
|
||||||
Ok(Json(HealthResponse {
|
Ok(Json(HealthResponse {
|
||||||
status: "ok",
|
status: "ok",
|
||||||
mode,
|
mode: "standalone",
|
||||||
items,
|
items,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn cluster_status(
|
|
||||||
State(state): State<Arc<ServerState>>,
|
|
||||||
) -> Result<Json<ClusterHealth>, AppError> {
|
|
||||||
state
|
|
||||||
.cluster_status()
|
|
||||||
.map(Json)
|
|
||||||
.ok_or_else(|| AppError(ServerError::BadRequest("not in cluster mode".into())))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct RegionCommand {
|
|
||||||
region: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn promote_leader(
|
|
||||||
State(state): State<Arc<ServerState>>,
|
|
||||||
Json(cmd): Json<RegionCommand>,
|
|
||||||
) -> Result<StatusCode, AppError> {
|
|
||||||
state.promote_leader(&cmd.region).map_err(AppError)?;
|
|
||||||
Ok(StatusCode::NO_CONTENT)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn partition_region(
|
|
||||||
State(state): State<Arc<ServerState>>,
|
|
||||||
Json(cmd): Json<RegionCommand>,
|
|
||||||
) -> Result<StatusCode, AppError> {
|
|
||||||
state.partition_region(&cmd.region).map_err(AppError)?;
|
|
||||||
Ok(StatusCode::NO_CONTENT)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn heal_region(
|
|
||||||
State(state): State<Arc<ServerState>>,
|
|
||||||
Json(cmd): Json<RegionCommand>,
|
|
||||||
) -> Result<StatusCode, AppError> {
|
|
||||||
state.heal_region(&cmd.region).map_err(AppError)?;
|
|
||||||
Ok(StatusCode::NO_CONTENT)
|
|
||||||
}
|
|
||||||
|
|
||||||
struct TidalErrorWrapper(tidaldb::TidalError);
|
struct TidalErrorWrapper(tidaldb::TidalError);
|
||||||
|
|
||||||
impl From<TidalErrorWrapper> for AppError {
|
impl From<TidalErrorWrapper> for AppError {
|
||||||
@ -334,9 +429,7 @@ impl IntoResponse for AppError {
|
|||||||
|
|
||||||
fn status_from_error(err: &ServerError) -> StatusCode {
|
fn status_from_error(err: &ServerError) -> StatusCode {
|
||||||
match err {
|
match err {
|
||||||
ServerError::BadRequest(_)
|
ServerError::BadRequest(_) | ServerError::SchemaConfig(_) => StatusCode::BAD_REQUEST,
|
||||||
| ServerError::SchemaConfig(_)
|
|
||||||
| ServerError::ClusterConfig(_) => StatusCode::BAD_REQUEST,
|
|
||||||
ServerError::Tidal(tidal_err) => match tidal_err {
|
ServerError::Tidal(tidal_err) => match tidal_err {
|
||||||
tidaldb::TidalError::NotFound { .. } => StatusCode::NOT_FOUND,
|
tidaldb::TidalError::NotFound { .. } => StatusCode::NOT_FOUND,
|
||||||
tidaldb::TidalError::Schema(_) | tidaldb::TidalError::InvalidInput(_) => {
|
tidaldb::TidalError::Schema(_) | tidaldb::TidalError::InvalidInput(_) => {
|
||||||
|
|||||||
@ -1,111 +1,35 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use tidaldb::TidalDb;
|
||||||
use tidaldb::query::retrieve::Retrieve;
|
use tidaldb::query::retrieve::Retrieve;
|
||||||
use tidaldb::query::search::Search;
|
use tidaldb::query::search::Search;
|
||||||
use tidaldb::schema::{EntityId, Schema};
|
use tidaldb::schema::EntityId;
|
||||||
use tidaldb::replication::RegionId;
|
|
||||||
use tidaldb::testing::cluster::SimulatedCluster;
|
|
||||||
use tidaldb::TidalDb;
|
|
||||||
|
|
||||||
use crate::config::ClusterLayout;
|
|
||||||
use crate::error::{Result, ServerError};
|
use crate::error::{Result, ServerError};
|
||||||
|
|
||||||
|
/// Return an error if a region name was provided.
|
||||||
|
///
|
||||||
|
/// Region routing is a cluster-mode concept. In standalone mode any non-None
|
||||||
|
/// region param is a client mistake; return a clear error rather than silently
|
||||||
|
/// ignoring it.
|
||||||
|
fn ensure_standalone(region_name: Option<&str>) -> Result<()> {
|
||||||
|
if region_name.is_some() {
|
||||||
|
return Err(ServerError::BadRequest(
|
||||||
|
"region routing requires cluster mode; this server runs standalone".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct ServerState {
|
pub struct ServerState {
|
||||||
mode: Mode,
|
db: Arc<TidalDb>,
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
enum Mode {
|
|
||||||
Standalone(Arc<TidalDb>),
|
|
||||||
Cluster(ClusterState),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct ClusterState {
|
|
||||||
cluster: Arc<SimulatedCluster>,
|
|
||||||
regions: RegionDirectory,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct RegionDirectory {
|
|
||||||
#[allow(dead_code)]
|
|
||||||
default_region: RegionId,
|
|
||||||
name_to_id: HashMap<String, RegionId>,
|
|
||||||
id_to_name: HashMap<u16, String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, serde::Serialize)]
|
|
||||||
pub struct ClusterHealth {
|
|
||||||
pub leader: String,
|
|
||||||
pub relay_log_len: u64,
|
|
||||||
pub regions: Vec<RegionStatus>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, serde::Serialize)]
|
|
||||||
pub struct RegionStatus {
|
|
||||||
pub name: String,
|
|
||||||
pub applied_events: u64,
|
|
||||||
pub lag_events: i64,
|
|
||||||
pub partitioned: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ServerState {
|
impl ServerState {
|
||||||
pub fn standalone(db: TidalDb) -> Self {
|
pub fn new(db: TidalDb) -> Self {
|
||||||
Self {
|
Self { db: Arc::new(db) }
|
||||||
mode: Mode::Standalone(Arc::new(db)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn cluster(schema: Schema, layout: ClusterLayout) -> Result<Self> {
|
|
||||||
use tidaldb::testing::cluster::ClusterConfig;
|
|
||||||
|
|
||||||
let mut regions = Vec::new();
|
|
||||||
for (idx, name) in layout.regions.iter().enumerate() {
|
|
||||||
regions.push((RegionId(idx as u16), name.clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let leader_id = regions
|
|
||||||
.iter()
|
|
||||||
.find(|(_, name)| name.eq_ignore_ascii_case(&layout.leader))
|
|
||||||
.map(|(id, _)| *id)
|
|
||||||
.ok_or_else(|| {
|
|
||||||
ServerError::ClusterConfig(format!(
|
|
||||||
"leader '{}' not found in region list",
|
|
||||||
layout.leader
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let config = ClusterConfig {
|
|
||||||
regions: regions.iter().map(|(id, _)| *id).collect(),
|
|
||||||
leader_region: leader_id,
|
|
||||||
schema,
|
|
||||||
};
|
|
||||||
let cluster = Arc::new(SimulatedCluster::build(config));
|
|
||||||
let mut name_to_id = HashMap::new();
|
|
||||||
let mut id_to_name = HashMap::new();
|
|
||||||
for (id, name) in ®ions {
|
|
||||||
name_to_id.insert(name.to_lowercase(), *id);
|
|
||||||
id_to_name.insert(id.0, name.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
let directory = RegionDirectory {
|
|
||||||
default_region: leader_id,
|
|
||||||
name_to_id,
|
|
||||||
id_to_name,
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
mode: Mode::Cluster(ClusterState {
|
|
||||||
cluster,
|
|
||||||
regions: directory,
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_cluster(&self) -> bool {
|
|
||||||
matches!(self.mode, Mode::Cluster(_))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_item(
|
pub fn write_item(
|
||||||
@ -113,24 +37,15 @@ impl ServerState {
|
|||||||
entity_id: EntityId,
|
entity_id: EntityId,
|
||||||
metadata: &HashMap<String, String>,
|
metadata: &HashMap<String, String>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
match &self.mode {
|
self.db
|
||||||
Mode::Standalone(db) => db
|
.write_item_with_metadata(entity_id, metadata)
|
||||||
.write_item_with_metadata(entity_id, metadata)
|
.map_err(ServerError::from)
|
||||||
.map_err(ServerError::from),
|
|
||||||
Mode::Cluster(cluster) => cluster
|
|
||||||
.cluster
|
|
||||||
.write_item_with_metadata(entity_id, metadata)
|
|
||||||
.map_err(ServerError::from),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_embedding(&self, entity_id: EntityId, embedding: &[f32]) -> Result<()> {
|
pub fn write_embedding(&self, entity_id: EntityId, embedding: &[f32]) -> Result<()> {
|
||||||
match &self.mode {
|
self.db
|
||||||
Mode::Standalone(db) => db
|
.write_item_embedding(entity_id, embedding)
|
||||||
.write_item_embedding(entity_id, embedding)
|
.map_err(ServerError::from)
|
||||||
.map_err(ServerError::from),
|
|
||||||
Mode::Cluster(cluster) => cluster.cluster.write_item_embedding(entity_id, embedding).map_err(ServerError::from),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn signal(
|
pub fn signal(
|
||||||
@ -141,37 +56,26 @@ impl ServerState {
|
|||||||
user_id: Option<u64>,
|
user_id: Option<u64>,
|
||||||
creator_id: Option<u64>,
|
creator_id: Option<u64>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
match &self.mode {
|
if user_id.is_some() || creator_id.is_some() {
|
||||||
Mode::Standalone(db) => {
|
self.db
|
||||||
if user_id.is_some() || creator_id.is_some() {
|
.signal_with_context(
|
||||||
db.signal_with_context(
|
signal_name,
|
||||||
signal_name,
|
entity_id,
|
||||||
entity_id,
|
weight,
|
||||||
weight,
|
tidaldb::schema::Timestamp::now(),
|
||||||
tidaldb::schema::Timestamp::now(),
|
user_id,
|
||||||
user_id,
|
creator_id,
|
||||||
creator_id,
|
)
|
||||||
)
|
.map_err(ServerError::from)
|
||||||
.map_err(ServerError::from)
|
} else {
|
||||||
} else {
|
self.db
|
||||||
db.signal(
|
.signal(
|
||||||
signal_name,
|
signal_name,
|
||||||
entity_id,
|
entity_id,
|
||||||
weight,
|
weight,
|
||||||
tidaldb::schema::Timestamp::now(),
|
tidaldb::schema::Timestamp::now(),
|
||||||
)
|
)
|
||||||
.map_err(ServerError::from)
|
.map_err(ServerError::from)
|
||||||
}
|
|
||||||
}
|
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
if user_id.is_some() || creator_id.is_some() {
|
|
||||||
return Err(ServerError::BadRequest(
|
|
||||||
"cluster mode currently supports only global signals (no user_id/creator_id)".into(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
cluster.cluster.write_signal(signal_name, entity_id, weight).map_err(ServerError::from)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,34 +84,19 @@ impl ServerState {
|
|||||||
region_name: Option<&str>,
|
region_name: Option<&str>,
|
||||||
query: &Retrieve,
|
query: &Retrieve,
|
||||||
) -> Result<tidaldb::query::retrieve::Results> {
|
) -> Result<tidaldb::query::retrieve::Results> {
|
||||||
match &self.mode {
|
ensure_standalone(region_name)?;
|
||||||
Mode::Standalone(db) => db.retrieve(query).map_err(ServerError::from),
|
self.db.retrieve(query).map_err(ServerError::from)
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.resolve_region(region_name)?;
|
|
||||||
cluster.cluster.retrieve(region, query).map_err(ServerError::from)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Reload the Tantivy text index reader so the next search sees recently
|
/// Reload the Tantivy text index reader so the next search sees recently
|
||||||
/// committed documents.
|
/// committed documents.
|
||||||
///
|
///
|
||||||
/// On-disk indexes auto-reload via `OnCommitWithDelay`; ephemeral indexes
|
/// On-disk indexes auto-reload via `OnCommitWithDelay`; ephemeral indexes
|
||||||
/// (the default for standalone and cluster modes) use `ReloadPolicy::Manual`
|
/// (the default for standalone mode) use `ReloadPolicy::Manual` and require
|
||||||
/// and require an explicit reload before each search.
|
/// an explicit reload before each search.
|
||||||
pub fn reload_text_index(&self, region_name: Option<&str>) -> Result<()> {
|
pub fn reload_text_index(&self, region_name: Option<&str>) -> Result<()> {
|
||||||
match &self.mode {
|
ensure_standalone(region_name)?;
|
||||||
Mode::Standalone(db) => db.reload_text_index().map_err(ServerError::from),
|
self.db.reload_text_index().map_err(ServerError::from)
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.resolve_region(region_name)?;
|
|
||||||
cluster
|
|
||||||
.cluster
|
|
||||||
.node(region)
|
|
||||||
.db
|
|
||||||
.reload_text_index()
|
|
||||||
.map_err(ServerError::from)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn search(
|
pub fn search(
|
||||||
@ -215,129 +104,12 @@ impl ServerState {
|
|||||||
region_name: Option<&str>,
|
region_name: Option<&str>,
|
||||||
query: &Search,
|
query: &Search,
|
||||||
) -> Result<tidaldb::query::search::SearchResults> {
|
) -> Result<tidaldb::query::search::SearchResults> {
|
||||||
match &self.mode {
|
ensure_standalone(region_name)?;
|
||||||
Mode::Standalone(db) => db.search(query).map_err(ServerError::from),
|
self.db.search(query).map_err(ServerError::from)
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.resolve_region(region_name)?;
|
|
||||||
cluster.cluster.search(region, query).map_err(ServerError::from)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn item_count(&self, region_name: Option<&str>) -> Result<u64> {
|
pub fn item_count(&self, region_name: Option<&str>) -> Result<u64> {
|
||||||
match &self.mode {
|
ensure_standalone(region_name)?;
|
||||||
Mode::Standalone(db) => Ok(db.item_count()),
|
Ok(self.db.item_count())
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.resolve_region(region_name)?;
|
|
||||||
Ok(cluster.cluster.item_count(region))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn cluster_status(&self) -> Option<ClusterHealth> {
|
|
||||||
match &self.mode {
|
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let leader_id = cluster.cluster.leader_region();
|
|
||||||
let leader_name = cluster
|
|
||||||
.regions
|
|
||||||
.id_to_name
|
|
||||||
.get(&leader_id.0)
|
|
||||||
.cloned()
|
|
||||||
.unwrap_or_else(|| format!("region-{}", leader_id.0));
|
|
||||||
let leader_seqno = cluster.cluster.leader_seqno();
|
|
||||||
let statuses = cluster
|
|
||||||
.regions
|
|
||||||
.id_to_name
|
|
||||||
.iter()
|
|
||||||
.map(|(id, name)| {
|
|
||||||
let rid = RegionId(*id);
|
|
||||||
let applied = cluster.cluster.applied_count(rid);
|
|
||||||
// The leader writes directly (no receiver thread), so its
|
|
||||||
// applied_seqno via replication is always 0. Report 0 lag
|
|
||||||
// for the leader since it is the authoritative source.
|
|
||||||
let lag = if rid == leader_id {
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
(leader_seqno as i64 - applied as i64).max(0)
|
|
||||||
};
|
|
||||||
RegionStatus {
|
|
||||||
name: name.clone(),
|
|
||||||
applied_events: applied,
|
|
||||||
lag_events: lag,
|
|
||||||
partitioned: cluster.cluster.is_partitioned(rid),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
Some(ClusterHealth {
|
|
||||||
leader: leader_name,
|
|
||||||
relay_log_len: cluster.cluster.relay_log_len(),
|
|
||||||
regions: statuses,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn promote_leader(&self, region_name: &str) -> Result<()> {
|
|
||||||
match &self.mode {
|
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.regions.lookup(region_name).ok_or_else(|| {
|
|
||||||
ServerError::BadRequest(format!("unknown region '{region_name}'"))
|
|
||||||
})?;
|
|
||||||
cluster.cluster.promote_leader(region);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
_ => Err(ServerError::BadRequest(
|
|
||||||
"leader promotion only supported in cluster mode".into(),
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn partition_region(&self, region_name: &str) -> Result<()> {
|
|
||||||
match &self.mode {
|
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.regions.lookup(region_name).ok_or_else(|| {
|
|
||||||
ServerError::BadRequest(format!("unknown region '{region_name}'"))
|
|
||||||
})?;
|
|
||||||
cluster.cluster.partition_region(region);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
_ => Err(ServerError::BadRequest(
|
|
||||||
"partitions only supported in cluster mode".into(),
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn heal_region(&self, region_name: &str) -> Result<()> {
|
|
||||||
match &self.mode {
|
|
||||||
Mode::Cluster(cluster) => {
|
|
||||||
let region = cluster.regions.lookup(region_name).ok_or_else(|| {
|
|
||||||
ServerError::BadRequest(format!("unknown region '{region_name}'"))
|
|
||||||
})?;
|
|
||||||
cluster.cluster.heal_region(region);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
_ => Err(ServerError::BadRequest(
|
|
||||||
"partitions only supported in cluster mode".into(),
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ClusterState {
|
|
||||||
fn resolve_region(&self, name: Option<&str>) -> Result<RegionId> {
|
|
||||||
if let Some(name) = name {
|
|
||||||
self.regions
|
|
||||||
.lookup(name)
|
|
||||||
.ok_or_else(|| ServerError::BadRequest(format!("unknown region '{name}'")))
|
|
||||||
} else {
|
|
||||||
Ok(self.cluster.leader_region())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RegionDirectory {
|
|
||||||
fn lookup(&self, name: &str) -> Option<RegionId> {
|
|
||||||
self.name_to_id.get(&name.trim().to_lowercase()).copied()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
209
tidal-server/tests/middleware.rs
Normal file
209
tidal-server/tests/middleware.rs
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
//! Integration tests for the tidal-server middleware stack.
|
||||||
|
//!
|
||||||
|
//! Each test spins up an in-process router using `tower::ServiceExt::oneshot`
|
||||||
|
//! — no TCP port binding required — and verifies middleware behaviors:
|
||||||
|
//! body limits, authentication, and request-ID propagation.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use axum::body::Body;
|
||||||
|
use axum::http::{Method, Request, StatusCode};
|
||||||
|
use tidal_server::{router::build_router, state::ServerState};
|
||||||
|
use tidaldb::TidalDb;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
|
||||||
|
// ── Test helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn make_state() -> Arc<ServerState> {
|
||||||
|
// Use the default schema so ranking profiles are available — same setup as
|
||||||
|
// `run_standalone` in main.rs.
|
||||||
|
let schema = tidal_server::config::load_schema(None).unwrap();
|
||||||
|
let db = TidalDb::builder()
|
||||||
|
.ephemeral()
|
||||||
|
.with_schema(schema)
|
||||||
|
.open()
|
||||||
|
.unwrap();
|
||||||
|
Arc::new(ServerState::new(db))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_app(api_key: Option<&str>) -> axum::Router {
|
||||||
|
let key = api_key.map(|k| Arc::from(k));
|
||||||
|
build_router(make_state(), key)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Auth tests ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// /health is public — no Bearer token required.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn health_no_auth_required() {
|
||||||
|
let app = make_app(Some("secret"));
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/health")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Protected routes (e.g. /feed) require a Bearer token when a key is set.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn protected_route_requires_auth() {
|
||||||
|
let app = make_app(Some("secret"));
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/feed")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A valid Bearer token grants access to protected routes.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn protected_route_accepts_valid_key() {
|
||||||
|
let app = make_app(Some("secret"));
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/feed")
|
||||||
|
.header("Authorization", "Bearer secret")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// RFC 7235: auth scheme is case-insensitive — `bearer` should work like `Bearer`.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn lowercase_bearer_accepted() {
|
||||||
|
let app = make_app(Some("secret"));
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/feed")
|
||||||
|
.header("Authorization", "bearer secret")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An incorrect Bearer token is rejected with 401.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn wrong_key_rejected() {
|
||||||
|
let app = make_app(Some("secret"));
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/feed")
|
||||||
|
.header("Authorization", "Bearer wrong")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When no API key is configured, all routes are accessible without auth.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn no_auth_mode_allows_all_routes() {
|
||||||
|
let app = make_app(None);
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/feed")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Middleware behavior tests ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Bodies exceeding 2MB on protected routes are rejected with 413.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn body_too_large_returns_413() {
|
||||||
|
let app = make_app(Some("secret"));
|
||||||
|
// 3 MB exceeds the 2 MB BODY_LIMIT_BYTES constant in router.rs.
|
||||||
|
let big_body = vec![0u8; 3 * 1024 * 1024];
|
||||||
|
let response = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::POST)
|
||||||
|
.uri("/embeddings")
|
||||||
|
.header("Authorization", "Bearer secret")
|
||||||
|
.header("Content-Type", "application/json")
|
||||||
|
.body(Body::from(big_body))
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// x-request-id is present in every response and increments per request.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn x_request_id_increments() {
|
||||||
|
// Both calls share the same App (and therefore the same AtomicU64 counter).
|
||||||
|
let app = make_app(None);
|
||||||
|
let r1 = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/health")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let r2 = app
|
||||||
|
.oneshot(
|
||||||
|
Request::builder()
|
||||||
|
.method(Method::GET)
|
||||||
|
.uri("/health")
|
||||||
|
.body(Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let id1: u64 = r1
|
||||||
|
.headers()
|
||||||
|
.get("x-request-id")
|
||||||
|
.expect("x-request-id missing from first response")
|
||||||
|
.to_str()
|
||||||
|
.unwrap()
|
||||||
|
.parse()
|
||||||
|
.unwrap();
|
||||||
|
let id2: u64 = r2
|
||||||
|
.headers()
|
||||||
|
.get("x-request-id")
|
||||||
|
.expect("x-request-id missing from second response")
|
||||||
|
.to_str()
|
||||||
|
.unwrap()
|
||||||
|
.parse()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(id2 > id1, "x-request-id should increase: {id1} then {id2}");
|
||||||
|
}
|
||||||
@ -189,6 +189,10 @@ impl super::TidalDb {
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
// 4. Checkpoint signal ledger synchronously.
|
// 4. Checkpoint signal ledger synchronously.
|
||||||
|
//
|
||||||
|
// This is fatal: if the checkpoint fails, WAL replay on restore would
|
||||||
|
// double-count signals (replay re-applies events that were never
|
||||||
|
// persisted to the checkpoint keyspace).
|
||||||
let seq = self.last_wal_seq.load(Ordering::Acquire);
|
let seq = self.last_wal_seq.load(Ordering::Acquire);
|
||||||
if let (Some(ledger), Some(storage)) = (&self.ledger, &self.storage) {
|
if let (Some(ledger), Some(storage)) = (&self.ledger, &self.storage) {
|
||||||
let meta = crate::signals::checkpoint::CheckpointMeta {
|
let meta = crate::signals::checkpoint::CheckpointMeta {
|
||||||
@ -196,18 +200,22 @@ impl super::TidalDb {
|
|||||||
wal_sequence: seq,
|
wal_sequence: seq,
|
||||||
payload_hash: [0u8; 32],
|
payload_hash: [0u8; 32],
|
||||||
};
|
};
|
||||||
if let Err(e) = ledger.checkpoint(storage.items_engine(), meta) {
|
ledger
|
||||||
tracing::warn!(error = %e, "backup: signal checkpoint failed (non-fatal)");
|
.checkpoint(storage.items_engine(), meta)
|
||||||
}
|
.map_err(|e| {
|
||||||
|
TidalError::Durability(DurabilityError {
|
||||||
|
message: format!("backup: signal checkpoint failed: {e}"),
|
||||||
|
})
|
||||||
|
})?;
|
||||||
|
|
||||||
// Checkpoint cohort signal state if any entries exist.
|
// Cohort and co-engagement checkpoints are secondary state; log
|
||||||
|
// and continue rather than aborting the backup if they fail.
|
||||||
if self.cohort_ledger.entry_count() > 0
|
if self.cohort_ledger.entry_count() > 0
|
||||||
&& let Err(e) = self.cohort_ledger.checkpoint(storage.items_engine(), meta)
|
&& let Err(e) = self.cohort_ledger.checkpoint(storage.items_engine(), meta)
|
||||||
{
|
{
|
||||||
tracing::warn!(error = %e, "backup: cohort checkpoint failed (non-fatal)");
|
tracing::warn!(error = %e, "backup: cohort checkpoint failed (non-fatal)");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checkpoint co-engagement edges if any exist.
|
|
||||||
if self.co_engagement.edge_count() > 0
|
if self.co_engagement.edge_count() > 0
|
||||||
&& let Err(e) = self.co_engagement.checkpoint(storage.items_engine())
|
&& let Err(e) = self.co_engagement.checkpoint(storage.items_engine())
|
||||||
{
|
{
|
||||||
@ -215,7 +223,8 @@ impl super::TidalDb {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. Flush text indexes.
|
// 5. Flush text indexes (non-fatal: search indexes are rebuilt on
|
||||||
|
// demand; their absence does not affect data integrity).
|
||||||
if let Err(e) = self.flush_text_index() {
|
if let Err(e) = self.flush_text_index() {
|
||||||
tracing::warn!(error = %e, "backup: text index flush failed (non-fatal)");
|
tracing::warn!(error = %e, "backup: text index flush failed (non-fatal)");
|
||||||
}
|
}
|
||||||
@ -224,10 +233,16 @@ impl super::TidalDb {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 6. Flush fjall storage (all keyspaces).
|
// 6. Flush fjall storage (all keyspaces).
|
||||||
if let Some(storage) = &self.storage
|
//
|
||||||
&& let Err(e) = storage.flush()
|
// This is fatal: if the memtable cannot be flushed, the backup
|
||||||
{
|
// directory will be missing recently written data. Callers can
|
||||||
tracing::warn!(error = %e, "backup: storage flush failed (non-fatal)");
|
// retry once disk pressure subsides.
|
||||||
|
if let Some(storage) = &self.storage {
|
||||||
|
storage.flush().map_err(|e| {
|
||||||
|
TidalError::Durability(DurabilityError {
|
||||||
|
message: format!("backup: storage flush failed: {e}"),
|
||||||
|
})
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 7. Copy data directory to destination.
|
// 7. Copy data directory to destination.
|
||||||
|
|||||||
@ -10,6 +10,15 @@ use std::sync::atomic::{AtomicU64, Ordering};
|
|||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
pub const WRITE_LATENCY_BOUNDS: &[u64] = &[1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000];
|
pub const WRITE_LATENCY_BOUNDS: &[u64] = &[1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000];
|
||||||
|
|
||||||
|
/// Bucket boundaries for query latency (retrieve + search), in microseconds.
|
||||||
|
///
|
||||||
|
/// Range: 100µs – 10s. Retrieve and search are 10x–100x slower than signal
|
||||||
|
/// writes so the buckets are wider and cover cold-start and degraded conditions.
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
pub const QUERY_LATENCY_BOUNDS: &[u64] = &[
|
||||||
|
100, 500, 1_000, 5_000, 10_000, 50_000, 100_000, 500_000, 1_000_000, 5_000_000, 10_000_000,
|
||||||
|
];
|
||||||
|
|
||||||
/// A Prometheus-compatible cumulative histogram.
|
/// A Prometheus-compatible cumulative histogram.
|
||||||
///
|
///
|
||||||
/// Each bucket count represents the number of observations less than or equal
|
/// Each bucket count represents the number of observations less than or equal
|
||||||
|
|||||||
@ -11,7 +11,9 @@
|
|||||||
|
|
||||||
pub(crate) mod histogram;
|
pub(crate) mod histogram;
|
||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
pub(crate) use histogram::{LatencyHistogram, WRITE_LATENCY_BOUNDS, write_metric_line};
|
pub(crate) use histogram::{
|
||||||
|
LatencyHistogram, QUERY_LATENCY_BOUNDS, WRITE_LATENCY_BOUNDS, write_metric_line,
|
||||||
|
};
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@ -64,6 +66,14 @@ pub struct MetricsState {
|
|||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
pub(crate) signal_write_latency: LatencyHistogram,
|
pub(crate) signal_write_latency: LatencyHistogram,
|
||||||
|
|
||||||
|
/// Retrieve query end-to-end latency histogram (microseconds).
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
pub(crate) retrieve_latency: LatencyHistogram,
|
||||||
|
|
||||||
|
/// Search query end-to-end latency histogram (microseconds).
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
pub(crate) search_latency: LatencyHistogram,
|
||||||
|
|
||||||
// ── Session + cohort + degradation metrics (m7p4, task-04) ─────────
|
// ── Session + cohort + degradation metrics (m7p4, task-04) ─────────
|
||||||
/// Number of currently active sessions.
|
/// Number of currently active sessions.
|
||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
@ -129,6 +139,10 @@ impl MetricsState {
|
|||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
signal_write_latency: LatencyHistogram::new(WRITE_LATENCY_BOUNDS),
|
signal_write_latency: LatencyHistogram::new(WRITE_LATENCY_BOUNDS),
|
||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
|
retrieve_latency: LatencyHistogram::new(QUERY_LATENCY_BOUNDS),
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
search_latency: LatencyHistogram::new(QUERY_LATENCY_BOUNDS),
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
active_sessions: AtomicU64::new(0),
|
active_sessions: AtomicU64::new(0),
|
||||||
#[cfg(feature = "metrics")]
|
#[cfg(feature = "metrics")]
|
||||||
closed_sessions_total: AtomicU64::new(0),
|
closed_sessions_total: AtomicU64::new(0),
|
||||||
@ -262,6 +276,16 @@ impl MetricsState {
|
|||||||
"Signal write latency in microseconds",
|
"Signal write latency in microseconds",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
out.push_str(&self.retrieve_latency.render_prometheus(
|
||||||
|
"tidaldb_retrieve_latency_us",
|
||||||
|
"Retrieve query end-to-end latency in microseconds",
|
||||||
|
));
|
||||||
|
|
||||||
|
out.push_str(&self.search_latency.render_prometheus(
|
||||||
|
"tidaldb_search_latency_us",
|
||||||
|
"Search query end-to-end latency in microseconds",
|
||||||
|
));
|
||||||
|
|
||||||
// Index health metrics.
|
// Index health metrics.
|
||||||
write_metric_line(
|
write_metric_line(
|
||||||
&mut out,
|
&mut out,
|
||||||
|
|||||||
@ -38,6 +38,10 @@ impl TidalDb {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record query latency from this point forward.
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
let query_start = std::time::Instant::now();
|
||||||
|
|
||||||
// Enter the load detector. The guard decrements the in-flight counter
|
// Enter the load detector. The guard decrements the in-flight counter
|
||||||
// on drop (method return, success or error).
|
// on drop (method return, success or error).
|
||||||
let (degradation_level, _inflight_guard) = self.load_detector.enter();
|
let (degradation_level, _inflight_guard) = self.load_detector.enter();
|
||||||
@ -118,7 +122,13 @@ impl TidalDb {
|
|||||||
base_executor
|
base_executor
|
||||||
};
|
};
|
||||||
|
|
||||||
executor.execute(query).map_err(TidalError::from)
|
let result = executor.execute(query).map_err(TidalError::from);
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
{
|
||||||
|
let elapsed_us = query_start.elapsed().as_micros() as u64;
|
||||||
|
self.metrics.retrieve_latency.observe(elapsed_us);
|
||||||
|
}
|
||||||
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Execute a SEARCH query -- text and/or vector retrieval with RRF fusion,
|
/// Execute a SEARCH query -- text and/or vector retrieval with RRF fusion,
|
||||||
@ -141,6 +151,10 @@ impl TidalDb {
|
|||||||
#[allow(clippy::too_many_lines)]
|
#[allow(clippy::too_many_lines)]
|
||||||
#[tracing::instrument(skip_all, fields(query = ?query.query_text, kind = ?query.entity_kind))]
|
#[tracing::instrument(skip_all, fields(query = ?query.query_text, kind = ?query.entity_kind))]
|
||||||
pub fn search(&self, query: &Search) -> crate::Result<SearchResults> {
|
pub fn search(&self, query: &Search) -> crate::Result<SearchResults> {
|
||||||
|
// Record query latency from this point forward.
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
let query_start = std::time::Instant::now();
|
||||||
|
|
||||||
// Guard against oversized filter expressions.
|
// Guard against oversized filter expressions.
|
||||||
const MAX_FILTER_NODES: usize = 256;
|
const MAX_FILTER_NODES: usize = 256;
|
||||||
if let Some(ref filter) = query.combined_filter() {
|
if let Some(ref filter) = query.combined_filter() {
|
||||||
@ -260,7 +274,15 @@ impl TidalDb {
|
|||||||
base_executor
|
base_executor
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = executor.execute(query).map_err(TidalError::from)?;
|
let result = executor.execute(query).map_err(TidalError::from);
|
||||||
|
|
||||||
|
#[cfg(feature = "metrics")]
|
||||||
|
{
|
||||||
|
let elapsed_us = query_start.elapsed().as_micros() as u64;
|
||||||
|
self.metrics.search_latency.observe(elapsed_us);
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = result?;
|
||||||
|
|
||||||
// M6p5: record query text for trending autocomplete suggestions.
|
// M6p5: record query text for trending autocomplete suggestions.
|
||||||
if let Some(ref text) = query.query_text {
|
if let Some(ref text) = query.query_text {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user