//! Runtime metrics for tidalDB. //! //! [`MetricsState`] is an `Arc`-shared bag of atomics that `TidalDb` updates //! on every operation. The metrics HTTP server (when the `metrics` feature //! is enabled) reads from this shared state to serve Prometheus text format. //! //! Adding a new counter in future milestones is: //! 1. Add an `AtomicU64` field to `MetricsState` //! 2. Increment it in the relevant `TidalDb` method //! 3. Add one line to `MetricsState::render_prometheus` pub(crate) mod histogram; #[cfg(feature = "metrics")] pub(crate) use histogram::{ LatencyHistogram, QUERY_LATENCY_BOUNDS, WRITE_LATENCY_BOUNDS, write_metric_line, }; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::time::Instant; // ── MetricsState ──────────────────────────────────────────────────────────── /// Shared runtime metrics for a `TidalDb` instance. /// /// Cheap to clone (`Arc` inside). Thread-safe. /// /// # Examples /// /// ``` /// # fn main() -> Result<(), Box> { /// let db = tidaldb::TidalDb::builder().ephemeral().open()?; /// let metrics = db.metrics(); /// assert!(metrics.uptime_seconds() >= 0.0); /// assert!((metrics.health_ok_value() - 1.0).abs() < f64::EPSILON); /// # Ok(()) /// # } /// ``` pub struct MetricsState { /// Time the database was opened. pub(crate) opened_at: Instant, /// Whether the database is currently healthy. pub(crate) health_ok: AtomicBool, // ── Signal system + WAL metrics (M7p4 Task 02) ────────────────────── /// Bytes of WAL segments not yet compacted. #[cfg(feature = "metrics")] pub(crate) wal_lag_bytes: AtomicU64, /// Total WAL segments compacted since database open. #[cfg(feature = "metrics")] pub(crate) wal_compacted_segments_total: AtomicU64, /// Unix timestamp (nanoseconds) of the last successful checkpoint. #[cfg(feature = "metrics")] pub(crate) last_checkpoint_ns: AtomicU64, /// Number of entries in the signal ledger hot tier. #[cfg(feature = "metrics")] pub(crate) signal_hot_entries: AtomicU64, /// Total signal writes since database open. #[cfg(feature = "metrics")] pub(crate) signal_writes_total: AtomicU64, /// Signal write latency histogram (microseconds). #[cfg(feature = "metrics")] pub(crate) signal_write_latency: LatencyHistogram, /// Retrieve query end-to-end latency histogram (microseconds). #[cfg(feature = "metrics")] pub(crate) retrieve_latency: LatencyHistogram, /// Search query end-to-end latency histogram (microseconds). #[cfg(feature = "metrics")] pub(crate) search_latency: LatencyHistogram, // ── Session + cohort + degradation metrics (m7p4, task-04) ───────── /// Number of currently active sessions. #[cfg(feature = "metrics")] pub(crate) active_sessions: AtomicU64, /// Total sessions closed since open (cumulative). #[cfg(feature = "metrics")] pub(crate) closed_sessions_total: AtomicU64, /// Total sessions auto-closed due to timeout since open (cumulative). #[cfg(feature = "metrics")] pub(crate) session_auto_closed_total: AtomicU64, /// Total requests rate-limited since open (cumulative). #[cfg(feature = "metrics")] pub(crate) rate_limited_total: AtomicU64, /// Current degradation level (0=Full, 1=ReducedCandidates, 2=CoarseAggregates, 3=NoDiversity). #[cfg(feature = "metrics")] pub(crate) degradation_level: AtomicU64, // ── Index health metrics (m7p4, task-03) ────────────────────────────── /// Number of Tantivy segments for the items text index. #[cfg(feature = "metrics")] pub(crate) tantivy_segment_count: AtomicU64, /// Number of documents indexed in the items text index. #[cfg(feature = "metrics")] pub(crate) tantivy_indexed_docs: AtomicU64, /// Total byte size of the `USearch` index files (f16 estimate). #[cfg(feature = "metrics")] pub(crate) usearch_index_size_bytes: AtomicU64, /// Number of vectors stored in the `USearch` index. #[cfg(feature = "metrics")] pub(crate) usearch_vector_count: AtomicU64, /// Total cardinality across all bitmap index entries (category + format + creator + tag). #[cfg(feature = "metrics")] pub(crate) bitmap_index_cardinality: AtomicU64, /// Total number of failed periodic signal checkpoints. pub(crate) checkpoint_failures_total: AtomicU64, // ── Replication metrics (m8p2) ────────────────────────────────────── /// Current replication lag in WAL segments (follower only; 0 on leader). #[cfg(feature = "metrics")] pub(crate) replication_lag_seqno: AtomicU64, // ── M8p5 control plane ─────────────────────────────────────────────── /// Shared control plane for cluster health queries. pub(crate) control_plane: Option>, } impl MetricsState { pub(crate) fn new() -> Self { Self { opened_at: Instant::now(), health_ok: AtomicBool::new(true), #[cfg(feature = "metrics")] wal_lag_bytes: AtomicU64::new(0), #[cfg(feature = "metrics")] wal_compacted_segments_total: AtomicU64::new(0), #[cfg(feature = "metrics")] last_checkpoint_ns: AtomicU64::new(0), #[cfg(feature = "metrics")] signal_hot_entries: AtomicU64::new(0), #[cfg(feature = "metrics")] signal_writes_total: AtomicU64::new(0), #[cfg(feature = "metrics")] signal_write_latency: LatencyHistogram::new(WRITE_LATENCY_BOUNDS), #[cfg(feature = "metrics")] retrieve_latency: LatencyHistogram::new(QUERY_LATENCY_BOUNDS), #[cfg(feature = "metrics")] search_latency: LatencyHistogram::new(QUERY_LATENCY_BOUNDS), #[cfg(feature = "metrics")] active_sessions: AtomicU64::new(0), #[cfg(feature = "metrics")] closed_sessions_total: AtomicU64::new(0), #[cfg(feature = "metrics")] session_auto_closed_total: AtomicU64::new(0), #[cfg(feature = "metrics")] rate_limited_total: AtomicU64::new(0), #[cfg(feature = "metrics")] degradation_level: AtomicU64::new(0), #[cfg(feature = "metrics")] tantivy_segment_count: AtomicU64::new(0), #[cfg(feature = "metrics")] tantivy_indexed_docs: AtomicU64::new(0), #[cfg(feature = "metrics")] usearch_index_size_bytes: AtomicU64::new(0), #[cfg(feature = "metrics")] usearch_vector_count: AtomicU64::new(0), #[cfg(feature = "metrics")] bitmap_index_cardinality: AtomicU64::new(0), checkpoint_failures_total: AtomicU64::new(0), #[cfg(feature = "metrics")] replication_lag_seqno: AtomicU64::new(0), control_plane: None, } } /// Return the current cluster health snapshot, if a control plane is wired. #[must_use] pub fn cluster_health(&self) -> Option { self.control_plane.as_ref().map(|cp| cp.health()) } /// Uptime in fractional seconds since the database was opened. #[must_use] pub fn uptime_seconds(&self) -> f64 { self.opened_at.elapsed().as_secs_f64() } /// Whether the database reports healthy (1.0) or degraded (0.0). #[must_use] pub fn health_ok_value(&self) -> f64 { if self.health_ok.load(Ordering::Relaxed) { 1.0 } else { 0.0 } } /// Render Prometheus text exposition format for all metrics. /// /// Format: #[must_use] #[allow(clippy::too_many_lines)] pub fn render_prometheus(&self) -> String { let uptime = self.uptime_seconds(); let health = self.health_ok_value(); let version = env!("CARGO_PKG_VERSION"); let build_hash = crate::BUILD_HASH; #[allow(unused_mut)] let mut out = format!( "# HELP tidaldb_uptime_seconds Seconds since database opened.\n\ # TYPE tidaldb_uptime_seconds gauge\n\ tidaldb_uptime_seconds{{partition_id=\"0\"}} {uptime:.3}\n\n\ # HELP tidaldb_health_ok Whether the database is healthy. 1 = ok, 0 = degraded.\n\ # TYPE tidaldb_health_ok gauge\n\ tidaldb_health_ok{{partition_id=\"0\"}} {health}\n\n\ # HELP tidaldb_info Build and version information.\n\ # TYPE tidaldb_info gauge\n\ tidaldb_info{{version=\"{version}\",build_hash=\"{build_hash}\",partition_id=\"0\"}} 1\n" ); // Prometheus uses f64 natively; precision loss on u64->f64 is // intentional and acceptable for monitoring counters/gauges. #[cfg(feature = "metrics")] #[allow(clippy::cast_precision_loss)] { let now_ns = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() .as_nanos() as u64; write_metric_line( &mut out, "tidaldb_wal_lag_bytes", "Bytes of WAL segments not yet compacted", "gauge", self.wal_lag_bytes.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_wal_compacted_segments_total", "Total WAL segments compacted since open", "counter", self.wal_compacted_segments_total.load(Ordering::Relaxed) as f64, ); let last_cp_ns = self.last_checkpoint_ns.load(Ordering::Relaxed); let checkpoint_age = if last_cp_ns > 0 && now_ns > last_cp_ns { now_ns - last_cp_ns } else { 0 }; write_metric_line( &mut out, "tidaldb_checkpoint_age_seconds", "Seconds since the last successful checkpoint", "gauge", checkpoint_age as f64 / 1_000_000_000.0, ); write_metric_line( &mut out, "tidaldb_signal_hot_entries", "Number of entries in the signal ledger hot tier", "gauge", self.signal_hot_entries.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_signal_writes_total", "Total signal writes since database open", "counter", self.signal_writes_total.load(Ordering::Relaxed) as f64, ); out.push_str(&self.signal_write_latency.render_prometheus( "tidaldb_signal_write_latency_us", "Signal write latency in microseconds", )); out.push_str(&self.retrieve_latency.render_prometheus( "tidaldb_retrieve_latency_us", "Retrieve query end-to-end latency in microseconds", )); out.push_str(&self.search_latency.render_prometheus( "tidaldb_search_latency_us", "Search query end-to-end latency in microseconds", )); // Index health metrics. write_metric_line( &mut out, "tidaldb_tantivy_segment_count", "Number of Tantivy index segments", "gauge", self.tantivy_segment_count.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_tantivy_indexed_docs", "Number of documents indexed in Tantivy", "gauge", self.tantivy_indexed_docs.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_usearch_index_size_bytes", "Estimated byte size of USearch vector indexes", "gauge", self.usearch_index_size_bytes.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_usearch_vector_count", "Number of vectors stored in USearch indexes", "gauge", self.usearch_vector_count.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_bitmap_index_cardinality", "Total entity IDs across all bitmap indexes", "gauge", self.bitmap_index_cardinality.load(Ordering::Relaxed) as f64, ); // Session lifecycle metrics. write_metric_line( &mut out, "tidaldb_active_sessions", "Number of currently active agent sessions", "gauge", self.active_sessions.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_closed_sessions_total", "Total agent sessions closed since open", "counter", self.closed_sessions_total.load(Ordering::Relaxed) as f64, ); write_metric_line( &mut out, "tidaldb_session_auto_closed_total", "Total agent sessions auto-closed due to timeout", "counter", self.session_auto_closed_total.load(Ordering::Relaxed) as f64, ); // Rate limiting. write_metric_line( &mut out, "tidaldb_rate_limited_total", "Total requests rate-limited due to overload", "counter", self.rate_limited_total.load(Ordering::Relaxed) as f64, ); // Degradation. write_metric_line( &mut out, "tidaldb_degradation_level", "Current degradation level (0=full, 1=reduced, 2=coarse, 3=no_diversity)", "gauge", self.degradation_level.load(Ordering::Relaxed) as f64, ); // Replication lag. write_metric_line( &mut out, "tidaldb_replication_lag_seqno", "Replication lag in WAL segments behind the leader", "gauge", self.replication_lag_seqno.load(Ordering::Relaxed) as f64, ); } // Checkpoint failure counter (unconditional -- not feature-gated). { use std::fmt::Write; let failures = self.checkpoint_failures_total.load(Ordering::Relaxed); let _ = write!( out, "\n# HELP tidaldb_checkpoint_failures_total Total number of failed periodic signal checkpoints\n\ # TYPE tidaldb_checkpoint_failures_total counter\n\ tidaldb_checkpoint_failures_total {failures}\n" ); } out } /// Render JSON for /healthz. #[must_use] pub fn render_healthz(&self) -> String { let uptime = self.uptime_seconds(); let status = if self.health_ok.load(Ordering::Relaxed) { "ok" } else { "degraded" }; let version = env!("CARGO_PKG_VERSION"); let build_hash = crate::BUILD_HASH; format!( r#"{{"status":"{status}","uptime_seconds":{uptime:.3},"version":"{version}","build_hash":"{build_hash}"}}"# ) } } #[cfg(test)] mod tests;