tidaldb/tidal/tests/m7_recovery_sla.rs
2026-02-23 22:41:16 -07:00

155 lines
4.6 KiB
Rust

//! Recovery SLA integration tests.
//!
//! Asserts hard latency bounds on cold-start recovery time.
//! These tests run on every `cargo test` invocation (the expensive 1M-item
//! variant is marked `#[ignore]` and must be run explicitly).
//!
//! See `benches/recovery.rs` for the matching Criterion microbenchmark.
#![allow(clippy::unwrap_used)]
use std::time::{Duration, Instant};
use tidaldb::TidalDb;
use tidaldb::schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, Timestamp, Window};
fn bench_schema() -> tidaldb::schema::Schema {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.windows(&[Window::AllTime])
.velocity(false)
.add();
builder.build().expect("valid schema")
}
fn generate_test_data(dir: &std::path::Path, entity_count: u64) {
let schema = bench_schema();
let db = TidalDb::builder()
.with_data_dir(dir)
.with_schema(schema)
.open()
.unwrap();
let base_ns = 1_000_000_000_000u64;
for i in 1..=entity_count {
let ts = Timestamp::from_nanos(base_ns + i * 1_000_000);
db.signal("view", EntityId::new(i), 1.0, ts).unwrap();
if entity_count >= 100_000 && i % 100_000 == 0 {
eprintln!(" setup: {i}/{entity_count} entities written");
}
}
db.close().unwrap();
}
/// Assert that recovery from 1000-item checkpoint completes in under 2 seconds.
#[test]
fn small_scale_recovery_smoke_test() {
let dir = tempfile::tempdir().unwrap();
generate_test_data(dir.path(), 1000);
let schema = bench_schema();
let start = Instant::now();
let db = TidalDb::builder()
.with_data_dir(dir.path())
.with_schema(schema)
.open()
.unwrap();
let count = db
.read_windowed_count(EntityId::new(500), "view", Window::AllTime)
.unwrap();
assert_eq!(count, 1);
let elapsed = start.elapsed();
assert!(
elapsed < Duration::from_secs(2),
"1000-entity recovery took {elapsed:?}, expected < 2s"
);
db.close().unwrap();
}
/// Assert that recovery from 10K-item checkpoint completes in under 5 seconds.
///
/// This is the CI-safe version of the 1M-item SLA test below.
/// Run the full benchmark with: cargo bench --bench recovery
#[test]
fn recovery_under_5_seconds_10k_items() {
let dir = tempfile::tempdir().unwrap();
generate_test_data(dir.path(), 10_000);
let schema = bench_schema();
let start = Instant::now();
let db = TidalDb::builder()
.with_data_dir(dir.path())
.with_schema(schema)
.open()
.unwrap();
let count = db
.read_windowed_count(EntityId::new(5000), "view", Window::AllTime)
.unwrap();
assert_eq!(count, 1);
let elapsed = start.elapsed();
assert!(
elapsed < Duration::from_secs(5),
"10K-entity recovery took {elapsed:?}, expected < 5s"
);
db.close().unwrap();
}
/// Assert that recovery from 1M-item checkpoint completes in under 30 seconds.
///
/// This is the spec-minimum SLA test from task-05. It takes ~5-10 minutes
/// to generate the 1M-item dataset (one-time setup). Marked `#[ignore]` so
/// CI does not run it on every commit. Run locally before major changes to
/// the checkpoint format, WAL replay logic, or entity rebuild:
///
/// ```bash
/// cargo test --manifest-path tidal/Cargo.toml --test m7_recovery_sla -- --ignored
/// ```
///
/// NOTE: This measures checkpoint restore + index rebuild only (WAL backlog
/// is ~0 after a clean `db.close()`). See the module-level doc for scope.
#[test]
#[ignore = "expensive: generates 1M items (~5min setup), run with --ignored"]
fn recovery_under_30_seconds() {
let dir = tempfile::tempdir().unwrap();
generate_test_data(dir.path(), 1_000_000);
eprintln!(" setup: checkpoint written, starting recovery timing");
let schema = bench_schema();
let start = Instant::now();
let db = TidalDb::builder()
.with_data_dir(dir.path())
.with_schema(schema)
.open()
.unwrap();
let elapsed = start.elapsed();
eprintln!("Recovery time (1M items): {elapsed:?}");
let count = db
.read_windowed_count(EntityId::new(500_000), "view", Window::AllTime)
.unwrap();
assert_eq!(count, 1, "entity 500000 should have exactly 1 signal");
// 30-second SLA from task-05 spec.
assert!(
elapsed < Duration::from_secs(30),
"1M-entity recovery took {elapsed:?}, expected < 30s"
);
db.close().unwrap();
}