tidaldb/tidal/benches/scale.rs
2026-02-23 22:41:16 -07:00

303 lines
8.9 KiB
Rust

#![allow(
clippy::unwrap_used,
clippy::cast_precision_loss,
clippy::cast_possible_truncation
)]
//! Criterion benchmarks for production-representative load: 1M items.
//!
//! Validates the m7p3 performance acceptance criteria:
//! - RETRIEVE p99 < 50ms
//! - SEARCH p99 < 100ms
//! - Signal write p99 < 100µs
//!
//! A single shared `TidalDb` is built once via `LazyLock` to amortize the
//! 1M-item setup cost across all benchmark runs.
//!
//! Dataset layout:
//! - 1M items, 10K creators (100 items/creator)
//! - 20 categories, 128D random unit vectors
//! - 10% view coverage, 5% like coverage
use std::collections::HashMap;
use std::sync::LazyLock;
use std::time::Duration;
use criterion::{BatchSize, Criterion, SamplingMode, black_box, criterion_group, criterion_main};
use tidaldb::TidalDb;
use tidaldb::query::retrieve::Retrieve;
use tidaldb::query::search::Search;
use tidaldb::schema::{
DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType, Timestamp, Window,
};
use tidaldb::storage::indexes::filter::FilterExpr;
const N_ITEMS: u64 = 1_000_000;
const N_CREATORS: u64 = 10_000;
const ITEMS_PER_CREATOR: u64 = 100;
const N_CATEGORIES: u64 = 20;
const DIM: usize = 128;
/// Categories pool for round-robin assignment.
static CATEGORIES: &[&str] = &[
"technology",
"sports",
"music",
"gaming",
"cooking",
"travel",
"science",
"arts",
"news",
"fashion",
"finance",
"health",
"education",
"entertainment",
"politics",
"nature",
"history",
"automotive",
"pets",
"fitness",
];
fn scale_schema() -> tidaldb::schema::Schema {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours, Window::SevenDays])
.velocity(false)
.add();
let _ = builder
.signal(
"like",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(30 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
builder.text_field("title", TextFieldType::Text);
builder.text_field("category", TextFieldType::Keyword);
builder.build().unwrap()
}
/// Build the shared 1M-item database. Called exactly once.
fn build_scale_db() -> TidalDb {
eprintln!("[scale bench] Building 1M-item database (this takes ~30s)...");
let db = TidalDb::builder()
.ephemeral()
.with_schema(scale_schema())
.open()
.unwrap();
let base_ns = 1_708_000_000_000_000_000u64;
for i in 0..N_ITEMS {
let item_id = EntityId::new(i + 1);
let creator_id = i / ITEMS_PER_CREATOR; // 0..9999
let category = CATEGORIES[(i % N_CATEGORIES) as usize];
let mut meta = HashMap::new();
meta.insert(
"title".to_string(),
format!("Content {i} by creator {creator_id} about {category}"),
);
meta.insert("category".to_string(), category.to_string());
meta.insert("creator".to_string(), creator_id.to_string());
db.write_item_with_metadata(item_id, &meta).unwrap();
// 10% view coverage.
if i % 10 == 0 {
let ts = Timestamp::from_nanos(base_ns - (i % 86400) * 1_000_000_000);
db.signal("view", item_id, 1.0, ts).unwrap();
}
// 5% like coverage.
if i % 20 == 0 {
let ts = Timestamp::from_nanos(base_ns - (i % 86400) * 2_000_000_000);
db.signal("like", item_id, 1.0, ts).unwrap();
}
}
// Wait for text syncer to commit all 1M items (1000 batch commits at 1000/commit).
// The syncer commits every 1000 items, so 1M items = ~1000 commits. Allow time.
eprintln!("[scale bench] Waiting for text index to commit...");
std::thread::sleep(Duration::from_millis(3000));
db.reload_text_index().unwrap();
eprintln!("[scale bench] Database ready: {N_ITEMS} items.");
db
}
/// Shared 1M-item DB: built once, reused by all benchmarks.
static SCALE_DB: LazyLock<TidalDb> = LazyLock::new(build_scale_db);
// ── RETRIEVE benchmarks ───────────────────────────────────────────────────────
/// RETRIEVE: "for_you" profile — signal-scored ranking over full universe.
fn bench_retrieve_for_you(c: &mut Criterion) {
let db: &TidalDb = &SCALE_DB;
let query = Retrieve::builder()
.profile("for_you")
.limit(20)
.build()
.unwrap();
let mut group = c.benchmark_group("retrieve_1m");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
group.sampling_mode(SamplingMode::Flat);
group.bench_function("for_you", |b| {
b.iter(|| db.retrieve(black_box(&query)).unwrap());
});
group.finish();
}
/// RETRIEVE: "trending" profile — top-viewed items.
fn bench_retrieve_trending(c: &mut Criterion) {
let db: &TidalDb = &SCALE_DB;
let query = Retrieve::builder()
.profile("trending")
.limit(20)
.build()
.unwrap();
let mut group = c.benchmark_group("retrieve_1m");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
group.sampling_mode(SamplingMode::Flat);
group.bench_function("trending", |b| {
b.iter(|| db.retrieve(black_box(&query)).unwrap());
});
group.finish();
}
/// RETRIEVE: "new" profile — recency-filtered by category.
fn bench_retrieve_new_filtered(c: &mut Criterion) {
let db: &TidalDb = &SCALE_DB;
let query = Retrieve::builder()
.profile("new")
.limit(20)
.filter(FilterExpr::CategoryEq("technology".into()))
.build()
.unwrap();
let mut group = c.benchmark_group("retrieve_1m");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
group.sampling_mode(SamplingMode::Flat);
group.bench_function("new_filtered", |b| {
b.iter(|| db.retrieve(black_box(&query)).unwrap());
});
group.finish();
}
// ── SEARCH benchmarks ─────────────────────────────────────────────────────────
/// SEARCH: text-only query over 1M items.
fn bench_search_text_only(c: &mut Criterion) {
let db: &TidalDb = &SCALE_DB;
let query = Search::builder()
.query("technology content creator")
.limit(20)
.build()
.unwrap();
let mut group = c.benchmark_group("search_1m");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
group.sampling_mode(SamplingMode::Flat);
group.bench_function("text_only", |b| {
b.iter(|| db.search(black_box(&query)).unwrap());
});
group.finish();
}
/// SEARCH: text query with category filter.
fn bench_search_text_filtered(c: &mut Criterion) {
let db: &TidalDb = &SCALE_DB;
let query = Search::builder()
.query("gaming sports fitness")
.limit(20)
.filter(FilterExpr::CategoryEq("sports".into()))
.build()
.unwrap();
let mut group = c.benchmark_group("search_1m");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
group.sampling_mode(SamplingMode::Flat);
group.bench_function("text_filtered", |b| {
b.iter(|| db.search(black_box(&query)).unwrap());
});
group.finish();
}
// ── Signal write benchmark ────────────────────────────────────────────────────
/// Signal write: amortized cost over rotating entity IDs.
///
/// Measures the hot path for incremental signal ingestion at 1M-item scale.
/// Rotates through 1000 entity IDs to represent a realistic write workload.
fn bench_signal_write(c: &mut Criterion) {
let db: &TidalDb = &SCALE_DB;
let mut group = c.benchmark_group("signal_write_1m");
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
group.sampling_mode(SamplingMode::Flat);
let mut counter = 0u64;
let ts = Timestamp::now();
group.bench_function("write_rotating_1k_entities", |b| {
b.iter_batched(
|| {
// Rotate through 1000 entity IDs in the 1M range.
let id = EntityId::new((counter % 1000) + 1);
counter += 1;
id
},
|id| db.signal("view", black_box(id), 1.0, ts).unwrap(),
BatchSize::SmallInput,
);
});
group.finish();
}
criterion_group!(
scale_benches,
bench_retrieve_for_you,
bench_retrieve_trending,
bench_retrieve_new_filtered,
bench_search_text_only,
bench_search_text_filtered,
bench_signal_write,
);
criterion_main!(scale_benches);