tidaldb/tidal/benches/vector.rs
jordan 39ada28c6e feat: complete Milestones 2–4 — RETRIEVE query, vector index, ranking profiles, diversity, entity system, sessions
M2: RETRIEVE query pipeline with 5-stage execution (candidate → filter → score → diversify → limit),
    usearch HNSW vector index, bitmap/range/universe filters, ranking profiles with signal scoring,
    MMR diversity enforcement, and m2_uat integration tests.

M3: Entity system with typed metadata, relationship graph (follows/blocks/interactions),
    creator entities, session tracking, and m3_uat integration tests.

M4: Advanced ranking with builtin functions (freshness, trending, controversy, wilson),
    ranking executor with explain mode, query executor integration, benchmarks for
    query/ranking/vector/filters/diversity, and m4_uat integration tests.

Includes: 9 new blog posts, marketing site updates, updated roadmap, and updated vision doc.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 16:24:48 -07:00

273 lines
8.7 KiB
Rust

#![allow(clippy::unwrap_used)]
//! Criterion benchmarks for the vector index subsystem.
//!
//! Measures ANN search latency across the adaptive query planner's strategy
//! spectrum: unfiltered, in-graph filtered (20%), widened filtered (5%),
//! and pre-filter brute-force (0.5%). Also benchmarks recall@100, single
//! insert, and single delete.
//!
//! All setup (index construction, vector insertion) is done OUTSIDE the
//! `b.iter()` closure. Only the search/insert/delete call is measured.
use criterion::{Criterion, black_box, criterion_group, criterion_main};
use rand::Rng;
use tidaldb::storage::vector::{
AdaptiveQueryPlanner, BruteForceIndex, DistanceMetric, QuantizationLevel, VectorId,
VectorIndex, VectorIndexConfig,
};
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// Generate a random unit vector of the given dimensionality.
fn random_unit_vector(dim: usize, rng: &mut impl Rng) -> Vec<f32> {
let v: Vec<f32> = (0..dim)
.map(|_| {
let x: f32 = rng.random();
x - 0.5
})
.collect();
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm < f32::EPSILON {
// Degenerate case: return a unit vector along the first axis.
let mut fallback = vec![0.0_f32; dim];
fallback[0] = 1.0;
return fallback;
}
v.iter().map(|x| x / norm).collect()
}
/// Build a brute-force index with `n` random unit vectors of dimension `dim`.
fn build_brute_index(n: u64, dim: usize) -> BruteForceIndex {
let config = VectorIndexConfig {
dimensions: dim,
metric: DistanceMetric::L2,
quantization: QuantizationLevel::F32,
connectivity: 16,
ef_construction: 200,
ef_search: 200,
};
let index = BruteForceIndex::new(config);
let mut rng = rand::rng();
for id in 0..n {
let vec = random_unit_vector(dim, &mut rng);
index.insert(id, &vec).unwrap();
}
index
}
// ---------------------------------------------------------------------------
// Benchmarks
// ---------------------------------------------------------------------------
/// Benchmark: unfiltered ANN search over 10K vectors, dim=128, k=100.
/// Measures baseline search latency without any filter overhead.
fn bench_ann_search_unfiltered(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let index = build_brute_index(n, dim);
let planner = AdaptiveQueryPlanner::with_defaults();
let mut rng = rand::rng();
let query = random_unit_vector(dim, &mut rng);
c.bench_function("ann_search_unfiltered_10k", |b| {
b.iter(|| {
planner
.execute(
black_box(&index),
black_box(&query),
black_box(100),
None,
1.0,
None,
)
.unwrap()
});
});
}
/// Benchmark: filtered ANN search with 20% selectivity (in-graph filter).
/// 10K vectors, dim=128, k=100.
fn bench_ann_search_filtered_20pct(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let index = build_brute_index(n, dim);
let planner = AdaptiveQueryPlanner::with_defaults();
let mut rng = rand::rng();
let query = random_unit_vector(dim, &mut rng);
// ~20% selectivity: IDs 0..1999 pass (20% of 10K).
let filter = |id: VectorId| id < 2000;
c.bench_function("ann_search_filtered_20pct_10k", |b| {
b.iter(|| {
planner
.execute(
black_box(&index),
black_box(&query),
black_box(100),
Some(black_box(&filter)),
0.20,
None,
)
.unwrap()
});
});
}
/// Benchmark: filtered ANN search with 5% selectivity (widened filter, ef=400).
/// 10K vectors, dim=128, k=100.
fn bench_ann_search_filtered_5pct(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let index = build_brute_index(n, dim);
let planner = AdaptiveQueryPlanner::with_defaults();
let mut rng = rand::rng();
let query = random_unit_vector(dim, &mut rng);
// ~5% selectivity: IDs 0..499 pass (5% of 10K).
let filter = |id: VectorId| id < 500;
c.bench_function("ann_search_filtered_5pct_10k", |b| {
b.iter(|| {
planner
.execute(
black_box(&index),
black_box(&query),
black_box(100),
Some(black_box(&filter)),
0.05,
None,
)
.unwrap()
});
});
}
/// Benchmark: pre-filter brute-force search with 0.5% selectivity.
/// 10K vectors, dim=128, k=100. Uses a separate brute-force index.
fn bench_ann_search_brute_force(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let index = build_brute_index(n, dim);
let brute = build_brute_index(n, dim);
let planner = AdaptiveQueryPlanner::with_defaults();
let mut rng = rand::rng();
let query = random_unit_vector(dim, &mut rng);
// ~0.5% selectivity: IDs 0..49 pass (0.5% of 10K).
let filter = |id: VectorId| id < 50;
c.bench_function("ann_search_brute_force_10k", |b| {
b.iter(|| {
planner
.execute(
black_box(&index),
black_box(&query),
black_box(100),
Some(black_box(&filter)),
0.005,
Some(black_box(&brute as &dyn VectorIndex)),
)
.unwrap()
});
});
}
/// Benchmark: recall@100 measurement.
/// Builds a 10K brute-force index, runs search, and compares against
/// ground truth (which for brute-force is exact). This benchmarks the
/// search + comparison loop to establish a baseline measurement cost.
fn bench_ann_recall_at_100(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let k = 100;
let index = build_brute_index(n, dim);
let mut rng = rand::rng();
let query = random_unit_vector(dim, &mut rng);
// Pre-compute ground truth.
let ground_truth = index.search(&query, k, 200).unwrap();
let gt_ids: Vec<VectorId> = ground_truth.iter().map(|r| r.id).collect();
c.bench_function("ann_recall_at_100_10k", |b| {
b.iter(|| {
let results = index
.search(black_box(&query), black_box(k), black_box(200))
.unwrap();
let result_ids: Vec<VectorId> = results.iter().map(|r| r.id).collect();
// Compute recall: fraction of ground truth IDs found in results.
let hits = result_ids.iter().filter(|id| gt_ids.contains(id)).count();
#[allow(clippy::cast_precision_loss)]
let recall = hits as f64 / gt_ids.len() as f64;
black_box(recall)
});
});
}
/// Benchmark: single vector insert into a pre-filled 10K index.
fn bench_ann_insert_single(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let index = build_brute_index(n, dim);
let mut rng = rand::rng();
let vec = random_unit_vector(dim, &mut rng);
// Use an ID outside the pre-filled range to avoid replacement overhead.
let mut next_id = n;
c.bench_function("ann_insert_single_10k", |b| {
b.iter(|| {
index.insert(black_box(next_id), black_box(&vec)).unwrap();
next_id += 1;
});
});
}
/// Benchmark: single vector delete from a pre-filled 10K index.
/// After each delete, re-inserts the vector so the bench remains iterable.
fn bench_ann_delete_single(c: &mut Criterion) {
let dim = 128;
let n = 10_000_u64;
let index = build_brute_index(n, dim);
let mut rng = rand::rng();
let vec = random_unit_vector(dim, &mut rng);
// Target a fixed ID for delete/reinsert cycle.
let target_id = 0_u64;
c.bench_function("ann_delete_single_10k", |b| {
b.iter(|| {
index.delete(black_box(target_id)).unwrap();
// Re-insert so the next iteration can delete it again.
index.insert(black_box(target_id), black_box(&vec)).unwrap();
});
});
}
// ---------------------------------------------------------------------------
// Criterion group + main
// ---------------------------------------------------------------------------
criterion_group!(
benches,
bench_ann_search_unfiltered,
bench_ann_search_filtered_20pct,
bench_ann_search_filtered_5pct,
bench_ann_search_brute_force,
bench_ann_recall_at_100,
bench_ann_insert_single,
bench_ann_delete_single,
);
criterion_main!(benches);