M2: RETRIEVE query pipeline with 5-stage execution (candidate → filter → score → diversify → limit),
usearch HNSW vector index, bitmap/range/universe filters, ranking profiles with signal scoring,
MMR diversity enforcement, and m2_uat integration tests.
M3: Entity system with typed metadata, relationship graph (follows/blocks/interactions),
creator entities, session tracking, and m3_uat integration tests.
M4: Advanced ranking with builtin functions (freshness, trending, controversy, wilson),
ranking executor with explain mode, query executor integration, benchmarks for
query/ranking/vector/filters/diversity, and m4_uat integration tests.
Includes: 9 new blog posts, marketing site updates, updated roadmap, and updated vision doc.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
273 lines
8.7 KiB
Rust
273 lines
8.7 KiB
Rust
#![allow(clippy::unwrap_used)]
|
|
|
|
//! Criterion benchmarks for the vector index subsystem.
|
|
//!
|
|
//! Measures ANN search latency across the adaptive query planner's strategy
|
|
//! spectrum: unfiltered, in-graph filtered (20%), widened filtered (5%),
|
|
//! and pre-filter brute-force (0.5%). Also benchmarks recall@100, single
|
|
//! insert, and single delete.
|
|
//!
|
|
//! All setup (index construction, vector insertion) is done OUTSIDE the
|
|
//! `b.iter()` closure. Only the search/insert/delete call is measured.
|
|
|
|
use criterion::{Criterion, black_box, criterion_group, criterion_main};
|
|
use rand::Rng;
|
|
use tidaldb::storage::vector::{
|
|
AdaptiveQueryPlanner, BruteForceIndex, DistanceMetric, QuantizationLevel, VectorId,
|
|
VectorIndex, VectorIndexConfig,
|
|
};
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Generate a random unit vector of the given dimensionality.
|
|
fn random_unit_vector(dim: usize, rng: &mut impl Rng) -> Vec<f32> {
|
|
let v: Vec<f32> = (0..dim)
|
|
.map(|_| {
|
|
let x: f32 = rng.random();
|
|
x - 0.5
|
|
})
|
|
.collect();
|
|
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
if norm < f32::EPSILON {
|
|
// Degenerate case: return a unit vector along the first axis.
|
|
let mut fallback = vec![0.0_f32; dim];
|
|
fallback[0] = 1.0;
|
|
return fallback;
|
|
}
|
|
v.iter().map(|x| x / norm).collect()
|
|
}
|
|
|
|
/// Build a brute-force index with `n` random unit vectors of dimension `dim`.
|
|
fn build_brute_index(n: u64, dim: usize) -> BruteForceIndex {
|
|
let config = VectorIndexConfig {
|
|
dimensions: dim,
|
|
metric: DistanceMetric::L2,
|
|
quantization: QuantizationLevel::F32,
|
|
connectivity: 16,
|
|
ef_construction: 200,
|
|
ef_search: 200,
|
|
};
|
|
let index = BruteForceIndex::new(config);
|
|
let mut rng = rand::rng();
|
|
for id in 0..n {
|
|
let vec = random_unit_vector(dim, &mut rng);
|
|
index.insert(id, &vec).unwrap();
|
|
}
|
|
index
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Benchmarks
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Benchmark: unfiltered ANN search over 10K vectors, dim=128, k=100.
|
|
/// Measures baseline search latency without any filter overhead.
|
|
fn bench_ann_search_unfiltered(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let index = build_brute_index(n, dim);
|
|
let planner = AdaptiveQueryPlanner::with_defaults();
|
|
|
|
let mut rng = rand::rng();
|
|
let query = random_unit_vector(dim, &mut rng);
|
|
|
|
c.bench_function("ann_search_unfiltered_10k", |b| {
|
|
b.iter(|| {
|
|
planner
|
|
.execute(
|
|
black_box(&index),
|
|
black_box(&query),
|
|
black_box(100),
|
|
None,
|
|
1.0,
|
|
None,
|
|
)
|
|
.unwrap()
|
|
});
|
|
});
|
|
}
|
|
|
|
/// Benchmark: filtered ANN search with 20% selectivity (in-graph filter).
|
|
/// 10K vectors, dim=128, k=100.
|
|
fn bench_ann_search_filtered_20pct(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let index = build_brute_index(n, dim);
|
|
let planner = AdaptiveQueryPlanner::with_defaults();
|
|
|
|
let mut rng = rand::rng();
|
|
let query = random_unit_vector(dim, &mut rng);
|
|
|
|
// ~20% selectivity: IDs 0..1999 pass (20% of 10K).
|
|
let filter = |id: VectorId| id < 2000;
|
|
|
|
c.bench_function("ann_search_filtered_20pct_10k", |b| {
|
|
b.iter(|| {
|
|
planner
|
|
.execute(
|
|
black_box(&index),
|
|
black_box(&query),
|
|
black_box(100),
|
|
Some(black_box(&filter)),
|
|
0.20,
|
|
None,
|
|
)
|
|
.unwrap()
|
|
});
|
|
});
|
|
}
|
|
|
|
/// Benchmark: filtered ANN search with 5% selectivity (widened filter, ef=400).
|
|
/// 10K vectors, dim=128, k=100.
|
|
fn bench_ann_search_filtered_5pct(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let index = build_brute_index(n, dim);
|
|
let planner = AdaptiveQueryPlanner::with_defaults();
|
|
|
|
let mut rng = rand::rng();
|
|
let query = random_unit_vector(dim, &mut rng);
|
|
|
|
// ~5% selectivity: IDs 0..499 pass (5% of 10K).
|
|
let filter = |id: VectorId| id < 500;
|
|
|
|
c.bench_function("ann_search_filtered_5pct_10k", |b| {
|
|
b.iter(|| {
|
|
planner
|
|
.execute(
|
|
black_box(&index),
|
|
black_box(&query),
|
|
black_box(100),
|
|
Some(black_box(&filter)),
|
|
0.05,
|
|
None,
|
|
)
|
|
.unwrap()
|
|
});
|
|
});
|
|
}
|
|
|
|
/// Benchmark: pre-filter brute-force search with 0.5% selectivity.
|
|
/// 10K vectors, dim=128, k=100. Uses a separate brute-force index.
|
|
fn bench_ann_search_brute_force(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let index = build_brute_index(n, dim);
|
|
let brute = build_brute_index(n, dim);
|
|
let planner = AdaptiveQueryPlanner::with_defaults();
|
|
|
|
let mut rng = rand::rng();
|
|
let query = random_unit_vector(dim, &mut rng);
|
|
|
|
// ~0.5% selectivity: IDs 0..49 pass (0.5% of 10K).
|
|
let filter = |id: VectorId| id < 50;
|
|
|
|
c.bench_function("ann_search_brute_force_10k", |b| {
|
|
b.iter(|| {
|
|
planner
|
|
.execute(
|
|
black_box(&index),
|
|
black_box(&query),
|
|
black_box(100),
|
|
Some(black_box(&filter)),
|
|
0.005,
|
|
Some(black_box(&brute as &dyn VectorIndex)),
|
|
)
|
|
.unwrap()
|
|
});
|
|
});
|
|
}
|
|
|
|
/// Benchmark: recall@100 measurement.
|
|
/// Builds a 10K brute-force index, runs search, and compares against
|
|
/// ground truth (which for brute-force is exact). This benchmarks the
|
|
/// search + comparison loop to establish a baseline measurement cost.
|
|
fn bench_ann_recall_at_100(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let k = 100;
|
|
let index = build_brute_index(n, dim);
|
|
|
|
let mut rng = rand::rng();
|
|
let query = random_unit_vector(dim, &mut rng);
|
|
|
|
// Pre-compute ground truth.
|
|
let ground_truth = index.search(&query, k, 200).unwrap();
|
|
let gt_ids: Vec<VectorId> = ground_truth.iter().map(|r| r.id).collect();
|
|
|
|
c.bench_function("ann_recall_at_100_10k", |b| {
|
|
b.iter(|| {
|
|
let results = index
|
|
.search(black_box(&query), black_box(k), black_box(200))
|
|
.unwrap();
|
|
let result_ids: Vec<VectorId> = results.iter().map(|r| r.id).collect();
|
|
|
|
// Compute recall: fraction of ground truth IDs found in results.
|
|
let hits = result_ids.iter().filter(|id| gt_ids.contains(id)).count();
|
|
#[allow(clippy::cast_precision_loss)]
|
|
let recall = hits as f64 / gt_ids.len() as f64;
|
|
black_box(recall)
|
|
});
|
|
});
|
|
}
|
|
|
|
/// Benchmark: single vector insert into a pre-filled 10K index.
|
|
fn bench_ann_insert_single(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let index = build_brute_index(n, dim);
|
|
|
|
let mut rng = rand::rng();
|
|
let vec = random_unit_vector(dim, &mut rng);
|
|
|
|
// Use an ID outside the pre-filled range to avoid replacement overhead.
|
|
let mut next_id = n;
|
|
|
|
c.bench_function("ann_insert_single_10k", |b| {
|
|
b.iter(|| {
|
|
index.insert(black_box(next_id), black_box(&vec)).unwrap();
|
|
next_id += 1;
|
|
});
|
|
});
|
|
}
|
|
|
|
/// Benchmark: single vector delete from a pre-filled 10K index.
|
|
/// After each delete, re-inserts the vector so the bench remains iterable.
|
|
fn bench_ann_delete_single(c: &mut Criterion) {
|
|
let dim = 128;
|
|
let n = 10_000_u64;
|
|
let index = build_brute_index(n, dim);
|
|
|
|
let mut rng = rand::rng();
|
|
let vec = random_unit_vector(dim, &mut rng);
|
|
|
|
// Target a fixed ID for delete/reinsert cycle.
|
|
let target_id = 0_u64;
|
|
|
|
c.bench_function("ann_delete_single_10k", |b| {
|
|
b.iter(|| {
|
|
index.delete(black_box(target_id)).unwrap();
|
|
// Re-insert so the next iteration can delete it again.
|
|
index.insert(black_box(target_id), black_box(&vec)).unwrap();
|
|
});
|
|
});
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Criterion group + main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
criterion_group!(
|
|
benches,
|
|
bench_ann_search_unfiltered,
|
|
bench_ann_search_filtered_20pct,
|
|
bench_ann_search_filtered_5pct,
|
|
bench_ann_search_brute_force,
|
|
bench_ann_recall_at_100,
|
|
bench_ann_insert_single,
|
|
bench_ann_delete_single,
|
|
);
|
|
criterion_main!(benches);
|