tidaldb/tidal/benches/text_index.rs
jordan 192c473f55 feat: complete Milestone 5 — full-text search, RRF fusion, and creator search
- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs)
- M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates)
- M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking)
- M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators)
- Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.)
- Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.)
- Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers
- Add benches: fusion, search, session, text_index
- Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index)
- Update blog posts, roadmap, content strategy, and M5 planning docs
- Add tmp/ and .claude/worktrees/ to .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 23:53:16 -07:00

136 lines
4.1 KiB
Rust

#![allow(clippy::unwrap_used)]
//! Criterion benchmarks for the BM25 text index pipeline.
//!
//! Measures BM25 query latency at various corpus sizes to validate the
//! < 10ms target at 10K documents specified in the m5p1 phase acceptance criteria.
use std::collections::HashMap;
use criterion::{Criterion, black_box, criterion_group, criterion_main};
use tidaldb::schema::{EntityId, TextFieldDef, TextFieldType};
use tidaldb::text::{AllScoresCollector, TextIndex};
fn make_index(n: u64) -> TextIndex {
let fields = vec![
TextFieldDef {
key: "title".into(),
field_type: TextFieldType::Text,
},
TextFieldDef {
key: "description".into(),
field_type: TextFieldType::Text,
},
TextFieldDef {
key: "category".into(),
field_type: TextFieldType::Keyword,
},
];
let idx = TextIndex::ephemeral(&fields).unwrap();
let mut w = idx.writer_guard().unwrap();
for i in 0..n {
let mut meta = HashMap::new();
// Vary titles so BM25 IDF scoring is meaningful.
meta.insert(
"title".into(),
format!("Rust tutorial {i} async concurrency"),
);
meta.insert(
"description".into(),
"Learn Rust programming with practical examples and real projects.".into(),
);
// Alternate categories to test keyword field throughput.
let cat = if i % 2 == 0 { "programming" } else { "systems" };
meta.insert("category".into(), cat.into());
w.index_item(EntityId::new(i), &meta).unwrap();
}
w.commit(n).unwrap();
drop(w);
idx.reload_reader().unwrap();
idx
}
/// BM25 bare-term query at 1K docs.
fn bench_bm25_1k(c: &mut Criterion) {
let idx = make_index(1_000);
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
c.bench_function("bm25_query_1k_docs", |b| {
b.iter(|| {
let q = parser.parse(black_box("Rust async")).unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
black_box(results)
});
});
}
/// BM25 bare-term query at 10K docs — must complete in < 10ms.
fn bench_bm25_10k(c: &mut Criterion) {
let idx = make_index(10_000);
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
c.bench_function("bm25_query_10k_docs", |b| {
b.iter(|| {
let q = parser.parse(black_box("Rust async")).unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
black_box(results)
});
});
}
/// BM25 exact-phrase query at 10K docs.
fn bench_bm25_phrase_10k(c: &mut Criterion) {
let idx = make_index(10_000);
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
c.bench_function("bm25_phrase_10k_docs", |b| {
b.iter(|| {
let q = parser.parse(black_box("\"Rust programming\"")).unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
black_box(results)
});
});
}
/// BM25 keyword field-scoped query at 10K docs.
fn bench_bm25_keyword_10k(c: &mut Criterion) {
let idx = make_index(10_000);
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
c.bench_function("bm25_keyword_10k_docs", |b| {
b.iter(|| {
let q = parser.parse(black_box("category:programming")).unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
black_box(results)
});
});
}
criterion_group!(
bm25_benches,
bench_bm25_1k,
bench_bm25_10k,
bench_bm25_phrase_10k,
bench_bm25_keyword_10k
);
criterion_main!(bm25_benches);