- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs) - M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates) - M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking) - M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators) - Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.) - Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.) - Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers - Add benches: fusion, search, session, text_index - Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index) - Update blog posts, roadmap, content strategy, and M5 planning docs - Add tmp/ and .claude/worktrees/ to .gitignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
136 lines
4.1 KiB
Rust
136 lines
4.1 KiB
Rust
#![allow(clippy::unwrap_used)]
|
|
//! Criterion benchmarks for the BM25 text index pipeline.
|
|
//!
|
|
//! Measures BM25 query latency at various corpus sizes to validate the
|
|
//! < 10ms target at 10K documents specified in the m5p1 phase acceptance criteria.
|
|
|
|
use std::collections::HashMap;
|
|
|
|
use criterion::{Criterion, black_box, criterion_group, criterion_main};
|
|
use tidaldb::schema::{EntityId, TextFieldDef, TextFieldType};
|
|
use tidaldb::text::{AllScoresCollector, TextIndex};
|
|
|
|
fn make_index(n: u64) -> TextIndex {
|
|
let fields = vec![
|
|
TextFieldDef {
|
|
key: "title".into(),
|
|
field_type: TextFieldType::Text,
|
|
},
|
|
TextFieldDef {
|
|
key: "description".into(),
|
|
field_type: TextFieldType::Text,
|
|
},
|
|
TextFieldDef {
|
|
key: "category".into(),
|
|
field_type: TextFieldType::Keyword,
|
|
},
|
|
];
|
|
|
|
let idx = TextIndex::ephemeral(&fields).unwrap();
|
|
let mut w = idx.writer_guard().unwrap();
|
|
|
|
for i in 0..n {
|
|
let mut meta = HashMap::new();
|
|
// Vary titles so BM25 IDF scoring is meaningful.
|
|
meta.insert(
|
|
"title".into(),
|
|
format!("Rust tutorial {i} async concurrency"),
|
|
);
|
|
meta.insert(
|
|
"description".into(),
|
|
"Learn Rust programming with practical examples and real projects.".into(),
|
|
);
|
|
// Alternate categories to test keyword field throughput.
|
|
let cat = if i % 2 == 0 { "programming" } else { "systems" };
|
|
meta.insert("category".into(), cat.into());
|
|
w.index_item(EntityId::new(i), &meta).unwrap();
|
|
}
|
|
|
|
w.commit(n).unwrap();
|
|
drop(w);
|
|
|
|
idx.reload_reader().unwrap();
|
|
idx
|
|
}
|
|
|
|
/// BM25 bare-term query at 1K docs.
|
|
fn bench_bm25_1k(c: &mut Criterion) {
|
|
let idx = make_index(1_000);
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
c.bench_function("bm25_query_1k_docs", |b| {
|
|
b.iter(|| {
|
|
let q = parser.parse(black_box("Rust async")).unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
black_box(results)
|
|
});
|
|
});
|
|
}
|
|
|
|
/// BM25 bare-term query at 10K docs — must complete in < 10ms.
|
|
fn bench_bm25_10k(c: &mut Criterion) {
|
|
let idx = make_index(10_000);
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
c.bench_function("bm25_query_10k_docs", |b| {
|
|
b.iter(|| {
|
|
let q = parser.parse(black_box("Rust async")).unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
black_box(results)
|
|
});
|
|
});
|
|
}
|
|
|
|
/// BM25 exact-phrase query at 10K docs.
|
|
fn bench_bm25_phrase_10k(c: &mut Criterion) {
|
|
let idx = make_index(10_000);
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
c.bench_function("bm25_phrase_10k_docs", |b| {
|
|
b.iter(|| {
|
|
let q = parser.parse(black_box("\"Rust programming\"")).unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
black_box(results)
|
|
});
|
|
});
|
|
}
|
|
|
|
/// BM25 keyword field-scoped query at 10K docs.
|
|
fn bench_bm25_keyword_10k(c: &mut Criterion) {
|
|
let idx = make_index(10_000);
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
c.bench_function("bm25_keyword_10k_docs", |b| {
|
|
b.iter(|| {
|
|
let q = parser.parse(black_box("category:programming")).unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
black_box(results)
|
|
});
|
|
});
|
|
}
|
|
|
|
criterion_group!(
|
|
bm25_benches,
|
|
bench_bm25_1k,
|
|
bench_bm25_10k,
|
|
bench_bm25_phrase_10k,
|
|
bench_bm25_keyword_10k
|
|
);
|
|
criterion_main!(bm25_benches);
|