- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs) - M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates) - M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking) - M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators) - Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.) - Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.) - Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers - Add benches: fusion, search, session, text_index - Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index) - Update blog posts, roadmap, content strategy, and M5 planning docs - Add tmp/ and .claude/worktrees/ to .gitignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
178 lines
5.8 KiB
Rust
178 lines
5.8 KiB
Rust
#![allow(clippy::unwrap_used)]
|
|
//! m5p1 Text Index end-to-end integration test.
|
|
//!
|
|
//! Validates the full BM25 pipeline: schema declaration → index → write →
|
|
//! commit → query parse → search → score. Uses an ephemeral in-RAM index so
|
|
//! no disk I/O is required.
|
|
|
|
use std::collections::HashMap;
|
|
|
|
use tidaldb::schema::{EntityId, TextFieldDef, TextFieldType};
|
|
use tidaldb::text::{AllScoresCollector, TextIndex};
|
|
|
|
fn make_fields() -> Vec<TextFieldDef> {
|
|
vec![
|
|
TextFieldDef {
|
|
key: "title".into(),
|
|
field_type: TextFieldType::Text,
|
|
},
|
|
TextFieldDef {
|
|
key: "description".into(),
|
|
field_type: TextFieldType::Text,
|
|
},
|
|
TextFieldDef {
|
|
key: "category".into(),
|
|
field_type: TextFieldType::Keyword,
|
|
},
|
|
]
|
|
}
|
|
|
|
/// Validates the full m5p1 text index pipeline:
|
|
/// index → write → commit → search → score
|
|
#[test]
|
|
fn text_index_end_to_end() {
|
|
let fields = make_fields();
|
|
let idx = TextIndex::ephemeral(&fields).unwrap();
|
|
|
|
// Write 100 items.
|
|
let mut w = idx.writer_guard().unwrap();
|
|
for i in 0..100u64 {
|
|
let mut meta = HashMap::new();
|
|
meta.insert("title".into(), format!("Rust tutorial {i}"));
|
|
meta.insert("description".into(), "Learn Rust programming".into());
|
|
meta.insert("category".into(), "programming".into());
|
|
w.index_item(EntityId::new(i), &meta).unwrap();
|
|
}
|
|
w.commit(100).unwrap();
|
|
drop(w);
|
|
|
|
idx.reload_reader().unwrap();
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
// Test 1: bare terms (AND conjunction) — "Rust tutorial" matches all 100.
|
|
let q = parser.parse("Rust tutorial").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(!results.is_empty(), "bare terms should return results");
|
|
|
|
// Test 2: exact phrase — "Rust programming" is in every description.
|
|
let q = parser.parse("\"Rust programming\"").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(!results.is_empty(), "exact phrase should match description");
|
|
|
|
// Test 3: field-scoped keyword — category:programming matches all 100.
|
|
let q = parser.parse("category:programming").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert_eq!(
|
|
results.len(),
|
|
100,
|
|
"keyword field-scoped query should match all 100"
|
|
);
|
|
|
|
// Test 4: exclusion — "Rust -foobarxyz" should match (exclusion term not in corpus).
|
|
// MUST_NOT excludes at the document level; "foobarxyz" appears nowhere, so nothing excluded.
|
|
let q = parser.parse("Rust -foobarxyz").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(
|
|
!results.is_empty(),
|
|
"exclusion of absent term should still return matching documents"
|
|
);
|
|
|
|
// Test 5: BM25 latency < 10ms at 100 docs (trivial at this scale).
|
|
let start = std::time::Instant::now();
|
|
let q = parser.parse("Rust").unwrap();
|
|
let _ = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(
|
|
start.elapsed().as_millis() < 10,
|
|
"BM25 query should complete in < 10ms at 100 docs"
|
|
);
|
|
}
|
|
|
|
/// Boolean OR returns more results than AND for the same terms.
|
|
#[test]
|
|
fn boolean_or_returns_superset_of_and() {
|
|
let fields = vec![TextFieldDef {
|
|
key: "title".into(),
|
|
field_type: TextFieldType::Text,
|
|
}];
|
|
let idx = TextIndex::ephemeral(&fields).unwrap();
|
|
|
|
let mut w = idx.writer_guard().unwrap();
|
|
for (i, title) in [
|
|
(1u64, "jazz piano"),
|
|
(2u64, "rock guitar"),
|
|
(3u64, "jazz violin"),
|
|
] {
|
|
let mut m = HashMap::new();
|
|
m.insert("title".into(), title.into());
|
|
w.index_item(EntityId::new(i), &m).unwrap();
|
|
}
|
|
w.commit(3).unwrap();
|
|
drop(w);
|
|
|
|
idx.reload_reader().unwrap();
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
// AND: "jazz piano" requires both terms — only entity 1.
|
|
let q_and = parser.parse("jazz piano").unwrap();
|
|
let and_results = searcher.search(q_and.as_ref(), &collector).unwrap();
|
|
|
|
// OR: "jazz OR piano" — entities 1 and 3.
|
|
let q_or = parser.parse("jazz OR piano").unwrap();
|
|
let or_results = searcher.search(q_or.as_ref(), &collector).unwrap();
|
|
|
|
assert!(
|
|
or_results.len() >= and_results.len(),
|
|
"OR should return at least as many results as AND"
|
|
);
|
|
assert_eq!(
|
|
and_results.len(),
|
|
1,
|
|
"AND requires both 'jazz' and 'piano' — only entity 1"
|
|
);
|
|
assert_eq!(or_results.len(), 2, "OR jazz or piano — entities 1 and 3");
|
|
}
|
|
|
|
/// Deleting an item removes it from search results after next commit.
|
|
#[test]
|
|
fn delete_removes_from_results() {
|
|
let fields = vec![TextFieldDef {
|
|
key: "title".into(),
|
|
field_type: TextFieldType::Text,
|
|
}];
|
|
let idx = TextIndex::ephemeral(&fields).unwrap();
|
|
|
|
let mut w = idx.writer_guard().unwrap();
|
|
let mut m = HashMap::new();
|
|
m.insert("title".into(), "jazz piano".into());
|
|
w.index_item(EntityId::new(1), &m).unwrap();
|
|
w.commit(1).unwrap();
|
|
|
|
// Delete and commit.
|
|
w.delete_item(EntityId::new(1));
|
|
w.commit(2).unwrap();
|
|
drop(w);
|
|
|
|
idx.reload_reader().unwrap();
|
|
let searcher = idx.searcher();
|
|
let parser = idx.query_parser();
|
|
let collector = AllScoresCollector {
|
|
entity_id_field: idx.fields().entity_id,
|
|
};
|
|
|
|
let q = parser.parse("jazz").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(
|
|
results.is_empty(),
|
|
"deleted item should not appear in results"
|
|
);
|
|
}
|