tidaldb/tidal/tests/text_index.rs
jordan 192c473f55 feat: complete Milestone 5 — full-text search, RRF fusion, and creator search
- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs)
- M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates)
- M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking)
- M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators)
- Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.)
- Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.)
- Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers
- Add benches: fusion, search, session, text_index
- Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index)
- Update blog posts, roadmap, content strategy, and M5 planning docs
- Add tmp/ and .claude/worktrees/ to .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 23:53:16 -07:00

178 lines
5.8 KiB
Rust

#![allow(clippy::unwrap_used)]
//! m5p1 Text Index end-to-end integration test.
//!
//! Validates the full BM25 pipeline: schema declaration → index → write →
//! commit → query parse → search → score. Uses an ephemeral in-RAM index so
//! no disk I/O is required.
use std::collections::HashMap;
use tidaldb::schema::{EntityId, TextFieldDef, TextFieldType};
use tidaldb::text::{AllScoresCollector, TextIndex};
fn make_fields() -> Vec<TextFieldDef> {
vec![
TextFieldDef {
key: "title".into(),
field_type: TextFieldType::Text,
},
TextFieldDef {
key: "description".into(),
field_type: TextFieldType::Text,
},
TextFieldDef {
key: "category".into(),
field_type: TextFieldType::Keyword,
},
]
}
/// Validates the full m5p1 text index pipeline:
/// index → write → commit → search → score
#[test]
fn text_index_end_to_end() {
let fields = make_fields();
let idx = TextIndex::ephemeral(&fields).unwrap();
// Write 100 items.
let mut w = idx.writer_guard().unwrap();
for i in 0..100u64 {
let mut meta = HashMap::new();
meta.insert("title".into(), format!("Rust tutorial {i}"));
meta.insert("description".into(), "Learn Rust programming".into());
meta.insert("category".into(), "programming".into());
w.index_item(EntityId::new(i), &meta).unwrap();
}
w.commit(100).unwrap();
drop(w);
idx.reload_reader().unwrap();
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
// Test 1: bare terms (AND conjunction) — "Rust tutorial" matches all 100.
let q = parser.parse("Rust tutorial").unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
assert!(!results.is_empty(), "bare terms should return results");
// Test 2: exact phrase — "Rust programming" is in every description.
let q = parser.parse("\"Rust programming\"").unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
assert!(!results.is_empty(), "exact phrase should match description");
// Test 3: field-scoped keyword — category:programming matches all 100.
let q = parser.parse("category:programming").unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
assert_eq!(
results.len(),
100,
"keyword field-scoped query should match all 100"
);
// Test 4: exclusion — "Rust -foobarxyz" should match (exclusion term not in corpus).
// MUST_NOT excludes at the document level; "foobarxyz" appears nowhere, so nothing excluded.
let q = parser.parse("Rust -foobarxyz").unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
assert!(
!results.is_empty(),
"exclusion of absent term should still return matching documents"
);
// Test 5: BM25 latency < 10ms at 100 docs (trivial at this scale).
let start = std::time::Instant::now();
let q = parser.parse("Rust").unwrap();
let _ = searcher.search(q.as_ref(), &collector).unwrap();
assert!(
start.elapsed().as_millis() < 10,
"BM25 query should complete in < 10ms at 100 docs"
);
}
/// Boolean OR returns more results than AND for the same terms.
#[test]
fn boolean_or_returns_superset_of_and() {
let fields = vec![TextFieldDef {
key: "title".into(),
field_type: TextFieldType::Text,
}];
let idx = TextIndex::ephemeral(&fields).unwrap();
let mut w = idx.writer_guard().unwrap();
for (i, title) in [
(1u64, "jazz piano"),
(2u64, "rock guitar"),
(3u64, "jazz violin"),
] {
let mut m = HashMap::new();
m.insert("title".into(), title.into());
w.index_item(EntityId::new(i), &m).unwrap();
}
w.commit(3).unwrap();
drop(w);
idx.reload_reader().unwrap();
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
// AND: "jazz piano" requires both terms — only entity 1.
let q_and = parser.parse("jazz piano").unwrap();
let and_results = searcher.search(q_and.as_ref(), &collector).unwrap();
// OR: "jazz OR piano" — entities 1 and 3.
let q_or = parser.parse("jazz OR piano").unwrap();
let or_results = searcher.search(q_or.as_ref(), &collector).unwrap();
assert!(
or_results.len() >= and_results.len(),
"OR should return at least as many results as AND"
);
assert_eq!(
and_results.len(),
1,
"AND requires both 'jazz' and 'piano' — only entity 1"
);
assert_eq!(or_results.len(), 2, "OR jazz or piano — entities 1 and 3");
}
/// Deleting an item removes it from search results after next commit.
#[test]
fn delete_removes_from_results() {
let fields = vec![TextFieldDef {
key: "title".into(),
field_type: TextFieldType::Text,
}];
let idx = TextIndex::ephemeral(&fields).unwrap();
let mut w = idx.writer_guard().unwrap();
let mut m = HashMap::new();
m.insert("title".into(), "jazz piano".into());
w.index_item(EntityId::new(1), &m).unwrap();
w.commit(1).unwrap();
// Delete and commit.
w.delete_item(EntityId::new(1));
w.commit(2).unwrap();
drop(w);
idx.reload_reader().unwrap();
let searcher = idx.searcher();
let parser = idx.query_parser();
let collector = AllScoresCollector {
entity_id_field: idx.fields().entity_id,
};
let q = parser.parse("jazz").unwrap();
let results = searcher.search(q.as_ref(), &collector).unwrap();
assert!(
results.is_empty(),
"deleted item should not appear in results"
);
}