tidaldb/tidal/tests/m5_search.rs
jordan 192c473f55 feat: complete Milestone 5 — full-text search, RRF fusion, and creator search
- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs)
- M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates)
- M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking)
- M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators)
- Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.)
- Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.)
- Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers
- Add benches: fusion, search, session, text_index
- Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index)
- Update blog posts, roadmap, content strategy, and M5 planning docs
- Add tmp/ and .claude/worktrees/ to .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 23:53:16 -07:00

372 lines
12 KiB
Rust

#![allow(clippy::unwrap_used)]
//! m5p3 SEARCH Query end-to-end integration test (UAT).
//!
//! Validates the full SEARCH pipeline: schema declaration → item writes →
//! text index flush → BM25 retrieval → profile scoring → result assembly.
//! Also validates `search_click` as a positive engagement signal.
//!
//! # UAT Scenario
//!
//! ```
//! Given: A database with 1000 indexed items (title, description)
//! When: db.search(Search { query: "Rust tutorial" })
//! Then: Returns non-empty SearchResults with BM25 scores
//! And: Items matching the query appear before non-matching items
//! ```
use std::collections::HashMap;
use std::time::Duration;
use tidaldb::TidalDb;
use tidaldb::query::search::Search;
use tidaldb::schema::{
DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType, Timestamp, Window,
};
// ── Schema and fixture helpers ───────────────────────────────────────────────
fn search_schema() -> tidaldb::schema::Schema {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
let _ = builder
.signal(
"like",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(30 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
let _ = builder
.signal(
"search_click",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(3 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
builder.text_field("title", TextFieldType::Text);
builder.text_field("description", TextFieldType::Text);
builder.text_field("category", TextFieldType::Keyword);
builder.build().unwrap()
}
/// Build a TidalDb with `n` indexed items and wait for the text syncer to
/// commit all documents.
///
/// Items with IDs 0..500 get title "Rust tutorial {i}" (matching corpus).
/// Items with IDs 500..n get title "Python machine learning {i}" (non-matching).
///
/// The text syncer commits every 1000 documents. Writing ≥ 1000 items
/// guarantees at least one batch commit. A 500ms sleep gives the syncer time
/// to drain the channel; `reload_text_index()` makes the reader see the
/// committed documents.
fn make_db(n: u64) -> TidalDb {
assert!(n >= 1000, "n must be ≥ 1000 to trigger a batch commit");
let db = TidalDb::builder()
.ephemeral()
.with_schema(search_schema())
.open()
.unwrap();
let ts = Timestamp::now();
for i in 0..n {
let mut meta = HashMap::new();
if i < 500 {
meta.insert("title".to_string(), format!("Rust tutorial {i}"));
meta.insert(
"description".to_string(),
"Learn Rust systems programming.".to_string(),
);
meta.insert("category".to_string(), "programming".to_string());
} else {
meta.insert("title".to_string(), format!("Python machine learning {i}"));
meta.insert(
"description".to_string(),
"Machine learning with Python.".to_string(),
);
meta.insert("category".to_string(), "data-science".to_string());
}
db.write_item_with_metadata(EntityId::new(i), &meta)
.unwrap();
// Add view signals to items 0..100 to make profile scoring non-trivial.
if i < 100 {
db.signal("view", EntityId::new(i), 1.0, ts).unwrap();
}
}
// Wait for the background text syncer to drain the channel and commit
// all documents (syncer commits every 1000 items; 1K items = 1 commit).
std::thread::sleep(Duration::from_millis(500));
db.reload_text_index().unwrap();
db
}
// ── Step 1: SearchBuilder ────────────────────────────────────────────────────
#[test]
fn step1_search_builder_requires_query() {
let result = Search::builder().build();
assert!(
result.is_err(),
"build() without query_text or query_vector must fail"
);
}
#[test]
fn step1_search_builder_defaults() {
let s = Search::builder().query("jazz").build().unwrap();
assert_eq!(s.limit, 20);
assert_eq!(s.profile.name, "search");
assert!(s.filters.is_empty());
assert!(s.for_user.is_none());
}
#[test]
fn step1_search_builder_vector_only() {
let s = Search::builder()
.vector(vec![0.1_f32; 4])
.limit(10)
.build()
.unwrap();
assert!(s.query_text.is_none());
assert!(s.query_vector.is_some());
assert_eq!(s.limit, 10);
}
// ── Step 2: Text search returns results ──────────────────────────────────────
#[test]
fn step2_text_search_returns_results() {
let db = make_db(1000);
let query = Search::builder()
.query("Rust tutorial")
.limit(20)
.build()
.unwrap();
let results = db.search(&query).unwrap();
assert!(
!results.is_empty(),
"search for 'Rust tutorial' should return results"
);
assert!(results.len() <= 20, "search results must not exceed limit");
}
// ── Step 3: BM25 scores are present in results ───────────────────────────────
#[test]
fn step3_bm25_scores_populated() {
let db = make_db(1000);
let query = Search::builder()
.query("Rust systems")
.limit(10)
.build()
.unwrap();
let results = db.search(&query).unwrap();
assert!(
!results.is_empty(),
"expected at least one result for 'Rust systems'"
);
// All results from a text-only query must have a BM25 score.
for item in &results.items {
assert!(
item.bm25_score.is_some(),
"bm25_score should be populated for text-only search"
);
assert!(
item.semantic_score.is_none(),
"no vector → no semantic_score"
);
}
}
// ── Step 4: Ranks are 1-based and sequential ─────────────────────────────────
#[test]
fn step4_ranks_are_sequential() {
let db = make_db(1000);
let query = Search::builder().query("Rust").limit(10).build().unwrap();
let results = db.search(&query).unwrap();
assert!(!results.is_empty(), "expected results");
for (i, item) in results.items.iter().enumerate() {
assert_eq!(
item.rank,
i + 1,
"rank should be 1-based and sequential at position {i}"
);
}
}
// ── Step 5: query_text that matches nothing returns empty results ─────────────
#[test]
fn step5_no_matching_query_returns_empty() {
let db = make_db(1000);
let query = Search::builder()
.query("xyzzy123foobarquux")
.limit(20)
.build()
.unwrap();
let results = db.search(&query).unwrap();
assert!(
results.is_empty(),
"non-matching query should return empty results"
);
}
// ── Step 6: search_click is a positive engagement signal ─────────────────────
#[test]
fn step6_search_click_signal_recorded() {
let db = TidalDb::builder()
.ephemeral()
.with_schema(search_schema())
.open()
.unwrap();
let entity = EntityId::new(1);
let ts = Timestamp::now();
// search_click should succeed as a registered signal type.
db.signal("search_click", entity, 1.0, ts).unwrap();
// The signal should be readable as a decay score.
let score = db.read_decay_score(entity, "search_click", 0).unwrap();
assert!(
score.is_some() && score.unwrap() > 0.0,
"search_click should produce a positive decay score"
);
}
// ── Step 7: search_click updates preference vector (positive engagement) ──────
#[test]
fn step7_search_click_updates_preference_vector() {
let db = TidalDb::builder()
.ephemeral()
.with_schema(search_schema())
.open()
.unwrap();
let user_id = 99_u64;
let entity = EntityId::new(42);
let ts = Timestamp::now();
// Write item with a creator so there is preference state to update.
let mut meta = HashMap::new();
meta.insert("title".to_string(), "Rust embedded".to_string());
meta.insert("creator_id".to_string(), "1".to_string());
db.write_item_with_metadata(entity, &meta).unwrap();
// signal_with_context with a user triggers preference vector update.
db.signal_with_context("search_click", entity, 1.0, ts, Some(user_id), None)
.unwrap();
// The signal is recorded.
let score = db.read_decay_score(entity, "search_click", 0).unwrap();
assert!(score.is_some(), "search_click signal should be recorded");
}
// ── Step 8: Latency target < 50ms at 1K items ────────────────────────────────
#[test]
fn step8_search_latency_under_50ms() {
let db = make_db(1000);
let query = Search::builder()
.query("Rust tutorial")
.limit(20)
.build()
.unwrap();
let start = std::time::Instant::now();
let _results = db.search(&query).unwrap();
let elapsed = start.elapsed();
assert!(
elapsed.as_millis() < 50,
"search at 1K items should complete in < 50ms, got {}ms",
elapsed.as_millis()
);
}
// ── Step 9: search with for_user doesn't panic ───────────────────────────────
#[test]
fn step9_personalized_search_executes() {
let db = make_db(1000);
let user_id = 7_u64;
let ts = Timestamp::now();
// Give the user some signals so personalization has data.
for i in 0u64..10 {
db.signal_with_context("view", EntityId::new(i), 1.0, ts, Some(user_id), None)
.unwrap();
}
let query = Search::builder()
.query("Rust")
.for_user(user_id)
.limit(20)
.build()
.unwrap();
let results = db.search(&query).unwrap();
// Personalized search should return results.
assert!(
!results.is_empty(),
"personalized search should return results"
);
}
// ── Step 10: search builtin profile is registered ────────────────────────────
#[test]
fn step10_search_profile_registered() {
let db = TidalDb::builder()
.ephemeral()
.with_schema(search_schema())
.open()
.unwrap();
// A search query with default profile ("search") must not fail with
// "profile not found" — it should succeed even if results are empty.
let query = Search::builder()
.query("anything")
.limit(1)
.build()
.unwrap();
// The search may return no results (text index not flushed), but must
// not fail with a missing profile error.
let result = db.search(&query);
assert!(
result.is_ok(),
"search with default 'search' profile must not fail: {:?}",
result.err()
);
}