- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs) - M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates) - M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking) - M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators) - Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.) - Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.) - Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers - Add benches: fusion, search, session, text_index - Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index) - Update blog posts, roadmap, content strategy, and M5 planning docs - Add tmp/ and .claude/worktrees/ to .gitignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
372 lines
12 KiB
Rust
372 lines
12 KiB
Rust
#![allow(clippy::unwrap_used)]
|
|
//! m5p3 SEARCH Query end-to-end integration test (UAT).
|
|
//!
|
|
//! Validates the full SEARCH pipeline: schema declaration → item writes →
|
|
//! text index flush → BM25 retrieval → profile scoring → result assembly.
|
|
//! Also validates `search_click` as a positive engagement signal.
|
|
//!
|
|
//! # UAT Scenario
|
|
//!
|
|
//! ```
|
|
//! Given: A database with 1000 indexed items (title, description)
|
|
//! When: db.search(Search { query: "Rust tutorial" })
|
|
//! Then: Returns non-empty SearchResults with BM25 scores
|
|
//! And: Items matching the query appear before non-matching items
|
|
//! ```
|
|
|
|
use std::collections::HashMap;
|
|
use std::time::Duration;
|
|
|
|
use tidaldb::TidalDb;
|
|
use tidaldb::query::search::Search;
|
|
use tidaldb::schema::{
|
|
DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType, Timestamp, Window,
|
|
};
|
|
|
|
// ── Schema and fixture helpers ───────────────────────────────────────────────
|
|
|
|
fn search_schema() -> tidaldb::schema::Schema {
|
|
let mut builder = SchemaBuilder::new();
|
|
let _ = builder
|
|
.signal(
|
|
"view",
|
|
EntityKind::Item,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(7 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::TwentyFourHours])
|
|
.velocity(false)
|
|
.add();
|
|
let _ = builder
|
|
.signal(
|
|
"like",
|
|
EntityKind::Item,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(30 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::TwentyFourHours])
|
|
.velocity(false)
|
|
.add();
|
|
let _ = builder
|
|
.signal(
|
|
"search_click",
|
|
EntityKind::Item,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(3 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::TwentyFourHours])
|
|
.velocity(false)
|
|
.add();
|
|
builder.text_field("title", TextFieldType::Text);
|
|
builder.text_field("description", TextFieldType::Text);
|
|
builder.text_field("category", TextFieldType::Keyword);
|
|
builder.build().unwrap()
|
|
}
|
|
|
|
/// Build a TidalDb with `n` indexed items and wait for the text syncer to
|
|
/// commit all documents.
|
|
///
|
|
/// Items with IDs 0..500 get title "Rust tutorial {i}" (matching corpus).
|
|
/// Items with IDs 500..n get title "Python machine learning {i}" (non-matching).
|
|
///
|
|
/// The text syncer commits every 1000 documents. Writing ≥ 1000 items
|
|
/// guarantees at least one batch commit. A 500ms sleep gives the syncer time
|
|
/// to drain the channel; `reload_text_index()` makes the reader see the
|
|
/// committed documents.
|
|
fn make_db(n: u64) -> TidalDb {
|
|
assert!(n >= 1000, "n must be ≥ 1000 to trigger a batch commit");
|
|
|
|
let db = TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(search_schema())
|
|
.open()
|
|
.unwrap();
|
|
|
|
let ts = Timestamp::now();
|
|
for i in 0..n {
|
|
let mut meta = HashMap::new();
|
|
if i < 500 {
|
|
meta.insert("title".to_string(), format!("Rust tutorial {i}"));
|
|
meta.insert(
|
|
"description".to_string(),
|
|
"Learn Rust systems programming.".to_string(),
|
|
);
|
|
meta.insert("category".to_string(), "programming".to_string());
|
|
} else {
|
|
meta.insert("title".to_string(), format!("Python machine learning {i}"));
|
|
meta.insert(
|
|
"description".to_string(),
|
|
"Machine learning with Python.".to_string(),
|
|
);
|
|
meta.insert("category".to_string(), "data-science".to_string());
|
|
}
|
|
db.write_item_with_metadata(EntityId::new(i), &meta)
|
|
.unwrap();
|
|
|
|
// Add view signals to items 0..100 to make profile scoring non-trivial.
|
|
if i < 100 {
|
|
db.signal("view", EntityId::new(i), 1.0, ts).unwrap();
|
|
}
|
|
}
|
|
|
|
// Wait for the background text syncer to drain the channel and commit
|
|
// all documents (syncer commits every 1000 items; 1K items = 1 commit).
|
|
std::thread::sleep(Duration::from_millis(500));
|
|
db.reload_text_index().unwrap();
|
|
|
|
db
|
|
}
|
|
|
|
// ── Step 1: SearchBuilder ────────────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn step1_search_builder_requires_query() {
|
|
let result = Search::builder().build();
|
|
assert!(
|
|
result.is_err(),
|
|
"build() without query_text or query_vector must fail"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn step1_search_builder_defaults() {
|
|
let s = Search::builder().query("jazz").build().unwrap();
|
|
assert_eq!(s.limit, 20);
|
|
assert_eq!(s.profile.name, "search");
|
|
assert!(s.filters.is_empty());
|
|
assert!(s.for_user.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn step1_search_builder_vector_only() {
|
|
let s = Search::builder()
|
|
.vector(vec![0.1_f32; 4])
|
|
.limit(10)
|
|
.build()
|
|
.unwrap();
|
|
assert!(s.query_text.is_none());
|
|
assert!(s.query_vector.is_some());
|
|
assert_eq!(s.limit, 10);
|
|
}
|
|
|
|
// ── Step 2: Text search returns results ──────────────────────────────────────
|
|
|
|
#[test]
|
|
fn step2_text_search_returns_results() {
|
|
let db = make_db(1000);
|
|
let query = Search::builder()
|
|
.query("Rust tutorial")
|
|
.limit(20)
|
|
.build()
|
|
.unwrap();
|
|
|
|
let results = db.search(&query).unwrap();
|
|
assert!(
|
|
!results.is_empty(),
|
|
"search for 'Rust tutorial' should return results"
|
|
);
|
|
assert!(results.len() <= 20, "search results must not exceed limit");
|
|
}
|
|
|
|
// ── Step 3: BM25 scores are present in results ───────────────────────────────
|
|
|
|
#[test]
|
|
fn step3_bm25_scores_populated() {
|
|
let db = make_db(1000);
|
|
let query = Search::builder()
|
|
.query("Rust systems")
|
|
.limit(10)
|
|
.build()
|
|
.unwrap();
|
|
|
|
let results = db.search(&query).unwrap();
|
|
assert!(
|
|
!results.is_empty(),
|
|
"expected at least one result for 'Rust systems'"
|
|
);
|
|
|
|
// All results from a text-only query must have a BM25 score.
|
|
for item in &results.items {
|
|
assert!(
|
|
item.bm25_score.is_some(),
|
|
"bm25_score should be populated for text-only search"
|
|
);
|
|
assert!(
|
|
item.semantic_score.is_none(),
|
|
"no vector → no semantic_score"
|
|
);
|
|
}
|
|
}
|
|
|
|
// ── Step 4: Ranks are 1-based and sequential ─────────────────────────────────
|
|
|
|
#[test]
|
|
fn step4_ranks_are_sequential() {
|
|
let db = make_db(1000);
|
|
let query = Search::builder().query("Rust").limit(10).build().unwrap();
|
|
|
|
let results = db.search(&query).unwrap();
|
|
assert!(!results.is_empty(), "expected results");
|
|
|
|
for (i, item) in results.items.iter().enumerate() {
|
|
assert_eq!(
|
|
item.rank,
|
|
i + 1,
|
|
"rank should be 1-based and sequential at position {i}"
|
|
);
|
|
}
|
|
}
|
|
|
|
// ── Step 5: query_text that matches nothing returns empty results ─────────────
|
|
|
|
#[test]
|
|
fn step5_no_matching_query_returns_empty() {
|
|
let db = make_db(1000);
|
|
let query = Search::builder()
|
|
.query("xyzzy123foobarquux")
|
|
.limit(20)
|
|
.build()
|
|
.unwrap();
|
|
|
|
let results = db.search(&query).unwrap();
|
|
assert!(
|
|
results.is_empty(),
|
|
"non-matching query should return empty results"
|
|
);
|
|
}
|
|
|
|
// ── Step 6: search_click is a positive engagement signal ─────────────────────
|
|
|
|
#[test]
|
|
fn step6_search_click_signal_recorded() {
|
|
let db = TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(search_schema())
|
|
.open()
|
|
.unwrap();
|
|
|
|
let entity = EntityId::new(1);
|
|
let ts = Timestamp::now();
|
|
|
|
// search_click should succeed as a registered signal type.
|
|
db.signal("search_click", entity, 1.0, ts).unwrap();
|
|
|
|
// The signal should be readable as a decay score.
|
|
let score = db.read_decay_score(entity, "search_click", 0).unwrap();
|
|
assert!(
|
|
score.is_some() && score.unwrap() > 0.0,
|
|
"search_click should produce a positive decay score"
|
|
);
|
|
}
|
|
|
|
// ── Step 7: search_click updates preference vector (positive engagement) ──────
|
|
|
|
#[test]
|
|
fn step7_search_click_updates_preference_vector() {
|
|
let db = TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(search_schema())
|
|
.open()
|
|
.unwrap();
|
|
|
|
let user_id = 99_u64;
|
|
let entity = EntityId::new(42);
|
|
let ts = Timestamp::now();
|
|
|
|
// Write item with a creator so there is preference state to update.
|
|
let mut meta = HashMap::new();
|
|
meta.insert("title".to_string(), "Rust embedded".to_string());
|
|
meta.insert("creator_id".to_string(), "1".to_string());
|
|
db.write_item_with_metadata(entity, &meta).unwrap();
|
|
|
|
// signal_with_context with a user triggers preference vector update.
|
|
db.signal_with_context("search_click", entity, 1.0, ts, Some(user_id), None)
|
|
.unwrap();
|
|
|
|
// The signal is recorded.
|
|
let score = db.read_decay_score(entity, "search_click", 0).unwrap();
|
|
assert!(score.is_some(), "search_click signal should be recorded");
|
|
}
|
|
|
|
// ── Step 8: Latency target < 50ms at 1K items ────────────────────────────────
|
|
|
|
#[test]
|
|
fn step8_search_latency_under_50ms() {
|
|
let db = make_db(1000);
|
|
let query = Search::builder()
|
|
.query("Rust tutorial")
|
|
.limit(20)
|
|
.build()
|
|
.unwrap();
|
|
|
|
let start = std::time::Instant::now();
|
|
let _results = db.search(&query).unwrap();
|
|
let elapsed = start.elapsed();
|
|
|
|
assert!(
|
|
elapsed.as_millis() < 50,
|
|
"search at 1K items should complete in < 50ms, got {}ms",
|
|
elapsed.as_millis()
|
|
);
|
|
}
|
|
|
|
// ── Step 9: search with for_user doesn't panic ───────────────────────────────
|
|
|
|
#[test]
|
|
fn step9_personalized_search_executes() {
|
|
let db = make_db(1000);
|
|
let user_id = 7_u64;
|
|
let ts = Timestamp::now();
|
|
|
|
// Give the user some signals so personalization has data.
|
|
for i in 0u64..10 {
|
|
db.signal_with_context("view", EntityId::new(i), 1.0, ts, Some(user_id), None)
|
|
.unwrap();
|
|
}
|
|
|
|
let query = Search::builder()
|
|
.query("Rust")
|
|
.for_user(user_id)
|
|
.limit(20)
|
|
.build()
|
|
.unwrap();
|
|
|
|
let results = db.search(&query).unwrap();
|
|
// Personalized search should return results.
|
|
assert!(
|
|
!results.is_empty(),
|
|
"personalized search should return results"
|
|
);
|
|
}
|
|
|
|
// ── Step 10: search builtin profile is registered ────────────────────────────
|
|
|
|
#[test]
|
|
fn step10_search_profile_registered() {
|
|
let db = TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(search_schema())
|
|
.open()
|
|
.unwrap();
|
|
|
|
// A search query with default profile ("search") must not fail with
|
|
// "profile not found" — it should succeed even if results are empty.
|
|
let query = Search::builder()
|
|
.query("anything")
|
|
.limit(1)
|
|
.build()
|
|
.unwrap();
|
|
|
|
// The search may return no results (text index not flushed), but must
|
|
// not fail with a missing profile error.
|
|
let result = db.search(&query);
|
|
assert!(
|
|
result.is_ok(),
|
|
"search with default 'search' profile must not fail: {:?}",
|
|
result.err()
|
|
);
|
|
}
|