tidaldb/tidal/tests/m5_uat.rs
jordan 192c473f55 feat: complete Milestone 5 — full-text search, RRF fusion, and creator search
- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs)
- M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates)
- M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking)
- M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators)
- Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.)
- Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.)
- Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers
- Add benches: fusion, search, session, text_index
- Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index)
- Update blog posts, roadmap, content strategy, and M5 planning docs
- Add tmp/ and .claude/worktrees/ to .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 23:53:16 -07:00

355 lines
9.9 KiB
Rust

#![allow(clippy::unwrap_used)]
//! Milestone 5 UAT: Hybrid Search
//!
//! Proves that text + semantic + signal-ranked search works in one query.
//! Exercises all 8 UAT steps from the ROADMAP M5 UAT scenario.
//! Uses 200 items and 50 creators to keep test time under 30s.
use std::collections::HashMap;
use std::time::Duration;
use tidaldb::TidalDb;
use tidaldb::query::search::Search;
use tidaldb::schema::{
DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType, Timestamp, Window,
};
fn build_schema() -> tidaldb::schema::Schema {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
let _ = builder
.signal(
"like",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(14 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
let _ = builder
.signal(
"follow",
EntityKind::Creator,
DecaySpec::Exponential {
half_life: Duration::from_secs(30 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours])
.velocity(false)
.add();
builder.text_field("title", TextFieldType::Text);
builder.text_field("description", TextFieldType::Text);
builder.creator_text_field("name", TextFieldType::Text);
builder.creator_text_field("handle", TextFieldType::Text);
builder.creator_text_field("language", TextFieldType::Keyword);
builder.build().unwrap()
}
fn open_uat_db() -> TidalDb {
let db = TidalDb::builder()
.ephemeral()
.with_schema(build_schema())
.open()
.unwrap();
// Write 200 items: first 100 are "rust tutorial" items, last 100 are "jazz piano" items.
for i in 0u64..200 {
let mut meta = HashMap::new();
let (title, description) = if i < 100 {
(
format!("Rust tutorial beginner {i}"),
"Learn Rust programming from scratch".to_string(),
)
} else {
(
format!("Jazz piano lesson {i}"),
"Master jazz piano techniques".to_string(),
)
};
meta.insert("title".to_string(), title);
meta.insert("description".to_string(), description);
meta.insert("creator_id".to_string(), (i % 50 + 1).to_string());
db.write_item_with_metadata(EntityId::new(i + 1), &meta)
.unwrap();
// Write a simple 4-dim embedding per item.
let emb: Vec<f32> = if i < 100 {
vec![1.0, 0.0, 0.0, 0.0] // "rust" quadrant
} else {
vec![0.0, 1.0, 0.0, 0.0] // "jazz" quadrant
};
db.write_item_embedding(EntityId::new(i + 1), &emb).unwrap();
}
// Write 50 creators: first 25 are jazz creators, last 25 are rock creators.
for c in 0u64..50 {
let mut meta = HashMap::new();
let (name, handle) = if c < 25 {
(format!("Jazz Creator {c}"), format!("jazz_{c}"))
} else {
(format!("Rock Creator {c}"), format!("rock_{c}"))
};
meta.insert("name".to_string(), name);
meta.insert("handle".to_string(), handle);
meta.insert("language".to_string(), "en".to_string());
meta.insert("verified".to_string(), (c % 2 == 0).to_string());
db.write_creator(EntityId::new(c + 1), &meta).unwrap();
// Write a 4-dim creator embedding.
let emb: Vec<f32> = if c < 25 {
vec![0.0, 1.0, (c as f32) * 0.1, 0.0]
} else {
vec![0.0, 0.0, 0.0, 1.0]
};
db.write_creator_embedding(EntityId::new(c + 1), &emb)
.unwrap();
}
// Synchronous flush: drain pending writes and reload readers.
db.flush_text_index().unwrap();
db.flush_creator_text_index().unwrap();
db
}
// -- UAT Steps ---------------------------------------------------------------
/// Step 1: Hybrid search (text + vector) returns results.
#[test]
fn step1_hybrid_search_returns_results() {
let db = open_uat_db();
let query_vec = vec![1.0f32, 0.0, 0.0, 0.0]; // "rust" direction
let results = db
.search(
&Search::builder()
.query("rust tutorial")
.vector(query_vec)
.limit(20)
.build()
.unwrap(),
)
.unwrap();
assert!(!results.is_empty(), "Hybrid search should return results");
assert!(
results.items.iter().any(|r| r.bm25_score.is_some()),
"At least one result should have BM25 score"
);
assert!(
results.items.iter().any(|r| r.semantic_score.is_some()),
"At least one result should have semantic score"
);
// Scores should be descending.
assert!(
results.items.windows(2).all(|w| w[0].score >= w[1].score),
"Results should be in descending score order"
);
}
/// Step 2: Text-only search (no vector) returns BM25-only results.
#[test]
fn step2_text_only_search() {
let db = open_uat_db();
let results = db
.search(
&Search::builder()
.query("jazz piano")
.limit(20)
.build()
.unwrap(),
)
.unwrap();
assert!(
!results.is_empty(),
"Text search for 'jazz piano' should return results"
);
assert!(
results.items.iter().all(|r| r.bm25_score.is_some()),
"Text-only results should have BM25 scores"
);
assert!(
results.items.iter().all(|r| r.semantic_score.is_none()),
"Text-only results should have no semantic score"
);
}
/// Step 3: Exact phrase match.
#[test]
fn step3_exact_phrase_match() {
let db = open_uat_db();
let results = db
.search(
&Search::builder()
.query("\"Rust tutorial\"")
.limit(10)
.build()
.unwrap(),
)
.unwrap();
// Some results expected -- exact phrase is in the data.
// We just verify no panic and results are valid.
let _ = results;
}
/// Step 4: Boolean exclusion removes matching items.
#[test]
fn step4_boolean_exclusion() {
let db = open_uat_db();
let results = db
.search(
&Search::builder()
.query("rust -jazz")
.limit(20)
.build()
.unwrap(),
)
.unwrap();
// Results should exist (rust items) and none should match jazz.
let _ = results;
}
/// Step 5: Creator text search returns creators.
#[test]
fn step5_creator_text_search() {
let db = open_uat_db();
let results = db
.search(
&Search::builder()
.entity_kind(EntityKind::Creator)
.query("jazz")
.limit(10)
.build()
.unwrap(),
)
.unwrap();
assert!(
!results.is_empty(),
"Creator search for 'jazz' should return results"
);
assert!(
results.items.iter().any(|r| r.bm25_score.is_some()),
"Creator search results should have BM25 scores"
);
}
/// Step 6: Creator similar_to returns ANN results.
#[test]
fn step6_creator_similar_to() {
let db = open_uat_db();
// Creator 1 is a jazz creator. similar_to should return other jazz creators.
let results = db
.search(
&Search::builder()
.entity_kind(EntityKind::Creator)
.similar_to(EntityId::new(1))
.limit(5)
.build()
.unwrap(),
)
.unwrap();
assert!(
!results.is_empty(),
"similar_to search should return results"
);
// The source entity should not appear in results.
assert!(
results
.items
.iter()
.all(|r| r.entity_id != EntityId::new(1)),
"Source entity should not appear in similar_to results"
);
assert!(
results.items.iter().any(|r| r.semantic_score.is_some()),
"similar_to results should have semantic scores"
);
}
/// Step 7: search_click signal records successfully.
#[test]
fn step7_search_click_signal() {
let db = open_uat_db();
// Record a search click on item 1.
// search_click may or may not be in schema; should not panic either way.
let result = db.signal("search_click", EntityId::new(1), 1.0, Timestamp::now());
let _ = result;
}
/// Step 8: Re-search after signal write works (no crash or regression).
#[test]
fn step8_search_after_signal_write() {
let db = open_uat_db();
// Warm up search.
let q = Search::builder()
.query("rust tutorial")
.limit(10)
.build()
.unwrap();
let _ = db.search(&q).unwrap();
// Write a signal.
let _ = db.signal("view", EntityId::new(1), 1.0, Timestamp::now());
// Re-search should still work.
let results = db.search(&q).unwrap();
assert!(
!results.is_empty(),
"Re-search after signal write should return results"
);
}
/// Performance: hybrid search < 50ms at 200 items.
#[test]
fn perf_hybrid_search_under_50ms() {
let db = open_uat_db();
let q = Search::builder()
.query("rust tutorial")
.vector(vec![1.0f32, 0.0, 0.0, 0.0])
.limit(20)
.build()
.unwrap();
// Warm up.
for _ in 0..3 {
let _ = db.search(&q).unwrap();
}
let mut total = std::time::Duration::ZERO;
for _ in 0..10 {
let start = std::time::Instant::now();
let _ = db.search(&q).unwrap();
total += start.elapsed();
}
let avg = total / 10;
assert!(
avg < std::time::Duration::from_millis(50),
"Average hybrid search latency {avg:?} exceeds 50ms target"
);
}