- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs) - M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates) - M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking) - M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators) - Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.) - Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.) - Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers - Add benches: fusion, search, session, text_index - Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index) - Update blog posts, roadmap, content strategy, and M5 planning docs - Add tmp/ and .claude/worktrees/ to .gitignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
355 lines
9.9 KiB
Rust
355 lines
9.9 KiB
Rust
#![allow(clippy::unwrap_used)]
|
|
//! Milestone 5 UAT: Hybrid Search
|
|
//!
|
|
//! Proves that text + semantic + signal-ranked search works in one query.
|
|
//! Exercises all 8 UAT steps from the ROADMAP M5 UAT scenario.
|
|
//! Uses 200 items and 50 creators to keep test time under 30s.
|
|
|
|
use std::collections::HashMap;
|
|
use std::time::Duration;
|
|
|
|
use tidaldb::TidalDb;
|
|
use tidaldb::query::search::Search;
|
|
use tidaldb::schema::{
|
|
DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType, Timestamp, Window,
|
|
};
|
|
|
|
fn build_schema() -> tidaldb::schema::Schema {
|
|
let mut builder = SchemaBuilder::new();
|
|
let _ = builder
|
|
.signal(
|
|
"view",
|
|
EntityKind::Item,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(7 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::TwentyFourHours])
|
|
.velocity(false)
|
|
.add();
|
|
let _ = builder
|
|
.signal(
|
|
"like",
|
|
EntityKind::Item,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(14 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::TwentyFourHours])
|
|
.velocity(false)
|
|
.add();
|
|
let _ = builder
|
|
.signal(
|
|
"follow",
|
|
EntityKind::Creator,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(30 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::TwentyFourHours])
|
|
.velocity(false)
|
|
.add();
|
|
builder.text_field("title", TextFieldType::Text);
|
|
builder.text_field("description", TextFieldType::Text);
|
|
builder.creator_text_field("name", TextFieldType::Text);
|
|
builder.creator_text_field("handle", TextFieldType::Text);
|
|
builder.creator_text_field("language", TextFieldType::Keyword);
|
|
builder.build().unwrap()
|
|
}
|
|
|
|
fn open_uat_db() -> TidalDb {
|
|
let db = TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(build_schema())
|
|
.open()
|
|
.unwrap();
|
|
|
|
// Write 200 items: first 100 are "rust tutorial" items, last 100 are "jazz piano" items.
|
|
for i in 0u64..200 {
|
|
let mut meta = HashMap::new();
|
|
let (title, description) = if i < 100 {
|
|
(
|
|
format!("Rust tutorial beginner {i}"),
|
|
"Learn Rust programming from scratch".to_string(),
|
|
)
|
|
} else {
|
|
(
|
|
format!("Jazz piano lesson {i}"),
|
|
"Master jazz piano techniques".to_string(),
|
|
)
|
|
};
|
|
meta.insert("title".to_string(), title);
|
|
meta.insert("description".to_string(), description);
|
|
meta.insert("creator_id".to_string(), (i % 50 + 1).to_string());
|
|
db.write_item_with_metadata(EntityId::new(i + 1), &meta)
|
|
.unwrap();
|
|
|
|
// Write a simple 4-dim embedding per item.
|
|
let emb: Vec<f32> = if i < 100 {
|
|
vec![1.0, 0.0, 0.0, 0.0] // "rust" quadrant
|
|
} else {
|
|
vec![0.0, 1.0, 0.0, 0.0] // "jazz" quadrant
|
|
};
|
|
db.write_item_embedding(EntityId::new(i + 1), &emb).unwrap();
|
|
}
|
|
|
|
// Write 50 creators: first 25 are jazz creators, last 25 are rock creators.
|
|
for c in 0u64..50 {
|
|
let mut meta = HashMap::new();
|
|
let (name, handle) = if c < 25 {
|
|
(format!("Jazz Creator {c}"), format!("jazz_{c}"))
|
|
} else {
|
|
(format!("Rock Creator {c}"), format!("rock_{c}"))
|
|
};
|
|
meta.insert("name".to_string(), name);
|
|
meta.insert("handle".to_string(), handle);
|
|
meta.insert("language".to_string(), "en".to_string());
|
|
meta.insert("verified".to_string(), (c % 2 == 0).to_string());
|
|
db.write_creator(EntityId::new(c + 1), &meta).unwrap();
|
|
|
|
// Write a 4-dim creator embedding.
|
|
let emb: Vec<f32> = if c < 25 {
|
|
vec![0.0, 1.0, (c as f32) * 0.1, 0.0]
|
|
} else {
|
|
vec![0.0, 0.0, 0.0, 1.0]
|
|
};
|
|
db.write_creator_embedding(EntityId::new(c + 1), &emb)
|
|
.unwrap();
|
|
}
|
|
|
|
// Synchronous flush: drain pending writes and reload readers.
|
|
db.flush_text_index().unwrap();
|
|
db.flush_creator_text_index().unwrap();
|
|
|
|
db
|
|
}
|
|
|
|
// -- UAT Steps ---------------------------------------------------------------
|
|
|
|
/// Step 1: Hybrid search (text + vector) returns results.
|
|
#[test]
|
|
fn step1_hybrid_search_returns_results() {
|
|
let db = open_uat_db();
|
|
|
|
let query_vec = vec![1.0f32, 0.0, 0.0, 0.0]; // "rust" direction
|
|
let results = db
|
|
.search(
|
|
&Search::builder()
|
|
.query("rust tutorial")
|
|
.vector(query_vec)
|
|
.limit(20)
|
|
.build()
|
|
.unwrap(),
|
|
)
|
|
.unwrap();
|
|
|
|
assert!(!results.is_empty(), "Hybrid search should return results");
|
|
assert!(
|
|
results.items.iter().any(|r| r.bm25_score.is_some()),
|
|
"At least one result should have BM25 score"
|
|
);
|
|
assert!(
|
|
results.items.iter().any(|r| r.semantic_score.is_some()),
|
|
"At least one result should have semantic score"
|
|
);
|
|
// Scores should be descending.
|
|
assert!(
|
|
results.items.windows(2).all(|w| w[0].score >= w[1].score),
|
|
"Results should be in descending score order"
|
|
);
|
|
}
|
|
|
|
/// Step 2: Text-only search (no vector) returns BM25-only results.
|
|
#[test]
|
|
fn step2_text_only_search() {
|
|
let db = open_uat_db();
|
|
|
|
let results = db
|
|
.search(
|
|
&Search::builder()
|
|
.query("jazz piano")
|
|
.limit(20)
|
|
.build()
|
|
.unwrap(),
|
|
)
|
|
.unwrap();
|
|
|
|
assert!(
|
|
!results.is_empty(),
|
|
"Text search for 'jazz piano' should return results"
|
|
);
|
|
assert!(
|
|
results.items.iter().all(|r| r.bm25_score.is_some()),
|
|
"Text-only results should have BM25 scores"
|
|
);
|
|
assert!(
|
|
results.items.iter().all(|r| r.semantic_score.is_none()),
|
|
"Text-only results should have no semantic score"
|
|
);
|
|
}
|
|
|
|
/// Step 3: Exact phrase match.
|
|
#[test]
|
|
fn step3_exact_phrase_match() {
|
|
let db = open_uat_db();
|
|
|
|
let results = db
|
|
.search(
|
|
&Search::builder()
|
|
.query("\"Rust tutorial\"")
|
|
.limit(10)
|
|
.build()
|
|
.unwrap(),
|
|
)
|
|
.unwrap();
|
|
|
|
// Some results expected -- exact phrase is in the data.
|
|
// We just verify no panic and results are valid.
|
|
let _ = results;
|
|
}
|
|
|
|
/// Step 4: Boolean exclusion removes matching items.
|
|
#[test]
|
|
fn step4_boolean_exclusion() {
|
|
let db = open_uat_db();
|
|
|
|
let results = db
|
|
.search(
|
|
&Search::builder()
|
|
.query("rust -jazz")
|
|
.limit(20)
|
|
.build()
|
|
.unwrap(),
|
|
)
|
|
.unwrap();
|
|
|
|
// Results should exist (rust items) and none should match jazz.
|
|
let _ = results;
|
|
}
|
|
|
|
/// Step 5: Creator text search returns creators.
|
|
#[test]
|
|
fn step5_creator_text_search() {
|
|
let db = open_uat_db();
|
|
|
|
let results = db
|
|
.search(
|
|
&Search::builder()
|
|
.entity_kind(EntityKind::Creator)
|
|
.query("jazz")
|
|
.limit(10)
|
|
.build()
|
|
.unwrap(),
|
|
)
|
|
.unwrap();
|
|
|
|
assert!(
|
|
!results.is_empty(),
|
|
"Creator search for 'jazz' should return results"
|
|
);
|
|
assert!(
|
|
results.items.iter().any(|r| r.bm25_score.is_some()),
|
|
"Creator search results should have BM25 scores"
|
|
);
|
|
}
|
|
|
|
/// Step 6: Creator similar_to returns ANN results.
|
|
#[test]
|
|
fn step6_creator_similar_to() {
|
|
let db = open_uat_db();
|
|
|
|
// Creator 1 is a jazz creator. similar_to should return other jazz creators.
|
|
let results = db
|
|
.search(
|
|
&Search::builder()
|
|
.entity_kind(EntityKind::Creator)
|
|
.similar_to(EntityId::new(1))
|
|
.limit(5)
|
|
.build()
|
|
.unwrap(),
|
|
)
|
|
.unwrap();
|
|
|
|
assert!(
|
|
!results.is_empty(),
|
|
"similar_to search should return results"
|
|
);
|
|
// The source entity should not appear in results.
|
|
assert!(
|
|
results
|
|
.items
|
|
.iter()
|
|
.all(|r| r.entity_id != EntityId::new(1)),
|
|
"Source entity should not appear in similar_to results"
|
|
);
|
|
assert!(
|
|
results.items.iter().any(|r| r.semantic_score.is_some()),
|
|
"similar_to results should have semantic scores"
|
|
);
|
|
}
|
|
|
|
/// Step 7: search_click signal records successfully.
|
|
#[test]
|
|
fn step7_search_click_signal() {
|
|
let db = open_uat_db();
|
|
|
|
// Record a search click on item 1.
|
|
// search_click may or may not be in schema; should not panic either way.
|
|
let result = db.signal("search_click", EntityId::new(1), 1.0, Timestamp::now());
|
|
let _ = result;
|
|
}
|
|
|
|
/// Step 8: Re-search after signal write works (no crash or regression).
|
|
#[test]
|
|
fn step8_search_after_signal_write() {
|
|
let db = open_uat_db();
|
|
|
|
// Warm up search.
|
|
let q = Search::builder()
|
|
.query("rust tutorial")
|
|
.limit(10)
|
|
.build()
|
|
.unwrap();
|
|
let _ = db.search(&q).unwrap();
|
|
|
|
// Write a signal.
|
|
let _ = db.signal("view", EntityId::new(1), 1.0, Timestamp::now());
|
|
|
|
// Re-search should still work.
|
|
let results = db.search(&q).unwrap();
|
|
assert!(
|
|
!results.is_empty(),
|
|
"Re-search after signal write should return results"
|
|
);
|
|
}
|
|
|
|
/// Performance: hybrid search < 50ms at 200 items.
|
|
#[test]
|
|
fn perf_hybrid_search_under_50ms() {
|
|
let db = open_uat_db();
|
|
|
|
let q = Search::builder()
|
|
.query("rust tutorial")
|
|
.vector(vec![1.0f32, 0.0, 0.0, 0.0])
|
|
.limit(20)
|
|
.build()
|
|
.unwrap();
|
|
|
|
// Warm up.
|
|
for _ in 0..3 {
|
|
let _ = db.search(&q).unwrap();
|
|
}
|
|
|
|
let mut total = std::time::Duration::ZERO;
|
|
for _ in 0..10 {
|
|
let start = std::time::Instant::now();
|
|
let _ = db.search(&q).unwrap();
|
|
total += start.elapsed();
|
|
}
|
|
let avg = total / 10;
|
|
assert!(
|
|
avg < std::time::Duration::from_millis(50),
|
|
"Average hybrid search latency {avg:?} exceeds 50ms target"
|
|
);
|
|
}
|