- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs) - M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates) - M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking) - M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators) - Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.) - Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.) - Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers - Add benches: fusion, search, session, text_index - Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index) - Update blog posts, roadmap, content strategy, and M5 planning docs - Add tmp/ and .claude/worktrees/ to .gitignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
267 lines
9.4 KiB
Markdown
267 lines
9.4 KiB
Markdown
# Task 05: Boolean Query Parsing
|
|
|
|
## Delivers
|
|
|
|
`TextQueryParser` — a wrapper over Tantivy's `QueryParser` with custom syntax extensions. Handles: AND/OR/NOT operators, exact phrase (`"..."`), field-scoped (`title:jazz`, `tag:tutorial`), exclusion (`-beginner`), wildcard prefix (`pian*`), hashtag (`#jazz`).
|
|
|
|
## Complexity: M
|
|
|
|
## Dependencies
|
|
|
|
- Task 01 complete: `TextIndex`, `TantivyFields` with field names
|
|
- Task 02 complete: documents indexed
|
|
|
|
## Technical Design
|
|
|
|
Tantivy's built-in `QueryParser` already handles most of the required syntax. tidalDB's `TextQueryParser` wraps it and adds:
|
|
1. Pre-processing of `#jazz` → `jazz` (hashtag syntax → bare term)
|
|
2. Pre-processing of `creator:handle` → field-scoped query on creator field
|
|
3. Validation and error messages appropriate for tidalDB's API
|
|
|
|
From `docs/research/tantivy.md`:
|
|
> QueryParser handles: bare terms, exact phrase, boolean AND/OR/NOT, field-scoped, exclusion (-term), wildcard prefix (pian*)
|
|
|
|
```rust
|
|
// tidal/src/text/query.rs
|
|
|
|
use tantivy::query::Query;
|
|
use tantivy::schema::Field;
|
|
use crate::schema::TextFieldDef;
|
|
use crate::TidalError;
|
|
|
|
/// Parser for text search queries.
|
|
///
|
|
/// Wraps Tantivy's QueryParser with tidalDB-specific syntax extensions:
|
|
/// - `#jazz` → bare term `jazz` (hashtag pre-processing)
|
|
/// - `creator:handle` → field-scoped query if `creator` field is declared
|
|
/// - All other Tantivy query syntax passes through unchanged
|
|
pub struct TextQueryParser {
|
|
inner: tantivy::query::QueryParser,
|
|
default_fields: Vec<Field>,
|
|
}
|
|
|
|
impl TextQueryParser {
|
|
/// Create a parser that searches across all `Text`-type declared fields by default.
|
|
///
|
|
/// `Keyword` fields require explicit field scoping (`field:value`).
|
|
pub fn new(
|
|
index: &tantivy::Index,
|
|
text_fields: &[(String, Field, crate::schema::TextFieldType)],
|
|
) -> Self {
|
|
use crate::schema::TextFieldType;
|
|
|
|
// Default search fields are Text-type only (tokenized)
|
|
let default_fields: Vec<Field> = text_fields
|
|
.iter()
|
|
.filter(|(_, _, ft)| *ft == TextFieldType::Text)
|
|
.map(|(_, f, _)| *f)
|
|
.collect();
|
|
|
|
let inner = tantivy::query::QueryParser::for_index(index, default_fields.clone());
|
|
Self { inner, default_fields }
|
|
}
|
|
|
|
/// Parse a query string into a Tantivy `Query`.
|
|
///
|
|
/// Applies tidalDB pre-processing before passing to Tantivy's parser.
|
|
///
|
|
/// # Errors
|
|
/// Returns `TidalError::Query` if the query string is syntactically invalid.
|
|
pub fn parse(&self, query_str: &str) -> crate::Result<Box<dyn Query>> {
|
|
let preprocessed = preprocess_query(query_str);
|
|
self.inner
|
|
.parse_query(&preprocessed)
|
|
.map_err(|e| TidalError::Query(crate::query::retrieve::QueryError::ParseError(
|
|
format!("text query parse error: {e}")
|
|
)))
|
|
}
|
|
}
|
|
|
|
/// Pre-process a tidalDB query string before passing to Tantivy's QueryParser.
|
|
///
|
|
/// Transformations:
|
|
/// - `#jazz` → `jazz` (hashtag syntax)
|
|
/// - Other syntax passes through to Tantivy's parser
|
|
fn preprocess_query(query: &str) -> String {
|
|
// Replace #word with word (remove hashtag prefix)
|
|
let mut result = String::with_capacity(query.len());
|
|
let mut chars = query.chars().peekable();
|
|
|
|
while let Some(ch) = chars.next() {
|
|
if ch == '#' {
|
|
// Check if followed by an alphanumeric char (valid hashtag)
|
|
if chars.peek().map(|c| c.is_alphanumeric()).unwrap_or(false) {
|
|
// Skip the '#' — the following word is the term
|
|
continue;
|
|
} else {
|
|
// Not a valid hashtag, pass through
|
|
result.push(ch);
|
|
}
|
|
} else {
|
|
result.push(ch);
|
|
}
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn preprocess_removes_hashtag() {
|
|
assert_eq!(preprocess_query("#jazz"), "jazz");
|
|
assert_eq!(preprocess_query("#jazz #piano"), "jazz piano");
|
|
assert_eq!(preprocess_query("jazz #piano"), "jazz piano");
|
|
assert_eq!(preprocess_query("no-hashtag"), "no-hashtag");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_bare_terms() {
|
|
// "jazz piano" → boolean OR of jazz and piano (Tantivy default)
|
|
}
|
|
|
|
#[test]
|
|
fn parse_exact_phrase() {
|
|
// "\"jazz piano\"" → PhraseQuery
|
|
}
|
|
|
|
#[test]
|
|
fn parse_boolean_and() {
|
|
// "jazz AND piano" → BooleanQuery with must clauses
|
|
}
|
|
|
|
#[test]
|
|
fn parse_boolean_not() {
|
|
// "jazz -beginner" or "jazz NOT beginner" → excludes beginner
|
|
}
|
|
|
|
#[test]
|
|
fn parse_field_scoped() {
|
|
// "title:jazz" → scopes query to title field
|
|
}
|
|
|
|
#[test]
|
|
fn parse_wildcard_prefix() {
|
|
// "pian*" → PrefixQuery matching piano, pianist, etc.
|
|
}
|
|
|
|
#[test]
|
|
fn parse_hashtag() {
|
|
// "#jazz" → same result as "jazz"
|
|
}
|
|
}
|
|
```
|
|
|
|
### Integration with TextIndex
|
|
|
|
Add `TextIndex::query_parser()` method:
|
|
|
|
```rust
|
|
impl TextIndex {
|
|
pub fn query_parser(&self) -> TextQueryParser {
|
|
TextQueryParser::new(&self.index, &self.fields.text_fields)
|
|
}
|
|
}
|
|
```
|
|
|
|
### Wildcard prefix note
|
|
|
|
Tantivy's `QueryParser` supports wildcard prefix queries (`pian*`) when the field uses `Indexing::positions()`. By default `TEXT` fields include positions — so prefix queries work out of the box.
|
|
|
|
However, Tantivy disables regex and leading-wildcard queries (`*jazz`) by default for performance. tidalDB only needs trailing wildcards (`pian*`), which Tantivy handles via `PrefixQuery`.
|
|
|
|
Enable fuzzy queries is deferred to M6. For M5, exact, phrase, boolean, field-scoped, and prefix are sufficient.
|
|
|
|
### Boolean operator note
|
|
|
|
Tantivy's `QueryParser` uses `OR` as default conjunction. To configure `AND` as default (which is what most users expect for multi-word queries like "rust tutorial"):
|
|
|
|
```rust
|
|
inner.set_conjunction_by_default();
|
|
```
|
|
|
|
This makes `"rust tutorial"` behave as `rust AND tutorial` rather than `rust OR tutorial`, which produces more precise results. tidalDB should enable conjunction by default.
|
|
|
|
## Acceptance Criteria
|
|
|
|
- [ ] `TextQueryParser` struct with `new(index, text_fields)` and `parse(query_str)` methods
|
|
- [ ] Default search fields are `Text`-type only (not `Keyword`)
|
|
- [ ] `#jazz` pre-processed to `jazz` before parsing
|
|
- [ ] Bare terms: `rust tutorial` → conjunction of `rust` AND `tutorial` (default conjunction mode)
|
|
- [ ] Exact phrase: `"exact phrase"` → `PhraseQuery` matching contiguous sequence
|
|
- [ ] Boolean AND: `jazz AND piano` → `BooleanQuery` with two must clauses
|
|
- [ ] Boolean OR: `jazz OR rock` → `BooleanQuery` with should clauses
|
|
- [ ] Boolean NOT / exclusion: `jazz -beginner` → excludes items with "beginner"
|
|
- [ ] Field-scoped: `title:jazz` → queries only the `title` field
|
|
- [ ] Wildcard prefix: `pian*` → matches "piano", "pianist", etc.
|
|
- [ ] Hashtag: `#jazz` → same results as bare `jazz`
|
|
- [ ] Invalid query string returns `TidalError::Query` with descriptive message
|
|
- [ ] `TextIndex::query_parser()` returns a `TextQueryParser` configured for the index
|
|
- [ ] Unit tests: all syntax types above with assertions on query type returned
|
|
- [ ] `cargo check`, `cargo fmt`, `cargo clippy -D warnings` all pass
|
|
|
|
## Full Integration Test (BM25 search end-to-end)
|
|
|
|
After tasks 01-05 complete, add an integration test in `tidal/tests/text_index.rs`:
|
|
|
|
```rust
|
|
/// Validates the full m5p1 text index pipeline:
|
|
/// index → write → commit → search → score
|
|
#[test]
|
|
fn text_index_end_to_end() {
|
|
let fields = vec![
|
|
TextFieldDef { key: "title".into(), field_type: TextFieldType::Text },
|
|
TextFieldDef { key: "description".into(), field_type: TextFieldType::Text },
|
|
TextFieldDef { key: "category".into(), field_type: TextFieldType::Keyword },
|
|
];
|
|
|
|
let idx = TextIndex::ephemeral(&fields).unwrap();
|
|
|
|
// Write 100 items
|
|
let mut w = idx.writer_guard().unwrap();
|
|
for i in 0..100u64 {
|
|
let mut meta = HashMap::new();
|
|
meta.insert("title".into(), format!("Rust tutorial {i}"));
|
|
meta.insert("description".into(), "Learn Rust programming".into());
|
|
meta.insert("category".into(), "programming".into());
|
|
w.index_item(EntityId::new(i), &meta).unwrap();
|
|
}
|
|
w.commit(100).unwrap();
|
|
drop(w);
|
|
|
|
idx.reader.reload().unwrap();
|
|
let searcher = idx.reader.searcher();
|
|
let parser = idx.query_parser();
|
|
|
|
// Test 1: bare terms
|
|
let q = parser.parse("Rust tutorial").unwrap();
|
|
let collector = AllScoresCollector { entity_id_field: idx.fields().entity_id };
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(!results.is_empty());
|
|
|
|
// Test 2: exact phrase
|
|
let q = parser.parse("\"Rust programming\"").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(!results.is_empty()); // matches description
|
|
|
|
// Test 3: field-scoped keyword
|
|
let q = parser.parse("category:programming").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert_eq!(results.len(), 100);
|
|
|
|
// Test 4: exclusion
|
|
let q = parser.parse("Rust -tutorial").unwrap();
|
|
let results = searcher.search(q.as_ref(), &collector).unwrap();
|
|
// "Rust programming" description matches "Rust" but not "tutorial"
|
|
assert!(!results.is_empty());
|
|
|
|
// Test 5: BM25 latency < 10ms at 100 docs (trivial at this scale)
|
|
let start = std::time::Instant::now();
|
|
let q = parser.parse("Rust").unwrap();
|
|
let _ = searcher.search(q.as_ref(), &collector).unwrap();
|
|
assert!(start.elapsed().as_millis() < 10);
|
|
}
|
|
```
|