tidaldb/tidal/tests/text_index.rs

#![allow(clippy::unwrap_used)]
//! m5p1 Text Index end-to-end integration test.
//!
//! Validates the full BM25 pipeline: schema declaration → index → write →
//! commit → query parse → search → score. Uses an ephemeral in-RAM index so
//! no disk I/O is required.

use std::collections::HashMap;

use tidaldb::schema::{EntityId, TextFieldDef, TextFieldType};
use tidaldb::text::{AllScoresCollector, TextIndex};

fn make_fields() -> Vec<TextFieldDef> {
    vec![
        TextFieldDef {
            key: "title".into(),
            field_type: TextFieldType::Text,
        },
        TextFieldDef {
            key: "description".into(),
            field_type: TextFieldType::Text,
        },
        TextFieldDef {
            key: "category".into(),
            field_type: TextFieldType::Keyword,
        },
    ]
}

/// Validates the full m5p1 text index pipeline:
/// index → write → commit → search → score
#[test]
fn text_index_end_to_end() {
    let fields = make_fields();
    let idx = TextIndex::ephemeral(&fields).unwrap();

    // Write 100 items.
    let mut w = idx.writer_guard().unwrap();
    for i in 0..100u64 {
        let mut meta = HashMap::new();
        meta.insert("title".into(), format!("Rust tutorial {i}"));
        meta.insert("description".into(), "Learn Rust programming".into());
        meta.insert("category".into(), "programming".into());
        w.index_item(EntityId::new(i), &meta).unwrap();
    }
    w.commit(100).unwrap();
    drop(w);

    idx.reload_reader().unwrap();
    let searcher = idx.searcher();
    let parser = idx.query_parser();

    let collector = AllScoresCollector {
        entity_id_field: idx.fields().entity_id,
    };

    // Test 1: bare terms (AND conjunction) — "Rust tutorial" matches all 100.
    let q = parser.parse("Rust tutorial").unwrap();
    let results = searcher.search(q.as_ref(), &collector).unwrap();
    assert!(!results.is_empty(), "bare terms should return results");

    // Test 2: exact phrase — "Rust programming" is in every description.
    let q = parser.parse("\"Rust programming\"").unwrap();
    let results = searcher.search(q.as_ref(), &collector).unwrap();
    assert!(!results.is_empty(), "exact phrase should match description");

    // Test 3: field-scoped keyword — category:programming matches all 100.
    let q = parser.parse("category:programming").unwrap();
    let results = searcher.search(q.as_ref(), &collector).unwrap();
    assert_eq!(
        results.len(),
        100,
        "keyword field-scoped query should match all 100"
    );

    // Test 4: exclusion — "Rust -foobarxyz" should match (exclusion term not in corpus).
    // MUST_NOT excludes at the document level; "foobarxyz" appears nowhere, so nothing excluded.
    let q = parser.parse("Rust -foobarxyz").unwrap();
    let results = searcher.search(q.as_ref(), &collector).unwrap();
    assert!(
        !results.is_empty(),
        "exclusion of absent term should still return matching documents"
    );

    // Test 5: BM25 latency < 10ms at 100 docs (trivial at this scale).
    let start = std::time::Instant::now();
    let q = parser.parse("Rust").unwrap();
    let _ = searcher.search(q.as_ref(), &collector).unwrap();
    assert!(
        start.elapsed().as_millis() < 10,
        "BM25 query should complete in < 10ms at 100 docs"
    );
}

/// Boolean OR returns more results than AND for the same terms.
#[test]
fn boolean_or_returns_superset_of_and() {
    let fields = vec![TextFieldDef {
        key: "title".into(),
        field_type: TextFieldType::Text,
    }];
    let idx = TextIndex::ephemeral(&fields).unwrap();

    let mut w = idx.writer_guard().unwrap();
    for (i, title) in [
        (1u64, "jazz piano"),
        (2u64, "rock guitar"),
        (3u64, "jazz violin"),
    ] {
        let mut m = HashMap::new();
        m.insert("title".into(), title.into());
        w.index_item(EntityId::new(i), &m).unwrap();
    }
    w.commit(3).unwrap();
    drop(w);

    idx.reload_reader().unwrap();
    let searcher = idx.searcher();
    let parser = idx.query_parser();
    let collector = AllScoresCollector {
        entity_id_field: idx.fields().entity_id,
    };

    // AND: "jazz piano" requires both terms — only entity 1.
    let q_and = parser.parse("jazz piano").unwrap();
    let and_results = searcher.search(q_and.as_ref(), &collector).unwrap();

    // OR: "jazz OR piano" — entities 1 and 3.
    let q_or = parser.parse("jazz OR piano").unwrap();
    let or_results = searcher.search(q_or.as_ref(), &collector).unwrap();

    assert!(
        or_results.len() >= and_results.len(),
        "OR should return at least as many results as AND"
    );
    assert_eq!(
        and_results.len(),
        1,
        "AND requires both 'jazz' and 'piano' — only entity 1"
    );
    assert_eq!(or_results.len(), 2, "OR jazz or piano — entities 1 and 3");
}

/// Deleting an item removes it from search results after next commit.
#[test]
fn delete_removes_from_results() {
    let fields = vec![TextFieldDef {
        key: "title".into(),
        field_type: TextFieldType::Text,
    }];
    let idx = TextIndex::ephemeral(&fields).unwrap();

    let mut w = idx.writer_guard().unwrap();
    let mut m = HashMap::new();
    m.insert("title".into(), "jazz piano".into());
    w.index_item(EntityId::new(1), &m).unwrap();
    w.commit(1).unwrap();

    // Delete and commit.
    w.delete_item(EntityId::new(1));
    w.commit(2).unwrap();
    drop(w);

    idx.reload_reader().unwrap();
    let searcher = idx.searcher();
    let parser = idx.query_parser();
    let collector = AllScoresCollector {
        entity_id_field: idx.fields().entity_id,
    };

    let q = parser.parse("jazz").unwrap();
    let results = searcher.search(q.as_ref(), &collector).unwrap();
    assert!(
        results.is_empty(),
        "deleted item should not appear in results"
    );
}