//! Tantivy merge policy integration tests. //! //! These tests are marked `#[ignore]` because they require large-scale ingestion //! (1M items) and are slow by design — they are intended for manual verification //! of merge policy behaviour, not CI regression detection. //! //! Run them with: //! ```text //! cargo test --manifest-path tidal/Cargo.toml --test tantivy_merge -- --ignored //! ``` #![allow(clippy::unwrap_used, clippy::cast_precision_loss)] use std::collections::HashMap; use std::time::{Duration, Instant}; use tidaldb::TidalDb; use tidaldb::schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType}; fn make_db_with_text() -> TidalDb { let mut builder = SchemaBuilder::new(); let _ = builder .signal( "view", EntityKind::Item, DecaySpec::Exponential { half_life: Duration::from_secs(7 * 24 * 3600), }, ) .add(); builder.text_field("title", TextFieldType::Text); builder.text_field("category", TextFieldType::Keyword); let schema = builder.build().unwrap(); TidalDb::builder() .ephemeral() .with_schema(schema) .open() .unwrap() } /// Verify that Tantivy segment count stays below 20 after 1M-item ingestion /// followed by 10 steady-state write rounds. /// /// **Manual verification test** — not run in CI. /// /// Acceptance criterion: `segment_count() < 20` after all rounds. #[test] #[ignore = "manual verification: takes ~3 minutes to ingest 1M items"] fn tantivy_segment_evolution() { let db = make_db_with_text(); eprintln!("[tantivy_merge] Ingesting 1M items..."); let t0 = Instant::now(); for i in 0u64..1_000_000 { let mut meta = HashMap::new(); meta.insert( "title".to_string(), format!("Content item {i} about category {}", i % 20), ); meta.insert("category".to_string(), format!("cat{}", i % 20)); db.write_item_with_metadata(EntityId::new(i + 1), &meta) .unwrap(); } eprintln!( "[tantivy_merge] Ingestion done in {:.1}s", t0.elapsed().as_secs_f32() ); // Allow text syncer to commit all 1M items. std::thread::sleep(Duration::from_millis(5000)); db.reload_text_index().unwrap(); let count_after_ingest = db.text_segment_count(); eprintln!("[tantivy_merge] Segments after 1M ingest: {count_after_ingest}"); // 10 steady-state rounds of 5K items each. for round in 0..10 { for i in 0u64..5_000 { let item_id = 1_000_000 + round * 5_000 + i; let mut meta = HashMap::new(); meta.insert( "title".to_string(), format!("Steady state round {round} item {i}"), ); meta.insert("category".to_string(), "steady".to_string()); db.write_item_with_metadata(EntityId::new(item_id + 1), &meta) .unwrap(); } db.flush_text_index().unwrap(); let count = db.text_segment_count(); eprintln!("[tantivy_merge] Round {round}: segment_count = {count}"); } let final_count = db.text_segment_count(); eprintln!("[tantivy_merge] Final segment count: {final_count}"); assert!( final_count < 20, "segment_count={final_count} exceeds target of 20 at steady state" ); } /// Verify concurrent read latency stays below 100ms p99 while a writer adds items. /// /// **Manual verification test** — not run in CI. /// /// Acceptance criterion: p99 search latency < 100ms while concurrent writes proceed. #[test] #[ignore = "manual verification: takes ~30 seconds"] fn tantivy_concurrent_read_write_latency() { use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; let mut builder = SchemaBuilder::new(); builder.text_field("title", TextFieldType::Text); let schema = builder.build().unwrap(); let db = Arc::new( TidalDb::builder() .ephemeral() .with_schema(schema) .open() .unwrap(), ); // Seed with 10K items. for i in 0u64..10_000 { let mut meta = HashMap::new(); meta.insert( "title".to_string(), format!("Seed item {i} rust programming"), ); db.write_item_with_metadata(EntityId::new(i + 1), &meta) .unwrap(); } std::thread::sleep(Duration::from_millis(3000)); db.reload_text_index().unwrap(); // Writer thread: add 5K items over 10 seconds. let writer_db = Arc::clone(&db); let stop = Arc::new(AtomicBool::new(false)); let stop_writer = Arc::clone(&stop); let writer = std::thread::spawn(move || { for i in 0u64..5_000 { if stop_writer.load(Ordering::Relaxed) { break; } let mut meta = HashMap::new(); meta.insert( "title".to_string(), format!("Concurrent write item {i} async concurrency"), ); writer_db .write_item_with_metadata(EntityId::new(10_001 + i), &meta) .unwrap(); std::thread::sleep(Duration::from_millis(2)); } }); // Reader thread: search continuously, collect latencies. let reader_db = Arc::clone(&db); let mut latencies_ms = Vec::with_capacity(500); for _ in 0..500 { let t = Instant::now(); let _ = reader_db .search( &tidaldb::query::search::Search::builder() .query("rust programming async") .limit(10) .build() .unwrap(), ) .unwrap(); latencies_ms.push(t.elapsed().as_secs_f64() * 1000.0); std::thread::sleep(Duration::from_millis(20)); } stop.store(true, Ordering::Relaxed); writer.join().unwrap(); latencies_ms.sort_by(|a, b| a.partial_cmp(b).unwrap()); let p99_idx = (latencies_ms.len() as f64 * 0.99) as usize; let p99 = latencies_ms[p99_idx.min(latencies_ms.len() - 1)]; let p50 = latencies_ms[latencies_ms.len() / 2]; eprintln!( "[tantivy_merge] Concurrent read latency: p50={p50:.2}ms p99={p99:.2}ms (n={})", latencies_ms.len() ); assert!( p99 < 100.0, "concurrent read p99={p99:.2}ms exceeds 100ms target" ); }