tidaldb/tidal/tests/tantivy_merge.rs
2026-02-23 22:41:16 -07:00

197 lines
6.3 KiB
Rust

//! Tantivy merge policy integration tests.
//!
//! These tests are marked `#[ignore]` because they require large-scale ingestion
//! (1M items) and are slow by design — they are intended for manual verification
//! of merge policy behaviour, not CI regression detection.
//!
//! Run them with:
//! ```text
//! cargo test --manifest-path tidal/Cargo.toml --test tantivy_merge -- --ignored
//! ```
#![allow(clippy::unwrap_used, clippy::cast_precision_loss)]
use std::collections::HashMap;
use std::time::{Duration, Instant};
use tidaldb::TidalDb;
use tidaldb::schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, TextFieldType};
fn make_db_with_text() -> TidalDb {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.add();
builder.text_field("title", TextFieldType::Text);
builder.text_field("category", TextFieldType::Keyword);
let schema = builder.build().unwrap();
TidalDb::builder()
.ephemeral()
.with_schema(schema)
.open()
.unwrap()
}
/// Verify that Tantivy segment count stays below 20 after 1M-item ingestion
/// followed by 10 steady-state write rounds.
///
/// **Manual verification test** — not run in CI.
///
/// Acceptance criterion: `segment_count() < 20` after all rounds.
#[test]
#[ignore = "manual verification: takes ~3 minutes to ingest 1M items"]
fn tantivy_segment_evolution() {
let db = make_db_with_text();
eprintln!("[tantivy_merge] Ingesting 1M items...");
let t0 = Instant::now();
for i in 0u64..1_000_000 {
let mut meta = HashMap::new();
meta.insert(
"title".to_string(),
format!("Content item {i} about category {}", i % 20),
);
meta.insert("category".to_string(), format!("cat{}", i % 20));
db.write_item_with_metadata(EntityId::new(i + 1), &meta)
.unwrap();
}
eprintln!(
"[tantivy_merge] Ingestion done in {:.1}s",
t0.elapsed().as_secs_f32()
);
// Allow text syncer to commit all 1M items.
std::thread::sleep(Duration::from_millis(5000));
db.reload_text_index().unwrap();
let count_after_ingest = db.text_segment_count();
eprintln!("[tantivy_merge] Segments after 1M ingest: {count_after_ingest}");
// 10 steady-state rounds of 5K items each.
for round in 0..10 {
for i in 0u64..5_000 {
let item_id = 1_000_000 + round * 5_000 + i;
let mut meta = HashMap::new();
meta.insert(
"title".to_string(),
format!("Steady state round {round} item {i}"),
);
meta.insert("category".to_string(), "steady".to_string());
db.write_item_with_metadata(EntityId::new(item_id + 1), &meta)
.unwrap();
}
db.flush_text_index().unwrap();
let count = db.text_segment_count();
eprintln!("[tantivy_merge] Round {round}: segment_count = {count}");
}
let final_count = db.text_segment_count();
eprintln!("[tantivy_merge] Final segment count: {final_count}");
assert!(
final_count < 20,
"segment_count={final_count} exceeds target of 20 at steady state"
);
}
/// Verify concurrent read latency stays below 100ms p99 while a writer adds items.
///
/// **Manual verification test** — not run in CI.
///
/// Acceptance criterion: p99 search latency < 100ms while concurrent writes proceed.
#[test]
#[ignore = "manual verification: takes ~30 seconds"]
fn tantivy_concurrent_read_write_latency() {
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
let mut builder = SchemaBuilder::new();
builder.text_field("title", TextFieldType::Text);
let schema = builder.build().unwrap();
let db = Arc::new(
TidalDb::builder()
.ephemeral()
.with_schema(schema)
.open()
.unwrap(),
);
// Seed with 10K items.
for i in 0u64..10_000 {
let mut meta = HashMap::new();
meta.insert(
"title".to_string(),
format!("Seed item {i} rust programming"),
);
db.write_item_with_metadata(EntityId::new(i + 1), &meta)
.unwrap();
}
std::thread::sleep(Duration::from_millis(3000));
db.reload_text_index().unwrap();
// Writer thread: add 5K items over 10 seconds.
let writer_db = Arc::clone(&db);
let stop = Arc::new(AtomicBool::new(false));
let stop_writer = Arc::clone(&stop);
let writer = std::thread::spawn(move || {
for i in 0u64..5_000 {
if stop_writer.load(Ordering::Relaxed) {
break;
}
let mut meta = HashMap::new();
meta.insert(
"title".to_string(),
format!("Concurrent write item {i} async concurrency"),
);
writer_db
.write_item_with_metadata(EntityId::new(10_001 + i), &meta)
.unwrap();
std::thread::sleep(Duration::from_millis(2));
}
});
// Reader thread: search continuously, collect latencies.
let reader_db = Arc::clone(&db);
let mut latencies_ms = Vec::with_capacity(500);
for _ in 0..500 {
let t = Instant::now();
let _ = reader_db
.search(
&tidaldb::query::search::Search::builder()
.query("rust programming async")
.limit(10)
.build()
.unwrap(),
)
.unwrap();
latencies_ms.push(t.elapsed().as_secs_f64() * 1000.0);
std::thread::sleep(Duration::from_millis(20));
}
stop.store(true, Ordering::Relaxed);
writer.join().unwrap();
latencies_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
let p99_idx = (latencies_ms.len() as f64 * 0.99) as usize;
let p99 = latencies_ms[p99_idx.min(latencies_ms.len() - 1)];
let p50 = latencies_ms[latencies_ms.len() / 2];
eprintln!(
"[tantivy_merge] Concurrent read latency: p50={p50:.2}ms p99={p99:.2}ms (n={})",
latencies_ms.len()
);
assert!(
p99 < 100.0,
"concurrent read p99={p99:.2}ms exceeds 100ms target"
);
}