Milestone 8 (phases 1-4): - Shard-aware WAL segment naming, BatchHeader v2, ShardRouter - Transport trait, InProcessTransport, WalShipper, FollowerDb - HLC, PNCounter, LWWRegister, CrdtSignalState, ReconciliationEngine - Session replication bridge with SeqNo/HWM, idempotency store Forage application: - Multi-source discovery engine with MAB exploration - Embedding-based label system, server handlers, UI refresh Other: - QUICKSTART.md, README.md, milestone-8 planning docs - Hard negative union semantics, RLHF export enhancements - Recovery benchmark and visibility test expansions - Split 8 oversized source files per CODING_GUIDELINES §9 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
317 lines
12 KiB
Markdown
317 lines
12 KiB
Markdown
# Task 03: UAT Scenario Tests (Steps 1–5)
|
||
|
||
## Delivers
|
||
|
||
Integration test suite in `tidal/tests/m8_uat.rs` covering all 5 UAT scenario steps. Uses `SimulatedCluster` and fault injection from Tasks 01–02. This is the gate for M8 completion.
|
||
|
||
## Complexity: M
|
||
|
||
## Dependencies
|
||
|
||
- Tasks 01–02 complete (SimulatedCluster, fault injection)
|
||
- All phases 8.1–8.5 complete
|
||
|
||
## Technical Design
|
||
|
||
```rust
|
||
// tidal/tests/m8_uat.rs
|
||
|
||
use tidaldb::{
|
||
EntityId, Timestamp, Window,
|
||
query::{retrieve::Retrieve, search::Search},
|
||
replication::{RegionId, ShardId, NodeRole},
|
||
};
|
||
use tidaldb::testing::{
|
||
cluster::{SimulatedCluster, ClusterConfig},
|
||
faults::{NetworkPartition, ShardCrash},
|
||
};
|
||
|
||
fn m8_schema() -> Schema {
|
||
SchemaBuilder::new()
|
||
.signal("view", EntityKind::Item,
|
||
DecaySpec::Exponential { half_life: Duration::from_secs(7 * 24 * 3600) })
|
||
.windows(&[Window::OneHour, Window::TwentyFourHours])
|
||
.add()
|
||
.signal("like", EntityKind::Item,
|
||
DecaySpec::Exponential { half_life: Duration::from_secs(24 * 3600) })
|
||
.add()
|
||
.build()
|
||
.unwrap()
|
||
}
|
||
|
||
fn three_region_config() -> ClusterConfig {
|
||
ClusterConfig {
|
||
regions: vec![RegionId(0), RegionId(1), RegionId(2)],
|
||
shards_per_region: 1,
|
||
leader: (RegionId(0), ShardId(0)),
|
||
schema: m8_schema(),
|
||
}
|
||
}
|
||
|
||
/// UAT Step 1: Cross-region signal replication < 2 seconds.
|
||
///
|
||
/// Write signals for a user in us-east (region 0), read in eu-west (region 1)
|
||
/// after < 2 seconds. Verified by ReplicationLagGauge assertion and
|
||
/// read_decay_score equivalence.
|
||
#[tokio::test]
|
||
async fn uat_step1_cross_region_replication() {
|
||
let cluster = SimulatedCluster::build(three_region_config()).await;
|
||
|
||
let item = EntityId::new(1);
|
||
let t = Timestamp::now();
|
||
|
||
// Write 25 signals in us-east (region 0 leader).
|
||
for _ in 0..25 {
|
||
cluster.write_signal("view", item, 1.0);
|
||
}
|
||
|
||
// Wait for convergence (< 2 seconds on in-process transport).
|
||
cluster.await_full_convergence(Duration::from_secs(2)).await;
|
||
|
||
// Read in eu-west (region 1) and ap-south (region 2).
|
||
let score_east = cluster.read_decay_score(RegionId(0), item, "view").unwrap();
|
||
let score_west = cluster.read_decay_score(RegionId(1), item, "view").unwrap();
|
||
let score_south = cluster.read_decay_score(RegionId(2), item, "view").unwrap();
|
||
|
||
// All regions should report the same score (within floating point epsilon).
|
||
let epsilon = 1e-6;
|
||
assert!((score_east - score_west).abs() < epsilon,
|
||
"eu-west score {} diverges from us-east score {} by > {}", score_west, score_east, epsilon);
|
||
assert!((score_east - score_south).abs() < epsilon,
|
||
"ap-south score {} diverges from us-east score {} by > {}", score_south, score_east, epsilon);
|
||
|
||
// Verify via replication lag gauge.
|
||
let lag_1 = cluster.control_plane().lag_seqno(RegionId(1));
|
||
let lag_2 = cluster.control_plane().lag_seqno(RegionId(2));
|
||
assert_eq!(lag_1, 0, "eu-west should have no replication lag");
|
||
assert_eq!(lag_2, 0, "ap-south should have no replication lag");
|
||
}
|
||
|
||
/// UAT Step 2: Shard crash and follower promotion.
|
||
///
|
||
/// Crash an entire shard primary. Follower is promoted within 10 seconds.
|
||
/// All acknowledged signals are present on the promoted follower. No data loss.
|
||
#[tokio::test]
|
||
async fn uat_step2_shard_crash_and_failover() {
|
||
let cluster = Arc::new(SimulatedCluster::build(three_region_config()).await);
|
||
|
||
let item = EntityId::new(2);
|
||
|
||
// Write 100 signals (all acknowledged by leader before crash).
|
||
for _ in 0..100 {
|
||
cluster.write_signal("view", item, 1.0);
|
||
}
|
||
|
||
// Wait for eu-west follower to receive all events.
|
||
cluster.await_full_convergence(Duration::from_secs(2)).await;
|
||
|
||
// Record the pre-crash seqno on eu-west.
|
||
let pre_crash_seqno = cluster.applied_seqno(RegionId(1));
|
||
|
||
// Crash the us-east primary.
|
||
let crash = ShardCrash::crash(ShardId(0), cluster.clone(), false).await;
|
||
|
||
// Follower promotion should complete within 10 seconds.
|
||
let deadline = Instant::now() + Duration::from_secs(10);
|
||
loop {
|
||
if Instant::now() > deadline {
|
||
panic!("failover timeout: no new leader elected within 10 seconds");
|
||
}
|
||
if cluster.has_leader() { break; }
|
||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||
}
|
||
|
||
// New leader (eu-west promoted follower) must have all 100 signals.
|
||
let new_leader_seqno = cluster.leader().db.applied_seqno();
|
||
assert!(new_leader_seqno >= pre_crash_seqno,
|
||
"promoted leader must have at least {} events (had {})", pre_crash_seqno, new_leader_seqno);
|
||
|
||
let score_on_promoted = cluster.read_decay_score(RegionId(1), item, "view").unwrap();
|
||
assert!(score_on_promoted > 0.0, "all 100 signals must be present on the promoted leader");
|
||
}
|
||
|
||
/// UAT Step 3: Degraded query during partition.
|
||
///
|
||
/// Execute RETRIEVE while ap-south (region 2) is partitioned.
|
||
/// Query succeeds using available shards. Degradation flag is set in QueryStats.
|
||
#[tokio::test]
|
||
async fn uat_step3_degraded_query_during_partition() {
|
||
let cluster = SimulatedCluster::build(three_region_config()).await;
|
||
|
||
let item = EntityId::new(3);
|
||
|
||
// Seed some data.
|
||
for _ in 0..10 {
|
||
cluster.write_signal("view", item, 1.0);
|
||
}
|
||
cluster.await_full_convergence(Duration::from_secs(1)).await;
|
||
|
||
// Inject partition: ap-south (region 2) is isolated.
|
||
let _partition = NetworkPartition::symmetric(
|
||
RegionId(0), RegionId(2),
|
||
cluster.transport_factory(),
|
||
);
|
||
|
||
// Write more signals during the partition.
|
||
for _ in 0..5 {
|
||
cluster.write_signal("view", item, 1.0);
|
||
}
|
||
|
||
// Query should still succeed from us-east or eu-west.
|
||
let results = cluster.leader().db.retrieve(&Retrieve::builder()
|
||
.candidates(vec![item])
|
||
.build()
|
||
.unwrap()
|
||
).unwrap();
|
||
|
||
assert!(!results.items.is_empty(), "query must succeed with 2 of 3 regions available");
|
||
|
||
// QueryStats should indicate degradation.
|
||
// (Exact API for degradation flag verified in m7p4 visibility tests -- same pattern)
|
||
let stats = results.stats;
|
||
// degraded = true is set when < all shards participated
|
||
// (exact field name TBD during implementation; verified in UAT step 3 acceptance)
|
||
}
|
||
|
||
/// UAT Step 4: Partition heal and reconciliation.
|
||
///
|
||
/// Heal the partition from Step 3. ReconciliationEngine runs. After reconciliation:
|
||
/// no duplicate signal counts, hard negatives never leaked, decay scores on all
|
||
/// shards match analytical formula to 6 decimal places.
|
||
#[tokio::test]
|
||
async fn uat_step4_partition_heal_reconciliation() {
|
||
let cluster = SimulatedCluster::build(three_region_config()).await;
|
||
|
||
let item = EntityId::new(4);
|
||
let user = EntityId::new(100);
|
||
|
||
// Phase 1: write events on both sides of partition.
|
||
let partition = NetworkPartition::symmetric(
|
||
RegionId(0), RegionId(2),
|
||
cluster.transport_factory(),
|
||
);
|
||
|
||
// Write to leader (us-east, region 0) during partition.
|
||
for _ in 0..50 {
|
||
cluster.write_signal("view", item, 1.0);
|
||
}
|
||
|
||
// Write to ap-south (region 2) directly during partition.
|
||
// (ap-south is isolated, so it accumulates its own events)
|
||
for _ in 0..30 {
|
||
cluster.node(RegionId(2)).db
|
||
.signal("view", item, 1.0, Timestamp::now())
|
||
.unwrap();
|
||
}
|
||
|
||
// Apply hard negative on ap-south during partition.
|
||
let ts_hide = HlcTimestamp { wall_ns: 200, logical: 0, node_id: 2 };
|
||
cluster.node(RegionId(2)).db.hide_item_with_ts(user, item, ts_hide).unwrap();
|
||
|
||
// Phase 2: heal partition.
|
||
drop(partition);
|
||
|
||
// Run reconciliation.
|
||
cluster.reconcile_all().await;
|
||
cluster.await_full_convergence(Duration::from_secs(5)).await;
|
||
|
||
// Verify: total signal count = 50 + 30 = 80 (no double-counting).
|
||
let score_east = cluster.read_decay_score(RegionId(0), item, "view").unwrap();
|
||
let score_west = cluster.read_decay_score(RegionId(1), item, "view").unwrap();
|
||
let score_south = cluster.read_decay_score(RegionId(2), item, "view").unwrap();
|
||
|
||
// Analytical formula: 80 events × weight=1.0, all at approximately t=now.
|
||
// Decay score = sum of decayed events; with very short elapsed time, ≈ 80.0.
|
||
let epsilon = 1e-6;
|
||
assert!((score_east - score_west).abs() < epsilon,
|
||
"post-reconciliation scores diverge between us-east and eu-west");
|
||
assert!((score_east - score_south).abs() < epsilon,
|
||
"post-reconciliation scores diverge between us-east and ap-south");
|
||
|
||
// Verify: hard negative applied on ap-south is propagated to all regions.
|
||
// Item must not appear in query results for the user on any region.
|
||
for ®ion in &[RegionId(0), RegionId(1), RegionId(2)] {
|
||
let results = cluster.node(region).db.retrieve(&Retrieve::builder()
|
||
.for_user(user)
|
||
.candidates(vec![item])
|
||
.build()
|
||
.unwrap()
|
||
).unwrap();
|
||
assert!(results.items.is_empty(),
|
||
"hard negative must suppress item in region {:?} after reconciliation", region);
|
||
}
|
||
}
|
||
|
||
/// UAT Step 5: Tenant migration with zero downtime.
|
||
///
|
||
/// Move a tenant to a new region by changing routing config.
|
||
/// During migration: zero downtime, all queries succeed.
|
||
/// After migration: tenant's data is on new region only; old region's copy is GC'd.
|
||
#[tokio::test]
|
||
async fn uat_step5_tenant_migration() {
|
||
let cluster = SimulatedCluster::build(three_region_config()).await;
|
||
|
||
let tenant = TenantId(42);
|
||
let item = EntityId::new(5);
|
||
|
||
// Register tenant on shard 0, region 0.
|
||
cluster.register_tenant(TenantConfig {
|
||
tenant_id: tenant,
|
||
max_signals_per_sec: None,
|
||
max_entities: None,
|
||
max_storage_bytes: None,
|
||
required_regions: vec![RegionId(0)],
|
||
label: "migrating-tenant".into(),
|
||
});
|
||
|
||
// Write 100 signals before migration.
|
||
for _ in 0..100 {
|
||
cluster.leader().db
|
||
.signal_for_tenant(tenant, "view", item, 1.0, Timestamp::now())
|
||
.unwrap();
|
||
}
|
||
|
||
cluster.await_full_convergence(Duration::from_secs(1)).await;
|
||
|
||
// Begin migration: move tenant 42 from shard 0 (region 0) to shard 0 (region 2).
|
||
let migration = cluster.begin_tenant_migration(tenant, ShardId(0), ShardId(0), RegionId(2));
|
||
migration.prepare_target().await.unwrap();
|
||
migration.enter_dual_write().await.unwrap();
|
||
|
||
// Write 50 more signals during dual-write window.
|
||
for _ in 0..50 {
|
||
cluster.leader().db
|
||
.signal_for_tenant(tenant, "view", item, 1.0, Timestamp::now())
|
||
.unwrap();
|
||
}
|
||
|
||
tokio::time::sleep(Duration::from_millis(200)).await;
|
||
migration.finalize().await.unwrap();
|
||
|
||
// All 150 signals must be present on the new region.
|
||
let score_new = cluster.read_decay_score(RegionId(2), item, "view").unwrap();
|
||
assert!(score_new > 0.0, "all signals must be on new region after migration");
|
||
|
||
// Queries during migration must have succeeded (no error returned during dual-write).
|
||
// (Verified by the fact that all writes above returned Ok)
|
||
|
||
// GC old region.
|
||
migration.gc_source(0).unwrap();
|
||
|
||
// Old region should have no data for this tenant.
|
||
let score_old = cluster.read_score_for_tenant(RegionId(0), tenant, item, "view").unwrap_or(0.0);
|
||
assert_eq!(score_old, 0.0, "source region must have no tenant data after GC");
|
||
}
|
||
```
|
||
|
||
## Acceptance Criteria
|
||
|
||
- [ ] `uat_step1_cross_region_replication`: scores in all 3 regions equal within 6 decimal places after < 2s; replication lag = 0
|
||
- [ ] `uat_step2_shard_crash_and_failover`: failover completes within 10 seconds; no data loss on promoted follower
|
||
- [ ] `uat_step3_degraded_query_during_partition`: query succeeds with 2/3 regions; `QueryStats` degradation flag set
|
||
- [ ] `uat_step4_partition_heal_reconciliation`: no duplicate signal counts after reconciliation (50 + 30 = 80 distinct events); hard negatives propagated to all regions; scores match analytical formula to 6 decimal places
|
||
- [ ] `uat_step5_tenant_migration`: 150 signals present on target region after migration; old region has 0; zero errors during dual-write window
|
||
- [ ] All 5 tests pass in `cargo test --test m8_uat`
|
||
- [ ] Total test suite runtime < 60 seconds (InProcessTransport keeps this fast)
|
||
- [ ] `cargo clippy -D warnings` and `cargo fmt` pass
|