# Task 03: UAT Scenario Tests (Steps 1–5) ## Delivers Integration test suite in `tidal/tests/m8_uat.rs` covering all 5 UAT scenario steps. Uses `SimulatedCluster` and fault injection from Tasks 01–02. This is the gate for M8 completion. ## Complexity: M ## Dependencies - Tasks 01–02 complete (SimulatedCluster, fault injection) - All phases 8.1–8.5 complete ## Technical Design ```rust // tidal/tests/m8_uat.rs use tidaldb::{ EntityId, Timestamp, Window, query::{retrieve::Retrieve, search::Search}, replication::{RegionId, ShardId, NodeRole}, }; use tidaldb::testing::{ cluster::{SimulatedCluster, ClusterConfig}, faults::{NetworkPartition, ShardCrash}, }; fn m8_schema() -> Schema { SchemaBuilder::new() .signal("view", EntityKind::Item, DecaySpec::Exponential { half_life: Duration::from_secs(7 * 24 * 3600) }) .windows(&[Window::OneHour, Window::TwentyFourHours]) .add() .signal("like", EntityKind::Item, DecaySpec::Exponential { half_life: Duration::from_secs(24 * 3600) }) .add() .build() .unwrap() } fn three_region_config() -> ClusterConfig { ClusterConfig { regions: vec![RegionId(0), RegionId(1), RegionId(2)], shards_per_region: 1, leader: (RegionId(0), ShardId(0)), schema: m8_schema(), } } /// UAT Step 1: Cross-region signal replication < 2 seconds. /// /// Write signals for a user in us-east (region 0), read in eu-west (region 1) /// after < 2 seconds. Verified by ReplicationLagGauge assertion and /// read_decay_score equivalence. #[tokio::test] async fn uat_step1_cross_region_replication() { let cluster = SimulatedCluster::build(three_region_config()).await; let item = EntityId::new(1); let t = Timestamp::now(); // Write 25 signals in us-east (region 0 leader). for _ in 0..25 { cluster.write_signal("view", item, 1.0); } // Wait for convergence (< 2 seconds on in-process transport). cluster.await_full_convergence(Duration::from_secs(2)).await; // Read in eu-west (region 1) and ap-south (region 2). let score_east = cluster.read_decay_score(RegionId(0), item, "view").unwrap(); let score_west = cluster.read_decay_score(RegionId(1), item, "view").unwrap(); let score_south = cluster.read_decay_score(RegionId(2), item, "view").unwrap(); // All regions should report the same score (within floating point epsilon). let epsilon = 1e-6; assert!((score_east - score_west).abs() < epsilon, "eu-west score {} diverges from us-east score {} by > {}", score_west, score_east, epsilon); assert!((score_east - score_south).abs() < epsilon, "ap-south score {} diverges from us-east score {} by > {}", score_south, score_east, epsilon); // Verify via replication lag gauge. let lag_1 = cluster.control_plane().lag_seqno(RegionId(1)); let lag_2 = cluster.control_plane().lag_seqno(RegionId(2)); assert_eq!(lag_1, 0, "eu-west should have no replication lag"); assert_eq!(lag_2, 0, "ap-south should have no replication lag"); } /// UAT Step 2: Shard crash and follower promotion. /// /// Crash an entire shard primary. Follower is promoted within 10 seconds. /// All acknowledged signals are present on the promoted follower. No data loss. #[tokio::test] async fn uat_step2_shard_crash_and_failover() { let cluster = Arc::new(SimulatedCluster::build(three_region_config()).await); let item = EntityId::new(2); // Write 100 signals (all acknowledged by leader before crash). for _ in 0..100 { cluster.write_signal("view", item, 1.0); } // Wait for eu-west follower to receive all events. cluster.await_full_convergence(Duration::from_secs(2)).await; // Record the pre-crash seqno on eu-west. let pre_crash_seqno = cluster.applied_seqno(RegionId(1)); // Crash the us-east primary. let crash = ShardCrash::crash(ShardId(0), cluster.clone(), false).await; // Follower promotion should complete within 10 seconds. let deadline = Instant::now() + Duration::from_secs(10); loop { if Instant::now() > deadline { panic!("failover timeout: no new leader elected within 10 seconds"); } if cluster.has_leader() { break; } tokio::time::sleep(Duration::from_millis(100)).await; } // New leader (eu-west promoted follower) must have all 100 signals. let new_leader_seqno = cluster.leader().db.applied_seqno(); assert!(new_leader_seqno >= pre_crash_seqno, "promoted leader must have at least {} events (had {})", pre_crash_seqno, new_leader_seqno); let score_on_promoted = cluster.read_decay_score(RegionId(1), item, "view").unwrap(); assert!(score_on_promoted > 0.0, "all 100 signals must be present on the promoted leader"); } /// UAT Step 3: Degraded query during partition. /// /// Execute RETRIEVE while ap-south (region 2) is partitioned. /// Query succeeds using available shards. Degradation flag is set in QueryStats. #[tokio::test] async fn uat_step3_degraded_query_during_partition() { let cluster = SimulatedCluster::build(three_region_config()).await; let item = EntityId::new(3); // Seed some data. for _ in 0..10 { cluster.write_signal("view", item, 1.0); } cluster.await_full_convergence(Duration::from_secs(1)).await; // Inject partition: ap-south (region 2) is isolated. let _partition = NetworkPartition::symmetric( RegionId(0), RegionId(2), cluster.transport_factory(), ); // Write more signals during the partition. for _ in 0..5 { cluster.write_signal("view", item, 1.0); } // Query should still succeed from us-east or eu-west. let results = cluster.leader().db.retrieve(&Retrieve::builder() .candidates(vec![item]) .build() .unwrap() ).unwrap(); assert!(!results.items.is_empty(), "query must succeed with 2 of 3 regions available"); // QueryStats should indicate degradation. // (Exact API for degradation flag verified in m7p4 visibility tests -- same pattern) let stats = results.stats; // degraded = true is set when < all shards participated // (exact field name TBD during implementation; verified in UAT step 3 acceptance) } /// UAT Step 4: Partition heal and reconciliation. /// /// Heal the partition from Step 3. ReconciliationEngine runs. After reconciliation: /// no duplicate signal counts, hard negatives never leaked, decay scores on all /// shards match analytical formula to 6 decimal places. #[tokio::test] async fn uat_step4_partition_heal_reconciliation() { let cluster = SimulatedCluster::build(three_region_config()).await; let item = EntityId::new(4); let user = EntityId::new(100); // Phase 1: write events on both sides of partition. let partition = NetworkPartition::symmetric( RegionId(0), RegionId(2), cluster.transport_factory(), ); // Write to leader (us-east, region 0) during partition. for _ in 0..50 { cluster.write_signal("view", item, 1.0); } // Write to ap-south (region 2) directly during partition. // (ap-south is isolated, so it accumulates its own events) for _ in 0..30 { cluster.node(RegionId(2)).db .signal("view", item, 1.0, Timestamp::now()) .unwrap(); } // Apply hard negative on ap-south during partition. let ts_hide = HlcTimestamp { wall_ns: 200, logical: 0, node_id: 2 }; cluster.node(RegionId(2)).db.hide_item_with_ts(user, item, ts_hide).unwrap(); // Phase 2: heal partition. drop(partition); // Run reconciliation. cluster.reconcile_all().await; cluster.await_full_convergence(Duration::from_secs(5)).await; // Verify: total signal count = 50 + 30 = 80 (no double-counting). let score_east = cluster.read_decay_score(RegionId(0), item, "view").unwrap(); let score_west = cluster.read_decay_score(RegionId(1), item, "view").unwrap(); let score_south = cluster.read_decay_score(RegionId(2), item, "view").unwrap(); // Analytical formula: 80 events × weight=1.0, all at approximately t=now. // Decay score = sum of decayed events; with very short elapsed time, ≈ 80.0. let epsilon = 1e-6; assert!((score_east - score_west).abs() < epsilon, "post-reconciliation scores diverge between us-east and eu-west"); assert!((score_east - score_south).abs() < epsilon, "post-reconciliation scores diverge between us-east and ap-south"); // Verify: hard negative applied on ap-south is propagated to all regions. // Item must not appear in query results for the user on any region. for ®ion in &[RegionId(0), RegionId(1), RegionId(2)] { let results = cluster.node(region).db.retrieve(&Retrieve::builder() .for_user(user) .candidates(vec![item]) .build() .unwrap() ).unwrap(); assert!(results.items.is_empty(), "hard negative must suppress item in region {:?} after reconciliation", region); } } /// UAT Step 5: Tenant migration with zero downtime. /// /// Move a tenant to a new region by changing routing config. /// During migration: zero downtime, all queries succeed. /// After migration: tenant's data is on new region only; old region's copy is GC'd. #[tokio::test] async fn uat_step5_tenant_migration() { let cluster = SimulatedCluster::build(three_region_config()).await; let tenant = TenantId(42); let item = EntityId::new(5); // Register tenant on shard 0, region 0. cluster.register_tenant(TenantConfig { tenant_id: tenant, max_signals_per_sec: None, max_entities: None, max_storage_bytes: None, required_regions: vec![RegionId(0)], label: "migrating-tenant".into(), }); // Write 100 signals before migration. for _ in 0..100 { cluster.leader().db .signal_for_tenant(tenant, "view", item, 1.0, Timestamp::now()) .unwrap(); } cluster.await_full_convergence(Duration::from_secs(1)).await; // Begin migration: move tenant 42 from shard 0 (region 0) to shard 0 (region 2). let migration = cluster.begin_tenant_migration(tenant, ShardId(0), ShardId(0), RegionId(2)); migration.prepare_target().await.unwrap(); migration.enter_dual_write().await.unwrap(); // Write 50 more signals during dual-write window. for _ in 0..50 { cluster.leader().db .signal_for_tenant(tenant, "view", item, 1.0, Timestamp::now()) .unwrap(); } tokio::time::sleep(Duration::from_millis(200)).await; migration.finalize().await.unwrap(); // All 150 signals must be present on the new region. let score_new = cluster.read_decay_score(RegionId(2), item, "view").unwrap(); assert!(score_new > 0.0, "all signals must be on new region after migration"); // Queries during migration must have succeeded (no error returned during dual-write). // (Verified by the fact that all writes above returned Ok) // GC old region. migration.gc_source(0).unwrap(); // Old region should have no data for this tenant. let score_old = cluster.read_score_for_tenant(RegionId(0), tenant, item, "view").unwrap_or(0.0); assert_eq!(score_old, 0.0, "source region must have no tenant data after GC"); } ``` ## Acceptance Criteria - [ ] `uat_step1_cross_region_replication`: scores in all 3 regions equal within 6 decimal places after < 2s; replication lag = 0 - [ ] `uat_step2_shard_crash_and_failover`: failover completes within 10 seconds; no data loss on promoted follower - [ ] `uat_step3_degraded_query_during_partition`: query succeeds with 2/3 regions; `QueryStats` degradation flag set - [ ] `uat_step4_partition_heal_reconciliation`: no duplicate signal counts after reconciliation (50 + 30 = 80 distinct events); hard negatives propagated to all regions; scores match analytical formula to 6 decimal places - [ ] `uat_step5_tenant_migration`: 150 signals present on target region after migration; old region has 0; zero errors during dual-write window - [ ] All 5 tests pass in `cargo test --test m8_uat` - [ ] Total test suite runtime < 60 seconds (InProcessTransport keeps this fast) - [ ] `cargo clippy -D warnings` and `cargo fmt` pass