//! Partition tolerance tests for distributed consistency. //! //! These tests verify that StemeDB clusters: //! - Continue accepting writes during network partitions //! - Converge correctly after partition heals //! - Handle node failures and recovery //! - Survive cascading failures //! //! # Test Strategy //! //! We use the `TestCluster` harness with `NetworkController` to simulate //! partitions without requiring real network operations. #![allow(clippy::unwrap_used, clippy::expect_used)] use std::time::Duration; use stemedb_chaos::{ TestCluster, TestClusterAccessExt, TestClusterConvergenceExt, TestClusterCreationExt, TestClusterLifecycleExt, TestClusterSyncExt, }; /// Test: 5-node cluster with 2 nodes killed converges after recovery. /// /// Verifies: /// - Writes succeed while nodes 0, 1, 2 are alive /// - Nodes 3, 4 can recover by syncing from survivors /// - All 5 nodes converge after recovery #[tokio::test] async fn test_5_node_kill_2_convergence() { let mut cluster = TestCluster::spawn(5).await.expect("spawn cluster"); // Write baseline assertions to all nodes cluster.get_node_mut(0).write_assertion("baseline:1", "pred", 1000).await.expect("write"); cluster.sync_all().await.expect("sync baseline"); cluster.assert_converged(); // Kill nodes 3 and 4 cluster.kill_node(3); cluster.kill_node(4); // Continue writes to nodes 0, 1, 2 cluster.get_node_mut(0).write_assertion("after_kill:1", "pred", 2000).await.expect("write"); cluster.get_node_mut(1).write_assertion("after_kill:2", "pred", 3000).await.expect("write"); cluster.get_node_mut(2).write_assertion("after_kill:3", "pred", 4000).await.expect("write"); // Sync among surviving nodes cluster.sync_all().await.expect("sync survivors"); // Verify surviving nodes converged let alive = cluster.alive_node_indices(); assert_eq!(alive.len(), 3); cluster.assert_converged(); // "Restart" nodes 3 and 4 (fresh state, needs sync) cluster.restart_node(3).expect("restart 3"); cluster.restart_node(4).expect("restart 4"); // Verify they start empty assert_eq!(cluster.get_node(3).assertion_count(), 0); assert_eq!(cluster.get_node(4).assertion_count(), 0); // Sync all nodes cluster.sync_all().await.expect("sync all"); // Verify all 5 nodes converged with all assertions cluster.assert_converged(); // All nodes should have 4 assertions (1 baseline + 3 after kill) for i in 0..5 { assert_eq!(cluster.get_node(i).assertion_count(), 4, "Node {i} should have 4 assertions"); } } /// Test: Network partition between two groups converges after heal. /// /// Verifies: /// - Writes succeed on both sides of partition /// - Groups cannot sync during partition /// - Both groups' writes survive after heal #[tokio::test] async fn test_partition_between_groups_convergence() { let mut cluster = TestCluster::spawn(5).await.expect("spawn cluster"); // Create partition: [0,1,2] and [3,4] cluster.network().partition(&[0, 1, 2], &[3, 4]); // Write assertion A to group 1 cluster.get_node_mut(0).write_assertion("group1:data", "value_a", 1000).await.expect("write A"); // Write assertion B to group 2 cluster.get_node_mut(3).write_assertion("group2:data", "value_b", 2000).await.expect("write B"); // Sync within groups only (partition blocks cross-group) cluster.sync_all().await.expect("sync"); // Verify isolation: group 1 has A but not B assert_eq!(cluster.get_node(0).assertion_count(), 1); assert_eq!(cluster.get_node(1).assertion_count(), 1); assert_eq!(cluster.get_node(2).assertion_count(), 1); assert_eq!(cluster.get_node(3).assertion_count(), 1); assert_eq!(cluster.get_node(4).assertion_count(), 1); // Groups should NOT be converged (different data) assert!(!cluster.is_converged(), "Should not be converged during partition"); // Heal partition cluster.network().heal(); // Sync all nodes cluster.sync_all().await.expect("sync after heal"); // Verify all nodes have both A and B cluster.assert_converged(); for i in 0..5 { assert_eq!( cluster.get_node(i).assertion_count(), 2, "Node {i} should have 2 assertions after heal" ); } } /// Test: Message reordering doesn't affect convergence. /// /// Verifies CRDT commutativity - same assertions in different order /// produce the same final state. #[tokio::test] async fn test_message_reordering_convergence() { let mut cluster = TestCluster::spawn(3).await.expect("spawn cluster"); // Write 100 unique assertions rapidly for i in 0..100 { let node_idx = i % 3; let subject = format!("rapid:{i}"); cluster .get_node_mut(node_idx) .write_assertion(&subject, "pred", 1000 + i as u64) .await .expect("write"); } // Sync in different patterns to simulate reordering // First: 0 -> 1 -> 2 cluster.sync_pair(0, 1).await.expect("sync 0->1"); cluster.sync_pair(1, 2).await.expect("sync 1->2"); // Then: 2 -> 0 -> 1 cluster.sync_pair(2, 0).await.expect("sync 2->0"); cluster.sync_pair(0, 1).await.expect("sync 0->1"); // Finally: full mesh cluster.sync_all().await.expect("sync all"); // All nodes should have all 100 assertions cluster.assert_converged(); for i in 0..3 { assert_eq!( cluster.get_node(i).assertion_count(), 100, "Node {i} should have 100 assertions" ); } } /// Test: Message duplication is idempotent. /// /// Verifies that syncing the same assertions multiple times /// doesn't create duplicates (content-addressed). #[tokio::test] async fn test_message_duplication_idempotent() { let mut cluster = TestCluster::spawn(3).await.expect("spawn cluster"); // Write 50 assertions for i in 0..50 { cluster .get_node_mut(0) .write_assertion(&format!("dup:{i}"), "pred", 1000 + i as u64) .await .expect("write"); } // Sync multiple times (simulating duplication) for _ in 0..5 { cluster.sync_pair(0, 1).await.expect("sync"); cluster.sync_pair(0, 2).await.expect("sync"); } // Verify no duplicates - should have exactly 50 assertions cluster.assert_converged(); for i in 0..3 { assert_eq!( cluster.get_node(i).assertion_count(), 50, "Node {i} should have exactly 50 assertions (no duplicates)" ); } } /// Test: Cascading failure recovery. /// /// Verifies cluster survives sequential node failures and recovers. #[tokio::test] async fn test_cascading_failure_recovery() { let mut cluster = TestCluster::spawn(5).await.expect("spawn cluster"); // Write baseline cluster.get_node_mut(0).write_assertion("baseline", "pred", 1000).await.expect("write"); cluster.sync_all().await.expect("sync baseline"); // Kill node 0 cluster.kill_node(0); cluster.get_node_mut(1).write_assertion("after_0", "pred", 2000).await.expect("write"); cluster.sync_all().await.expect("sync"); // Kill node 1 cluster.kill_node(1); cluster.get_node_mut(2).write_assertion("after_1", "pred", 3000).await.expect("write"); cluster.sync_all().await.expect("sync"); // Kill node 2 - only nodes 3, 4 remain cluster.kill_node(2); // Verify nodes 3, 4 are converged with all 3 assertions cluster.assert_converged(); assert_eq!(cluster.alive_node_indices().len(), 2); assert_eq!(cluster.get_node(3).assertion_count(), 3); assert_eq!(cluster.get_node(4).assertion_count(), 3); // Restart nodes sequentially cluster.restart_node(0).expect("restart 0"); cluster.sync_all().await.expect("sync"); cluster.restart_node(1).expect("restart 1"); cluster.sync_all().await.expect("sync"); cluster.restart_node(2).expect("restart 2"); cluster.sync_all().await.expect("sync"); // Verify full cluster converged cluster.assert_converged(); for i in 0..5 { assert_eq!(cluster.get_node(i).assertion_count(), 3, "Node {i} should have 3 assertions"); } } /// Test: SWIM suspicion doesn't cause false positive under slow responses. /// /// Verifies that a slow node is marked Suspect but recovers to Alive /// when responses resume. #[tokio::test] async fn test_swim_suspicion_not_false_positive() { // Create cluster to test SWIM behavior let cluster = TestCluster::spawn_with_config(5, 4, 2).await.expect("spawn"); // Get node IDs for SWIM operations let node1_id = cluster.get_node(1).node_id(); let node1_info = stemedb_cluster::membership::NodeInfo::new( node1_id, std::net::SocketAddr::from(([127, 0, 0, 1], 9091)), std::net::SocketAddr::from(([127, 0, 0, 1], 8081)), ); // Register node 1 with node 0's membership cluster.get_node(0).membership().alive_node(node1_id, node1_info.clone()); // Slow down node 1's responses (add latency close to timeout) cluster.network().set_latency_bidirectional(0, 1, Duration::from_millis(100)); // Node 0 suspects node 1 (simulating failed probe) cluster.get_node(0).membership().suspect_node(node1_id); // Verify node 1 is marked Suspect let state = cluster.get_node(0).get_peer_state(node1_id); assert_eq!(state, Some(stemedb_cluster::membership::NodeState::Suspect)); // Speed up node 1's responses (remove latency) cluster.network().clear_latency(0, 1); cluster.network().clear_latency(1, 0); // Node 0 receives successful response, marks node 1 Alive again cluster.get_node(0).membership().alive_node(node1_id, node1_info); // Verify node 1 recovered to Alive let state = cluster.get_node(0).get_peer_state(node1_id); assert_eq!(state, Some(stemedb_cluster::membership::NodeState::Alive)); } /// Test: Writes during asymmetric partition. /// /// Tests where only one direction of communication is blocked. #[tokio::test] async fn test_asymmetric_partition() { let mut cluster = TestCluster::spawn(3).await.expect("spawn cluster"); // Asymmetric partition: 0 can send to 1, but 1 cannot send to 0 cluster.network().partition_one_way(1, 0); // Node 0 writes cluster.get_node_mut(0).write_assertion("from_0", "pred", 1000).await.expect("write"); // Node 1 writes cluster.get_node_mut(1).write_assertion("from_1", "pred", 2000).await.expect("write"); // Sync: 0->1 should work, 1->0 should be blocked let merged_1 = cluster.sync_pair(0, 1).await.expect("sync 0->1"); let merged_0 = cluster.sync_pair(1, 0).await.expect("sync 1->0"); assert_eq!(merged_1, 1, "Node 1 should receive from 0"); assert_eq!(merged_0, 0, "Node 0 should not receive from 1 (blocked)"); // Heal and verify convergence cluster.network().heal(); cluster.sync_all().await.expect("sync all"); cluster.assert_converged(); } /// Test: Write availability during partition. /// /// Verifies that nodes can accept writes even when partitioned from others. #[tokio::test] async fn test_write_availability_during_partition() { let mut cluster = TestCluster::spawn(4).await.expect("spawn cluster"); // Severe partition: each node isolated cluster.network().partition(&[0], &[1, 2, 3]); cluster.network().partition(&[1], &[2, 3]); cluster.network().partition(&[2], &[3]); // All nodes can still write locally for i in 0..4 { cluster .get_node_mut(i) .write_assertion(&format!("isolated:{i}"), "pred", 1000 + i as u64) .await .expect("write should succeed"); } // Each node has exactly 1 assertion for i in 0..4 { assert_eq!(cluster.get_node(i).assertion_count(), 1); } // Heal all partitions cluster.network().heal(); cluster.sync_all().await.expect("sync"); // All nodes should have all 4 assertions cluster.assert_converged(); for i in 0..4 { assert_eq!( cluster.get_node(i).assertion_count(), 4, "Node {i} should have 4 assertions after heal" ); } }