From 476d8e19e43607f2a7657691a9e809cf97764b5b Mon Sep 17 00:00:00 2001 From: jordan Date: Sat, 7 Mar 2026 20:13:27 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20prevent=20phantom=20members=20=E2=80=94?= =?UTF-8?q?=20advertise=20pod=20IP=20instead=20of=200.0.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: SyncStorageBridge returned bind addresses (0.0.0.0) in PingResponse, causing peers to overwrite each other's addresses with loopback. Subsequent probes connected to self, registering self as a member (node_count: 4 instead of 3). Two fixes: - alive_node() now skips self (node_id == local_id guard) - Advertise POD_IP (k8s downward API) in PingResponse instead of bind addr --- crates/stemedb-api/src/main.rs | 20 ++++++++++++++++--- crates/stemedb-cluster/src/membership/swim.rs | 5 +++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/crates/stemedb-api/src/main.rs b/crates/stemedb-api/src/main.rs index 5428639..9cd72fd 100644 --- a/crates/stemedb-api/src/main.rs +++ b/crates/stemedb-api/src/main.rs @@ -209,8 +209,22 @@ async fn main() -> Result<(), Box> { .parse() .unwrap_or_else(|_| SocketAddr::from(([0, 0, 0, 0], 18180))); + // Advertise addresses: use POD_IP (k8s downward API) instead of bind address + // (0.0.0.0) which would cause peers to connect to themselves via loopback. + let advertise_ip = std::env::var("POD_IP") + .ok() + .and_then(|ip| ip.parse::().ok()) + .unwrap_or_else(|| rpc_addr.ip()); // Fallback: use bind IP (fine for local dev) + let advertise_rpc = SocketAddr::new(advertise_ip, rpc_addr.port()); + let advertise_api = SocketAddr::new(advertise_ip, api_addr.port()); + info!( + %advertise_rpc, + %advertise_api, + "Advertising cluster addresses" + ); + // --- Membership --- - let local_info = NodeInfo::new(node_id, rpc_addr, api_addr); + let local_info = NodeInfo::new(node_id, advertise_rpc, advertise_api); let membership = Arc::new(SwimMembership::new(local_info, SwimConfig::default())); // Resolve seeds via DNS (for k8s headless service names) @@ -277,8 +291,8 @@ async fn main() -> Result<(), Box> { Arc::clone(&store), Arc::clone(&merkle_manager), *node_id.as_bytes(), - rpc_addr.to_string(), - api_addr.to_string(), + advertise_rpc.to_string(), + advertise_api.to_string(), )); let grpc_service = SyncServiceServer::new(SyncServiceHandler::new(bridge)); diff --git a/crates/stemedb-cluster/src/membership/swim.rs b/crates/stemedb-cluster/src/membership/swim.rs index 2bd569e..a0dd8e3 100644 --- a/crates/stemedb-cluster/src/membership/swim.rs +++ b/crates/stemedb-cluster/src/membership/swim.rs @@ -410,6 +410,11 @@ impl SwimMembership { /// Marks a node as alive (responded to probe or refuted suspicion). #[instrument(skip(self))] pub fn alive_node(&self, node_id: NodeId, info: NodeInfo) { + // Never add ourselves to the members map — self is tracked separately + if node_id == self.local_id() { + return; + } + let lamport = self.tick(); // IMPORTANT: same deadlock hazard — drop RefMut from get_mut before update_node_gauges.