diff --git a/crates/stemedb-api/src/main.rs b/crates/stemedb-api/src/main.rs index 5428639..9cd72fd 100644 --- a/crates/stemedb-api/src/main.rs +++ b/crates/stemedb-api/src/main.rs @@ -209,8 +209,22 @@ async fn main() -> Result<(), Box> { .parse() .unwrap_or_else(|_| SocketAddr::from(([0, 0, 0, 0], 18180))); + // Advertise addresses: use POD_IP (k8s downward API) instead of bind address + // (0.0.0.0) which would cause peers to connect to themselves via loopback. + let advertise_ip = std::env::var("POD_IP") + .ok() + .and_then(|ip| ip.parse::().ok()) + .unwrap_or_else(|| rpc_addr.ip()); // Fallback: use bind IP (fine for local dev) + let advertise_rpc = SocketAddr::new(advertise_ip, rpc_addr.port()); + let advertise_api = SocketAddr::new(advertise_ip, api_addr.port()); + info!( + %advertise_rpc, + %advertise_api, + "Advertising cluster addresses" + ); + // --- Membership --- - let local_info = NodeInfo::new(node_id, rpc_addr, api_addr); + let local_info = NodeInfo::new(node_id, advertise_rpc, advertise_api); let membership = Arc::new(SwimMembership::new(local_info, SwimConfig::default())); // Resolve seeds via DNS (for k8s headless service names) @@ -277,8 +291,8 @@ async fn main() -> Result<(), Box> { Arc::clone(&store), Arc::clone(&merkle_manager), *node_id.as_bytes(), - rpc_addr.to_string(), - api_addr.to_string(), + advertise_rpc.to_string(), + advertise_api.to_string(), )); let grpc_service = SyncServiceServer::new(SyncServiceHandler::new(bridge)); diff --git a/crates/stemedb-cluster/src/membership/swim.rs b/crates/stemedb-cluster/src/membership/swim.rs index 2bd569e..a0dd8e3 100644 --- a/crates/stemedb-cluster/src/membership/swim.rs +++ b/crates/stemedb-cluster/src/membership/swim.rs @@ -410,6 +410,11 @@ impl SwimMembership { /// Marks a node as alive (responded to probe or refuted suspicion). #[instrument(skip(self))] pub fn alive_node(&self, node_id: NodeId, info: NodeInfo) { + // Never add ourselves to the members map — self is tracked separately + if node_id == self.local_id() { + return; + } + let lamport = self.tick(); // IMPORTANT: same deadlock hazard — drop RefMut from get_mut before update_node_gauges.