fix: prevent phantom members — advertise pod IP instead of 0.0.0.0
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

Root cause: SyncStorageBridge returned bind addresses (0.0.0.0) in
PingResponse, causing peers to overwrite each other's addresses with
loopback. Subsequent probes connected to self, registering self as a
member (node_count: 4 instead of 3).

Two fixes:
- alive_node() now skips self (node_id == local_id guard)
- Advertise POD_IP (k8s downward API) in PingResponse instead of bind addr
This commit is contained in:
jordan 2026-03-07 20:13:27 -07:00
parent 4360a17dd3
commit 476d8e19e4
2 changed files with 22 additions and 3 deletions

View File

@ -209,8 +209,22 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.parse() .parse()
.unwrap_or_else(|_| SocketAddr::from(([0, 0, 0, 0], 18180))); .unwrap_or_else(|_| SocketAddr::from(([0, 0, 0, 0], 18180)));
// Advertise addresses: use POD_IP (k8s downward API) instead of bind address
// (0.0.0.0) which would cause peers to connect to themselves via loopback.
let advertise_ip = std::env::var("POD_IP")
.ok()
.and_then(|ip| ip.parse::<std::net::IpAddr>().ok())
.unwrap_or_else(|| rpc_addr.ip()); // Fallback: use bind IP (fine for local dev)
let advertise_rpc = SocketAddr::new(advertise_ip, rpc_addr.port());
let advertise_api = SocketAddr::new(advertise_ip, api_addr.port());
info!(
%advertise_rpc,
%advertise_api,
"Advertising cluster addresses"
);
// --- Membership --- // --- Membership ---
let local_info = NodeInfo::new(node_id, rpc_addr, api_addr); let local_info = NodeInfo::new(node_id, advertise_rpc, advertise_api);
let membership = Arc::new(SwimMembership::new(local_info, SwimConfig::default())); let membership = Arc::new(SwimMembership::new(local_info, SwimConfig::default()));
// Resolve seeds via DNS (for k8s headless service names) // Resolve seeds via DNS (for k8s headless service names)
@ -277,8 +291,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
Arc::clone(&store), Arc::clone(&store),
Arc::clone(&merkle_manager), Arc::clone(&merkle_manager),
*node_id.as_bytes(), *node_id.as_bytes(),
rpc_addr.to_string(), advertise_rpc.to_string(),
api_addr.to_string(), advertise_api.to_string(),
)); ));
let grpc_service = SyncServiceServer::new(SyncServiceHandler::new(bridge)); let grpc_service = SyncServiceServer::new(SyncServiceHandler::new(bridge));

View File

@ -410,6 +410,11 @@ impl SwimMembership {
/// Marks a node as alive (responded to probe or refuted suspicion). /// Marks a node as alive (responded to probe or refuted suspicion).
#[instrument(skip(self))] #[instrument(skip(self))]
pub fn alive_node(&self, node_id: NodeId, info: NodeInfo) { pub fn alive_node(&self, node_id: NodeId, info: NodeInfo) {
// Never add ourselves to the members map — self is tracked separately
if node_id == self.local_id() {
return;
}
let lamport = self.tick(); let lamport = self.tick();
// IMPORTANT: same deadlock hazard — drop RefMut from get_mut before update_node_gauges. // IMPORTANT: same deadlock hazard — drop RefMut from get_mut before update_node_gauges.