fix: prevent phantom members — advertise pod IP instead of 0.0.0.0
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
Root cause: SyncStorageBridge returned bind addresses (0.0.0.0) in PingResponse, causing peers to overwrite each other's addresses with loopback. Subsequent probes connected to self, registering self as a member (node_count: 4 instead of 3). Two fixes: - alive_node() now skips self (node_id == local_id guard) - Advertise POD_IP (k8s downward API) in PingResponse instead of bind addr
This commit is contained in:
parent
4360a17dd3
commit
476d8e19e4
@ -209,8 +209,22 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
.parse()
|
||||
.unwrap_or_else(|_| SocketAddr::from(([0, 0, 0, 0], 18180)));
|
||||
|
||||
// Advertise addresses: use POD_IP (k8s downward API) instead of bind address
|
||||
// (0.0.0.0) which would cause peers to connect to themselves via loopback.
|
||||
let advertise_ip = std::env::var("POD_IP")
|
||||
.ok()
|
||||
.and_then(|ip| ip.parse::<std::net::IpAddr>().ok())
|
||||
.unwrap_or_else(|| rpc_addr.ip()); // Fallback: use bind IP (fine for local dev)
|
||||
let advertise_rpc = SocketAddr::new(advertise_ip, rpc_addr.port());
|
||||
let advertise_api = SocketAddr::new(advertise_ip, api_addr.port());
|
||||
info!(
|
||||
%advertise_rpc,
|
||||
%advertise_api,
|
||||
"Advertising cluster addresses"
|
||||
);
|
||||
|
||||
// --- Membership ---
|
||||
let local_info = NodeInfo::new(node_id, rpc_addr, api_addr);
|
||||
let local_info = NodeInfo::new(node_id, advertise_rpc, advertise_api);
|
||||
let membership = Arc::new(SwimMembership::new(local_info, SwimConfig::default()));
|
||||
|
||||
// Resolve seeds via DNS (for k8s headless service names)
|
||||
@ -277,8 +291,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&merkle_manager),
|
||||
*node_id.as_bytes(),
|
||||
rpc_addr.to_string(),
|
||||
api_addr.to_string(),
|
||||
advertise_rpc.to_string(),
|
||||
advertise_api.to_string(),
|
||||
));
|
||||
|
||||
let grpc_service = SyncServiceServer::new(SyncServiceHandler::new(bridge));
|
||||
|
||||
@ -410,6 +410,11 @@ impl SwimMembership {
|
||||
/// Marks a node as alive (responded to probe or refuted suspicion).
|
||||
#[instrument(skip(self))]
|
||||
pub fn alive_node(&self, node_id: NodeId, info: NodeInfo) {
|
||||
// Never add ourselves to the members map — self is tracked separately
|
||||
if node_id == self.local_id() {
|
||||
return;
|
||||
}
|
||||
|
||||
let lamport = self.tick();
|
||||
|
||||
// IMPORTANT: same deadlock hazard — drop RefMut from get_mut before update_node_gauges.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user