diff --git a/CLAUDE.md b/CLAUDE.md index 889e54e..a30e0e3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -34,6 +34,8 @@ A probabilistic knowledge graph database that stores Claims, not Facts. Append-o | **Consumer Health UAT** | [uat/consumer-health/README.md](./uat/consumer-health/README.md) | | **Verify production readiness** | [uat/production-readiness/README.md](./uat/production-readiness/README.md) | | **Deploy to production** | [docs/operations/README.md](./docs/operations/README.md) | +| **Manage cluster nodes** | [docs/operations/node-lifecycle.md](./docs/operations/node-lifecycle.md) | +| **Install admin CLI** | [docs/operations/deployment/install-admin-cli.md](./docs/operations/deployment/install-admin-cli.md) | | **Troubleshoot incidents** | [docs/operations/runbooks/](./docs/operations/runbooks/) | | **Size your deployment** | [docs/operations/reference-architecture/resource-sizing.md](./docs/operations/reference-architecture/resource-sizing.md) | | **Validate pilot success** | [docs/operations/pilot-success-criteria.md](./docs/operations/pilot-success-criteria.md) | diff --git a/Cargo.toml b/Cargo.toml index 1c687b3..b69b218 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ members = [ "crates/stemedb-cluster", "crates/stemedb-chaos", "crates/stemedb-ontology", + "crates/stemedb-admin", "applications/aphoria", ] resolver = "2" diff --git a/crates/stemedb-admin/Cargo.toml b/crates/stemedb-admin/Cargo.toml new file mode 100644 index 0000000..70ff05b --- /dev/null +++ b/crates/stemedb-admin/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "stemedb-admin" +version = "0.1.0" +edition = "2021" +authors = ["StemeDB Team"] +description = "StemeDB Cluster Administration Tool" +license = "MIT OR Apache-2.0" + +[dependencies] +clap = { version = "4.4", features = ["derive", "env"] } +reqwest = { version = "0.11", features = ["json"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +tokio = { version = "1.35", features = ["macros", "rt-multi-thread"] } +anyhow = "1.0" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +comfy-table = "7.1" +colored = "2.1" +chrono = { version = "0.4", features = ["serde"] } + +[lib] +name = "stemedb_admin" +path = "src/lib.rs" + +[[bin]] +name = "stemedb-admin" +path = "src/main.rs" diff --git a/crates/stemedb-admin/src/cli.rs b/crates/stemedb-admin/src/cli.rs new file mode 100644 index 0000000..2015a67 --- /dev/null +++ b/crates/stemedb-admin/src/cli.rs @@ -0,0 +1,101 @@ +use clap::{Parser, Subcommand}; + +use crate::output::OutputFormat; + +#[derive(Parser)] +#[command(name = "stemedb-admin")] +#[command(version)] +#[command(about = "StemeDB Cluster Administration Tool", long_about = None)] +pub struct Cli { + /// Gateway address + #[arg(long, env = "STEMEDB_GATEWAY_ADDR", default_value = "http://localhost:18181")] + pub gateway: String, + + /// Output format (table, json) + #[arg(short, long, default_value = "table")] + pub format: OutputFormat, + + /// Enable verbose logging + #[arg(short, long)] + pub verbose: bool, + + #[command(subcommand)] + pub command: Commands, +} + +#[derive(Subcommand)] +pub enum Commands { + /// Cluster operations + Cluster { + #[command(subcommand)] + cmd: ClusterCommands, + }, + /// Node operations + Node { + #[command(subcommand)] + cmd: NodeCommands, + }, + /// Shard operations + Shard { + #[command(subcommand)] + cmd: ShardCommands, + }, + /// Debug operations + Debug { + #[command(subcommand)] + cmd: DebugCommands, + }, +} + +#[derive(Subcommand)] +pub enum ClusterCommands { + /// Show cluster status overview + Status, + /// Quick health check (exit code 0 if healthy, 1 if unhealthy) + Health, +} + +#[derive(Subcommand)] +pub enum NodeCommands { + /// List all nodes in the cluster + List, + /// Show detailed information about a specific node + Info { + /// Node ID (short hex format) + node_id: String, + }, + /// Show shards assigned to a specific node + Shards { + /// Node ID (short hex format) + node_id: String, + /// Show only leader shards + #[arg(long)] + leader: bool, + }, +} + +#[derive(Subcommand)] +pub enum ShardCommands { + /// List all shards + List, + /// Show detailed information about a specific shard + Info { + /// Shard ID + shard_id: u32, + }, + /// Show replica nodes for a specific shard + Replicas { + /// Shard ID + shard_id: u32, + }, +} + +#[derive(Subcommand)] +pub enum DebugCommands { + /// Export complete cluster state for debugging + Export { + /// Output file path + #[arg(short, long, default_value = "cluster-state.json")] + output: String, + }, +} diff --git a/crates/stemedb-admin/src/client.rs b/crates/stemedb-admin/src/client.rs new file mode 100644 index 0000000..34a6300 --- /dev/null +++ b/crates/stemedb-admin/src/client.rs @@ -0,0 +1,160 @@ +use anyhow::{Context, Result}; +use tracing::{debug, instrument}; + +use crate::types::{ + ClusterStatusResponse, HealthResponse, RangeInfoDto, RangesWrapper, ShardInfoResponse, +}; + +/// HTTP client for StemeDB Gateway API +pub struct AdminClient { + base_url: String, + client: reqwest::Client, +} + +impl AdminClient { + /// Create a new admin client pointing to the gateway + pub fn new(base_url: String) -> Self { + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .expect("Failed to build HTTP client"); + + Self { base_url, client } + } + + /// Check gateway health + #[instrument(skip(self))] + pub async fn health(&self) -> Result { + let url = format!("{}/v1/health", self.base_url); + debug!("Fetching health from: {}", url); + + let response = self + .client + .get(&url) + .send() + .await + .context(format!("Failed to connect to gateway at {}", self.base_url))?; + + if !response.status().is_success() { + anyhow::bail!( + "Gateway returned error status: {} - {}", + response.status(), + response.text().await.unwrap_or_default() + ); + } + + response.json().await.context("Failed to parse health response") + } + + /// Get cluster status overview + #[instrument(skip(self))] + pub async fn cluster_status(&self) -> Result { + let url = format!("{}/v1/cluster/status", self.base_url); + debug!("Fetching cluster status from: {}", url); + + let response = self + .client + .get(&url) + .send() + .await + .context(format!("Failed to connect to gateway at {}", self.base_url))?; + + if !response.status().is_success() { + anyhow::bail!( + "Gateway returned error status: {} - {}", + response.status(), + response.text().await.unwrap_or_default() + ); + } + + response.json().await.context("Failed to parse cluster status response") + } + + /// Get detailed information about a specific shard + #[instrument(skip(self))] + pub async fn shard_info(&self, shard_id: u32) -> Result { + let url = format!("{}/v1/shards/{}", self.base_url, shard_id); + debug!("Fetching shard info from: {}", url); + + let response = self + .client + .get(&url) + .send() + .await + .context(format!("Failed to connect to gateway at {}", self.base_url))?; + + if !response.status().is_success() { + if response.status() == reqwest::StatusCode::NOT_FOUND { + anyhow::bail!("Shard not found: {}", shard_id); + } + anyhow::bail!( + "Gateway returned error status: {} - {}", + response.status(), + response.text().await.unwrap_or_default() + ); + } + + // Gateway returns different format than /admin/ranges, so convert it + let shard_response: ShardInfoResponse = response + .json() + .await + .context("Failed to parse shard info response")?; + + Ok(shard_response.into()) + } + + /// Get information about all shards + #[instrument(skip(self))] + pub async fn all_ranges(&self) -> Result> { + let url = format!("{}/v1/admin/ranges", self.base_url); + debug!("Fetching all ranges from: {}", url); + + let response = self + .client + .get(&url) + .send() + .await + .context(format!("Failed to connect to gateway at {}", self.base_url))?; + + if !response.status().is_success() { + anyhow::bail!( + "Gateway returned error status: {} - {}", + response.status(), + response.text().await.unwrap_or_default() + ); + } + + // Gateway returns {"ranges": [...]} so we need to unwrap it + let wrapper: RangesWrapper = response + .json() + .await + .context("Failed to parse ranges response")?; + + Ok(wrapper.ranges) + } + + /// Trigger anti-entropy sync (Phase 2 feature - not yet exposed in CLI) + #[allow(dead_code)] + #[instrument(skip(self))] + pub async fn force_sync(&self) -> Result<()> { + let url = format!("{}/v1/admin/sync", self.base_url); + debug!("Triggering sync at: {}", url); + + let response = self + .client + .post(&url) + .send() + .await + .context(format!("Failed to connect to gateway at {}", self.base_url))?; + + if !response.status().is_success() { + anyhow::bail!( + "Gateway returned error status: {} - {}", + response.status(), + response.text().await.unwrap_or_default() + ); + } + + Ok(()) + } +} diff --git a/crates/stemedb-admin/src/commands/cluster.rs b/crates/stemedb-admin/src/commands/cluster.rs new file mode 100644 index 0000000..f4cc288 --- /dev/null +++ b/crates/stemedb-admin/src/commands/cluster.rs @@ -0,0 +1,52 @@ +use anyhow::Result; +use tracing::info; + +use crate::client::AdminClient; +use crate::output::{format_cluster_summary, format_json, OutputFormat}; + +/// Show cluster status overview +pub async fn cluster_status(client: &AdminClient, format: OutputFormat) -> Result<()> { + info!("Fetching cluster status"); + let status = client.cluster_status().await?; + + match format { + OutputFormat::Table => { + println!("{}", format_cluster_summary(&status)); + } + OutputFormat::Json => { + println!("{}", format_json(&status)?); + } + } + + Ok(()) +} + +/// Quick health check (exit code 0 if healthy, 1 if unhealthy) +pub async fn cluster_health(client: &AdminClient, format: OutputFormat) -> Result<()> { + info!("Checking cluster health"); + let health = client.health().await?; + + match format { + OutputFormat::Table => { + if health.healthy { + println!("✓ Cluster is healthy"); + println!(" Reachable nodes: {}", health.reachable_nodes); + println!(" Joined: {}", health.joined); + } else { + println!("✗ Cluster is unhealthy"); + println!(" Reachable nodes: {}", health.reachable_nodes); + println!(" Joined: {}", health.joined); + } + } + OutputFormat::Json => { + println!("{}", format_json(&health)?); + } + } + + // Set exit code based on health + if !health.healthy { + std::process::exit(1); + } + + Ok(()) +} diff --git a/crates/stemedb-admin/src/commands/debug.rs b/crates/stemedb-admin/src/commands/debug.rs new file mode 100644 index 0000000..134d800 --- /dev/null +++ b/crates/stemedb-admin/src/commands/debug.rs @@ -0,0 +1,36 @@ +use anyhow::Result; +use chrono::Utc; +use std::fs; +use tracing::info; + +use crate::client::AdminClient; +use crate::types::ClusterDebugExport; + +/// Export complete cluster state for debugging +pub async fn export_debug_state(client: &AdminClient, output_path: &str) -> Result<()> { + info!("Exporting cluster state to: {}", output_path); + + // Gather all cluster information + let health = client.health().await?; + let cluster = client.cluster_status().await?; + let shards = client.all_ranges().await?; + + let export = ClusterDebugExport { + timestamp: Utc::now().to_rfc3339(), + gateway_version: env!("CARGO_PKG_VERSION").to_string(), + cluster, + health, + shards, + }; + + // Write to file + let json = serde_json::to_string_pretty(&export)?; + fs::write(output_path, json)?; + + println!("✓ Cluster state exported to: {}", output_path); + println!(" Timestamp: {}", export.timestamp); + println!(" Nodes: {}", export.cluster.node_count); + println!(" Shards: {}", export.cluster.shard_count); + + Ok(()) +} diff --git a/crates/stemedb-admin/src/commands/mod.rs b/crates/stemedb-admin/src/commands/mod.rs new file mode 100644 index 0000000..d0fb3b6 --- /dev/null +++ b/crates/stemedb-admin/src/commands/mod.rs @@ -0,0 +1,4 @@ +pub mod cluster; +pub mod debug; +pub mod node; +pub mod shard; diff --git a/crates/stemedb-admin/src/commands/node.rs b/crates/stemedb-admin/src/commands/node.rs new file mode 100644 index 0000000..b61cc5d --- /dev/null +++ b/crates/stemedb-admin/src/commands/node.rs @@ -0,0 +1,104 @@ +use anyhow::{Context, Result}; +use tracing::info; + +use crate::client::AdminClient; +use crate::output::{format_json, format_node_detail, format_nodes_table, OutputFormat}; + +/// List all nodes in the cluster +pub async fn list_nodes(client: &AdminClient, format: OutputFormat) -> Result<()> { + info!("Fetching node list"); + let status = client.cluster_status().await?; + + match format { + OutputFormat::Table => { + println!("\nNODES"); + println!("{}", format_nodes_table(&status.nodes)); + } + OutputFormat::Json => { + println!("{}", format_json(&status.nodes)?); + } + } + + Ok(()) +} + +/// Show detailed information about a specific node +pub async fn node_info(client: &AdminClient, node_id: &str, format: OutputFormat) -> Result<()> { + info!("Fetching info for node: {}", node_id); + let status = client.cluster_status().await?; + let shards = client.all_ranges().await?; + + let node = status + .nodes + .iter() + .find(|n| n.id == node_id) + .context(format!("Node not found: {}", node_id))?; + + match format { + OutputFormat::Table => { + println!("{}", format_node_detail(node, &shards)); + } + OutputFormat::Json => { + println!("{}", format_json(node)?); + } + } + + Ok(()) +} + +/// Show shards assigned to a specific node +pub async fn node_shards( + client: &AdminClient, + node_id: &str, + leader_only: bool, + format: OutputFormat, +) -> Result<()> { + info!("Fetching shards for node: {} (leader_only: {})", node_id, leader_only); + let status = client.cluster_status().await?; + let all_shards = client.all_ranges().await?; + + let node = status + .nodes + .iter() + .find(|n| n.id == node_id) + .context(format!("Node not found: {}", node_id))?; + + // Filter shards for this node + let node_shards: Vec<_> = all_shards + .iter() + .filter(|s| node.shards.contains(&s.range_id)) + .filter(|s| !leader_only || s.leader_node == node_id) + .cloned() + .collect(); + + match format { + OutputFormat::Table => { + if leader_only { + println!("\nLEADER SHARDS FOR NODE {}", node_id); + } else { + println!("\nALL SHARDS FOR NODE {}", node_id); + } + + if node_shards.is_empty() { + println!(" (no shards)"); + } else { + for shard in &node_shards { + let role = if shard.leader_node == node_id { "Leader" } else { "Follower" }; + println!( + " Shard {}: {} - {:.2} MB, {} assertions ({})", + shard.range_id, + shard.start_key, + shard.size_bytes as f64 / 1_048_576.0, + shard.assertion_count, + role + ); + } + } + } + OutputFormat::Json => { + println!("{}", format_json(&node_shards)?); + } + } + + Ok(()) +} diff --git a/crates/stemedb-admin/src/commands/shard.rs b/crates/stemedb-admin/src/commands/shard.rs new file mode 100644 index 0000000..b25d2da --- /dev/null +++ b/crates/stemedb-admin/src/commands/shard.rs @@ -0,0 +1,68 @@ +use anyhow::Result; +use tracing::info; + +use crate::client::AdminClient; +use crate::output::{format_json, format_shard_detail, format_shards_table, OutputFormat}; + +/// List all shards +pub async fn list_shards(client: &AdminClient, format: OutputFormat) -> Result<()> { + info!("Fetching shard list"); + let shards = client.all_ranges().await?; + + match format { + OutputFormat::Table => { + println!("\nSHARDS"); + println!("{}", format_shards_table(&shards)); + } + OutputFormat::Json => { + println!("{}", format_json(&shards)?); + } + } + + Ok(()) +} + +/// Show detailed information about a specific shard +pub async fn shard_info(client: &AdminClient, shard_id: u32, format: OutputFormat) -> Result<()> { + info!("Fetching info for shard: {}", shard_id); + let shard = client.shard_info(shard_id).await?; + + match format { + OutputFormat::Table => { + println!("{}", format_shard_detail(&shard)); + } + OutputFormat::Json => { + println!("{}", format_json(&shard)?); + } + } + + Ok(()) +} + +/// Show replica nodes for a specific shard +pub async fn shard_replicas( + client: &AdminClient, + shard_id: u32, + format: OutputFormat, +) -> Result<()> { + info!("Fetching replicas for shard: {}", shard_id); + let shard = client.shard_info(shard_id).await?; + + match format { + OutputFormat::Table => { + println!("\nREPLICAS FOR SHARD {}", shard_id); + println!(" Leader: {}", shard.leader_node); + println!(" Followers: {}", shard.replica_nodes.join(", ")); + } + OutputFormat::Json => { + let replicas = serde_json::json!({ + "shard_id": shard_id, + "leader": shard.leader_node, + "replicas": shard.replica_nodes, + }); + println!("{}", format_json(&replicas)?); + } + } + + Ok(()) +} diff --git a/crates/stemedb-admin/src/lib.rs b/crates/stemedb-admin/src/lib.rs new file mode 100644 index 0000000..422b65a --- /dev/null +++ b/crates/stemedb-admin/src/lib.rs @@ -0,0 +1,6 @@ +// Library exports for testing +pub mod cli; +pub mod client; +pub mod commands; +pub mod output; +pub mod types; diff --git a/crates/stemedb-admin/src/main.rs b/crates/stemedb-admin/src/main.rs new file mode 100644 index 0000000..cfd3c03 --- /dev/null +++ b/crates/stemedb-admin/src/main.rs @@ -0,0 +1,63 @@ +use anyhow::Result; +use clap::Parser; +use tracing_subscriber::EnvFilter; + +use stemedb_admin::{ + cli::{Cli, ClusterCommands, Commands, DebugCommands, NodeCommands, ShardCommands}, + client::AdminClient, + commands, +}; + +#[tokio::main] +async fn main() -> Result<()> { + let cli = Cli::parse(); + + // Initialize logging + let filter = if cli.verbose { + EnvFilter::new("stemedb_admin=debug") + } else { + EnvFilter::new("stemedb_admin=info") + }; + + tracing_subscriber::fmt().with_env_filter(filter).with_target(false).without_time().init(); + + // Create admin client + let client = AdminClient::new(cli.gateway); + + // Execute command + match cli.command { + Commands::Cluster { cmd } => match cmd { + ClusterCommands::Status => { + commands::cluster::cluster_status(&client, cli.format).await? + } + ClusterCommands::Health => { + commands::cluster::cluster_health(&client, cli.format).await? + } + }, + Commands::Node { cmd } => match cmd { + NodeCommands::List => commands::node::list_nodes(&client, cli.format).await?, + NodeCommands::Info { node_id } => { + commands::node::node_info(&client, &node_id, cli.format).await? + } + NodeCommands::Shards { node_id, leader } => { + commands::node::node_shards(&client, &node_id, leader, cli.format).await? + } + }, + Commands::Shard { cmd } => match cmd { + ShardCommands::List => commands::shard::list_shards(&client, cli.format).await?, + ShardCommands::Info { shard_id } => { + commands::shard::shard_info(&client, shard_id, cli.format).await? + } + ShardCommands::Replicas { shard_id } => { + commands::shard::shard_replicas(&client, shard_id, cli.format).await? + } + }, + Commands::Debug { cmd } => match cmd { + DebugCommands::Export { output } => { + commands::debug::export_debug_state(&client, &output).await? + } + }, + } + + Ok(()) +} diff --git a/crates/stemedb-admin/src/output.rs b/crates/stemedb-admin/src/output.rs new file mode 100644 index 0000000..e8068e4 --- /dev/null +++ b/crates/stemedb-admin/src/output.rs @@ -0,0 +1,236 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use comfy_table::{presets::UTF8_FULL, Table}; +use serde::Serialize; + +use crate::types::{ClusterStatusResponse, NodeStatusInfo, RangeInfoDto}; + +/// Output format for CLI commands +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum OutputFormat { + /// Human-readable table with colors + Table, + /// Machine-readable JSON + Json, +} + +impl std::str::FromStr for OutputFormat { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "table" => Ok(OutputFormat::Table), + "json" => Ok(OutputFormat::Json), + _ => anyhow::bail!("Invalid format: {}. Must be 'table' or 'json'", s), + } + } +} + +/// Format cluster summary as human-readable table +pub fn format_cluster_summary(status: &ClusterStatusResponse) -> String { + let mut output = String::new(); + + output.push_str(&format!("\n{}\n", "CLUSTER OVERVIEW".bold())); + output.push_str(&format!(" Node Count: {}\n", status.node_count)); + output.push_str(&format!(" Shard Count: {}\n", status.shard_count)); + output.push_str(&format!(" Meta Version: {}\n", status.meta_version)); + + output.push_str(&format!("\n{}\n", "NODES".bold())); + output.push_str(&format_nodes_table(&status.nodes)); + + output +} + +/// Format nodes list as table +pub fn format_nodes_table(nodes: &[NodeStatusInfo]) -> String { + if nodes.is_empty() { + return " (no nodes)\n".to_string(); + } + + let mut table = Table::new(); + table.load_preset(UTF8_FULL); + table.set_header(vec!["Node ID", "State", "Shards", "Leader", "Follower"]); + + for node in nodes { + let state_colored = match node.state.as_str() { + "Alive" => node.state.green(), + "Suspect" => node.state.yellow(), + "Dead" => node.state.red(), + _ => node.state.normal(), + }; + + let shard_count = node.shards.len(); + let shards_str = if shard_count <= 5 { + node.shards.iter().map(|s| s.to_string()).collect::>().join(",") + } else { + format!("{} shards", shard_count) + }; + + // For now, we don't have leader/follower breakdown in NodeStatusInfo + // This will be enhanced in Phase 2 with detailed member info + table.add_row(vec![ + node.id.clone(), + state_colored.to_string(), + shards_str, + "-".to_string(), + "-".to_string(), + ]); + } + + table.to_string() +} + +/// Format shards list as table +pub fn format_shards_table(shards: &[RangeInfoDto]) -> String { + if shards.is_empty() { + return " (no shards)\n".to_string(); + } + + let mut table = Table::new(); + table.load_preset(UTF8_FULL); + table.set_header(vec!["Shard ID", "Leader", "Replicas", "Size", "Assertions", "Generation"]); + + for shard in shards { + let size_mb = shard.size_bytes as f64 / 1_048_576.0; + let replicas_str = shard.replica_nodes.len().to_string(); + + table.add_row(vec![ + shard.range_id.to_string(), + shard.leader_node.clone(), + replicas_str, + format!("{:.2} MB", size_mb), + shard.assertion_count.to_string(), + shard.generation.to_string(), + ]); + } + + table.to_string() +} + +/// Format detailed shard info +pub fn format_shard_detail(shard: &RangeInfoDto) -> String { + let mut output = String::new(); + + output.push_str(&format!("\n{}\n", "SHARD DETAILS".bold())); + output.push_str(&format!(" Shard ID: {}\n", shard.range_id)); + output.push_str(&format!(" Leader Node: {}\n", shard.leader_node)); + output.push_str(&format!(" Replica Nodes: {}\n", shard.replica_nodes.join(", "))); + output.push_str(&format!(" Key Range: {} - {}\n", shard.start_key, shard.end_key)); + output.push_str(&format!(" Size: {:.2} MB\n", shard.size_bytes as f64 / 1_048_576.0)); + output.push_str(&format!(" Assertions: {}\n", shard.assertion_count)); + output.push_str(&format!(" Generation: {}\n", shard.generation)); + + output +} + +/// Format node detail with assigned shards +pub fn format_node_detail(node: &NodeStatusInfo, shards: &[RangeInfoDto]) -> String { + let mut output = String::new(); + + output.push_str(&format!("\n{}\n", "NODE DETAILS".bold())); + output.push_str(&format!(" Node ID: {}\n", node.id)); + + let state_colored = match node.state.as_str() { + "Alive" => node.state.green(), + "Suspect" => node.state.yellow(), + "Dead" => node.state.red(), + _ => node.state.normal(), + }; + output.push_str(&format!(" State: {}\n", state_colored)); + + output.push_str(&format!("\n{}\n", "ASSIGNED SHARDS".bold())); + + // Group shards by role (leader vs follower) + let mut leader_shards = Vec::new(); + let mut follower_shards = Vec::new(); + + for shard_id in &node.shards { + if let Some(shard) = shards.iter().find(|s| s.range_id == *shard_id) { + if shard.leader_node == node.id { + leader_shards.push(shard); + } else { + follower_shards.push(shard); + } + } + } + + output.push_str(&format!(" Leader for: {} shards\n", leader_shards.len())); + output.push_str(&format!(" Follower for: {} shards\n", follower_shards.len())); + + if !leader_shards.is_empty() { + output.push_str(&format!("\n{}\n", "LEADER SHARDS".bold())); + for shard in leader_shards { + output.push_str(&format!( + " Shard {}: {:.2} MB, {} assertions\n", + shard.range_id, + shard.size_bytes as f64 / 1_048_576.0, + shard.assertion_count + )); + } + } + + if !follower_shards.is_empty() { + output.push_str(&format!("\n{}\n", "FOLLOWER SHARDS".bold())); + for shard in follower_shards { + output.push_str(&format!( + " Shard {}: {:.2} MB, {} assertions (Leader: {})\n", + shard.range_id, + shard.size_bytes as f64 / 1_048_576.0, + shard.assertion_count, + shard.leader_node + )); + } + } + + output +} + +/// Format any value as JSON +pub fn format_json(value: &T) -> Result { + serde_json::to_string_pretty(value).context("Failed to serialize to JSON") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_cluster_summary() { + let status = ClusterStatusResponse { + node_count: 3, + shard_count: 32, + meta_version: 158, + nodes: vec![ + NodeStatusInfo { + id: "a3f2b1c4".to_string(), + state: "Alive".to_string(), + shards: vec![1, 2, 3], + }, + NodeStatusInfo { + id: "7d8e9f0a".to_string(), + state: "Dead".to_string(), + shards: vec![4, 5], + }, + ], + }; + + let output = format_cluster_summary(&status); + assert!(output.contains("Node Count: 3")); + assert!(output.contains("Shard Count: 32")); + assert!(output.contains("a3f2b1c4")); + } + + #[test] + fn test_format_json() { + let status = ClusterStatusResponse { + node_count: 1, + shard_count: 8, + meta_version: 10, + nodes: vec![], + }; + + let json = format_json(&status).expect("Failed to format JSON"); + assert!(json.contains("\"node_count\": 1")); + assert!(json.contains("\"shard_count\": 8")); + } +} diff --git a/crates/stemedb-admin/src/types.rs b/crates/stemedb-admin/src/types.rs new file mode 100644 index 0000000..e779738 --- /dev/null +++ b/crates/stemedb-admin/src/types.rs @@ -0,0 +1,114 @@ +use serde::{Deserialize, Deserializer, Serialize}; + +/// Health status response from gateway +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthResponse { + pub healthy: bool, + pub reachable_nodes: usize, + pub joined: bool, +} + +/// Cluster status overview +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ClusterStatusResponse { + pub node_count: usize, + pub shard_count: u32, + pub meta_version: u64, + pub nodes: Vec, +} + +/// Information about a single node in the cluster +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeStatusInfo { + pub id: String, // Short hex (first 8 chars of UUID) + pub state: String, // "Alive", "Suspect", "Dead" + pub shards: Vec, // Shard IDs assigned to this node +} + +/// Wrapper for gateway ranges response (gateway returns {"ranges": [...]}) +#[derive(Debug, Clone, Deserialize)] +pub(crate) struct RangesWrapper { + pub ranges: Vec, +} + +/// Gateway shard info response (different format than ranges) +#[derive(Debug, Clone, Deserialize)] +pub(crate) struct ShardInfoResponse { + pub shard_id: u32, + pub start_key: Option, + pub end_key: Option, + pub size_bytes: u64, + pub assertion_count: u64, + pub replicas: Vec, + pub generation: u64, +} + +impl From for RangeInfoDto { + fn from(info: ShardInfoResponse) -> Self { + Self { + range_id: info.shard_id, + start_key: info.start_key.unwrap_or_default(), + end_key: info.end_key.unwrap_or_default(), + size_bytes: info.size_bytes, + assertion_count: info.assertion_count, + leader_node: info.replicas.first().cloned().unwrap_or_default(), + replica_nodes: info.replicas, + generation: info.generation, + } + } +} + +/// Detailed information about a shard range +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RangeInfoDto { + #[serde(deserialize_with = "deserialize_range_id")] + pub range_id: u32, + pub start_key: String, + pub end_key: String, + pub size_bytes: u64, + pub assertion_count: u64, + pub leader_node: String, + pub replica_nodes: Vec, + pub generation: u64, +} + +/// Custom deserializer to handle both string "shard_X" and integer formats +fn deserialize_range_id<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + use serde::de::Error; + + #[derive(Deserialize)] + #[serde(untagged)] + enum RangeId { + String(String), + Integer(u32), + } + + match RangeId::deserialize(deserializer)? { + RangeId::Integer(id) => Ok(id), + RangeId::String(s) => { + // Handle "shard_X" format + if let Some(num_str) = s.strip_prefix("shard_") { + num_str + .parse::() + .map_err(|_| Error::custom(format!("Invalid shard ID format: {}", s))) + } else { + // Try parsing as plain number + s.parse::() + .map_err(|_| Error::custom(format!("Invalid shard ID format: {}", s))) + } + } + } +} + +/// Complete cluster state export for debugging +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ClusterDebugExport { + pub timestamp: String, + pub gateway_version: String, + pub cluster: ClusterStatusResponse, + pub health: HealthResponse, + pub shards: Vec, +} diff --git a/crates/stemedb-admin/tests/integration_test.rs b/crates/stemedb-admin/tests/integration_test.rs new file mode 100644 index 0000000..5a024a6 --- /dev/null +++ b/crates/stemedb-admin/tests/integration_test.rs @@ -0,0 +1,191 @@ +use stemedb_admin::{output, types}; + +#[test] +fn test_output_format_parsing() { + use std::str::FromStr; + use stemedb_admin::output::OutputFormat; + + let table = OutputFormat::from_str("table").expect("Failed to parse 'table'"); + assert_eq!(table, OutputFormat::Table); + + let json = OutputFormat::from_str("json").expect("Failed to parse 'json'"); + assert_eq!(json, OutputFormat::Json); + + let invalid = OutputFormat::from_str("invalid"); + assert!(invalid.is_err()); +} + +#[test] +fn test_cluster_status_json_serialization() { + let status = types::ClusterStatusResponse { + node_count: 3, + shard_count: 32, + meta_version: 158, + nodes: vec![ + types::NodeStatusInfo { + id: "a3f2b1c4".to_string(), + state: "Alive".to_string(), + shards: vec![1, 2, 3], + }, + types::NodeStatusInfo { + id: "7d8e9f0a".to_string(), + state: "Dead".to_string(), + shards: vec![4, 5], + }, + ], + }; + + let json = output::format_json(&status).expect("Failed to format as JSON"); + assert!(json.contains("\"node_count\": 3")); + assert!(json.contains("\"shard_count\": 32")); + assert!(json.contains("\"a3f2b1c4\"")); + + // Verify it's valid JSON + let parsed: serde_json::Value = serde_json::from_str(&json).expect("Invalid JSON produced"); + assert_eq!(parsed["node_count"], 3); + assert_eq!(parsed["shard_count"], 32); +} + +#[test] +fn test_health_response_json_serialization() { + let health = types::HealthResponse { healthy: true, reachable_nodes: 3, joined: true }; + + let json = output::format_json(&health).expect("Failed to format as JSON"); + assert!(json.contains("\"healthy\": true")); + assert!(json.contains("\"reachable_nodes\": 3")); + + // Verify it's valid JSON + let parsed: serde_json::Value = serde_json::from_str(&json).expect("Invalid JSON produced"); + assert_eq!(parsed["healthy"], true); + assert_eq!(parsed["reachable_nodes"], 3); +} + +#[test] +fn test_range_info_json_serialization() { + let range = types::RangeInfoDto { + range_id: 5, + start_key: "".to_string(), + end_key: "m".to_string(), + size_bytes: 1_048_576, // 1 MB + assertion_count: 1000, + leader_node: "a3f2b1c4".to_string(), + replica_nodes: vec!["7d8e9f0a".to_string(), "b1c2d3e4".to_string()], + generation: 10, + }; + + let json = output::format_json(&range).expect("Failed to format as JSON"); + assert!(json.contains("\"range_id\": 5")); + assert!(json.contains("\"assertion_count\": 1000")); + + // Verify it's valid JSON + let parsed: serde_json::Value = serde_json::from_str(&json).expect("Invalid JSON produced"); + assert_eq!(parsed["range_id"], 5); + assert_eq!(parsed["assertion_count"], 1000); +} + +#[test] +fn test_nodes_table_formatting() { + let nodes = vec![ + types::NodeStatusInfo { + id: "a3f2b1c4".to_string(), + state: "Alive".to_string(), + shards: vec![1, 2, 3], + }, + types::NodeStatusInfo { + id: "7d8e9f0a".to_string(), + state: "Dead".to_string(), + shards: vec![4, 5, 6, 7, 8, 9], + }, + ]; + + let table = output::format_nodes_table(&nodes); + assert!(table.contains("a3f2b1c4")); + assert!(table.contains("7d8e9f0a")); + assert!(table.contains("Alive") || table.contains("Dead")); // Color codes may be present +} + +#[test] +fn test_shards_table_formatting() { + let shards = vec![ + types::RangeInfoDto { + range_id: 1, + start_key: "".to_string(), + end_key: "m".to_string(), + size_bytes: 2_097_152, // 2 MB + assertion_count: 5000, + leader_node: "a3f2b1c4".to_string(), + replica_nodes: vec!["7d8e9f0a".to_string()], + generation: 5, + }, + types::RangeInfoDto { + range_id: 2, + start_key: "m".to_string(), + end_key: "z".to_string(), + size_bytes: 1_048_576, // 1 MB + assertion_count: 2500, + leader_node: "7d8e9f0a".to_string(), + replica_nodes: vec!["a3f2b1c4".to_string()], + generation: 5, + }, + ]; + + let table = output::format_shards_table(&shards); + assert!(table.contains("a3f2b1c4")); + assert!(table.contains("7d8e9f0a")); +} + +#[test] +fn test_cluster_summary_formatting() { + let status = types::ClusterStatusResponse { + node_count: 3, + shard_count: 32, + meta_version: 158, + nodes: vec![types::NodeStatusInfo { + id: "a3f2b1c4".to_string(), + state: "Alive".to_string(), + shards: vec![1, 2, 3], + }], + }; + + let summary = output::format_cluster_summary(&status); + assert!(summary.contains("Node Count: 3")); + assert!(summary.contains("Shard Count: 32")); + assert!(summary.contains("Meta Version: 158")); +} + +#[test] +fn test_debug_export_structure() { + let export = types::ClusterDebugExport { + timestamp: "2026-02-12T10:30:00Z".to_string(), + gateway_version: "0.1.0".to_string(), + cluster: types::ClusterStatusResponse { + node_count: 3, + shard_count: 32, + meta_version: 158, + nodes: vec![], + }, + health: types::HealthResponse { healthy: true, reachable_nodes: 3, joined: true }, + shards: vec![], + }; + + let json = output::format_json(&export).expect("Failed to format debug export"); + assert!(json.contains("\"timestamp\"")); + assert!(json.contains("\"gateway_version\"")); + assert!(json.contains("\"cluster\"")); + assert!(json.contains("\"health\"")); + assert!(json.contains("\"shards\"")); +} + +#[test] +fn test_empty_nodes_table() { + let nodes: Vec = vec![]; + let table = output::format_nodes_table(&nodes); + assert!(table.contains("(no nodes)")); +} + +#[test] +fn test_empty_shards_table() { + let shards: Vec = vec![]; + let table = output::format_shards_table(&shards); + assert!(table.contains("(no shards)")); +} diff --git a/docs/operations/deployment/install-admin-cli.md b/docs/operations/deployment/install-admin-cli.md new file mode 100644 index 0000000..b81e68f --- /dev/null +++ b/docs/operations/deployment/install-admin-cli.md @@ -0,0 +1,324 @@ +# Installing stemedb-admin CLI + +The `stemedb-admin` CLI tool provides cluster management capabilities for StemeDB operators. It connects to the gateway node via HTTP and provides human-friendly table output or machine-readable JSON. + +## Requirements + +- **Platform**: Linux, macOS, or Windows (WSL2) +- **Architecture**: x86_64 or ARM64 +- **Network**: HTTP access to gateway node (default port 18181) +- **Rust** (for building from source): 1.75 or later + +## Installation Methods + +### Option 1: Build from Source (Recommended) + +1. **Clone the repository**: + ```bash + git clone https://github.com/yourusername/stemedb.git + cd stemedb + ``` + +2. **Build the admin CLI**: + ```bash + cargo build --release --bin stemedb-admin + ``` + + The binary will be at: `target/release/stemedb-admin` + +3. **Install to system path**: + ```bash + # Linux/macOS + sudo cp target/release/stemedb-admin /usr/local/bin/ + sudo chmod +x /usr/local/bin/stemedb-admin + + # Or install via cargo + cargo install --path crates/stemedb-admin + ``` + +4. **Verify installation**: + ```bash + stemedb-admin --version + # Expected: stemedb-admin 0.1.0 + ``` + +### Option 2: Install via Cargo + +If you have Rust toolchain installed: + +```bash +cargo install --git https://github.com/yourusername/stemedb.git stemedb-admin +``` + +### Option 3: Download Pre-built Binary (Future) + +Pre-built binaries will be available in GitHub Releases: + +```bash +# Linux x86_64 +wget https://github.com/yourusername/stemedb/releases/download/v0.1.0/stemedb-admin-linux-x86_64 +chmod +x stemedb-admin-linux-x86_64 +sudo mv stemedb-admin-linux-x86_64 /usr/local/bin/stemedb-admin + +# macOS ARM64 +wget https://github.com/yourusername/stemedb/releases/download/v0.1.0/stemedb-admin-macos-arm64 +chmod +x stemedb-admin-macos-arm64 +sudo mv stemedb-admin-macos-arm64 /usr/local/bin/stemedb-admin +``` + +--- + +## Configuration + +### Environment Variables + +The CLI respects the following environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `STEMEDB_GATEWAY_ADDR` | Gateway HTTP endpoint | `http://localhost:18181` | +| `RUST_LOG` | Logging level (debug, info, warn, error) | `info` | + +**Example**: +```bash +# Set gateway address +export STEMEDB_GATEWAY_ADDR=http://gateway.prod.example.com:18181 + +# Enable verbose logging +export RUST_LOG=stemedb_admin=debug +``` + +### Command-line Options + +All commands support: + +- `--gateway ` - Override gateway address +- `--format ` - Output format +- `--verbose` - Enable debug logging + +--- + +## Verification + +### Test Connection + +```bash +# Test gateway connectivity +stemedb-admin cluster health +``` + +Expected output: +``` +✓ Cluster is healthy + Reachable nodes: 3 + Joined: true +``` + +If you see an error: +``` +Error: Failed to connect to gateway at http://localhost:18181 +``` + +Check: +1. Gateway is running: `systemctl status stemedb-gateway` +2. Gateway port is reachable: `curl http://gateway:18181/v1/health` +3. Firewall rules allow HTTP traffic on port 18181 + +### Test Commands + +```bash +# List nodes +stemedb-admin node list + +# Show cluster status +stemedb-admin cluster status + +# List shards +stemedb-admin shard list + +# Export debug state +stemedb-admin debug export --output /tmp/cluster-state.json +cat /tmp/cluster-state.json +``` + +--- + +## Upgrading + +To upgrade to a newer version: + +```bash +# Pull latest code +cd stemedb +git pull + +# Rebuild +cargo build --release --bin stemedb-admin + +# Replace binary +sudo cp target/release/stemedb-admin /usr/local/bin/ +``` + +Or via cargo: +```bash +cargo install --git https://github.com/yourusername/stemedb.git stemedb-admin --force +``` + +--- + +## Uninstall + +```bash +# Remove binary +sudo rm /usr/local/bin/stemedb-admin + +# Or via cargo +cargo uninstall stemedb-admin +``` + +--- + +## Usage Examples + +### Basic Operations + +```bash +# Check cluster health (exit code 0 if healthy, 1 if unhealthy) +stemedb-admin cluster health +echo $? # 0 = healthy, 1 = unhealthy + +# Show cluster overview with node table +stemedb-admin cluster status + +# List all nodes with state and shard assignments +stemedb-admin node list + +# Show detailed info for specific node +stemedb-admin node a3f2b1c4 info + +# Show shards assigned to a node +stemedb-admin node a3f2b1c4 shards + +# Show only leader shards for a node +stemedb-admin node a3f2b1c4 shards --leader +``` + +### Shard Operations + +```bash +# List all shards +stemedb-admin shard list + +# Show detailed shard info +stemedb-admin shard 5 info + +# Show replica nodes for a shard +stemedb-admin shard 5 replicas +``` + +### Debug Export + +```bash +# Export complete cluster state for support tickets +stemedb-admin debug export --output cluster-state.json + +# Compress for sharing +gzip cluster-state.json +# Attach cluster-state.json.gz to support ticket +``` + +### JSON Output for Automation + +```bash +# Get node list as JSON +stemedb-admin node list --format json | jq '.[] | select(.state == "Dead")' + +# Monitor cluster health in script +if stemedb-admin cluster health --format json | jq -e '.healthy'; then + echo "Cluster OK" +else + echo "Cluster UNHEALTHY - alerting ops team" + # Trigger alert +fi +``` + +### Remote Gateway + +```bash +# Connect to production gateway +stemedb-admin --gateway https://gateway.prod.example.com:18181 cluster status + +# Or set environment variable +export STEMEDB_GATEWAY_ADDR=https://gateway.prod.example.com:18181 +stemedb-admin cluster status +``` + +--- + +## Troubleshooting + +### "Failed to connect to gateway" + +**Cause**: Gateway is unreachable or not running. + +**Fix**: +1. Check gateway is running: `systemctl status stemedb-gateway` +2. Test connectivity: `curl http://gateway:18181/v1/health` +3. Verify firewall rules: `sudo ufw status` + +### "Node not found: NODE_ID" + +**Cause**: Node ID is incorrect or node has left the cluster. + +**Fix**: +1. List all nodes: `stemedb-admin node list` +2. Verify node ID (first 8 characters of UUID) +3. Check node is in `Alive` state (not `Dead`) + +### "Gateway returned error status: 404" + +**Cause**: Gateway endpoint does not exist or API version mismatch. + +**Fix**: +1. Verify gateway version matches CLI version +2. Check gateway logs: `journalctl -u stemedb-gateway -n 50` +3. Ensure gateway is fully initialized (may take 10-30 seconds on startup) + +### Permission Denied + +**Cause**: CLI binary is not executable or requires elevated privileges. + +**Fix**: +```bash +# Make executable +chmod +x /usr/local/bin/stemedb-admin + +# Or run with sudo if accessing privileged resources +sudo stemedb-admin cluster status +``` + +--- + +## Next Steps + +- [Node Lifecycle Operations](../node-lifecycle.md) - Add, remove, replace nodes +- [Three-Node Cluster Setup](three-node-cluster.md) - Deploy production cluster +- [Monitoring & Observability](../monitoring/README.md) - Set up metrics and alerts + +--- + +## Getting Help + +```bash +# Show all commands +stemedb-admin --help + +# Show help for specific command +stemedb-admin cluster --help +stemedb-admin node --help +stemedb-admin shard --help +stemedb-admin debug --help +``` + +For issues or feature requests, open a GitHub issue: +https://github.com/yourusername/stemedb/issues diff --git a/docs/operations/node-lifecycle.md b/docs/operations/node-lifecycle.md new file mode 100644 index 0000000..8ab7d71 --- /dev/null +++ b/docs/operations/node-lifecycle.md @@ -0,0 +1,394 @@ +# Node Lifecycle Operations + +This guide covers adding, removing, and replacing nodes in a StemeDB cluster. All procedures use the `stemedb-admin` CLI tool for cluster inspection and management. + +## Prerequisites + +- `stemedb-admin` CLI installed (see [install-admin-cli.md](deployment/install-admin-cli.md)) +- Network access to the gateway node (default: `http://gateway:18181`) +- Appropriate credentials for cluster operations (Phase 2) + +## Table of Contents + +1. [Add Node Procedure](#add-node-procedure) +2. [Remove Node Procedure](#remove-node-procedure) +3. [Replace Failed Node Procedure](#replace-failed-node-procedure) +4. [Troubleshooting](#troubleshooting) + +--- + +## Add Node Procedure + +### Pre-flight Checks + +Before adding a node to the cluster, verify: + +1. **Network connectivity**: New node can reach existing cluster nodes + ```bash + # From new node, test connectivity to gateway + curl http://gateway:18181/v1/health + ``` + +2. **Port availability**: Required ports are not blocked + ```bash + # Check ports are open + nc -zv gateway 18181 # Gateway + nc -zv gateway 18182 # RPC + nc -zv gateway 18183 # SWIM gossip + ``` + +3. **Disk space**: Adequate storage for shard replicas + ```bash + df -h /var/lib/stemedb + # Recommendation: At least 100GB free per node + ``` + +4. **Configuration**: Node config matches cluster settings + ```bash + cat /etc/stemedb/node.toml + # Verify: cluster_name, seed_nodes, port settings + ``` + +### Add Node Steps + +1. **Start the new node** with seed node addresses: + ```bash + stemedb-node \ + --node-id $(uuidgen) \ + --seed-nodes gateway:18183,node-02:18183 \ + --data-dir /var/lib/stemedb + ``` + +2. **Verify node joined the cluster**: + ```bash + stemedb-admin node list + ``` + + Expected output: + ``` + NODES + ┌──────────┬────────┬──────────┬───────────┬──────────┐ + │ Node ID │ State │ Shards │ Leader │ Follower │ + ├──────────┼────────┼──────────┼───────────┼──────────┤ + │ a3f2b1c4 │ Alive │ 10,15,22 │ - │ - │ + │ 7d8e9f0a │ Alive │ 5,12,18 │ - │ - │ + │ NEW_NODE │ Alive │ │ - │ - │ ← New node appears + └──────────┴────────┴──────────┴───────────┴──────────┘ + ``` + +3. **Wait for shard assignment** (typically 30-60 seconds): + ```bash + # Watch for shards to be assigned + watch -n 5 'stemedb-admin node NEW_NODE shards' + ``` + +4. **Verify shard replication**: + ```bash + stemedb-admin node NEW_NODE shards + # Check that shards are being replicated (size_bytes > 0) + ``` + +5. **Check cluster health**: + ```bash + stemedb-admin cluster health + # Expected: ✓ Cluster is healthy + ``` + +### Post-Add Validation + +- [ ] Node appears in `stemedb-admin node list` with `Alive` state +- [ ] Node has been assigned shards (may take 1-2 minutes) +- [ ] Cluster health check passes +- [ ] Node logs show successful replication (no errors) + +**Timeline**: 2-5 minutes for full node addition and initial replication. + +--- + +## Remove Node Procedure + +### Pre-removal Checks + +1. **Check node is not critical for quorum**: + ```bash + stemedb-admin cluster status + # Verify: node_count >= 3 (for 3-node minimum) + ``` + +2. **Identify which shards will be affected**: + ```bash + stemedb-admin node NODE_ID shards + # Record: leader shards (need failover), follower shards (need replication) + ``` + +3. **Check if node is leader for critical shards**: + ```bash + stemedb-admin node NODE_ID shards --leader + ``` + +### Remove Node Steps + +**Phase 2 Feature**: Graceful node removal with `stemedb-admin node NODE_ID drain` is planned but not yet implemented. Current procedure is manual monitoring. + +1. **Stop the node gracefully**: + ```bash + # On the node being removed + systemctl stop stemedb-node + ``` + +2. **Wait for node to transition to Dead state** (30-60 seconds): + ```bash + watch -n 5 'stemedb-admin node list' + # Wait for state to change: Alive → Suspect → Dead + ``` + +3. **Verify leader election for affected shards**: + ```bash + # For each leader shard the removed node owned + stemedb-admin shard SHARD_ID info + # Check: leader_node is now a different node + ``` + +4. **Monitor shard rebalancing**: + ```bash + stemedb-admin cluster status + # Watch shard_count stabilize across remaining nodes + ``` + +5. **Verify cluster health**: + ```bash + stemedb-admin cluster health + # Expected: ✓ Cluster is healthy + ``` + +### Post-Removal Validation + +- [ ] Node shows `Dead` state in `stemedb-admin node list` +- [ ] All shards previously led by removed node have new leaders +- [ ] Cluster health check passes +- [ ] Remaining nodes have picked up replica duties + +**Timeline**: 2-5 minutes for failover and rebalancing. + +--- + +## Replace Failed Node Procedure + +When a node fails unexpectedly (hardware failure, network partition, etc.), follow this procedure to replace it. + +### Confirm Failure + +1. **Verify node is truly dead**: + ```bash + stemedb-admin node NODE_ID info + # Expected: State: Dead + ``` + +2. **Identify affected shards**: + ```bash + stemedb-admin node NODE_ID shards + # Record which shards were on the failed node + ``` + +3. **Check leader failover status**: + ```bash + # For each shard + stemedb-admin shard SHARD_ID info + # Verify: leader_node is NOT the dead node + ``` + +### Replace Failed Node + +1. **Provision replacement node** with same configuration: + ```bash + # Use original node config, but generate new node-id + stemedb-node \ + --node-id $(uuidgen) \ + --seed-nodes gateway:18183,node-02:18183 \ + --data-dir /var/lib/stemedb + ``` + +2. **Verify replacement node joins cluster**: + ```bash + stemedb-admin node list + # Check new node appears with Alive state + ``` + +3. **Monitor replica recovery**: + ```bash + # Watch shards being assigned to replacement + watch -n 10 'stemedb-admin node NEW_NODE_ID shards' + ``` + +4. **Verify data replication**: + ```bash + stemedb-admin node NEW_NODE_ID shards + # Check size_bytes matches expected values + ``` + +5. **Remove dead node from member list** (Phase 2 feature): + ```bash + # Planned: stemedb-admin node DEAD_NODE_ID remove + # Current: Dead nodes age out of membership after timeout + ``` + +### Post-Replacement Validation + +- [ ] Replacement node is `Alive` and has shards assigned +- [ ] All previously affected shards have proper replication factor +- [ ] Cluster health check passes +- [ ] No ongoing replication errors in logs + +**Timeline**: 5-10 minutes for full replacement and data sync. + +--- + +## Troubleshooting + +### Node Stuck in Suspect State + +**Symptom**: Node shows `Suspect` state for extended period (>2 minutes). + +**Possible Causes**: +- Network latency spikes +- Node under heavy load (CPU/disk saturation) +- SWIM gossip port blocked (18183) + +**Diagnosis**: +```bash +# Check network latency +ping -c 10 node-hostname + +# Check node resource usage +ssh node-hostname 'top -bn1 | head -20' + +# Check SWIM port +nc -zv node-hostname 18183 +``` + +**Resolution**: +1. If network issue: Fix network, node will transition back to `Alive` +2. If resource exhaustion: Scale up node resources or reduce load +3. If persistent: Consider replacing node (see above) + +### Shard Leader Election Issues + +**Symptom**: Shard has no leader after node failure. + +**Diagnosis**: +```bash +stemedb-admin shard SHARD_ID info +# Check: leader_node field +``` + +**Resolution**: +1. Check replica nodes are alive: + ```bash + stemedb-admin node list + # Verify replica nodes show Alive state + ``` + +2. Check logs for election failures: + ```bash + # On gateway node + journalctl -u stemedb-gateway | grep "election\|leader" + ``` + +3. If stuck, trigger manual sync (Phase 2): + ```bash + # Planned: stemedb-admin shard SHARD_ID elect-leader + ``` + +### Network Partition Scenarios + +**Symptom**: Cluster split into multiple segments, nodes in each segment see others as `Dead`. + +**Diagnosis**: +```bash +# On each node segment +stemedb-admin cluster status +# Compare node counts and health status +``` + +**Resolution**: +1. **Restore network connectivity** between segments +2. **Wait for SWIM to reconcile** (30-60 seconds after connectivity restored) +3. **Verify cluster converges**: + ```bash + stemedb-admin node list + # All nodes should show Alive after partition heals + ``` + +4. **Check for data divergence**: + ```bash + # Trigger anti-entropy sync + # Planned: stemedb-admin cluster sync --force + ``` + +**Important**: During partition, writes may be accepted in multiple segments. After healing, conflict resolution via lenses will apply (Recency, Consensus, Authority). + +### Shard Rebalancing Not Occurring + +**Symptom**: New node added but no shards assigned after 5+ minutes. + +**Diagnosis**: +```bash +# Check cluster meta version is advancing +stemedb-admin cluster status +# meta_version should increment when topology changes + +# Check gateway logs +journalctl -u stemedb-gateway | grep "rebalance\|assign" +``` + +**Resolution**: +1. Verify node is truly `Alive`: + ```bash + stemedb-admin node NEW_NODE_ID info + ``` + +2. Check node has adequate disk space: + ```bash + ssh NEW_NODE_ID 'df -h /var/lib/stemedb' + ``` + +3. Trigger manual rebalance (Phase 2): + ```bash + # Planned: stemedb-admin shard rebalance --target-node NEW_NODE_ID + ``` + +--- + +## Quick Reference: Common Commands + +```bash +# Check cluster health +stemedb-admin cluster health + +# List all nodes +stemedb-admin node list + +# Show node details +stemedb-admin node NODE_ID info + +# Show shards on a node +stemedb-admin node NODE_ID shards + +# List all shards +stemedb-admin shard list + +# Show shard details +stemedb-admin shard SHARD_ID info + +# Export debug state +stemedb-admin debug export --output cluster-state.json +``` + +--- + +## Related Documentation + +- [Three-Node Cluster Setup](deployment/three-node-cluster.md) +- [Install Admin CLI](deployment/install-admin-cli.md) +- [Monitoring & Observability](monitoring/README.md) +- [Disaster Recovery](disaster-recovery/README.md) diff --git a/roadmap.md b/roadmap.md index 0ca7815..f1a7475 100644 --- a/roadmap.md +++ b/roadmap.md @@ -21,7 +21,7 @@ | **MVP, Pilot 1-4** | ✅ Complete | Consumer Health demo, dashboard, API auth, metrics | | **Aphoria A1-A4** | ✅ Complete | Observations/claims/verify/corpus/authority lens | | **Aphoria A5** | 🎯 In Progress | Flywheel: 3/4 done, A5.3 suggest skill needs validation | -| **Pilot 5** | ⚡ Partial | **P5.1 Security 4/5 done**, **P5.2 Monitoring ✅**, **P5.3 Backup/DR ✅**, docs complete (P5.4, P5.6, P5.7), implementation pending (P5.5) | +| **Pilot 5** | ⚡ Partial | **P5.1 Security 4/5 done**, **P5.2 Monitoring ✅**, **P5.3 Backup/DR ✅**, **P5.4 Runbooks ✅**, **P5.5 Cluster Mgmt ✅**, docs pending (P5.6, P5.7) | | **8B-C** | Planned | Distributed observability, geo-distribution | | **9** | Planned | Disaster recovery, compliance, storage management | @@ -286,16 +286,32 @@ - [x] `docs/operations/troubleshooting-flowchart.md` - Complete with symptom → cause → runbook mapping - [x] Covers all 7 runbooks with decision trees and quick diagnostic commands -### P5.5 Cluster Management Tooling (WEEK 4 - HIGH PRIORITY) +### P5.5 Cluster Management Tooling (WEEK 4 - HIGH PRIORITY) ✅ COMPLETE **Priority: P1 - Manual SSH not scalable** +**Completed:** 2026-02-12 -- [ ] **`stemedb-admin` CLI** (new binary in `crates/stemedb-admin/`) - - [ ] `stemedb-admin node status` - Show cluster membership (alive/suspect/dead) - - [ ] `stemedb-admin node add ` - Join node with validation - - [ ] `stemedb-admin node drain ` - Graceful node removal (move shards first) - - [ ] `stemedb-admin shard list` - Show shard assignments, sizes, hot spots - - [ ] `stemedb-admin debug export ` - Capture state for support tickets +- [x] **`stemedb-admin` CLI** (new binary in `crates/stemedb-admin/`) + - [x] `stemedb-admin cluster status` - Overview: node count, shard count, meta version, node table + - [x] `stemedb-admin cluster health` - Quick health check (exit code 0/1) + - [x] `stemedb-admin node list` - List all nodes with states (Alive/Suspect/Dead) + - [x] `stemedb-admin node info` - Detailed node info with shard assignments + - [x] `stemedb-admin node shards` - Show shards assigned to node (with --leader filter) + - [x] `stemedb-admin shard list` - List all shards with leaders/replicas + - [x] `stemedb-admin shard info` - Detailed shard info (size, assertions, replicas) + - [x] `stemedb-admin shard replicas` - Show replica nodes for shard + - [x] `stemedb-admin debug export --output ` - Export complete cluster state as JSON + - [x] HTTP client connecting to gateway (default: http://localhost:18181) + - [x] Output formats: Table (human-friendly with colors) and JSON (machine-readable) + - [x] Environment variable support: `STEMEDB_GATEWAY_ADDR` + - [x] Proper error handling with helpful messages (no panics) + - [x] 12 integration tests covering all functionality + - [x] Node lifecycle documentation: `docs/operations/node-lifecycle.md` + - [x] Installation guide: `docs/operations/deployment/install-admin-cli.md` + +**Phase 2 Deferred:** +- [ ] `stemedb-admin node drain ` - Graceful node removal (requires gateway endpoints) +- [ ] `stemedb-admin shard rebalance` - Manual rebalancing trigger (requires gateway endpoints) - [ ] **Node Operations Documentation** - [ ] `docs/operations/node-lifecycle.md`