# Task 06: `tidalctl recover --verify-only` ## Delivers A `recover` subcommand for the `tidalctl` CLI that performs a dry-run WAL replay against a tidalDB data directory. Reports: total event count, last WAL sequence number, checkpoint sequence number, number of inconsistencies (corrupted batches, deserialization failures), estimated recovery time, and segment file inventory. Writes no state -- purely diagnostic. Operators use this to answer "what happened?" after a crash, and "how long will recovery take?" before restarting the database. ## Complexity: S ## Dependencies - Task 05 (recovery benchmark -- provides the recovery time estimation model) ## Technical Design ### 1. CLI argument parsing The `tidalctl` binary already exists (from m0p2). Add a `recover` subcommand: ```rust // In the tidalctl clap definition: #[derive(clap::Subcommand)] enum Commands { // ... existing subcommands ... /// Inspect WAL state and verify crash recovery without modifying data. Recover { /// Path to the tidalDB data directory. #[arg(long)] path: std::path::PathBuf, /// Dry-run: report WAL state without writing any changes. /// This is the only supported mode initially. #[arg(long, default_value_t = true)] verify_only: bool, }, } ``` ### 2. Recovery report struct ```rust // tidal/src/wal/diagnostics.rs (new file) use std::path::Path; use super::checkpoint::CheckpointManager; use super::error::WalError; use super::format::{self, BatchHeader, EventRecord, HEADER_SIZE, MAGIC}; use super::segment; /// Diagnostic report from a dry-run WAL inspection. #[derive(Debug)] pub struct WalDiagnosticReport { /// Total number of valid signal events found in WAL segments. pub total_events: u64, /// Number of events that would be replayed (after checkpoint filtering). pub replay_events: u64, /// Last WAL sequence number seen in any segment. pub last_wal_seq: u64, /// Checkpoint sequence number (from checkpoint.meta), or 0 if none. pub checkpoint_seq: u64, /// Checkpoint timestamp (nanos), or 0 if none. pub checkpoint_ts: u64, /// Number of WAL segment files. pub segment_count: usize, /// Total bytes across all segment files. pub total_segment_bytes: u64, /// Number of corrupted or truncated batches encountered. pub inconsistency_count: u64, /// Per-segment summary. pub segments: Vec, /// Estimated recovery time in seconds (based on event count heuristic). pub estimated_recovery_secs: f64, } /// Summary of a single WAL segment file. #[derive(Debug)] pub struct SegmentSummary { /// First sequence number in this segment. pub first_seq: u64, /// File size in bytes. pub file_size: u64, /// Number of valid batches. pub batch_count: u64, /// Number of valid events. pub event_count: u64, /// Number of corrupted batches. pub corrupt_batches: u64, } ``` ### 3. Diagnostic scan implementation ```rust /// Perform a dry-run WAL inspection and produce a diagnostic report. /// /// This function reads WAL segments and the checkpoint file but writes /// nothing. It is safe to run against a live database directory (reads /// are independent of the writer thread's append-only operations). /// /// # Errors /// /// Returns `WalError::Io` on filesystem failure. pub fn diagnose_wal(data_dir: &Path) -> Result { let wal_dir = data_dir.join("wal"); // Read checkpoint. let checkpoint = CheckpointManager::read(&wal_dir)?; let (checkpoint_seq, checkpoint_ts) = checkpoint.unwrap_or((0, 0)); // List and scan segments. let segment_list = segment::list_segments(&wal_dir)?; let segment_count = segment_list.len(); let mut total_events = 0u64; let mut replay_events = 0u64; let mut last_wal_seq = 0u64; let mut inconsistency_count = 0u64; let mut total_segment_bytes = 0u64; let mut segments = Vec::with_capacity(segment_count); for (seg_first_seq, seg_path) in &segment_list { let file_size = std::fs::metadata(seg_path) .map(|m| m.len()) .unwrap_or(0); total_segment_bytes += file_size; let data = std::fs::read(seg_path)?; let mut offset = 0usize; let mut batch_count = 0u64; let mut event_count = 0u64; let mut corrupt_batches = 0u64; while offset < data.len() { if data.len() - offset < HEADER_SIZE { corrupt_batches += 1; inconsistency_count += 1; break; } if data[offset..offset + 4] != MAGIC { corrupt_batches += 1; inconsistency_count += 1; break; } let payload_len = u32::from_le_bytes( data[offset + 24..offset + 28] .try_into() .unwrap_or([0; 4]), ) as usize; let batch_end = offset + HEADER_SIZE + payload_len; if batch_end > data.len() { corrupt_batches += 1; inconsistency_count += 1; break; } match format::decode_batch(&data[offset..batch_end]) { Ok((header, events)) => { batch_count += 1; let n = events.len() as u64; event_count += n; total_events += n; // Count replay events (after checkpoint). for (i, _) in events.iter().enumerate() { let event_seq = header.first_seq + i as u64; if event_seq >= checkpoint_seq { replay_events += 1; } let candidate = event_seq + 1; if candidate > last_wal_seq { last_wal_seq = candidate; } } offset = batch_end; } Err(_) => { corrupt_batches += 1; inconsistency_count += 1; break; } } } segments.push(SegmentSummary { first_seq: *seg_first_seq, file_size, batch_count, event_count, corrupt_batches, }); } // Estimate recovery time. // Heuristic: ~100K events/sec replay throughput based on benchmarks. // Checkpoint restore adds ~10ms per 1000 entries. // Total = replay_events / 100_000 + checkpoint_restore_estimate. let replay_secs = replay_events as f64 / 100_000.0; // We don't know the checkpoint size from the WAL alone, so estimate // conservatively as 5 seconds for checkpoint restore. let estimated_recovery_secs = replay_secs + 5.0; Ok(WalDiagnosticReport { total_events, replay_events, last_wal_seq, checkpoint_seq, checkpoint_ts, segment_count, total_segment_bytes, inconsistency_count, segments, estimated_recovery_secs, }) } ``` ### 4. CLI output formatting ```rust // In tidalctl, in the recover subcommand handler: fn handle_recover(path: &std::path::Path, verify_only: bool) -> anyhow::Result<()> { use tidaldb::wal::diagnostics::diagnose_wal; if !verify_only { anyhow::bail!("only --verify-only mode is supported"); } let report = diagnose_wal(path) .map_err(|e| anyhow::anyhow!("WAL diagnosis failed: {e}"))?; println!("WAL Diagnostic Report"); println!("====================="); println!(); println!("Checkpoint:"); if report.checkpoint_seq > 0 { println!(" Sequence: {}", report.checkpoint_seq); println!(" Timestamp: {} ns", report.checkpoint_ts); } else { println!(" (no checkpoint found)"); } println!(); println!("WAL Segments: {}", report.segment_count); println!("Total Segment Bytes: {} ({:.1} MB)", report.total_segment_bytes, report.total_segment_bytes as f64 / 1_048_576.0, ); println!(); println!("Events:"); println!(" Total in WAL: {}", report.total_events); println!(" To replay: {}", report.replay_events); println!(" Last WAL seq: {}", report.last_wal_seq); println!(" Inconsistencies: {}", report.inconsistency_count); println!(); println!("Estimated Recovery Time: {:.1}s", report.estimated_recovery_secs); println!(); if !report.segments.is_empty() { println!("Segment Inventory:"); println!(" {:>10} {:>10} {:>8} {:>8} {:>8}", "first_seq", "file_size", "batches", "events", "corrupt"); for seg in &report.segments { println!(" {:>10} {:>10} {:>8} {:>8} {:>8}", seg.first_seq, seg.file_size, seg.batch_count, seg.event_count, seg.corrupt_batches); } } if report.inconsistency_count > 0 { println!(); println!("WARNING: {} inconsistencies found. Recovery will truncate corrupted tails.", report.inconsistency_count); } Ok(()) } ``` ### 5. Module registration Add `pub mod diagnostics;` to `tidal/src/wal/mod.rs`. ## Acceptance Criteria - [ ] `tidalctl recover --path --verify-only` runs without modifying any files - [ ] Reports: total event count, replay event count, last WAL sequence, checkpoint sequence - [ ] Reports: inconsistency count (corrupted/truncated batches) - [ ] Reports: estimated recovery time in seconds - [ ] Reports: per-segment inventory (first_seq, file_size, batch_count, event_count, corrupt_batches) - [ ] `WalDiagnosticReport` struct in `tidal/src/wal/diagnostics.rs` - [ ] `diagnose_wal(data_dir)` function scans WAL without writing state - [ ] Handles missing WAL directory gracefully (reports 0 segments) - [ ] Handles missing checkpoint file gracefully (reports checkpoint_seq=0) - [ ] Unit tests: `diagnose_empty_dir`, `diagnose_single_segment`, `diagnose_with_checkpoint`, `diagnose_with_corruption` ## Test Strategy ```rust #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { use super::*; use crate::wal::format::encode_batch; use crate::wal::format::EventRecord; use crate::wal::segment::segment_filename; fn sample_event(id: u64) -> EventRecord { EventRecord { entity_id: id, signal_type: 1, weight: 1.0, timestamp_nanos: id * 1_000_000_000, } } #[test] fn diagnose_empty_dir() { let dir = tempfile::tempdir().unwrap(); // Create the wal subdirectory (diagnose_wal expects data_dir, not wal_dir). std::fs::create_dir_all(dir.path().join("wal")).unwrap(); let report = diagnose_wal(dir.path()).unwrap(); assert_eq!(report.total_events, 0); assert_eq!(report.segment_count, 0); assert_eq!(report.checkpoint_seq, 0); assert_eq!(report.inconsistency_count, 0); } #[test] fn diagnose_single_segment() { let dir = tempfile::tempdir().unwrap(); let wal_dir = dir.path().join("wal"); std::fs::create_dir_all(&wal_dir).unwrap(); let events = vec![sample_event(1), sample_event(2), sample_event(3)]; let batch = encode_batch(&events, 1, 1000).unwrap(); let seg_name = segment_filename(1); std::fs::write(wal_dir.join(seg_name), &batch).unwrap(); let report = diagnose_wal(dir.path()).unwrap(); assert_eq!(report.total_events, 3); assert_eq!(report.replay_events, 3); // no checkpoint assert_eq!(report.segment_count, 1); assert_eq!(report.inconsistency_count, 0); assert_eq!(report.segments[0].event_count, 3); } #[test] fn diagnose_with_checkpoint() { let dir = tempfile::tempdir().unwrap(); let wal_dir = dir.path().join("wal"); std::fs::create_dir_all(&wal_dir).unwrap(); // Write checkpoint at seq=2. CheckpointManager::write(&wal_dir, 2, 2000).unwrap(); // Write events at seq 1, 2, 3. let events = vec![sample_event(1), sample_event(2), sample_event(3)]; let batch = encode_batch(&events, 1, 1000).unwrap(); std::fs::write(wal_dir.join(segment_filename(1)), &batch).unwrap(); let report = diagnose_wal(dir.path()).unwrap(); assert_eq!(report.total_events, 3); assert_eq!(report.replay_events, 2); // events at seq 2, 3 assert_eq!(report.checkpoint_seq, 2); } #[test] fn diagnose_with_corruption() { let dir = tempfile::tempdir().unwrap(); let wal_dir = dir.path().join("wal"); std::fs::create_dir_all(&wal_dir).unwrap(); let events = vec![sample_event(1)]; let mut batch = encode_batch(&events, 1, 1000).unwrap(); // Append garbage to simulate a torn write. batch.extend_from_slice(&[0xDE, 0xAD, 0xBE, 0xEF]); std::fs::write(wal_dir.join(segment_filename(1)), &batch).unwrap(); let report = diagnose_wal(dir.path()).unwrap(); assert_eq!(report.total_events, 1); // one valid event assert_eq!(report.inconsistency_count, 1); // one corrupt tail } } ```