13 KiB
13 KiB
Task 06: tidalctl recover --verify-only
Delivers
A recover subcommand for the tidalctl CLI that performs a dry-run WAL replay against a tidalDB data directory. Reports: total event count, last WAL sequence number, checkpoint sequence number, number of inconsistencies (corrupted batches, deserialization failures), estimated recovery time, and segment file inventory. Writes no state -- purely diagnostic.
Operators use this to answer "what happened?" after a crash, and "how long will recovery take?" before restarting the database.
Complexity: S
Dependencies
- Task 05 (recovery benchmark -- provides the recovery time estimation model)
Technical Design
1. CLI argument parsing
The tidalctl binary already exists (from m0p2). Add a recover subcommand:
// In the tidalctl clap definition:
#[derive(clap::Subcommand)]
enum Commands {
// ... existing subcommands ...
/// Inspect WAL state and verify crash recovery without modifying data.
Recover {
/// Path to the tidalDB data directory.
#[arg(long)]
path: std::path::PathBuf,
/// Dry-run: report WAL state without writing any changes.
/// This is the only supported mode initially.
#[arg(long, default_value_t = true)]
verify_only: bool,
},
}
2. Recovery report struct
// tidal/src/wal/diagnostics.rs (new file)
use std::path::Path;
use super::checkpoint::CheckpointManager;
use super::error::WalError;
use super::format::{self, BatchHeader, EventRecord, HEADER_SIZE, MAGIC};
use super::segment;
/// Diagnostic report from a dry-run WAL inspection.
#[derive(Debug)]
pub struct WalDiagnosticReport {
/// Total number of valid signal events found in WAL segments.
pub total_events: u64,
/// Number of events that would be replayed (after checkpoint filtering).
pub replay_events: u64,
/// Last WAL sequence number seen in any segment.
pub last_wal_seq: u64,
/// Checkpoint sequence number (from checkpoint.meta), or 0 if none.
pub checkpoint_seq: u64,
/// Checkpoint timestamp (nanos), or 0 if none.
pub checkpoint_ts: u64,
/// Number of WAL segment files.
pub segment_count: usize,
/// Total bytes across all segment files.
pub total_segment_bytes: u64,
/// Number of corrupted or truncated batches encountered.
pub inconsistency_count: u64,
/// Per-segment summary.
pub segments: Vec<SegmentSummary>,
/// Estimated recovery time in seconds (based on event count heuristic).
pub estimated_recovery_secs: f64,
}
/// Summary of a single WAL segment file.
#[derive(Debug)]
pub struct SegmentSummary {
/// First sequence number in this segment.
pub first_seq: u64,
/// File size in bytes.
pub file_size: u64,
/// Number of valid batches.
pub batch_count: u64,
/// Number of valid events.
pub event_count: u64,
/// Number of corrupted batches.
pub corrupt_batches: u64,
}
3. Diagnostic scan implementation
/// Perform a dry-run WAL inspection and produce a diagnostic report.
///
/// This function reads WAL segments and the checkpoint file but writes
/// nothing. It is safe to run against a live database directory (reads
/// are independent of the writer thread's append-only operations).
///
/// # Errors
///
/// Returns `WalError::Io` on filesystem failure.
pub fn diagnose_wal(data_dir: &Path) -> Result<WalDiagnosticReport, WalError> {
let wal_dir = data_dir.join("wal");
// Read checkpoint.
let checkpoint = CheckpointManager::read(&wal_dir)?;
let (checkpoint_seq, checkpoint_ts) = checkpoint.unwrap_or((0, 0));
// List and scan segments.
let segment_list = segment::list_segments(&wal_dir)?;
let segment_count = segment_list.len();
let mut total_events = 0u64;
let mut replay_events = 0u64;
let mut last_wal_seq = 0u64;
let mut inconsistency_count = 0u64;
let mut total_segment_bytes = 0u64;
let mut segments = Vec::with_capacity(segment_count);
for (seg_first_seq, seg_path) in &segment_list {
let file_size = std::fs::metadata(seg_path)
.map(|m| m.len())
.unwrap_or(0);
total_segment_bytes += file_size;
let data = std::fs::read(seg_path)?;
let mut offset = 0usize;
let mut batch_count = 0u64;
let mut event_count = 0u64;
let mut corrupt_batches = 0u64;
while offset < data.len() {
if data.len() - offset < HEADER_SIZE {
corrupt_batches += 1;
inconsistency_count += 1;
break;
}
if data[offset..offset + 4] != MAGIC {
corrupt_batches += 1;
inconsistency_count += 1;
break;
}
let payload_len = u32::from_le_bytes(
data[offset + 24..offset + 28]
.try_into()
.unwrap_or([0; 4]),
) as usize;
let batch_end = offset + HEADER_SIZE + payload_len;
if batch_end > data.len() {
corrupt_batches += 1;
inconsistency_count += 1;
break;
}
match format::decode_batch(&data[offset..batch_end]) {
Ok((header, events)) => {
batch_count += 1;
let n = events.len() as u64;
event_count += n;
total_events += n;
// Count replay events (after checkpoint).
for (i, _) in events.iter().enumerate() {
let event_seq = header.first_seq + i as u64;
if event_seq >= checkpoint_seq {
replay_events += 1;
}
let candidate = event_seq + 1;
if candidate > last_wal_seq {
last_wal_seq = candidate;
}
}
offset = batch_end;
}
Err(_) => {
corrupt_batches += 1;
inconsistency_count += 1;
break;
}
}
}
segments.push(SegmentSummary {
first_seq: *seg_first_seq,
file_size,
batch_count,
event_count,
corrupt_batches,
});
}
// Estimate recovery time.
// Heuristic: ~100K events/sec replay throughput based on benchmarks.
// Checkpoint restore adds ~10ms per 1000 entries.
// Total = replay_events / 100_000 + checkpoint_restore_estimate.
let replay_secs = replay_events as f64 / 100_000.0;
// We don't know the checkpoint size from the WAL alone, so estimate
// conservatively as 5 seconds for checkpoint restore.
let estimated_recovery_secs = replay_secs + 5.0;
Ok(WalDiagnosticReport {
total_events,
replay_events,
last_wal_seq,
checkpoint_seq,
checkpoint_ts,
segment_count,
total_segment_bytes,
inconsistency_count,
segments,
estimated_recovery_secs,
})
}
4. CLI output formatting
// In tidalctl, in the recover subcommand handler:
fn handle_recover(path: &std::path::Path, verify_only: bool) -> anyhow::Result<()> {
use tidaldb::wal::diagnostics::diagnose_wal;
if !verify_only {
anyhow::bail!("only --verify-only mode is supported");
}
let report = diagnose_wal(path)
.map_err(|e| anyhow::anyhow!("WAL diagnosis failed: {e}"))?;
println!("WAL Diagnostic Report");
println!("=====================");
println!();
println!("Checkpoint:");
if report.checkpoint_seq > 0 {
println!(" Sequence: {}", report.checkpoint_seq);
println!(" Timestamp: {} ns", report.checkpoint_ts);
} else {
println!(" (no checkpoint found)");
}
println!();
println!("WAL Segments: {}", report.segment_count);
println!("Total Segment Bytes: {} ({:.1} MB)",
report.total_segment_bytes,
report.total_segment_bytes as f64 / 1_048_576.0,
);
println!();
println!("Events:");
println!(" Total in WAL: {}", report.total_events);
println!(" To replay: {}", report.replay_events);
println!(" Last WAL seq: {}", report.last_wal_seq);
println!(" Inconsistencies: {}", report.inconsistency_count);
println!();
println!("Estimated Recovery Time: {:.1}s", report.estimated_recovery_secs);
println!();
if !report.segments.is_empty() {
println!("Segment Inventory:");
println!(" {:>10} {:>10} {:>8} {:>8} {:>8}",
"first_seq", "file_size", "batches", "events", "corrupt");
for seg in &report.segments {
println!(" {:>10} {:>10} {:>8} {:>8} {:>8}",
seg.first_seq, seg.file_size, seg.batch_count,
seg.event_count, seg.corrupt_batches);
}
}
if report.inconsistency_count > 0 {
println!();
println!("WARNING: {} inconsistencies found. Recovery will truncate corrupted tails.",
report.inconsistency_count);
}
Ok(())
}
5. Module registration
Add pub mod diagnostics; to tidal/src/wal/mod.rs.
Acceptance Criteria
tidalctl recover --path <dir> --verify-onlyruns without modifying any files- Reports: total event count, replay event count, last WAL sequence, checkpoint sequence
- Reports: inconsistency count (corrupted/truncated batches)
- Reports: estimated recovery time in seconds
- Reports: per-segment inventory (first_seq, file_size, batch_count, event_count, corrupt_batches)
WalDiagnosticReportstruct intidal/src/wal/diagnostics.rsdiagnose_wal(data_dir)function scans WAL without writing state- Handles missing WAL directory gracefully (reports 0 segments)
- Handles missing checkpoint file gracefully (reports checkpoint_seq=0)
- Unit tests:
diagnose_empty_dir,diagnose_single_segment,diagnose_with_checkpoint,diagnose_with_corruption
Test Strategy
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use crate::wal::format::encode_batch;
use crate::wal::format::EventRecord;
use crate::wal::segment::segment_filename;
fn sample_event(id: u64) -> EventRecord {
EventRecord {
entity_id: id,
signal_type: 1,
weight: 1.0,
timestamp_nanos: id * 1_000_000_000,
}
}
#[test]
fn diagnose_empty_dir() {
let dir = tempfile::tempdir().unwrap();
// Create the wal subdirectory (diagnose_wal expects data_dir, not wal_dir).
std::fs::create_dir_all(dir.path().join("wal")).unwrap();
let report = diagnose_wal(dir.path()).unwrap();
assert_eq!(report.total_events, 0);
assert_eq!(report.segment_count, 0);
assert_eq!(report.checkpoint_seq, 0);
assert_eq!(report.inconsistency_count, 0);
}
#[test]
fn diagnose_single_segment() {
let dir = tempfile::tempdir().unwrap();
let wal_dir = dir.path().join("wal");
std::fs::create_dir_all(&wal_dir).unwrap();
let events = vec![sample_event(1), sample_event(2), sample_event(3)];
let batch = encode_batch(&events, 1, 1000).unwrap();
let seg_name = segment_filename(1);
std::fs::write(wal_dir.join(seg_name), &batch).unwrap();
let report = diagnose_wal(dir.path()).unwrap();
assert_eq!(report.total_events, 3);
assert_eq!(report.replay_events, 3); // no checkpoint
assert_eq!(report.segment_count, 1);
assert_eq!(report.inconsistency_count, 0);
assert_eq!(report.segments[0].event_count, 3);
}
#[test]
fn diagnose_with_checkpoint() {
let dir = tempfile::tempdir().unwrap();
let wal_dir = dir.path().join("wal");
std::fs::create_dir_all(&wal_dir).unwrap();
// Write checkpoint at seq=2.
CheckpointManager::write(&wal_dir, 2, 2000).unwrap();
// Write events at seq 1, 2, 3.
let events = vec![sample_event(1), sample_event(2), sample_event(3)];
let batch = encode_batch(&events, 1, 1000).unwrap();
std::fs::write(wal_dir.join(segment_filename(1)), &batch).unwrap();
let report = diagnose_wal(dir.path()).unwrap();
assert_eq!(report.total_events, 3);
assert_eq!(report.replay_events, 2); // events at seq 2, 3
assert_eq!(report.checkpoint_seq, 2);
}
#[test]
fn diagnose_with_corruption() {
let dir = tempfile::tempdir().unwrap();
let wal_dir = dir.path().join("wal");
std::fs::create_dir_all(&wal_dir).unwrap();
let events = vec![sample_event(1)];
let mut batch = encode_batch(&events, 1, 1000).unwrap();
// Append garbage to simulate a torn write.
batch.extend_from_slice(&[0xDE, 0xAD, 0xBE, 0xEF]);
std::fs::write(wal_dir.join(segment_filename(1)), &batch).unwrap();
let report = diagnose_wal(dir.path()).unwrap();
assert_eq!(report.total_events, 1); // one valid event
assert_eq!(report.inconsistency_count, 1); // one corrupt tail
}
}