# Task 04: Checkpoint BLAKE3 Integrity ## Delivers Extension of `CheckpointMeta` with a 32-byte BLAKE3 hash of the checkpoint payload. The hash is computed during `checkpoint()` and verified during `restore()`. On hash mismatch (corrupt checkpoint), the system falls back to WAL-only replay from the beginning, logging a warning. This catches silent data corruption (bit rot, partial writes, filesystem bugs) that would otherwise produce incorrect signal state on recovery. ## Complexity: M ## Dependencies - Task 01 (CrashPoint enum -- for testing corruption fallback under crash conditions) ## Technical Design ### 1. Extend CheckpointMeta Modify `tidal/src/signals/checkpoint/meta.rs`: ```rust // ── Constants ───────────────────────────────────────────────────────────────── pub(super) const VERSION: u8 = 0x02; // bumped from 0x01 pub(super) const META_SIZE_V1: usize = 17; pub(super) const META_SIZE_V2: usize = 49; // 17 + 32 (BLAKE3 hash) pub(crate) const META_SUFFIX: &[u8] = b"meta"; /// Checkpoint sequence metadata stored alongside the signal state. /// /// V2 adds a BLAKE3 hash of the checkpoint payload (all serialized entries /// concatenated in key order). If the hash does not match on restore, the /// checkpoint is treated as corrupt and the system falls back to WAL-only replay. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CheckpointMeta { /// Nanosecond timestamp when the checkpoint was taken. pub checkpoint_time_ns: u64, /// WAL sequence number at checkpoint time. pub wal_sequence: u64, /// BLAKE3 hash of the checkpoint payload (32 bytes). /// Set to `[0u8; 32]` for V1 compatibility (no hash verification). pub payload_hash: [u8; 32], } ``` ### 2. Serialization (V2 format) ```rust /// Serialize `CheckpointMeta` to a 49-byte buffer (V2 format). /// /// Format: `[version: 1][checkpoint_time_ns: 8 LE][wal_sequence: 8 LE][payload_hash: 32]` #[must_use] pub fn serialize_meta(meta: &CheckpointMeta) -> Vec { let mut buf = Vec::with_capacity(META_SIZE_V2); buf.push(VERSION); buf.extend_from_slice(&meta.checkpoint_time_ns.to_le_bytes()); buf.extend_from_slice(&meta.wal_sequence.to_le_bytes()); buf.extend_from_slice(&meta.payload_hash); debug_assert_eq!(buf.len(), META_SIZE_V2); buf } ``` ### 3. Deserialization (V1 + V2 compatible) ```rust /// Deserialize `CheckpointMeta` from bytes. /// /// Supports both V1 (17 bytes, no hash) and V2 (49 bytes, with BLAKE3 hash). /// V1 checkpoints are deserialized with `payload_hash = [0u8; 32]`, which /// disables hash verification on restore (backward compatible). pub fn deserialize_meta(bytes: &[u8]) -> Result { if bytes.is_empty() { return Err("empty checkpoint meta".to_string()); } match bytes[0] { 0x01 => { // V1: 17 bytes, no hash. if bytes.len() != META_SIZE_V1 { return Err(format!( "V1 meta: expected {META_SIZE_V1} bytes, got {}", bytes.len() )); } let checkpoint_time_ns = u64::from_le_bytes( bytes[1..9].try_into().map_err(|_| "V1 offset error at [1..9]".to_string())?, ); let wal_sequence = u64::from_le_bytes( bytes[9..17].try_into().map_err(|_| "V1 offset error at [9..17]".to_string())?, ); Ok(CheckpointMeta { checkpoint_time_ns, wal_sequence, payload_hash: [0u8; 32], // V1: no hash verification }) } 0x02 => { // V2: 49 bytes, with BLAKE3 hash. if bytes.len() != META_SIZE_V2 { return Err(format!( "V2 meta: expected {META_SIZE_V2} bytes, got {}", bytes.len() )); } let checkpoint_time_ns = u64::from_le_bytes( bytes[1..9].try_into().map_err(|_| "V2 offset error at [1..9]".to_string())?, ); let wal_sequence = u64::from_le_bytes( bytes[9..17].try_into().map_err(|_| "V2 offset error at [9..17]".to_string())?, ); let mut payload_hash = [0u8; 32]; payload_hash.copy_from_slice(&bytes[17..49]); Ok(CheckpointMeta { checkpoint_time_ns, wal_sequence, payload_hash, }) } v => Err(format!( "unknown checkpoint meta version 0x{v:02x}, expected 0x01 or 0x02" )), } } ``` ### 4. Integrity module ```rust // tidal/src/signals/checkpoint/integrity.rs /// Compute a BLAKE3 hash over the concatenated checkpoint entry payloads. /// /// Takes the WriteBatch entries (excluding the meta key) in insertion order /// and hashes their raw byte values. The hash covers only the entry payloads, /// not the keys (keys are deterministic from entity_id + signal_type_id). /// /// Returns a 32-byte BLAKE3 hash. pub fn hash_checkpoint_payload(entry_values: &[Vec]) -> [u8; 32] { let mut hasher = blake3::Hasher::new(); for value in entry_values { // Length-prefix each value to prevent ambiguous concatenation. hasher.update(&(value.len() as u64).to_le_bytes()); hasher.update(value); } *hasher.finalize().as_bytes() } /// Verify a checkpoint payload against its expected BLAKE3 hash. /// /// Returns `true` if the hash matches, `false` if it does not. /// Returns `true` if `expected_hash` is all zeros (V1 compatibility: no hash). pub fn verify_checkpoint_payload(entry_values: &[Vec], expected_hash: &[u8; 32]) -> bool { // V1 compatibility: all-zero hash means "no verification". if expected_hash == &[0u8; 32] { return true; } let actual = hash_checkpoint_payload(entry_values); actual == *expected_hash } ``` ### 5. Modify `SignalLedger::checkpoint()` to compute and store the hash ```rust // In tidal/src/signals/checkpoint/mod.rs, inside checkpoint(): pub fn checkpoint( &self, storage: &dyn StorageEngine, mut meta: CheckpointMeta, ) -> crate::Result<()> { let mut batch = WriteBatch::new(); let mut entry_values: Vec> = Vec::new(); // Write all entity-signal entries. for entry_ref in self.entries() { let &(entity_id, signal_type_id) = entry_ref.key(); let entry = entry_ref.value(); let suffix = signal_type_id.as_u16().to_be_bytes(); let key = encode_key(entity_id, Tag::Sig, &suffix); let value = serialize_entry(entity_id, signal_type_id, entry); entry_values.push(value.clone()); batch.put(key, value); } // Compute BLAKE3 hash over all entry payloads. meta.payload_hash = integrity::hash_checkpoint_payload(&entry_values); // Write checkpoint metadata (now including the hash). let meta_key = encode_key(EntityId::new(0), Tag::Sig, META_SUFFIX); batch.put(meta_key, serialize_meta(&meta)); #[cfg(any(test, feature = "test-utils"))] crate::testing::crash_injector::check_crash_point( crate::testing::CrashPoint::CheckpointPreFlush, ); storage.write_batch(batch)?; storage.flush()?; #[cfg(any(test, feature = "test-utils"))] crate::testing::crash_injector::check_crash_point( crate::testing::CrashPoint::CheckpointPostFlush, ); Ok(()) } ``` ### 6. Modify `SignalLedger::restore()` to verify the hash ```rust // In tidal/src/signals/checkpoint/mod.rs, inside restore(): pub fn restore(&self, storage: &dyn StorageEngine) -> crate::Result> { // Read checkpoint metadata first. let meta_key = encode_key(EntityId::new(0), Tag::Sig, META_SUFFIX); let meta = match storage.get(&meta_key)? { None => None, Some(meta_bytes) => Some( deserialize_meta(&meta_bytes) .map_err(|e| TidalError::Internal(format!("corrupt checkpoint meta: {e}")))?, ), }; // Collect entry values for integrity verification. let mut entry_values: Vec> = Vec::new(); let mut entries_to_insert: Vec<(EntityId, SignalTypeId, EntitySignalEntry)> = Vec::new(); for item in storage.scan_prefix(&[]) { let (key, value) = item?; if let Some((entity_id, Tag::Sig, suffix)) = parse_key(&key) { if entity_id == EntityId::new(0) && suffix == META_SUFFIX { continue; } entry_values.push(value.clone()); let (eid, stid, entry) = deserialize_entry(&value) .map_err(|e| TidalError::Internal(format!("corrupt checkpoint entry: {e}")))?; entries_to_insert.push((eid, stid, entry)); } } // Verify integrity if we have a meta with a non-zero hash. if let Some(ref meta) = meta { if !integrity::verify_checkpoint_payload(&entry_values, &meta.payload_hash) { tracing::warn!( "checkpoint BLAKE3 hash mismatch; falling back to WAL-only replay" ); // Return None to signal that the checkpoint is corrupt. // The caller (open.rs) will replay the entire WAL from the beginning. return Ok(None); } } // All entries verified -- insert into the DashMap. for (eid, stid, entry) in entries_to_insert { self.entries.insert((eid, stid), entry); } Ok(meta) } ``` ### 7. Modify `open.rs` to handle corrupt checkpoint The existing code in `open.rs` already handles `None` from `restore()` as "no checkpoint, replay all WAL events." When `restore()` returns `None` due to hash mismatch, the same path is taken: the ledger starts empty and all WAL events are replayed from the beginning. No change to `open.rs` is needed for the fallback path. The only addition is a log message at the call site: ```rust match ledger.restore(storage.items_engine()) { Ok(Some(meta)) => { tracing::info!( wal_sequence = meta.wal_sequence, "signal ledger restored from checkpoint" ); } Ok(None) => { // First boot or corrupt checkpoint -- WAL replay covers everything. tracing::info!("no valid checkpoint; full WAL replay will be performed"); } Err(e) => { tracing::warn!( error = %e, "signal ledger restore failed; starting from empty state" ); } } ``` ## Acceptance Criteria - [ ] `CheckpointMeta` extended with 32-byte `payload_hash` field - [ ] `serialize_meta` produces V2 format (49 bytes, version 0x02) - [ ] `deserialize_meta` supports both V1 (17 bytes) and V2 (49 bytes) formats - [ ] V1 checkpoints deserialize with `payload_hash = [0u8; 32]` (no verification) - [ ] `hash_checkpoint_payload` computes BLAKE3 over length-prefixed entry values - [ ] `verify_checkpoint_payload` returns `true` for matching hash, `false` for mismatch, `true` for all-zero hash - [ ] `checkpoint()` computes hash over all entry payloads and stores it in meta - [ ] `restore()` verifies hash before inserting entries; returns `None` on mismatch - [ ] Corrupt checkpoint triggers fallback to WAL-only replay with warning log - [ ] Clean checkpoint passes verification and restores normally - [ ] Existing proptest `serialize_deserialize_meta_roundtrip` updated for V2 - [ ] New proptests: `v1_to_v2_upgrade`, `corrupt_hash_triggers_fallback`, `hash_changes_on_different_payload` - [ ] `cargo test --manifest-path tidal/Cargo.toml` passes ## Test Strategy ```rust #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { use super::*; #[test] fn v2_serialize_deserialize_roundtrip() { let meta = CheckpointMeta { checkpoint_time_ns: 1_700_000_000_000_000_000, wal_sequence: 42_000, payload_hash: blake3::hash(b"test payload").into(), }; let bytes = serialize_meta(&meta); assert_eq!(bytes.len(), META_SIZE_V2); assert_eq!(bytes[0], 0x02); let restored = deserialize_meta(&bytes).unwrap(); assert_eq!(restored, meta); } #[test] fn v1_deserialization_backward_compatible() { // Simulate a V1 checkpoint (version 0x01, 17 bytes). let mut bytes = Vec::with_capacity(17); bytes.push(0x01); bytes.extend_from_slice(&1_000u64.to_le_bytes()); bytes.extend_from_slice(&42u64.to_le_bytes()); assert_eq!(bytes.len(), 17); let meta = deserialize_meta(&bytes).unwrap(); assert_eq!(meta.checkpoint_time_ns, 1_000); assert_eq!(meta.wal_sequence, 42); assert_eq!(meta.payload_hash, [0u8; 32]); // V1 has no hash } #[test] fn hash_verification_catches_corruption() { let values = vec![vec![1u8, 2, 3], vec![4, 5, 6]]; let hash = hash_checkpoint_payload(&values); // Correct values verify. assert!(verify_checkpoint_payload(&values, &hash)); // Corrupted values fail verification. let corrupt = vec![vec![1u8, 2, 99], vec![4, 5, 6]]; assert!(!verify_checkpoint_payload(&corrupt, &hash)); } #[test] fn zero_hash_skips_verification() { let values = vec![vec![1u8, 2, 3]]; let zero_hash = [0u8; 32]; assert!(verify_checkpoint_payload(&values, &zero_hash)); } #[test] fn hash_is_order_dependent() { let a = vec![vec![1u8, 2], vec![3, 4]]; let b = vec![vec![3u8, 4], vec![1, 2]]; let hash_a = hash_checkpoint_payload(&a); let hash_b = hash_checkpoint_payload(&b); assert_ne!(hash_a, hash_b); } #[test] fn empty_payload_has_deterministic_hash() { let empty: Vec> = vec![]; let hash1 = hash_checkpoint_payload(&empty); let hash2 = hash_checkpoint_payload(&empty); assert_eq!(hash1, hash2); } } #[cfg(test)] mod integrity_proptests { use proptest::prelude::*; use super::*; proptest! { #[test] fn hash_roundtrip( values in proptest::collection::vec( proptest::collection::vec(any::(), 0..100), 0..50 ), ) { let hash = hash_checkpoint_payload(&values); prop_assert!(verify_checkpoint_payload(&values, &hash)); } #[test] fn v2_meta_roundtrip( checkpoint_time_ns: u64, wal_sequence: u64, hash_bytes in proptest::collection::vec(any::(), 32..=32), ) { let mut payload_hash = [0u8; 32]; payload_hash.copy_from_slice(&hash_bytes); let meta = CheckpointMeta { checkpoint_time_ns, wal_sequence, payload_hash, }; let bytes = serialize_meta(&meta); let restored = deserialize_meta(&bytes).unwrap(); prop_assert_eq!(restored, meta); } } } ```