14 KiB
Task 01: CrashPoint Enum + Fault Injection Hooks
Delivers
A CrashPoint enum identifying 8 write-path locations where crashes can occur, a CrashInjector struct that triggers controlled panics or early returns at those locations, and test-gated hooks threaded into the signal write path, checkpoint path, cohort ledger, collection index, and co-engagement index. The injector is entirely behind #[cfg(test)] -- zero overhead in release builds.
Complexity: M
Dependencies
- None (this is the foundation task for the phase)
Technical Design
1. CrashPoint enum
// tidal/src/testing/crash_injector.rs
/// Locations in the write path where a crash can occur.
///
/// Each variant corresponds to a real boundary between durable and
/// in-memory state transitions. The names encode the operation and
/// whether the crash occurs before or after the durable write.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CrashPoint {
/// After WAL append returns but before the signal ledger hot tier is updated.
/// State: WAL has the event, ledger does not.
WalPreAggregate,
/// After signal ledger hot tier is updated but before WAL append confirms.
/// State: ledger has the update, WAL may or may not.
/// (In practice, our write path appends WAL first, so this tests the
/// case where the process dies between WAL confirm and caller return.)
WalPostAggregate,
/// Before the checkpoint WriteBatch is committed to the storage engine.
/// State: in-memory state is live, checkpoint is stale.
CheckpointPreFlush,
/// After the checkpoint WriteBatch is committed but before the WAL
/// checkpoint marker is written.
/// State: checkpoint is fresh, WAL still has old events.
CheckpointPostFlush,
/// During the signal aggregation update (hot tier on_signal call).
/// State: partial update -- some decay scores updated, others not.
SignalAggregationUpdate,
/// During CohortSignalLedger::record() -- after the global ledger
/// write succeeds but before the cohort write completes.
CohortLedgerUpdate,
/// During CollectionIndex::add_item() or create() -- after the
/// fjall put but before the in-memory bitmap is updated.
CollectionIndexUpdate,
/// During CoEngagementIndex::record_positive() -- after some edges
/// are written but before eviction completes.
CoEngagementUpdate,
}
2. CrashInjector struct
// tidal/src/testing/crash_injector.rs
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
/// Configurable fault injector for crash recovery testing.
///
/// Tracks how many times each crash point has been crossed and triggers
/// a panic when the configured threshold is reached. The panic is caught
/// by the test harness, simulating an unclean process exit.
///
/// Thread-safe: uses atomics for all counters.
pub struct CrashInjector {
/// Which crash point to trigger on.
target: CrashPoint,
/// Fire after this many crossings of the target crash point.
/// 0 means fire on the first crossing.
fire_after_n: u64,
/// Counter of crossings for the target crash point.
crossing_count: AtomicU64,
/// Whether the injector has already fired (one-shot).
fired: AtomicBool,
/// Whether the injector is armed (can be disarmed for setup phases).
armed: AtomicBool,
}
impl CrashInjector {
/// Create a new injector targeting the given crash point.
///
/// `fire_after_n`: trigger on the Nth crossing (0 = first crossing).
pub fn new(target: CrashPoint, fire_after_n: u64) -> Arc<Self> {
Arc::new(Self {
target,
fire_after_n,
crossing_count: AtomicU64::new(0),
fired: AtomicBool::new(false),
armed: AtomicBool::new(true),
})
}
/// Arm the injector (enable triggering).
pub fn arm(&self) {
self.armed.store(true, Ordering::Release);
}
/// Disarm the injector (disable triggering without resetting counters).
pub fn disarm(&self) {
self.armed.store(false, Ordering::Release);
}
/// Whether the injector has fired.
pub fn has_fired(&self) -> bool {
self.fired.load(Ordering::Acquire)
}
/// Called at each crash point in the write path.
///
/// If this crossing matches the configured trigger condition,
/// sets `fired = true` and panics with a descriptive message.
/// The test harness catches the panic via `std::panic::catch_unwind`.
pub fn maybe_crash(&self, point: CrashPoint) {
if point != self.target {
return;
}
if !self.armed.load(Ordering::Acquire) {
return;
}
if self.fired.load(Ordering::Acquire) {
return; // one-shot: already fired
}
let count = self.crossing_count.fetch_add(1, Ordering::AcqRel);
if count >= self.fire_after_n {
self.fired.store(true, Ordering::Release);
panic!(
"CrashInjector: simulated crash at {:?} after {} crossings",
point,
count + 1
);
}
}
/// Return the number of times the target crash point has been crossed.
pub fn crossing_count(&self) -> u64 {
self.crossing_count.load(Ordering::Acquire)
}
}
3. Module structure
// tidal/src/testing/mod.rs
//! Test-only utilities. Gated behind #[cfg(test)] or the "test-utils" feature.
pub mod crash_injector;
pub use crash_injector::{CrashInjector, CrashPoint};
Add to tidal/src/lib.rs:
#[cfg(any(test, feature = "test-utils"))]
pub mod testing;
4. Thread-local crash injector slot
To avoid threading an Arc<CrashInjector> through every function signature in the write path (which would pollute production code), we use a thread-local slot that is only populated during tests.
// tidal/src/testing/crash_injector.rs
use std::cell::RefCell;
thread_local! {
static INJECTOR: RefCell<Option<Arc<CrashInjector>>> = const { RefCell::new(None) };
}
/// Install a crash injector for the current thread.
///
/// Only callable from test code. Production builds optimize this away.
pub fn install_injector(injector: Arc<CrashInjector>) {
INJECTOR.with(|cell| {
*cell.borrow_mut() = Some(injector);
});
}
/// Remove the crash injector from the current thread.
pub fn clear_injector() {
INJECTOR.with(|cell| {
*cell.borrow_mut() = None;
});
}
/// Check the crash point against the installed injector (if any).
///
/// In production builds (without #[cfg(test)]), this compiles to nothing.
/// In test builds, it checks the thread-local injector.
#[inline(always)]
pub fn check_crash_point(point: CrashPoint) {
INJECTOR.with(|cell| {
if let Some(ref injector) = *cell.borrow() {
injector.maybe_crash(point);
}
});
}
5. Hook injection sites
Each hook is a single-line call behind #[cfg(test)]:
Signal write path (tidal/src/signals/ledger/core.rs, in record_signal):
// After WAL append, before hot tier update:
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::WalPreAggregate,
);
// After hot tier update:
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::WalPostAggregate,
);
Checkpoint path (tidal/src/signals/checkpoint/mod.rs, in checkpoint):
// Before storage.write_batch(batch):
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::CheckpointPreFlush,
);
// After storage.write_batch(batch) + flush, before returning:
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::CheckpointPostFlush,
);
Signal aggregation (tidal/src/signals/hot.rs, in on_signal):
// Between score updates (between decay_scores[0] and decay_scores[1]):
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::SignalAggregationUpdate,
);
Cohort ledger (tidal/src/cohort/ledger.rs, in record):
// After hot.on_signal, before warm.increment:
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::CohortLedgerUpdate,
);
Collection index (tidal/src/db/collections.rs, in add_to_collection):
// After fjall put, before in-memory bitmap update:
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::CollectionIndexUpdate,
);
Co-engagement index (tidal/src/entities/co_engagement.rs, in record_positive):
// After edge insertion loop, before eviction check:
#[cfg(any(test, feature = "test-utils"))]
crate::testing::crash_injector::check_crash_point(
crate::testing::CrashPoint::CoEngagementUpdate,
);
6. Test helper: run_with_crash
// tidal/src/testing/crash_injector.rs
use std::panic::{AssertUnwindSafe, catch_unwind};
/// Execute a closure with a crash injector installed.
///
/// Returns `Ok(T)` if the closure completes normally, or `Err(CrashPoint)`
/// if the injector fired (simulated crash). In the Err case, the
/// injector is automatically cleared from the thread-local slot.
// Takes `&Arc` not `Arc` so callers can inspect `injector.has_fired()` after
// the call without needing to clone first.
pub fn run_with_crash<T, F: FnOnce() -> T>(
injector: &Arc<CrashInjector>,
f: F,
) -> Result<T, CrashPoint> {
let target = injector.target;
install_injector(Arc::clone(injector));
let result = catch_unwind(AssertUnwindSafe(f));
clear_injector();
match result {
Ok(val) => Ok(val),
Err(_) if injector.has_fired() => Err(target),
Err(payload) => std::panic::resume_unwind(payload),
}
}
Acceptance Criteria
CrashPointenum with 8 variants:WalPreAggregate,WalPostAggregate,CheckpointPreFlush,CheckpointPostFlush,SignalAggregationUpdate,CohortLedgerUpdate,CollectionIndexUpdate,CoEngagementUpdateCrashInjector::new(target, fire_after_n)creates a one-shot injectorCrashInjector::maybe_crash(point)panics when the crossing count reachesfire_after_nCrashInjector::arm()/disarm()control whether the injector can fireCrashInjector::has_fired()returns whether the injector has triggered- Thread-local
install_injector/clear_injector/check_crash_pointfunctions run_with_crash(injector, closure)helper catches injector panics and returnsErr(CrashPoint)- Hook sites added at all 8 write-path locations behind
#[cfg(any(test, feature = "test-utils"))] tidal/src/testing/mod.rsmodule gated behind#[cfg(any(test, feature = "test-utils"))]- Zero overhead in release builds -- all hooks compile away
- Unit tests:
injector_fires_at_threshold,injector_one_shot,injector_disarm_prevents_fire,run_with_crash_returns_err_on_fire,run_with_crash_returns_ok_on_complete cargo clippy -D warningsandcargo fmtpass
Test Strategy
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn injector_fires_at_threshold() {
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 3);
// Crossings 0, 1, 2 -- should not fire.
inj.maybe_crash(CrashPoint::WalPreAggregate);
inj.maybe_crash(CrashPoint::WalPreAggregate);
inj.maybe_crash(CrashPoint::WalPreAggregate);
assert!(!inj.has_fired());
// Wait -- fire_after_n=3 means fire when count >= 3.
// Crossing 0 (count=0), 1 (count=1), 2 (count=2) don't fire.
// Crossing 3 (count=3) fires.
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
inj.maybe_crash(CrashPoint::WalPreAggregate);
}));
assert!(result.is_err());
assert!(inj.has_fired());
}
#[test]
fn injector_ignores_wrong_crash_point() {
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
// Different crash point -- should not fire.
inj.maybe_crash(CrashPoint::CheckpointPreFlush);
inj.maybe_crash(CrashPoint::CohortLedgerUpdate);
assert!(!inj.has_fired());
}
#[test]
fn injector_one_shot() {
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
inj.maybe_crash(CrashPoint::WalPreAggregate);
}));
assert!(result.is_err());
assert!(inj.has_fired());
// Second crossing should not panic (one-shot).
inj.maybe_crash(CrashPoint::WalPreAggregate); // should not panic
}
#[test]
fn injector_disarm_prevents_fire() {
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
inj.disarm();
inj.maybe_crash(CrashPoint::WalPreAggregate); // should not panic
assert!(!inj.has_fired());
inj.arm();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
inj.maybe_crash(CrashPoint::WalPreAggregate);
}));
assert!(result.is_err());
}
#[test]
fn run_with_crash_returns_ok_on_complete() {
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 1000);
let result = run_with_crash(inj, || 42);
assert_eq!(result, Ok(42));
}
#[test]
fn run_with_crash_returns_err_on_fire() {
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
let result = run_with_crash(inj, || {
check_crash_point(CrashPoint::WalPreAggregate);
42
});
assert_eq!(result, Err(CrashPoint::WalPreAggregate));
}
}