420 lines
14 KiB
Markdown
420 lines
14 KiB
Markdown
# Task 01: CrashPoint Enum + Fault Injection Hooks
|
|
|
|
## Delivers
|
|
|
|
A `CrashPoint` enum identifying 8 write-path locations where crashes can occur, a `CrashInjector` struct that triggers controlled panics or early returns at those locations, and test-gated hooks threaded into the signal write path, checkpoint path, cohort ledger, collection index, and co-engagement index. The injector is entirely behind `#[cfg(test)]` -- zero overhead in release builds.
|
|
|
|
## Complexity: M
|
|
|
|
## Dependencies
|
|
|
|
- None (this is the foundation task for the phase)
|
|
|
|
## Technical Design
|
|
|
|
### 1. CrashPoint enum
|
|
|
|
```rust
|
|
// tidal/src/testing/crash_injector.rs
|
|
|
|
/// Locations in the write path where a crash can occur.
|
|
///
|
|
/// Each variant corresponds to a real boundary between durable and
|
|
/// in-memory state transitions. The names encode the operation and
|
|
/// whether the crash occurs before or after the durable write.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
pub enum CrashPoint {
|
|
/// After WAL append returns but before the signal ledger hot tier is updated.
|
|
/// State: WAL has the event, ledger does not.
|
|
WalPreAggregate,
|
|
|
|
/// After signal ledger hot tier is updated but before WAL append confirms.
|
|
/// State: ledger has the update, WAL may or may not.
|
|
/// (In practice, our write path appends WAL first, so this tests the
|
|
/// case where the process dies between WAL confirm and caller return.)
|
|
WalPostAggregate,
|
|
|
|
/// Before the checkpoint WriteBatch is committed to the storage engine.
|
|
/// State: in-memory state is live, checkpoint is stale.
|
|
CheckpointPreFlush,
|
|
|
|
/// After the checkpoint WriteBatch is committed but before the WAL
|
|
/// checkpoint marker is written.
|
|
/// State: checkpoint is fresh, WAL still has old events.
|
|
CheckpointPostFlush,
|
|
|
|
/// During the signal aggregation update (hot tier on_signal call).
|
|
/// State: partial update -- some decay scores updated, others not.
|
|
SignalAggregationUpdate,
|
|
|
|
/// During CohortSignalLedger::record() -- after the global ledger
|
|
/// write succeeds but before the cohort write completes.
|
|
CohortLedgerUpdate,
|
|
|
|
/// During CollectionIndex::add_item() or create() -- after the
|
|
/// fjall put but before the in-memory bitmap is updated.
|
|
CollectionIndexUpdate,
|
|
|
|
/// During CoEngagementIndex::record_positive() -- after some edges
|
|
/// are written but before eviction completes.
|
|
CoEngagementUpdate,
|
|
}
|
|
```
|
|
|
|
### 2. CrashInjector struct
|
|
|
|
```rust
|
|
// tidal/src/testing/crash_injector.rs
|
|
|
|
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
|
use std::sync::Arc;
|
|
|
|
/// Configurable fault injector for crash recovery testing.
|
|
///
|
|
/// Tracks how many times each crash point has been crossed and triggers
|
|
/// a panic when the configured threshold is reached. The panic is caught
|
|
/// by the test harness, simulating an unclean process exit.
|
|
///
|
|
/// Thread-safe: uses atomics for all counters.
|
|
pub struct CrashInjector {
|
|
/// Which crash point to trigger on.
|
|
target: CrashPoint,
|
|
/// Fire after this many crossings of the target crash point.
|
|
/// 0 means fire on the first crossing.
|
|
fire_after_n: u64,
|
|
/// Counter of crossings for the target crash point.
|
|
crossing_count: AtomicU64,
|
|
/// Whether the injector has already fired (one-shot).
|
|
fired: AtomicBool,
|
|
/// Whether the injector is armed (can be disarmed for setup phases).
|
|
armed: AtomicBool,
|
|
}
|
|
|
|
impl CrashInjector {
|
|
/// Create a new injector targeting the given crash point.
|
|
///
|
|
/// `fire_after_n`: trigger on the Nth crossing (0 = first crossing).
|
|
pub fn new(target: CrashPoint, fire_after_n: u64) -> Arc<Self> {
|
|
Arc::new(Self {
|
|
target,
|
|
fire_after_n,
|
|
crossing_count: AtomicU64::new(0),
|
|
fired: AtomicBool::new(false),
|
|
armed: AtomicBool::new(true),
|
|
})
|
|
}
|
|
|
|
/// Arm the injector (enable triggering).
|
|
pub fn arm(&self) {
|
|
self.armed.store(true, Ordering::Release);
|
|
}
|
|
|
|
/// Disarm the injector (disable triggering without resetting counters).
|
|
pub fn disarm(&self) {
|
|
self.armed.store(false, Ordering::Release);
|
|
}
|
|
|
|
/// Whether the injector has fired.
|
|
pub fn has_fired(&self) -> bool {
|
|
self.fired.load(Ordering::Acquire)
|
|
}
|
|
|
|
/// Called at each crash point in the write path.
|
|
///
|
|
/// If this crossing matches the configured trigger condition,
|
|
/// sets `fired = true` and panics with a descriptive message.
|
|
/// The test harness catches the panic via `std::panic::catch_unwind`.
|
|
pub fn maybe_crash(&self, point: CrashPoint) {
|
|
if point != self.target {
|
|
return;
|
|
}
|
|
if !self.armed.load(Ordering::Acquire) {
|
|
return;
|
|
}
|
|
if self.fired.load(Ordering::Acquire) {
|
|
return; // one-shot: already fired
|
|
}
|
|
|
|
let count = self.crossing_count.fetch_add(1, Ordering::AcqRel);
|
|
if count >= self.fire_after_n {
|
|
self.fired.store(true, Ordering::Release);
|
|
panic!(
|
|
"CrashInjector: simulated crash at {:?} after {} crossings",
|
|
point,
|
|
count + 1
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Return the number of times the target crash point has been crossed.
|
|
pub fn crossing_count(&self) -> u64 {
|
|
self.crossing_count.load(Ordering::Acquire)
|
|
}
|
|
}
|
|
```
|
|
|
|
### 3. Module structure
|
|
|
|
```rust
|
|
// tidal/src/testing/mod.rs
|
|
//! Test-only utilities. Gated behind #[cfg(test)] or the "test-utils" feature.
|
|
|
|
pub mod crash_injector;
|
|
pub use crash_injector::{CrashInjector, CrashPoint};
|
|
```
|
|
|
|
Add to `tidal/src/lib.rs`:
|
|
|
|
```rust
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
pub mod testing;
|
|
```
|
|
|
|
### 4. Thread-local crash injector slot
|
|
|
|
To avoid threading an `Arc<CrashInjector>` through every function signature in the write path (which would pollute production code), we use a thread-local slot that is only populated during tests.
|
|
|
|
```rust
|
|
// tidal/src/testing/crash_injector.rs
|
|
|
|
use std::cell::RefCell;
|
|
|
|
thread_local! {
|
|
static INJECTOR: RefCell<Option<Arc<CrashInjector>>> = const { RefCell::new(None) };
|
|
}
|
|
|
|
/// Install a crash injector for the current thread.
|
|
///
|
|
/// Only callable from test code. Production builds optimize this away.
|
|
pub fn install_injector(injector: Arc<CrashInjector>) {
|
|
INJECTOR.with(|cell| {
|
|
*cell.borrow_mut() = Some(injector);
|
|
});
|
|
}
|
|
|
|
/// Remove the crash injector from the current thread.
|
|
pub fn clear_injector() {
|
|
INJECTOR.with(|cell| {
|
|
*cell.borrow_mut() = None;
|
|
});
|
|
}
|
|
|
|
/// Check the crash point against the installed injector (if any).
|
|
///
|
|
/// In production builds (without #[cfg(test)]), this compiles to nothing.
|
|
/// In test builds, it checks the thread-local injector.
|
|
#[inline(always)]
|
|
pub fn check_crash_point(point: CrashPoint) {
|
|
INJECTOR.with(|cell| {
|
|
if let Some(ref injector) = *cell.borrow() {
|
|
injector.maybe_crash(point);
|
|
}
|
|
});
|
|
}
|
|
```
|
|
|
|
### 5. Hook injection sites
|
|
|
|
Each hook is a single-line call behind `#[cfg(test)]`:
|
|
|
|
**Signal write path** (`tidal/src/signals/ledger/core.rs`, in `record_signal`):
|
|
|
|
```rust
|
|
// After WAL append, before hot tier update:
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::WalPreAggregate,
|
|
);
|
|
|
|
// After hot tier update:
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::WalPostAggregate,
|
|
);
|
|
```
|
|
|
|
**Checkpoint path** (`tidal/src/signals/checkpoint/mod.rs`, in `checkpoint`):
|
|
|
|
```rust
|
|
// Before storage.write_batch(batch):
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::CheckpointPreFlush,
|
|
);
|
|
|
|
// After storage.write_batch(batch) + flush, before returning:
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::CheckpointPostFlush,
|
|
);
|
|
```
|
|
|
|
**Signal aggregation** (`tidal/src/signals/hot.rs`, in `on_signal`):
|
|
|
|
```rust
|
|
// Between score updates (between decay_scores[0] and decay_scores[1]):
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::SignalAggregationUpdate,
|
|
);
|
|
```
|
|
|
|
**Cohort ledger** (`tidal/src/cohort/ledger.rs`, in `record`):
|
|
|
|
```rust
|
|
// After hot.on_signal, before warm.increment:
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::CohortLedgerUpdate,
|
|
);
|
|
```
|
|
|
|
**Collection index** (`tidal/src/db/collections.rs`, in `add_to_collection`):
|
|
|
|
```rust
|
|
// After fjall put, before in-memory bitmap update:
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::CollectionIndexUpdate,
|
|
);
|
|
```
|
|
|
|
**Co-engagement index** (`tidal/src/entities/co_engagement.rs`, in `record_positive`):
|
|
|
|
```rust
|
|
// After edge insertion loop, before eviction check:
|
|
#[cfg(any(test, feature = "test-utils"))]
|
|
crate::testing::crash_injector::check_crash_point(
|
|
crate::testing::CrashPoint::CoEngagementUpdate,
|
|
);
|
|
```
|
|
|
|
### 6. Test helper: `run_with_crash`
|
|
|
|
```rust
|
|
// tidal/src/testing/crash_injector.rs
|
|
|
|
use std::panic::{AssertUnwindSafe, catch_unwind};
|
|
|
|
/// Execute a closure with a crash injector installed.
|
|
///
|
|
/// Returns `Ok(T)` if the closure completes normally, or `Err(CrashPoint)`
|
|
/// if the injector fired (simulated crash). In the Err case, the
|
|
/// injector is automatically cleared from the thread-local slot.
|
|
// Takes `&Arc` not `Arc` so callers can inspect `injector.has_fired()` after
|
|
// the call without needing to clone first.
|
|
pub fn run_with_crash<T, F: FnOnce() -> T>(
|
|
injector: &Arc<CrashInjector>,
|
|
f: F,
|
|
) -> Result<T, CrashPoint> {
|
|
let target = injector.target;
|
|
install_injector(Arc::clone(injector));
|
|
|
|
let result = catch_unwind(AssertUnwindSafe(f));
|
|
|
|
clear_injector();
|
|
|
|
match result {
|
|
Ok(val) => Ok(val),
|
|
Err(_) if injector.has_fired() => Err(target),
|
|
Err(payload) => std::panic::resume_unwind(payload),
|
|
}
|
|
}
|
|
```
|
|
|
|
## Acceptance Criteria
|
|
|
|
- [ ] `CrashPoint` enum with 8 variants: `WalPreAggregate`, `WalPostAggregate`, `CheckpointPreFlush`, `CheckpointPostFlush`, `SignalAggregationUpdate`, `CohortLedgerUpdate`, `CollectionIndexUpdate`, `CoEngagementUpdate`
|
|
- [ ] `CrashInjector::new(target, fire_after_n)` creates a one-shot injector
|
|
- [ ] `CrashInjector::maybe_crash(point)` panics when the crossing count reaches `fire_after_n`
|
|
- [ ] `CrashInjector::arm()` / `disarm()` control whether the injector can fire
|
|
- [ ] `CrashInjector::has_fired()` returns whether the injector has triggered
|
|
- [ ] Thread-local `install_injector` / `clear_injector` / `check_crash_point` functions
|
|
- [ ] `run_with_crash(injector, closure)` helper catches injector panics and returns `Err(CrashPoint)`
|
|
- [ ] Hook sites added at all 8 write-path locations behind `#[cfg(any(test, feature = "test-utils"))]`
|
|
- [ ] `tidal/src/testing/mod.rs` module gated behind `#[cfg(any(test, feature = "test-utils"))]`
|
|
- [ ] Zero overhead in release builds -- all hooks compile away
|
|
- [ ] Unit tests: `injector_fires_at_threshold`, `injector_one_shot`, `injector_disarm_prevents_fire`, `run_with_crash_returns_err_on_fire`, `run_with_crash_returns_ok_on_complete`
|
|
- [ ] `cargo clippy -D warnings` and `cargo fmt` pass
|
|
|
|
## Test Strategy
|
|
|
|
```rust
|
|
#[cfg(test)]
|
|
#[allow(clippy::unwrap_used)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn injector_fires_at_threshold() {
|
|
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 3);
|
|
// Crossings 0, 1, 2 -- should not fire.
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate);
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate);
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate);
|
|
assert!(!inj.has_fired());
|
|
// Wait -- fire_after_n=3 means fire when count >= 3.
|
|
// Crossing 0 (count=0), 1 (count=1), 2 (count=2) don't fire.
|
|
// Crossing 3 (count=3) fires.
|
|
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate);
|
|
}));
|
|
assert!(result.is_err());
|
|
assert!(inj.has_fired());
|
|
}
|
|
|
|
#[test]
|
|
fn injector_ignores_wrong_crash_point() {
|
|
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
|
|
// Different crash point -- should not fire.
|
|
inj.maybe_crash(CrashPoint::CheckpointPreFlush);
|
|
inj.maybe_crash(CrashPoint::CohortLedgerUpdate);
|
|
assert!(!inj.has_fired());
|
|
}
|
|
|
|
#[test]
|
|
fn injector_one_shot() {
|
|
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
|
|
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate);
|
|
}));
|
|
assert!(result.is_err());
|
|
assert!(inj.has_fired());
|
|
|
|
// Second crossing should not panic (one-shot).
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate); // should not panic
|
|
}
|
|
|
|
#[test]
|
|
fn injector_disarm_prevents_fire() {
|
|
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
|
|
inj.disarm();
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate); // should not panic
|
|
assert!(!inj.has_fired());
|
|
|
|
inj.arm();
|
|
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
|
inj.maybe_crash(CrashPoint::WalPreAggregate);
|
|
}));
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn run_with_crash_returns_ok_on_complete() {
|
|
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 1000);
|
|
let result = run_with_crash(inj, || 42);
|
|
assert_eq!(result, Ok(42));
|
|
}
|
|
|
|
#[test]
|
|
fn run_with_crash_returns_err_on_fire() {
|
|
let inj = CrashInjector::new(CrashPoint::WalPreAggregate, 0);
|
|
let result = run_with_crash(inj, || {
|
|
check_crash_point(CrashPoint::WalPreAggregate);
|
|
42
|
|
});
|
|
assert_eq!(result, Err(CrashPoint::WalPreAggregate));
|
|
}
|
|
}
|
|
```
|