stemedb/crates/stemedb-wal/src/durability.rs
jordan 3cfaa1e1d3 feat: Complete Phase 1 (The Spine) - storage foundation
Phase 1 delivers the complete durability and storage layer:

- WAL with crash recovery: Append-only journal with BLAKE3 checksums,
  fsync guarantees, and proper seek-to-EOF on reopen
- Storage engine: sled-backed KVStore with scan_prefix for range queries
- Content-addressed storage: H:{hash}, V:{hash}, E:{hash} key patterns
- Ingestor: Background worker tailing WAL, writing to KV with 8-byte
  aligned record headers for rkyv zero-copy deserialization
- Comprehensive tests: 31 tests covering crash recovery, round-trips,
  and multi-cycle durability

New crates: stemedb-wal, stemedb-storage, stemedb-ingest

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 14:15:34 -07:00

376 lines
12 KiB
Rust

//! fsync semantics and durability primitives.
//!
//! This module provides the fsync discipline for quarantine journal files.
//! It defines when and how data is durably persisted to disk.
//!
//! # Durability Levels
//!
//! - **Immediate**: fsync after every write (safest, slowest)
//! - **Batched**: fsync after N writes or T time (balanced)
//! - **Eventual**: fsync only on close (fastest, least safe)
use crate::error::{QuarantineError, Result};
use fs2::FileExt;
use std::fs::File;
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
use tracing::{debug, instrument};
/// Default fsync timeout in seconds.
pub const DEFAULT_FSYNC_TIMEOUT_SECS: u64 = 5;
/// Default batch size for batched durability.
pub const DEFAULT_BATCH_SIZE: usize = 100;
/// Default batch time window.
pub const DEFAULT_BATCH_DURATION: Duration = Duration::from_millis(10);
/// Durability level for write operations.
///
/// Controls when fsync is called to ensure data is persisted to disk.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum DurabilityLevel {
/// fsync after every write operation.
/// - Highest durability guarantee
/// - Lowest throughput
/// - Use for critical data that cannot be lost
#[default]
Immediate,
/// fsync after batch_size writes OR batch_duration time.
/// - Good balance of durability and throughput
/// - Configurable trade-off
/// - Recommended for most use cases
Batched {
/// Maximum writes before fsync.
max_writes: usize,
/// Maximum time before fsync.
max_duration: Duration,
},
/// fsync only on explicit flush or close.
/// - Highest throughput
/// - Data may be lost on crash
/// - Use only for non-critical or reconstructible data
Eventual,
}
impl DurabilityLevel {
/// Create a batched durability level with defaults.
pub fn batched() -> Self {
Self::Batched { max_writes: DEFAULT_BATCH_SIZE, max_duration: DEFAULT_BATCH_DURATION }
}
/// Create a batched durability level with custom parameters.
pub fn batched_with(max_writes: usize, max_duration: Duration) -> Self {
Self::Batched { max_writes, max_duration }
}
}
/// Guard that ensures file is synced on drop.
///
/// This struct wraps a file handle and tracks pending writes.
/// When dropped, it attempts to sync any pending data.
pub struct FsyncGuard {
file: File,
path: PathBuf,
level: DurabilityLevel,
pending_writes: usize,
last_sync: Instant,
#[allow(dead_code)] // Reserved for future timeout logic
timeout: Duration,
}
impl FsyncGuard {
/// Create a new fsync guard for the given file.
pub fn new(file: File, path: PathBuf, level: DurabilityLevel) -> Self {
Self {
file,
path,
level,
pending_writes: 0,
last_sync: Instant::now(),
timeout: Duration::from_secs(DEFAULT_FSYNC_TIMEOUT_SECS),
}
}
/// Set the fsync timeout.
pub fn with_timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
/// Write data to the file and potentially sync based on durability level.
pub fn write(&mut self, data: &[u8]) -> Result<()> {
self.file.write_all(data).map_err(|e| QuarantineError::io(&self.path, e))?;
self.pending_writes += 1;
self.maybe_sync()
}
/// Check if sync is needed based on durability level and trigger if so.
pub fn maybe_sync(&mut self) -> Result<()> {
let should_sync = match self.level {
DurabilityLevel::Immediate => self.pending_writes > 0,
DurabilityLevel::Batched { max_writes, max_duration } => {
self.pending_writes >= max_writes || self.last_sync.elapsed() >= max_duration
}
DurabilityLevel::Eventual => false,
};
if should_sync {
self.force_sync()?;
}
Ok(())
}
/// Force an fsync regardless of durability level.
#[instrument(skip(self), fields(pending = self.pending_writes))]
pub fn force_sync(&mut self) -> Result<()> {
self.sync_file()?;
self.pending_writes = 0;
self.last_sync = Instant::now();
debug!("Forced sync complete");
Ok(())
}
/// Get the underlying file reference.
pub fn file(&self) -> &File {
&self.file
}
/// Get a mutable reference to the underlying file.
pub fn file_mut(&mut self) -> &mut File {
&mut self.file
}
/// Get the file path.
pub fn path(&self) -> &Path {
&self.path
}
/// Get the current durability level.
pub fn level(&self) -> DurabilityLevel {
self.level
}
/// Get the number of pending (unsynced) writes.
pub fn pending_writes(&self) -> usize {
self.pending_writes
}
/// Acquire an exclusive lock on the file.
#[instrument(skip(self), fields(path = %self.path.display()))]
pub fn lock_exclusive(&self) -> Result<()> {
self.file.lock_exclusive().map_err(|e| {
if e.kind() == io::ErrorKind::WouldBlock {
QuarantineError::FileLocked { path: self.path.clone() }
} else {
QuarantineError::io(&self.path, e)
}
})?;
debug!("Acquired exclusive lock");
Ok(())
}
/// Try to acquire an exclusive lock without blocking.
pub fn try_lock_exclusive(&self) -> Result<bool> {
match self.file.try_lock_exclusive() {
Ok(()) => Ok(true),
Err(e) if e.kind() == io::ErrorKind::WouldBlock => Ok(false),
Err(e) => Err(QuarantineError::io(&self.path, e)),
}
}
/// Release the file lock.
#[allow(clippy::incompatible_msrv)]
pub fn unlock(&self) -> Result<()> {
self.file.unlock().map_err(|e| QuarantineError::io(&self.path, e))
}
/// Perform the actual fsync operation.
fn sync_file(&self) -> Result<()> {
// Use sync_data (fdatasync) when we only need data durability,
// not metadata like modification time.
self.file
.sync_data()
.map_err(|e| QuarantineError::FsyncFailed { path: self.path.clone(), source: e })
}
}
impl Drop for FsyncGuard {
fn drop(&mut self) {
// Best-effort sync on drop - we can't return errors from Drop
if self.pending_writes > 0 {
if let Err(e) = self.force_sync() {
tracing::error!(
path = %self.path.display(),
error = %e,
pending_writes = self.pending_writes,
"Failed to sync file on drop"
);
}
}
}
}
/// Sync a directory to ensure file creation is durable.
///
/// This is necessary for crash-safe file operations on some filesystems.
pub fn sync_directory(path: &Path) -> Result<()> {
let dir = File::open(path).map_err(|e| QuarantineError::io(path, e))?;
dir.sync_all().map_err(|e| QuarantineError::FsyncFailed { path: path.to_path_buf(), source: e })
}
/// Perform an atomic file write (write to temp, sync, rename).
///
/// This ensures the file either exists completely or not at all.
pub fn atomic_write(path: &Path, contents: &[u8]) -> Result<()> {
let temp_path = path.with_extension("tmp");
// Write to temporary file
let mut file = File::create(&temp_path).map_err(|e| QuarantineError::io(&temp_path, e))?;
file.write_all(contents).map_err(|e| QuarantineError::io(&temp_path, e))?;
file.sync_all()
.map_err(|e| QuarantineError::FsyncFailed { path: temp_path.clone(), source: e })?;
drop(file);
// Rename atomically
std::fs::rename(&temp_path, path).map_err(|e| QuarantineError::io(path, e))?;
// Sync parent directory
if let Some(parent) = path.parent() {
sync_directory(parent)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::{tempdir, TempDir};
/// Test helper: Creates a temp dir and file for FsyncGuard tests
fn create_test_file() -> (TempDir, std::path::PathBuf, File) {
let dir = tempdir().unwrap();
let path = dir.path().join("test.quarantine");
let file = File::create(&path).unwrap();
(dir, path, file)
}
#[test]
fn test_durability_level_default() {
let level = DurabilityLevel::default();
assert_eq!(level, DurabilityLevel::Immediate);
}
#[test]
fn test_durability_level_batched() {
let level = DurabilityLevel::batched();
match level {
DurabilityLevel::Batched { max_writes, max_duration } => {
assert_eq!(max_writes, DEFAULT_BATCH_SIZE);
assert_eq!(max_duration, DEFAULT_BATCH_DURATION);
}
_ => panic!("Expected Batched"),
}
}
#[test]
fn test_fsync_guard_immediate() {
let (_dir, path, file) = create_test_file();
let mut guard = FsyncGuard::new(file, path.clone(), DurabilityLevel::Immediate);
guard.write(b"hello").unwrap();
assert_eq!(guard.pending_writes(), 0); // Should have synced
// Verify file contains data
let contents = std::fs::read(&path).unwrap();
assert_eq!(contents, b"hello");
}
#[test]
fn test_fsync_guard_batched() {
let (_dir, path, file) = create_test_file();
let level = DurabilityLevel::batched_with(3, Duration::from_secs(60));
let mut guard = FsyncGuard::new(file, path, level);
guard.write(b"1").unwrap();
assert_eq!(guard.pending_writes(), 1);
guard.write(b"2").unwrap();
assert_eq!(guard.pending_writes(), 2);
guard.write(b"3").unwrap();
assert_eq!(guard.pending_writes(), 0); // Should have synced at 3
}
#[test]
fn test_fsync_guard_eventual() {
let (_dir, path, file) = create_test_file();
let mut guard = FsyncGuard::new(file, path, DurabilityLevel::Eventual);
for i in 0..100 {
guard.write(&[i]).unwrap();
}
assert_eq!(guard.pending_writes(), 100); // Never synced
guard.force_sync().unwrap();
assert_eq!(guard.pending_writes(), 0);
}
#[test]
fn test_fsync_guard_drop_syncs() {
let dir = tempdir().unwrap();
let path = dir.path().join("test.quarantine");
{
let file = File::create(&path).unwrap();
let mut guard = FsyncGuard::new(file, path.clone(), DurabilityLevel::Eventual);
guard.write(b"test data").unwrap();
// Guard dropped here, should sync
}
// File should still contain data
let contents = std::fs::read(&path).unwrap();
assert_eq!(contents, b"test data");
}
#[test]
fn test_atomic_write() {
let dir = tempdir().unwrap();
let path = dir.path().join("atomic.txt");
atomic_write(&path, b"atomic content").unwrap();
let contents = std::fs::read(&path).unwrap();
assert_eq!(contents, b"atomic content");
// Temp file should not exist
let temp_path = path.with_extension("tmp");
assert!(!temp_path.exists());
}
#[test]
fn test_file_locking() {
let dir = tempdir().unwrap();
let path = dir.path().join("locked.quarantine");
let file = File::create(&path).unwrap();
let guard = FsyncGuard::new(file, path.clone(), DurabilityLevel::Immediate);
guard.lock_exclusive().unwrap();
// Try to lock from another handle
let file2 = File::open(&path).unwrap();
let guard2 = FsyncGuard::new(file2, path, DurabilityLevel::Immediate);
assert!(!guard2.try_lock_exclusive().unwrap());
guard.unlock().unwrap();
assert!(guard2.try_lock_exclusive().unwrap());
}
}