Phase 1 delivers the complete durability and storage layer:
- WAL with crash recovery: Append-only journal with BLAKE3 checksums,
fsync guarantees, and proper seek-to-EOF on reopen
- Storage engine: sled-backed KVStore with scan_prefix for range queries
- Content-addressed storage: H:{hash}, V:{hash}, E:{hash} key patterns
- Ingestor: Background worker tailing WAL, writing to KV with 8-byte
aligned record headers for rkyv zero-copy deserialization
- Comprehensive tests: 31 tests covering crash recovery, round-trips,
and multi-cycle durability
New crates: stemedb-wal, stemedb-storage, stemedb-ingest
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
376 lines
12 KiB
Rust
376 lines
12 KiB
Rust
//! fsync semantics and durability primitives.
|
|
//!
|
|
//! This module provides the fsync discipline for quarantine journal files.
|
|
//! It defines when and how data is durably persisted to disk.
|
|
//!
|
|
//! # Durability Levels
|
|
//!
|
|
//! - **Immediate**: fsync after every write (safest, slowest)
|
|
//! - **Batched**: fsync after N writes or T time (balanced)
|
|
//! - **Eventual**: fsync only on close (fastest, least safe)
|
|
|
|
use crate::error::{QuarantineError, Result};
|
|
use fs2::FileExt;
|
|
use std::fs::File;
|
|
use std::io::{self, Write};
|
|
use std::path::{Path, PathBuf};
|
|
use std::time::{Duration, Instant};
|
|
use tracing::{debug, instrument};
|
|
|
|
/// Default fsync timeout in seconds.
|
|
pub const DEFAULT_FSYNC_TIMEOUT_SECS: u64 = 5;
|
|
|
|
/// Default batch size for batched durability.
|
|
pub const DEFAULT_BATCH_SIZE: usize = 100;
|
|
|
|
/// Default batch time window.
|
|
pub const DEFAULT_BATCH_DURATION: Duration = Duration::from_millis(10);
|
|
|
|
/// Durability level for write operations.
|
|
///
|
|
/// Controls when fsync is called to ensure data is persisted to disk.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
pub enum DurabilityLevel {
|
|
/// fsync after every write operation.
|
|
/// - Highest durability guarantee
|
|
/// - Lowest throughput
|
|
/// - Use for critical data that cannot be lost
|
|
#[default]
|
|
Immediate,
|
|
|
|
/// fsync after batch_size writes OR batch_duration time.
|
|
/// - Good balance of durability and throughput
|
|
/// - Configurable trade-off
|
|
/// - Recommended for most use cases
|
|
Batched {
|
|
/// Maximum writes before fsync.
|
|
max_writes: usize,
|
|
/// Maximum time before fsync.
|
|
max_duration: Duration,
|
|
},
|
|
|
|
/// fsync only on explicit flush or close.
|
|
/// - Highest throughput
|
|
/// - Data may be lost on crash
|
|
/// - Use only for non-critical or reconstructible data
|
|
Eventual,
|
|
}
|
|
|
|
impl DurabilityLevel {
|
|
/// Create a batched durability level with defaults.
|
|
pub fn batched() -> Self {
|
|
Self::Batched { max_writes: DEFAULT_BATCH_SIZE, max_duration: DEFAULT_BATCH_DURATION }
|
|
}
|
|
|
|
/// Create a batched durability level with custom parameters.
|
|
pub fn batched_with(max_writes: usize, max_duration: Duration) -> Self {
|
|
Self::Batched { max_writes, max_duration }
|
|
}
|
|
}
|
|
|
|
/// Guard that ensures file is synced on drop.
|
|
///
|
|
/// This struct wraps a file handle and tracks pending writes.
|
|
/// When dropped, it attempts to sync any pending data.
|
|
pub struct FsyncGuard {
|
|
file: File,
|
|
path: PathBuf,
|
|
level: DurabilityLevel,
|
|
pending_writes: usize,
|
|
last_sync: Instant,
|
|
#[allow(dead_code)] // Reserved for future timeout logic
|
|
timeout: Duration,
|
|
}
|
|
|
|
impl FsyncGuard {
|
|
/// Create a new fsync guard for the given file.
|
|
pub fn new(file: File, path: PathBuf, level: DurabilityLevel) -> Self {
|
|
Self {
|
|
file,
|
|
path,
|
|
level,
|
|
pending_writes: 0,
|
|
last_sync: Instant::now(),
|
|
timeout: Duration::from_secs(DEFAULT_FSYNC_TIMEOUT_SECS),
|
|
}
|
|
}
|
|
|
|
/// Set the fsync timeout.
|
|
pub fn with_timeout(mut self, timeout: Duration) -> Self {
|
|
self.timeout = timeout;
|
|
self
|
|
}
|
|
|
|
/// Write data to the file and potentially sync based on durability level.
|
|
pub fn write(&mut self, data: &[u8]) -> Result<()> {
|
|
self.file.write_all(data).map_err(|e| QuarantineError::io(&self.path, e))?;
|
|
self.pending_writes += 1;
|
|
self.maybe_sync()
|
|
}
|
|
|
|
/// Check if sync is needed based on durability level and trigger if so.
|
|
pub fn maybe_sync(&mut self) -> Result<()> {
|
|
let should_sync = match self.level {
|
|
DurabilityLevel::Immediate => self.pending_writes > 0,
|
|
DurabilityLevel::Batched { max_writes, max_duration } => {
|
|
self.pending_writes >= max_writes || self.last_sync.elapsed() >= max_duration
|
|
}
|
|
DurabilityLevel::Eventual => false,
|
|
};
|
|
|
|
if should_sync {
|
|
self.force_sync()?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Force an fsync regardless of durability level.
|
|
#[instrument(skip(self), fields(pending = self.pending_writes))]
|
|
pub fn force_sync(&mut self) -> Result<()> {
|
|
self.sync_file()?;
|
|
self.pending_writes = 0;
|
|
self.last_sync = Instant::now();
|
|
debug!("Forced sync complete");
|
|
Ok(())
|
|
}
|
|
|
|
/// Get the underlying file reference.
|
|
pub fn file(&self) -> &File {
|
|
&self.file
|
|
}
|
|
|
|
/// Get a mutable reference to the underlying file.
|
|
pub fn file_mut(&mut self) -> &mut File {
|
|
&mut self.file
|
|
}
|
|
|
|
/// Get the file path.
|
|
pub fn path(&self) -> &Path {
|
|
&self.path
|
|
}
|
|
|
|
/// Get the current durability level.
|
|
pub fn level(&self) -> DurabilityLevel {
|
|
self.level
|
|
}
|
|
|
|
/// Get the number of pending (unsynced) writes.
|
|
pub fn pending_writes(&self) -> usize {
|
|
self.pending_writes
|
|
}
|
|
|
|
/// Acquire an exclusive lock on the file.
|
|
#[instrument(skip(self), fields(path = %self.path.display()))]
|
|
pub fn lock_exclusive(&self) -> Result<()> {
|
|
self.file.lock_exclusive().map_err(|e| {
|
|
if e.kind() == io::ErrorKind::WouldBlock {
|
|
QuarantineError::FileLocked { path: self.path.clone() }
|
|
} else {
|
|
QuarantineError::io(&self.path, e)
|
|
}
|
|
})?;
|
|
debug!("Acquired exclusive lock");
|
|
Ok(())
|
|
}
|
|
|
|
/// Try to acquire an exclusive lock without blocking.
|
|
pub fn try_lock_exclusive(&self) -> Result<bool> {
|
|
match self.file.try_lock_exclusive() {
|
|
Ok(()) => Ok(true),
|
|
Err(e) if e.kind() == io::ErrorKind::WouldBlock => Ok(false),
|
|
Err(e) => Err(QuarantineError::io(&self.path, e)),
|
|
}
|
|
}
|
|
|
|
/// Release the file lock.
|
|
#[allow(clippy::incompatible_msrv)]
|
|
pub fn unlock(&self) -> Result<()> {
|
|
self.file.unlock().map_err(|e| QuarantineError::io(&self.path, e))
|
|
}
|
|
|
|
/// Perform the actual fsync operation.
|
|
fn sync_file(&self) -> Result<()> {
|
|
// Use sync_data (fdatasync) when we only need data durability,
|
|
// not metadata like modification time.
|
|
self.file
|
|
.sync_data()
|
|
.map_err(|e| QuarantineError::FsyncFailed { path: self.path.clone(), source: e })
|
|
}
|
|
}
|
|
|
|
impl Drop for FsyncGuard {
|
|
fn drop(&mut self) {
|
|
// Best-effort sync on drop - we can't return errors from Drop
|
|
if self.pending_writes > 0 {
|
|
if let Err(e) = self.force_sync() {
|
|
tracing::error!(
|
|
path = %self.path.display(),
|
|
error = %e,
|
|
pending_writes = self.pending_writes,
|
|
"Failed to sync file on drop"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Sync a directory to ensure file creation is durable.
|
|
///
|
|
/// This is necessary for crash-safe file operations on some filesystems.
|
|
pub fn sync_directory(path: &Path) -> Result<()> {
|
|
let dir = File::open(path).map_err(|e| QuarantineError::io(path, e))?;
|
|
dir.sync_all().map_err(|e| QuarantineError::FsyncFailed { path: path.to_path_buf(), source: e })
|
|
}
|
|
|
|
/// Perform an atomic file write (write to temp, sync, rename).
|
|
///
|
|
/// This ensures the file either exists completely or not at all.
|
|
pub fn atomic_write(path: &Path, contents: &[u8]) -> Result<()> {
|
|
let temp_path = path.with_extension("tmp");
|
|
|
|
// Write to temporary file
|
|
let mut file = File::create(&temp_path).map_err(|e| QuarantineError::io(&temp_path, e))?;
|
|
file.write_all(contents).map_err(|e| QuarantineError::io(&temp_path, e))?;
|
|
file.sync_all()
|
|
.map_err(|e| QuarantineError::FsyncFailed { path: temp_path.clone(), source: e })?;
|
|
drop(file);
|
|
|
|
// Rename atomically
|
|
std::fs::rename(&temp_path, path).map_err(|e| QuarantineError::io(path, e))?;
|
|
|
|
// Sync parent directory
|
|
if let Some(parent) = path.parent() {
|
|
sync_directory(parent)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use tempfile::{tempdir, TempDir};
|
|
|
|
/// Test helper: Creates a temp dir and file for FsyncGuard tests
|
|
fn create_test_file() -> (TempDir, std::path::PathBuf, File) {
|
|
let dir = tempdir().unwrap();
|
|
let path = dir.path().join("test.quarantine");
|
|
let file = File::create(&path).unwrap();
|
|
(dir, path, file)
|
|
}
|
|
|
|
#[test]
|
|
fn test_durability_level_default() {
|
|
let level = DurabilityLevel::default();
|
|
assert_eq!(level, DurabilityLevel::Immediate);
|
|
}
|
|
|
|
#[test]
|
|
fn test_durability_level_batched() {
|
|
let level = DurabilityLevel::batched();
|
|
match level {
|
|
DurabilityLevel::Batched { max_writes, max_duration } => {
|
|
assert_eq!(max_writes, DEFAULT_BATCH_SIZE);
|
|
assert_eq!(max_duration, DEFAULT_BATCH_DURATION);
|
|
}
|
|
_ => panic!("Expected Batched"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_fsync_guard_immediate() {
|
|
let (_dir, path, file) = create_test_file();
|
|
|
|
let mut guard = FsyncGuard::new(file, path.clone(), DurabilityLevel::Immediate);
|
|
guard.write(b"hello").unwrap();
|
|
assert_eq!(guard.pending_writes(), 0); // Should have synced
|
|
|
|
// Verify file contains data
|
|
let contents = std::fs::read(&path).unwrap();
|
|
assert_eq!(contents, b"hello");
|
|
}
|
|
|
|
#[test]
|
|
fn test_fsync_guard_batched() {
|
|
let (_dir, path, file) = create_test_file();
|
|
|
|
let level = DurabilityLevel::batched_with(3, Duration::from_secs(60));
|
|
let mut guard = FsyncGuard::new(file, path, level);
|
|
|
|
guard.write(b"1").unwrap();
|
|
assert_eq!(guard.pending_writes(), 1);
|
|
|
|
guard.write(b"2").unwrap();
|
|
assert_eq!(guard.pending_writes(), 2);
|
|
|
|
guard.write(b"3").unwrap();
|
|
assert_eq!(guard.pending_writes(), 0); // Should have synced at 3
|
|
}
|
|
|
|
#[test]
|
|
fn test_fsync_guard_eventual() {
|
|
let (_dir, path, file) = create_test_file();
|
|
|
|
let mut guard = FsyncGuard::new(file, path, DurabilityLevel::Eventual);
|
|
|
|
for i in 0..100 {
|
|
guard.write(&[i]).unwrap();
|
|
}
|
|
assert_eq!(guard.pending_writes(), 100); // Never synced
|
|
|
|
guard.force_sync().unwrap();
|
|
assert_eq!(guard.pending_writes(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_fsync_guard_drop_syncs() {
|
|
let dir = tempdir().unwrap();
|
|
let path = dir.path().join("test.quarantine");
|
|
|
|
{
|
|
let file = File::create(&path).unwrap();
|
|
let mut guard = FsyncGuard::new(file, path.clone(), DurabilityLevel::Eventual);
|
|
guard.write(b"test data").unwrap();
|
|
// Guard dropped here, should sync
|
|
}
|
|
|
|
// File should still contain data
|
|
let contents = std::fs::read(&path).unwrap();
|
|
assert_eq!(contents, b"test data");
|
|
}
|
|
|
|
#[test]
|
|
fn test_atomic_write() {
|
|
let dir = tempdir().unwrap();
|
|
let path = dir.path().join("atomic.txt");
|
|
|
|
atomic_write(&path, b"atomic content").unwrap();
|
|
|
|
let contents = std::fs::read(&path).unwrap();
|
|
assert_eq!(contents, b"atomic content");
|
|
|
|
// Temp file should not exist
|
|
let temp_path = path.with_extension("tmp");
|
|
assert!(!temp_path.exists());
|
|
}
|
|
|
|
#[test]
|
|
fn test_file_locking() {
|
|
let dir = tempdir().unwrap();
|
|
let path = dir.path().join("locked.quarantine");
|
|
let file = File::create(&path).unwrap();
|
|
|
|
let guard = FsyncGuard::new(file, path.clone(), DurabilityLevel::Immediate);
|
|
guard.lock_exclusive().unwrap();
|
|
|
|
// Try to lock from another handle
|
|
let file2 = File::open(&path).unwrap();
|
|
let guard2 = FsyncGuard::new(file2, path, DurabilityLevel::Immediate);
|
|
assert!(!guard2.try_lock_exclusive().unwrap());
|
|
|
|
guard.unlock().unwrap();
|
|
assert!(guard2.try_lock_exclusive().unwrap());
|
|
}
|
|
}
|