tidaldb/tidal/src/wal/writer.rs
jordan 29400d48db feat: implement Milestone 1 phases 1-3 — schema, WAL, and storage layer
Implements the foundation of tidalDB's data pipeline:

**Phase 1 – Schema primitives**
- EntityId newtype (u64, big-endian ordering)
- SignalTypeDefinition with pre-computed decay λ, deduped/sorted windows
- SchemaBuilder with full constraint validation (duplicates, identifiers,
  half-life, windows, velocity)
- LumenError wrapping all subsystems with required From impls

**Phase 2 – Write-Ahead Log**
- Length-prefixed, BLAKE3-protected entry format
- Group-commit writer (batch up to 100 events / 10 ms)
- Double-buffered content-hash deduplication
- Checkpoint, truncation, and crash-recovery with full replay
- Integration, property, and UAT tests (incl. 5,500-event deterministic UAT)
- Proptest coverage scaled to 10 000 events/run (was ≤500) to meet
  acceptance criterion; cases reduced 100→10 to keep runtime comparable

**Phase 3 – Storage engine**
- StorageEngine trait (get/put/delete/scan/batch/flush)
- Key encoding: [EntityId][0x00][Tag][suffix] with ordering/prefix helpers
- InMemoryBackend (BTreeMap + RwLock)
- FjallStorage with three isolated keyspaces and atomic batch helper
- Property tests for key ordering and round-trip correctness

Also adds planning docs for phases 4-5, research docs, architecture
overview, and roadmap updates.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-20 16:43:24 -07:00

452 lines
16 KiB
Rust

use std::path::PathBuf;
use std::time::{Duration, Instant};
use crossbeam::channel::Receiver;
use super::dedup::DedupWindow;
use super::error::WalError;
use super::format::{self, EventRecord};
use super::segment::{self, SegmentWriter};
/// Commands sent from `WalHandle` to the writer thread.
pub enum WalCommand {
/// Append a signal event. The reply channel receives the assigned
/// sequence number (or an error) once the batch containing this
/// event has been durably fsynced.
Append {
event: EventRecord,
reply: crossbeam::channel::Sender<Result<u64, WalError>>,
},
/// Delete segments whose first sequence number is less than `before_seq`.
/// Runs inside the writer thread to avoid racing with concurrent writes.
TruncateBefore {
before_seq: u64,
reply: crossbeam::channel::Sender<Result<(), WalError>>,
},
/// Graceful shutdown: flush remaining events and exit.
Shutdown,
}
/// Configuration for the group commit writer.
pub struct WriterConfig {
pub dir: PathBuf,
pub segment_size: u64,
pub batch_size: usize,
pub batch_timeout: Duration,
pub dedup_window: Duration,
}
/// The group commit writer loop.
///
/// Runs on a dedicated thread. Receives events via crossbeam channel,
/// accumulates them into batches, writes batches to the WAL segment,
/// and fsyncs once per batch. Callers are notified of their sequence
/// numbers via per-event reply channels.
///
/// # Batch formation
///
/// 1. Block until the first event arrives.
/// 2. Drain additional events from the channel up to `batch_size` or
/// until `batch_timeout` elapses (whichever comes first).
/// 3. Deduplicate events, encode the batch, write to segment, fsync.
/// 4. Send sequence numbers back to all waiting callers.
///
/// # Errors
///
/// Returns `WalError::Io` on filesystem failure during batch writes or fsync.
/// Returns `WalError::Corruption` if batch encoding fails (should not happen
/// under normal operation).
///
/// # Panics
///
/// Panics if the system clock is before the Unix epoch (same as `Timestamp::now()`).
// The function exceeds 100 lines due to the shutdown-drain path (B-3 fix).
// Extracting a helper would require restructuring the module, which is outside
// the scope of these targeted fixes.
#[allow(clippy::too_many_lines)]
pub fn run_writer(
rx: &Receiver<WalCommand>,
config: &WriterConfig,
mut segment: SegmentWriter,
start_seq: u64,
mut dedup: DedupWindow,
) -> Result<(), WalError> {
let mut next_seq = start_seq;
let mut batch: Vec<(
EventRecord,
crossbeam::channel::Sender<Result<u64, WalError>>,
)> = Vec::with_capacity(config.batch_size);
let mut shutdown_requested = false;
loop {
// Block until the first event arrives (or shutdown/disconnect)
match rx.recv() {
Ok(WalCommand::Append { event, reply }) => {
batch.push((event, reply));
}
Ok(WalCommand::TruncateBefore { before_seq, reply }) => {
let result = segment::delete_segments_before(&config.dir, before_seq);
let _ = reply.send(result.map(|_| ()));
continue;
}
Ok(WalCommand::Shutdown) | Err(_) => {
break;
}
}
// Drain up to batch_size with deadline
let deadline = Instant::now() + config.batch_timeout;
while batch.len() < config.batch_size {
match rx.recv_deadline(deadline) {
Ok(WalCommand::Append { event, reply }) => {
batch.push((event, reply));
}
Ok(WalCommand::TruncateBefore { before_seq, reply }) => {
let result = segment::delete_segments_before(&config.dir, before_seq);
let _ = reply.send(result.map(|_| ()));
// Continue draining the batch; truncation is a side-effect,
// not a batch-terminating event.
}
Ok(WalCommand::Shutdown)
| Err(crossbeam::channel::RecvTimeoutError::Disconnected) => {
shutdown_requested = true;
break;
}
Err(crossbeam::channel::RecvTimeoutError::Timeout) => break,
}
}
// Deduplicate and separate into kept events and duplicate replies
let mut kept_events: Vec<EventRecord> = Vec::with_capacity(batch.len());
let mut kept_replies: Vec<crossbeam::channel::Sender<Result<u64, WalError>>> =
Vec::with_capacity(batch.len());
let mut dup_replies: Vec<crossbeam::channel::Sender<Result<u64, WalError>>> = Vec::new();
// drain(..) is intentional: we reuse batch's heap allocation across loop iterations.
#[allow(clippy::iter_with_drain)]
for (event, reply) in batch.drain(..) {
if dedup.is_duplicate(&event) {
dup_replies.push(reply);
} else {
kept_events.push(event);
kept_replies.push(reply);
}
}
// Notify duplicate senders with seq=0 (sentinel for dedup).
for reply in dup_replies {
let _ = reply.send(Ok(0));
}
// Write the batch if there are any non-duplicate events
if !kept_events.is_empty() {
let batch_seq = next_seq;
let batch_ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.expect("system clock is before Unix epoch")
.as_nanos();
#[allow(clippy::cast_possible_truncation)]
let batch_ts_u64 = batch_ts as u64;
// Wrap the write path in a closure so we can notify callers of
// the specific error before propagating it. Without this, an
// early `?` return would drop pending reply channels, leaving
// callers blocked forever (or receiving a generic Closed error
// instead of the real I/O error).
let write_result = (|| -> Result<u64, WalError> {
let encoded = format::encode_batch(&kept_events, batch_seq, batch_ts_u64)?;
if segment.needs_rotation() {
segment.rotate(batch_seq)?;
}
segment.write_batch_bytes(&encoded)?;
segment.sync()?;
Ok(batch_seq)
})();
match write_result {
Ok(_) => {
let event_count = kept_events.len() as u64;
segment.set_last_seq(batch_seq + event_count - 1);
for (i, reply) in kept_replies.into_iter().enumerate() {
let _ = reply.send(Ok(batch_seq + i as u64));
}
next_seq = batch_seq + event_count;
}
Err(ref err) => {
// Notify all waiting callers with the actual error before
// propagating. We cannot clone WalError, so we send a
// synthetic I/O error with the same description.
let err_msg = err.to_string();
for reply in kept_replies {
let _ =
reply.send(Err(WalError::Io(std::io::Error::other(err_msg.clone()))));
}
// write_result is known to be Err here; the Ok branch is
// handled above, so this else-branch is unreachable.
return Err(write_result
.expect_err("write_result is Err in this branch; Ok is handled above"));
}
}
}
if shutdown_requested {
break;
}
}
// Drain any remaining commands that arrived before senders observed
// the shutdown. This ensures in-flight append() calls are not silently
// dropped, which would cause callers to block forever or receive
// WalError::Closed instead of a real sequence number.
let mut final_batch: Vec<(
EventRecord,
crossbeam::channel::Sender<Result<u64, WalError>>,
)> = Vec::new();
loop {
match rx.try_recv() {
Ok(WalCommand::Append { event, reply }) => {
final_batch.push((event, reply));
}
Ok(WalCommand::TruncateBefore { before_seq, reply }) => {
let result = segment::delete_segments_before(&config.dir, before_seq);
let _ = reply.send(result.map(|_| ()));
}
Ok(WalCommand::Shutdown) => {
// Ignore duplicate shutdown commands
}
Err(
crossbeam::channel::TryRecvError::Empty
| crossbeam::channel::TryRecvError::Disconnected,
) => break,
}
}
// Flush the final drain batch if non-empty
if !final_batch.is_empty() {
let mut kept_events: Vec<EventRecord> = Vec::with_capacity(final_batch.len());
let mut kept_replies: Vec<crossbeam::channel::Sender<Result<u64, WalError>>> =
Vec::with_capacity(final_batch.len());
let mut dup_replies: Vec<crossbeam::channel::Sender<Result<u64, WalError>>> = Vec::new();
for (event, reply) in final_batch {
if dedup.is_duplicate(&event) {
dup_replies.push(reply);
} else {
kept_events.push(event);
kept_replies.push(reply);
}
}
for reply in dup_replies {
let _ = reply.send(Ok(0));
}
if !kept_events.is_empty() {
let batch_seq = next_seq;
let batch_ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.expect("system clock is before Unix epoch")
.as_nanos();
#[allow(clippy::cast_possible_truncation)]
let batch_ts_u64 = batch_ts as u64;
let encoded = format::encode_batch(&kept_events, batch_seq, batch_ts_u64)?;
if segment.needs_rotation() {
segment.rotate(batch_seq)?;
}
segment.write_batch_bytes(&encoded)?;
segment.sync()?;
let event_count = kept_events.len() as u64;
segment.set_last_seq(batch_seq + event_count - 1);
for (i, reply) in kept_replies.into_iter().enumerate() {
let _ = reply.send(Ok(batch_seq + i as u64));
}
}
}
// Final sync before exit
segment.sync()?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crossbeam::channel::bounded;
fn make_event(id: u64) -> EventRecord {
EventRecord {
entity_id: id,
signal_type: 1,
weight: 1.0,
timestamp_nanos: 1_000_000_000,
}
}
#[test]
fn writer_processes_single_event() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
};
let (reply_tx, reply_rx) = bounded(1);
tx.send(WalCommand::Append {
event: make_event(42),
reply: reply_tx,
})
.expect("send should succeed");
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let seq = reply_rx
.recv()
.expect("should receive reply")
.expect("should be ok");
assert_eq!(seq, 1);
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
#[test]
fn writer_deduplicates_events() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
};
let event = make_event(42);
let (reply_tx1, reply_rx1) = bounded(1);
let (reply_tx2, reply_rx2) = bounded(1);
tx.send(WalCommand::Append {
event: event.clone(),
reply: reply_tx1,
})
.expect("send should succeed");
tx.send(WalCommand::Append {
event,
reply: reply_tx2,
})
.expect("send should succeed");
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let seq1 = reply_rx1
.recv()
.expect("should receive")
.expect("should be ok");
let seq2 = reply_rx2
.recv()
.expect("should receive")
.expect("should be ok");
assert_eq!(seq1, 1);
assert_eq!(seq2, 0); // deduplicated
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
#[test]
fn writer_handles_channel_disconnect() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
};
drop(tx); // Disconnect immediately
let result = run_writer(&rx, &config, segment, 1, dedup);
assert!(result.is_ok());
}
#[test]
fn writer_assigns_monotonic_sequences() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
};
let mut reply_rxs = Vec::new();
for i in 0..5 {
let (reply_tx, reply_rx) = bounded(1);
tx.send(WalCommand::Append {
event: make_event(i),
reply: reply_tx,
})
.expect("send should succeed");
reply_rxs.push(reply_rx);
}
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let mut seqs = Vec::new();
for reply_rx in reply_rxs {
let seq = reply_rx
.recv()
.expect("should receive")
.expect("should be ok");
seqs.push(seq);
}
// Verify monotonically increasing
for window in seqs.windows(2) {
assert!(window[0] < window[1], "seqs not monotonic: {seqs:?}");
}
assert_eq!(seqs[0], 1);
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
}