Milestone 8 (phases 1-4): - Shard-aware WAL segment naming, BatchHeader v2, ShardRouter - Transport trait, InProcessTransport, WalShipper, FollowerDb - HLC, PNCounter, LWWRegister, CrdtSignalState, ReconciliationEngine - Session replication bridge with SeqNo/HWM, idempotency store Forage application: - Multi-source discovery engine with MAB exploration - Embedding-based label system, server handlers, UI refresh Other: - QUICKSTART.md, README.md, milestone-8 planning docs - Hard negative union semantics, RLHF export enhancements - Recovery benchmark and visibility test expansions - Split 8 oversized source files per CODING_GUIDELINES §9 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
450 lines
14 KiB
Rust
450 lines
14 KiB
Rust
//! M8p2 WAL Replication integration tests.
|
|
//!
|
|
//! Tests the full replication pipeline: leader writes signals, segments are
|
|
//! shipped (or directly injected) to a follower, and the follower's ledger
|
|
//! reflects the replicated signals. Also verifies follower write rejection
|
|
//! and replication lag gauge.
|
|
#![allow(clippy::unwrap_used)]
|
|
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
use tidaldb::db::config::{NodeConfig, NodeRole};
|
|
use tidaldb::replication::lag::ReplicationLagGauge;
|
|
use tidaldb::replication::shard::ShardId;
|
|
use tidaldb::replication::state::ReplicationState;
|
|
use tidaldb::replication::transport::{Transport, TransportError, WalSegmentPayload};
|
|
use tidaldb::replication::{InProcessTransportFactory, WalSegmentId};
|
|
use tidaldb::schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, Timestamp, Window};
|
|
use tidaldb::signals::{NoopWalWriter, SignalLedger};
|
|
use tidaldb::wal::format::batch::{EventRecord, encode_batch};
|
|
use tidaldb::{TidalDb, TidalError};
|
|
|
|
/// Build a minimal schema with one signal type.
|
|
fn make_schema() -> tidaldb::schema::Schema {
|
|
let mut builder = SchemaBuilder::new();
|
|
let _ = builder
|
|
.signal(
|
|
"view",
|
|
EntityKind::Item,
|
|
DecaySpec::Exponential {
|
|
half_life: Duration::from_secs(7 * 24 * 3600),
|
|
},
|
|
)
|
|
.windows(&[Window::AllTime])
|
|
.velocity(false)
|
|
.add();
|
|
builder.build().expect("schema must be valid")
|
|
}
|
|
|
|
/// Resolve the signal type ID for "view" using a throwaway ledger.
|
|
///
|
|
/// Signal type IDs are deterministic (alphabetically sorted, starting at 0).
|
|
/// For a schema with one signal type "view", the ID is always 0.
|
|
fn resolve_view_type_id(schema: &tidaldb::schema::Schema) -> tidaldb::signals::SignalTypeId {
|
|
let ledger = SignalLedger::new(schema.clone(), Box::new(NoopWalWriter));
|
|
ledger.resolve_signal_type("view").unwrap()
|
|
}
|
|
|
|
/// Open a follower TidalDb (ephemeral, follower role).
|
|
fn open_follower(schema: tidaldb::schema::Schema) -> TidalDb {
|
|
TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(schema)
|
|
.with_cluster(NodeConfig {
|
|
role: NodeRole::Follower,
|
|
..NodeConfig::default()
|
|
})
|
|
.open()
|
|
.expect("follower should open")
|
|
}
|
|
|
|
/// Open a leader TidalDb (ephemeral, leader role).
|
|
fn open_leader(schema: tidaldb::schema::Schema) -> TidalDb {
|
|
TidalDb::builder()
|
|
.ephemeral()
|
|
.with_schema(schema)
|
|
.with_cluster(NodeConfig {
|
|
role: NodeRole::Leader,
|
|
..NodeConfig::default()
|
|
})
|
|
.open()
|
|
.expect("leader should open")
|
|
}
|
|
|
|
// ── Test 1: Follower rejects write calls ─────────────────────────────────
|
|
|
|
#[test]
|
|
fn follower_rejects_signal_write() {
|
|
let schema = make_schema();
|
|
let follower = open_follower(schema);
|
|
|
|
let err = follower
|
|
.signal("view", EntityId::new(1), 1.0, Timestamp::now())
|
|
.expect_err("follower should reject signal writes");
|
|
|
|
assert!(
|
|
matches!(err, TidalError::ReadOnly(_)),
|
|
"expected ReadOnly error, got: {err}"
|
|
);
|
|
|
|
follower.close().unwrap();
|
|
}
|
|
|
|
#[test]
|
|
fn follower_rejects_write_item() {
|
|
let schema = make_schema();
|
|
let follower = open_follower(schema);
|
|
|
|
let meta = HashMap::from([("title".to_string(), "test".to_string())]);
|
|
let err = follower
|
|
.write_item_with_metadata(EntityId::new(1), &meta)
|
|
.expect_err("follower should reject item writes");
|
|
|
|
assert!(
|
|
matches!(err, TidalError::ReadOnly(_)),
|
|
"expected ReadOnly error, got: {err}"
|
|
);
|
|
|
|
follower.close().unwrap();
|
|
}
|
|
|
|
// ── Test 2: Leader accepts writes ────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn leader_accepts_signal_write() {
|
|
let schema = make_schema();
|
|
let leader = open_leader(schema);
|
|
|
|
leader
|
|
.signal("view", EntityId::new(1), 1.0, Timestamp::now())
|
|
.expect("leader should accept signal writes");
|
|
|
|
let score = leader
|
|
.read_decay_score(EntityId::new(1), "view", 0)
|
|
.expect("read should succeed");
|
|
assert!(score.is_some(), "signal should have been recorded");
|
|
|
|
leader.close().unwrap();
|
|
}
|
|
|
|
// ── Test 3: Direct payload injection into follower ledger ────────────────
|
|
|
|
#[test]
|
|
fn payload_injection_updates_follower_ledger() {
|
|
let schema = make_schema();
|
|
let follower = open_follower(schema.clone());
|
|
|
|
// Resolve the signal type ID using a standalone ledger (same schema).
|
|
let type_id = resolve_view_type_id(&schema);
|
|
|
|
let state = follower.replication_state().clone();
|
|
|
|
// Build a WAL batch payload.
|
|
let events = vec![EventRecord {
|
|
entity_id: 42,
|
|
signal_type: type_id.as_u16() as u8,
|
|
weight: 3.0,
|
|
timestamp_nanos: 1_000_000_000,
|
|
}];
|
|
let bytes = encode_batch(&events, 1, 1).unwrap();
|
|
|
|
// Apply it through the receiver's apply_payload (via the public module).
|
|
// We cannot call apply_payload directly (it is private), so we use
|
|
// the InProcessTransport + spawn_receiver path instead.
|
|
|
|
// Create a oneshot channel-based transport.
|
|
let (tx, rx) = crossbeam::channel::bounded(4);
|
|
|
|
struct OneShotTransport {
|
|
rx: crossbeam::channel::Receiver<WalSegmentPayload>,
|
|
}
|
|
impl Transport for OneShotTransport {
|
|
fn send_segment(
|
|
&self,
|
|
_to: ShardId,
|
|
_payload: WalSegmentPayload,
|
|
) -> Result<(), TransportError> {
|
|
Ok(())
|
|
}
|
|
fn recv_segment(&self) -> Option<WalSegmentPayload> {
|
|
self.rx.recv().ok()
|
|
}
|
|
fn local_shard(&self) -> ShardId {
|
|
ShardId::SINGLE
|
|
}
|
|
}
|
|
|
|
let transport = Arc::new(OneShotTransport { rx });
|
|
follower.start_replication(Arc::clone(&transport)).unwrap();
|
|
|
|
// Send the payload.
|
|
tx.send(WalSegmentPayload {
|
|
id: WalSegmentId::new(tidaldb::replication::RegionId::SINGLE, ShardId::SINGLE, 1),
|
|
bytes,
|
|
event_count: 1,
|
|
})
|
|
.unwrap();
|
|
|
|
// Give the receiver a moment to process.
|
|
std::thread::sleep(Duration::from_millis(100));
|
|
|
|
// Verify the follower's ledger was updated.
|
|
let score = follower
|
|
.read_decay_score(EntityId::new(42), "view", 0)
|
|
.expect("read should succeed");
|
|
assert!(score.is_some(), "signal should be visible on follower");
|
|
|
|
// Verify replication state advanced.
|
|
let applied = state.applied_seqno(ShardId::SINGLE);
|
|
assert_eq!(applied, Some(1), "replication state should have advanced");
|
|
|
|
// Shutdown: drop sender so receiver exits.
|
|
drop(tx);
|
|
follower.close().unwrap();
|
|
}
|
|
|
|
// ── Test 4: Idempotent replay ────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn replay_is_idempotent() {
|
|
let schema = make_schema();
|
|
let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
|
|
let state = Arc::new(ReplicationState::single());
|
|
|
|
let type_id = ledger.resolve_signal_type("view").unwrap();
|
|
|
|
let events = vec![EventRecord {
|
|
entity_id: 10,
|
|
signal_type: type_id.as_u16() as u8,
|
|
weight: 5.0,
|
|
timestamp_nanos: 1_000_000_000,
|
|
}];
|
|
let bytes = encode_batch(&events, 1, 1).unwrap();
|
|
|
|
// Build a transport that delivers the same segment twice.
|
|
let (tx, rx) = crossbeam::channel::bounded(4);
|
|
|
|
struct MultiTransport {
|
|
rx: crossbeam::channel::Receiver<WalSegmentPayload>,
|
|
}
|
|
impl Transport for MultiTransport {
|
|
fn send_segment(
|
|
&self,
|
|
_to: ShardId,
|
|
_payload: WalSegmentPayload,
|
|
) -> Result<(), TransportError> {
|
|
Ok(())
|
|
}
|
|
fn recv_segment(&self) -> Option<WalSegmentPayload> {
|
|
self.rx.recv().ok()
|
|
}
|
|
fn local_shard(&self) -> ShardId {
|
|
ShardId::SINGLE
|
|
}
|
|
}
|
|
|
|
let transport = Arc::new(MultiTransport { rx });
|
|
let handle = tidaldb::replication::spawn_receiver(
|
|
Arc::clone(&transport),
|
|
Arc::clone(&ledger),
|
|
Arc::clone(&state),
|
|
);
|
|
|
|
// Send the same segment twice.
|
|
for _ in 0..2 {
|
|
tx.send(WalSegmentPayload {
|
|
id: WalSegmentId::new(tidaldb::replication::RegionId::SINGLE, ShardId::SINGLE, 1),
|
|
bytes: bytes.clone(),
|
|
event_count: 1,
|
|
})
|
|
.unwrap();
|
|
}
|
|
|
|
std::thread::sleep(Duration::from_millis(100));
|
|
drop(tx);
|
|
handle.join();
|
|
|
|
// The entity should exist with weight=5.0, not 10.0.
|
|
// (Idempotent replay means the second apply was a no-op.)
|
|
assert_eq!(state.applied_seqno(ShardId::SINGLE), Some(1));
|
|
|
|
// Read the hot tier directly to verify only one application.
|
|
let entry = ledger.entries().get(&(EntityId::new(10), type_id));
|
|
assert!(entry.is_some(), "entity should exist in ledger");
|
|
}
|
|
|
|
// ── Test 5: InProcessTransport end-to-end ────────────────────────────────
|
|
|
|
#[test]
|
|
fn in_process_transport_delivers_segment() {
|
|
let shards = vec![ShardId(0), ShardId(1)];
|
|
let mut transports = InProcessTransportFactory::new(&shards).build();
|
|
|
|
let t0 = transports.remove(&ShardId(0)).unwrap();
|
|
let t1 = transports.remove(&ShardId(1)).unwrap();
|
|
|
|
let schema = make_schema();
|
|
let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
|
|
let type_id = ledger.resolve_signal_type("view").unwrap();
|
|
|
|
// Shard 0 sends a segment to shard 1.
|
|
let events = vec![EventRecord {
|
|
entity_id: 99,
|
|
signal_type: type_id.as_u16() as u8,
|
|
weight: 2.0,
|
|
timestamp_nanos: 500,
|
|
}];
|
|
let bytes = encode_batch(&events, 1, 42).unwrap();
|
|
|
|
t0.send_segment(
|
|
ShardId(1),
|
|
WalSegmentPayload {
|
|
id: WalSegmentId::new(tidaldb::replication::RegionId::SINGLE, ShardId(0), 42),
|
|
bytes: bytes.clone(),
|
|
event_count: 1,
|
|
},
|
|
)
|
|
.unwrap();
|
|
|
|
// Shard 1 receives.
|
|
let payload = t1.recv_segment();
|
|
assert!(payload.is_some(), "shard 1 should receive the segment");
|
|
let payload = payload.unwrap();
|
|
assert_eq!(payload.id.seqno, 42);
|
|
assert_eq!(payload.event_count, 1);
|
|
assert_eq!(payload.bytes, bytes);
|
|
|
|
// Drop both transports to clean up.
|
|
drop(t0);
|
|
drop(t1);
|
|
}
|
|
|
|
// ── Test 6: ReplicationLagGauge ──────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn replication_lag_gauge_tracks_lag() {
|
|
let state = Arc::new(ReplicationState::single());
|
|
let gauge = ReplicationLagGauge::new(ShardId::SINGLE, Arc::clone(&state));
|
|
|
|
// Initially, both leader and applied are 0 => lag = 0.
|
|
assert_eq!(gauge.lag_segments(), 0);
|
|
|
|
// Leader moves ahead.
|
|
gauge.update_leader_seqno(10);
|
|
assert_eq!(gauge.lag_segments(), 10);
|
|
|
|
// Follower catches up partially.
|
|
state.advance(ShardId::SINGLE, 7);
|
|
assert_eq!(gauge.lag_segments(), 3);
|
|
|
|
// Follower catches up completely.
|
|
state.advance(ShardId::SINGLE, 10);
|
|
assert_eq!(gauge.lag_segments(), 0);
|
|
}
|
|
|
|
// ── Test 7: Full pipeline: leader -> transport -> follower ───────────────
|
|
|
|
#[test]
|
|
fn full_pipeline_leader_to_follower() {
|
|
let schema = make_schema();
|
|
|
|
// Open leader and follower.
|
|
let leader = open_leader(schema.clone());
|
|
let follower = open_follower(schema.clone());
|
|
|
|
// Resolve type ID using a standalone ledger (same schema).
|
|
let type_id = resolve_view_type_id(&schema);
|
|
let follower_state = follower.replication_state().clone();
|
|
|
|
// Wire up a channel-based transport for the follower.
|
|
let (tx, rx) = crossbeam::channel::bounded(16);
|
|
|
|
struct PipeTransport {
|
|
rx: crossbeam::channel::Receiver<WalSegmentPayload>,
|
|
}
|
|
impl Transport for PipeTransport {
|
|
fn send_segment(
|
|
&self,
|
|
_to: ShardId,
|
|
_payload: WalSegmentPayload,
|
|
) -> Result<(), TransportError> {
|
|
Ok(())
|
|
}
|
|
fn recv_segment(&self) -> Option<WalSegmentPayload> {
|
|
self.rx.recv().ok()
|
|
}
|
|
fn local_shard(&self) -> ShardId {
|
|
ShardId::SINGLE
|
|
}
|
|
}
|
|
|
|
let transport = Arc::new(PipeTransport { rx });
|
|
follower.start_replication(Arc::clone(&transport)).unwrap();
|
|
|
|
// Write signals on the leader.
|
|
let ts = Timestamp::from_nanos(2_000_000_000);
|
|
leader.signal("view", EntityId::new(100), 1.0, ts).unwrap();
|
|
leader.signal("view", EntityId::new(101), 2.0, ts).unwrap();
|
|
|
|
// Simulate the shipper: build a WAL payload from the leader's signals
|
|
// and send it to the follower via the transport.
|
|
let events = vec![
|
|
EventRecord {
|
|
entity_id: 100,
|
|
signal_type: type_id.as_u16() as u8,
|
|
weight: 1.0,
|
|
timestamp_nanos: 2_000_000_000,
|
|
},
|
|
EventRecord {
|
|
entity_id: 101,
|
|
signal_type: type_id.as_u16() as u8,
|
|
weight: 2.0,
|
|
timestamp_nanos: 2_000_000_000,
|
|
},
|
|
];
|
|
let batch_bytes = encode_batch(&events, 1, 1).unwrap();
|
|
|
|
tx.send(WalSegmentPayload {
|
|
id: WalSegmentId::new(tidaldb::replication::RegionId::SINGLE, ShardId::SINGLE, 1),
|
|
bytes: batch_bytes,
|
|
event_count: 2,
|
|
})
|
|
.unwrap();
|
|
|
|
// Wait for the follower to process.
|
|
std::thread::sleep(Duration::from_millis(100));
|
|
|
|
// Verify the follower has the signals.
|
|
let score_100 = follower
|
|
.read_decay_score(EntityId::new(100), "view", 0)
|
|
.unwrap();
|
|
let score_101 = follower
|
|
.read_decay_score(EntityId::new(101), "view", 0)
|
|
.unwrap();
|
|
|
|
assert!(
|
|
score_100.is_some(),
|
|
"entity 100 should be visible on follower"
|
|
);
|
|
assert!(
|
|
score_101.is_some(),
|
|
"entity 101 should be visible on follower"
|
|
);
|
|
|
|
// Verify replication state.
|
|
// Batch has 2 events starting at seq 1, so last seq = 1 + 2 - 1 = 2.
|
|
let applied = follower_state.applied_seqno(ShardId::SINGLE);
|
|
assert_eq!(
|
|
applied,
|
|
Some(2),
|
|
"replication state should reflect applied batch"
|
|
);
|
|
|
|
// Cleanup.
|
|
drop(tx);
|
|
leader.close().unwrap();
|
|
follower.close().unwrap();
|
|
}
|