feat: complete M8 replication primitives + forage enhancements + docs

Milestone 8 (phases 1-4): - Shard-aware WAL segment naming, BatchHeader v2, ShardRouter - Transport trait, InProcessTransport, WalShipper, FollowerDb - HLC, PNCounter, LWWRegister, CrdtSignalState, ReconciliationEngine - Session replication bridge with SeqNo/HWM, idempotency store Forage application: - Multi-source discovery engine with MAB exploration - Embedding-based label system, server handlers, UI refresh Other: - QUICKSTART.md, README.md, milestone-8 planning docs - Hard negative union semantics, RLHF export enhancements - Recovery benchmark and visibility test expansions - Split 8 oversized source files per CODING_GUIDELINES §9 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 13:17:19 -07:00 · 2026-02-24 13:17:19 -07:00 · f4cfd6c81f
commit f4cfd6c81f
parent c1c5a10fbc
127 changed files with 18631 additions and 1646 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -11,6 +11,7 @@ A single-node-first, embeddable Rust database for the **personalized content ran
 | If you need to... | Read this |
 |-------------------|-----------|
 | **Get started quickly** | [README.md](README.md) → [QUICKSTART.md](QUICKSTART.md) |
 | **Understand the vision** | [VISION.md](VISION.md) |
 | **See use cases and surfaces** | [USE_CASES.md](USE_CASES.md) |
 | **See sequence diagrams** | [SEQUENCE.md](SEQUENCE.md) |
--- a/Cargo.lock
+++ b/Cargo.lock
@ -221,6 +221,15 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "anes"
 version = "0.1.6"
@ -589,6 +598,20 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 [[package]]
 name = "chrono"
 version = "0.4.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
 dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
 "serde",
 "wasm-bindgen",
 "windows-link",
 ]
 [[package]]
 name = "ciborium"
 version = "0.2.2"
@ -1168,13 +1191,16 @@ name = "forage-server"
 version = "0.1.0"
 dependencies = [
 "axum 0.7.9",
 "chrono",
 "clap",
 "dirs-next",
 "forage-engine",
 "serde",
 "serde_json",
 "tokio",
 "tokio-stream",
 "tower-http 0.5.2",
 "tracing-subscriber",
 ]
 [[package]]
@ -1538,6 +1564,30 @@ dependencies = [
 "windows-registry",
 ]
 [[package]]
 name = "iana-time-zone"
 version = "0.1.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
 dependencies = [
 "android_system_properties",
 "core-foundation-sys",
 "iana-time-zone-haiku",
 "js-sys",
 "log",
 "wasm-bindgen",
 "windows-core",
 ]
 [[package]]
 name = "iana-time-zone-haiku"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
 dependencies = [
 "cc",
 ]
 [[package]]
 name = "icu_collections"
 version = "2.1.1"
@ -3180,6 +3230,7 @@ dependencies = [
 "dashmap",
 "fjall",
 "fs4",
 "lru",
 "proptest",
 "rand 0.9.2",
 "roaring",
@ -3293,6 +3344,18 @@ dependencies = [
 "tokio",
 ]
 [[package]]
 name = "tokio-stream"
 version = "0.1.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
 dependencies = [
 "futures-core",
 "pin-project-lite",
 "tokio",
 "tokio-util",
 ]
 [[package]]
 name = "tokio-util"
 version = "0.7.18"
@ -3761,6 +3824,41 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 [[package]]
 name = "windows-core"
 version = "0.62.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
 dependencies = [
 "windows-implement",
 "windows-interface",
 "windows-link",
 "windows-result",
 "windows-strings",
 ]
 [[package]]
 name = "windows-implement"
 version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "windows-interface"
 version = "0.59.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "windows-link"
 version = "0.2.1"
--- a/QUICKSTART.md
+++ b/QUICKSTART.md
@ -0,0 +1,296 @@
 # Quickstart
 Get a working ranked feed in 10 minutes.
 **Prerequisites:** Rust 1.91+, Cargo. No external services.
 ---
 ## Run the example
 The fastest path is the included example, which demonstrates the complete loop — schema, ingest, signals, ranking:
 ```bash
 cargo run --manifest-path tidal/Cargo.toml --example quickstart
 ```
 The rest of this guide explains what it does and extends it with personalization and search.
 ---
 ## Step 1: Add the dependency
 ```toml
 [dependencies]
 tidaldb = { git = "https://github.com/your-org/tidalDB", rev = "..." }
 ```
 ---
 ## Step 2: Define a schema
 Schema is defined before opening the database. It declares signal types (what events you'll record and how they decay), text fields (for BM25 search), and embedding slots (for vector search).
 ```rust
 use std::time::Duration;
 use tidaldb::schema::{SchemaBuilder, EntityKind, DecaySpec, Window, TextFieldType};
 let mut schema = SchemaBuilder::new();
 // View signal: 7-day half-life, three windows, velocity enabled.
 // You declare the decay. tidalDB applies it at query time — no formula to maintain.
 let _ = schema.signal("view", EntityKind::Item, DecaySpec::Exponential {
    half_life: Duration::from_secs(7 * 24 * 3600),
 }).windows(&[Window::OneHour, Window::TwentyFourHours, Window::AllTime]).velocity(true).add();
 // Like signal: 30-day half-life. Durable engagement decays slowly.
 let _ = schema.signal("like", EntityKind::Item, DecaySpec::Exponential {
    half_life: Duration::from_secs(30 * 24 * 3600),
 }).windows(&[Window::AllTime]).velocity(false).add();
 // Share signal: 3-day half-life. Short-lived but strongly trending.
 let _ = schema.signal("share", EntityKind::Item, DecaySpec::Exponential {
    half_life: Duration::from_secs(3 * 24 * 3600),
 }).windows(&[Window::TwentyFourHours, Window::AllTime]).velocity(true).add();
 // Skip signal: permanent. A user who skipped should not see it again.
 let _ = schema.signal("hide", EntityKind::Item, DecaySpec::Permanent).add();
 // Text fields for BM25 full-text search.
 schema.text_field("title", TextFieldType::Text);
 schema.text_field("category", TextFieldType::Keyword);
 // Embedding slot for semantic / vector search (128D in this example).
 // In production, use the dimensionality of your embedding model.
 schema.embedding_slot("content", EntityKind::Item, 128);
 let schema = schema.build()?;
 ```
 **Decay types:**
 - `Exponential { half_life }` — weight halves every `half_life`. Use for views, likes, shares.
 - `Linear { lifetime }` — weight drops to zero over `lifetime`.
 - `Permanent` — never decays. Use for hides, blocks, follows.
 ---
 ## Step 3: Open the database
 ```rust
 use tidaldb::TidalDb;
 // Ephemeral: in-memory, ideal for tests and this tutorial.
 let db = TidalDb::builder().ephemeral().with_schema(schema).open()?;
 // Persistent: durable storage at a path on disk.
 // let db = TidalDb::builder().with_data_dir("/var/lib/myapp/tidaldb").with_schema(schema).open()?;
 db.health_check()?;
 ```
 `TidalDb` is `Send + Sync`. Wrap it in `Arc<TidalDb>` to share across threads or tasks.
 ---
 ## Step 4: Ingest content
 Write items with metadata as `HashMap<String, String>` key-value pairs. Then write their embeddings separately.
 **tidalDB does not generate embeddings.** You bring your model; tidalDB handles retrieval and ranking over the vectors you produce.
 ```rust
 use std::collections::HashMap;
 use tidaldb::schema::{EntityId, Timestamp};
 let tracks = [
    (1u64, "Introduction to Jazz Piano",  "music",    "1320"),
    (2,    "Rust Async Programming",       "tech",     "3600"),
    (3,    "Sourdough Bread Masterclass",  "cooking",  "2700"),
    (4,    "Jazz Improvisation Techniques","music",    "1800"),
    (5,    "Building a Compiler in Rust",  "tech",     "5400"),
    (6,    "French Pastry Fundamentals",   "cooking",  "2100"),
    (7,    "Modal Jazz: Coltrane Changes", "music",    "2400"),
    (8,    "WebAssembly from Scratch",     "tech",     "2700"),
    (9,    "Knife Skills for Home Cooks",  "cooking",   "900"),
    (10,   "Bebop Piano Vocabulary",       "music",    "1500"),
 ];
 for (id, title, category, duration) in &tracks {
    let mut meta = HashMap::new();
    meta.insert("title".to_string(), title.to_string());
    meta.insert("category".to_string(), category.to_string());
    meta.insert("format".to_string(), "video".to_string());
    meta.insert("duration".to_string(), duration.to_string());
    meta.insert("created_at".to_string(), Timestamp::now().as_nanos().to_string());
    db.write_item_with_metadata(EntityId::new(*id), &meta)?;
    // In production: embed the title with your model.
    // Here we use random unit vectors for illustration.
    let embedding = random_unit_vector(128, &mut rng);
    db.write_item_embedding(EntityId::new(*id), &embedding)?;
 }
 println!("Ingested {} items.", db.item_count());
 ```
 On write, tidalDB:
 1. Stores the entity and metadata
 2. Indexes text fields into the BM25 index
 3. Inserts the embedding into the HNSW vector index
 4. Initializes the signal ledger with an exploration budget
 5. Makes the item immediately queryable
 ---
 ## Step 5: Record engagement signals
 When a user engages with content, write a signal. The feedback loop closes at write time — no Kafka consumer to lag, no feature store sync to schedule.
 ```rust
 let now = Timestamp::now();
 // Global signals — these update the item's aggregate signal ledger.
 db.signal("view", EntityId::new(1), 1.0, now)?;  // Jazz Piano viewed
 db.signal("view", EntityId::new(4), 1.0, now)?;
 db.signal("view", EntityId::new(7), 1.0, now)?;  // Modal Jazz viewed
 db.signal("like", EntityId::new(4), 1.0, now)?;  // Jazz Improv liked
 db.signal("share", EntityId::new(7), 1.0, now)?; // Modal Jazz shared
 ```
 For signals with user context, use `signal_with_context`. This also updates the user's preference vector and interaction weights — enabling personalization.
 ```rust
 let user_id = 42u64;
 let creator_id = 100u64;
 // User 42 viewed item 4. Their preference vector shifts toward jazz content.
 db.signal_with_context("view", EntityId::new(4), 1.0, now, Some(user_id), Some(creator_id))?;
 db.signal_with_context("like", EntityId::new(7), 1.0, now, Some(user_id), Some(creator_id))?;
 // Negative signals are equal citizens.
 db.signal("hide", EntityId::new(2), 1.0, now)?; // User hid the Rust video.
 ```
 A ranking query issued 100ms later sees the updated state. No ETL required.
 ---
 ## Step 6: Retrieve a ranked feed
 tidalDB ships 25 built-in ranking profiles. The application names a profile; the database executes the full scoring pipeline.
 ```rust
 use tidaldb::query::retrieve::Retrieve;
 // Global trending: items with the highest share + view velocity.
 let query = Retrieve::builder().profile("trending").limit(10).build()?;
 let results = db.retrieve(&query)?;
 println!("Trending ({} candidates):", results.total_candidates);
 for item in &results.items {
    let sigs: Vec<_> = item.signals.iter()
        .map(|s| format!("{}={:.3}", s.name, s.value))
        .collect();
    println!("  #{} id={} score={:.4} [{}]",
        item.rank, item.entity_id.as_u64(), item.score, sigs.join(", "));
 }
 ```
 ---
 ## Step 7: Personalize
 Swap the profile to `for_you`. Because user 42 signaled views and likes on jazz content, their results differ from global trending.
 ```rust
 // Personalized feed for user 42.
 let query = Retrieve::builder()
    .for_user(user_id)
    .profile("for_you")
    .limit(10)
    .build()?;
 let results = db.retrieve(&query)?;
 println!("For You (user {}):", user_id);
 for item in &results.items {
    println!("  #{} id={} score={:.4}", item.rank, item.entity_id.as_u64(), item.score);
 }
 ```
 Other useful profiles:
 - `"hot"` — score with age decay (Reddit model)
 - `"following"` — content from followed creators (requires `for_user` + written `follows` relationships)
 - `"hidden_gems"` — high completion rate, low reach
 - `"top_week"` — cumulative quality over the last 7 days
 - `"shuffle"` — random, quality-weighted
 ---
 ## Step 8: Search
 Search combines BM25 full-text and ANN semantic similarity via Reciprocal Rank Fusion.
 ```rust
 use tidaldb::query::search::Search;
 // Flush the text index so recently written items are searchable.
 // In production with persistent mode this happens automatically on a ~2s commit cycle.
 db.flush_text_index()?;
 // Keyword search, personalized for user 42.
 let query = Search::builder()
    .query("jazz piano")
    .for_user(user_id)
    .limit(5)
    .build()?;
 let results = db.search(&query)?;
 println!("Search 'jazz piano':");
 for item in &results.items {
    println!("  #{} id={} bm25={:.3?} semantic={:.3?}",
        item.rank,
        item.entity_id.as_u64(),
        item.bm25_score,
        item.semantic_score,
    );
 }
 ```
 Add a query embedding for hybrid search — text relevance + semantic similarity:
 ```rust
 let query_vector = your_model.embed("jazz piano");  // same model as item embeddings
 let query = Search::builder()
    .query("jazz piano")
    .vector(query_vector)
    .for_user(user_id)
    .limit(5)
    .build()?;
 ```
 ---
 ## Step 9: Close
 ```rust
 db.close()?;
 ```
 This flushes the WAL, checkpoints signal state, and persists indexes. In persistent mode, the next open recovers to the last checkpointed state.
 ---
 ## What to explore next
 | Topic | Where to look |
 |-------|--------------|
 | Full API reference | [API.md](API.md) |
 | Filters — format, duration, location, engagement thresholds | [API.md — Filters](API.md#filters) |
 | Diversity constraints | [API.md — Diversity Constraints](API.md#diversity-constraints) |
 | All 25 ranking profiles | [API.md — Sort Modes](API.md#sort-modes) |
 | Cohort-scoped trending | [API.md — Cohorts](API.md#cohort-definitions) |
 | Collections and saved searches | [API.md — Collections](API.md#collections) |
 | Axum embedding example | `tidal/examples/axum_embedding.rs` |
 | 14 content discovery surfaces | [USE_CASES.md](USE_CASES.md) |
 | Architecture and design decisions | [ARCHITECTURE.md](ARCHITECTURE.md) |
--- a/README.md
+++ b/README.md
@ -0,0 +1,144 @@
 # tidalDB
 **An embeddable Rust database for the personalized content ranking problem.**
 > Pre-release. API is stabilizing. Not yet recommended for production.
 ---
 Every content platform eventually builds the same distributed system from scratch: Elasticsearch for retrieval, Redis for hot signals, Kafka for event ingestion, a feature store for user profiles, a vector database for semantic search, and a ranking service that stitches them together. The seams between those systems are where correctness dies — stale signals, inconsistent ranking, cache invalidation bugs, ETL lag.
 The root cause: existing databases treat ranking as an afterthought. They have no native concept of signals that evolve over time, no understanding of user context, no diversity as a query constraint.
 **Ranking is not a feature. It is a primitive.**
 tidalDB is a single-node, embeddable Rust library built for one question: *given a user and a context, what content should they see, and in what order?* No server, no network protocol, no client SDK. Link it into your process.
 ---
 ## What it looks like
 ```rust
 use std::collections::HashMap;
 use std::time::Duration;
 use tidaldb::{TidalDb, query::retrieve::Retrieve, schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, Timestamp, Window}};
 // Declare signals with native decay — no application formulas.
 let mut schema = SchemaBuilder::new();
 let _ = schema.signal("view", EntityKind::Item, DecaySpec::Exponential {
    half_life: Duration::from_secs(7 * 24 * 3600),
 }).windows(&[Window::OneHour, Window::TwentyFourHours, Window::AllTime]).velocity(true).add();
 let _ = schema.signal("like", EntityKind::Item, DecaySpec::Exponential {
    half_life: Duration::from_secs(30 * 24 * 3600),
 }).windows(&[Window::AllTime]).velocity(false).add();
 let schema = schema.build()?;
 // Open — ephemeral for tests, persistent for production.
 let db = TidalDb::builder().ephemeral().with_schema(schema).open()?;
 // Ingest content with metadata.
 let mut meta = HashMap::new();
 meta.insert("title".to_string(), "Introduction to Jazz Piano".to_string());
 meta.insert("category".to_string(), "music".to_string());
 db.write_item_with_metadata(EntityId::new(1), &meta)?;
 // Write an embedding (you generate it, tidalDB indexes and ranks over it).
 db.write_item_embedding(EntityId::new(1), &your_model.embed("Introduction to Jazz Piano"))?;
 // Record engagement — the feedback loop closes here, no ETL required.
 db.signal("view", EntityId::new(1), 1.0, Timestamp::now())?;
 db.signal_with_context("like", EntityId::new(1), 1.0, Timestamp::now(), Some(user_id), Some(creator_id))?;
 // Retrieve a ranked feed. Name the profile. tidalDB executes the pipeline.
 let results = db.retrieve(&Retrieve::builder().for_user(user_id).profile("for_you").limit(50).build()?)?;
 // Search: BM25 + semantic similarity fused via RRF.
 let results = db.search(&Search::builder().query("jazz piano tutorial").for_user(user_id).limit(20).build()?)?;
 db.close()?;
 ```
 ---
 ## What it replaces
 | System | tidalDB equivalent |
 |--------|--------------------|
 | Elasticsearch | Tantivy BM25 text index (derived, crash-recoverable) |
 | Redis | Lock-free in-memory signal ledger — decay scores, windowed counters |
 | Kafka | Write-ahead log — durable, ordered, replayable |
 | Feature store | Signal aggregates + user preference vectors (updated at write time) |
 | Vector DB | USearch HNSW — embedded, f16 quantized, predicate-filtered ANN |
 | Ranking service | 25 named profiles, scored at query time, swappable by name |
 ---
 ## Key capabilities
 - **Signals with native decay** — declare `view` with a 7-day half-life; the database applies it at query time. No `trending_score_7d` field to maintain.
 - **25 built-in ranking profiles** — `trending`, `hot`, `for_you`, `following`, `related`, `hidden_gems`, `top_week`, `shuffle`, `controversial`, and more. Name the profile; the database executes the full pipeline.
 - **Hybrid search** — BM25 full-text + ANN semantic similarity, fused via Reciprocal Rank Fusion, personalized by user preference vector.
 - **Composable filters** — filter by category, format, duration, language, engagement threshold, location, collection membership, and more — any combination, all composable.
 - **Diversity as a query constraint** — `max_per_creator: 2` belongs in the query, not your API layer.
 - **Feedback loop in the write path** — a signal write atomically updates the item's ledger, the user's preference vector, and relationship weights. The next ranking query — 100ms later — reflects it.
 - **Cold start handled** — new content gets an exploration budget; new users get sensible defaults. No application logic required.
 - **Cohort-scoped trending** — "trending among US users aged 18-24 who engage with jazz" is one query, not a pipeline.
 - **Embeddable first** — runs in your process. `Arc<TidalDb>` is `Send + Sync`. No operational overhead.
 ---
 ## Getting started
 tidalDB is not yet published to crates.io. Add it as a git dependency:
 ```toml
 [dependencies]
 tidaldb = { git = "https://github.com/your-org/tidalDB", rev = "..." }
 ```
 Then follow the **[Quickstart](QUICKSTART.md)** to get a working ranked feed in 10 minutes, or run the included example:
 ```bash
 cargo run --manifest-path tidal/Cargo.toml --example quickstart
 ```
 **MSRV:** Rust 1.91
 ---
 ## Documentation
 | Document | Contents |
 |----------|----------|
 | [QUICKSTART.md](QUICKSTART.md) | Step-by-step guide: schema, ingest, signals, ranking, search |
 | [API.md](API.md) | Full API reference with code examples |
 | [VISION.md](VISION.md) | Problem statement and design thesis |
 | [ARCHITECTURE.md](ARCHITECTURE.md) | Storage, signal system, vector index, query pipeline |
 | [USE_CASES.md](USE_CASES.md) | 14 content discovery surfaces, filter and sort references |
 ---
 ## Status
 Milestones completed:
 - Storage engine, WAL, entity store, signal ledger
 - RETRIEVE query: candidate retrieval, filtering, scoring, diversity, pagination
 - Vector index (USearch HNSW) with adaptive filtered search
 - 25 built-in ranking profiles
 - BM25 full-text search (Tantivy) + hybrid RRF fusion
 - Creator search and creator profiles
 - Cohort-scoped signal aggregation and trending
 - Social graph (follows, blocks, following feed)
 - Collections, saved searches, autocomplete suggestions
 - Session and agent context (short-lived signals, preference decay)
 - Crash recovery, graceful degradation, rate limiting, diagnostics
 - Scale: tested to 1M items; scale benchmarks passing
 The API surface is stable for the implemented features. Breaking changes are possible before 1.0.
 ---
 ## License
 MIT
--- a/applications/forage/agent.md
+++ b/applications/forage/agent.md
@ -0,0 +1,90 @@
 # Forage Discovery Agent
 You are the Forage discovery agent. Your job is to find real articles from the web and capture them into the Forage personalized feed engine running at `http://localhost:4242`.
 ## Core Loop
 Repeat this loop indefinitely until I tell you to stop:
 ### Step 1: Get browse tasks
 ```
 GET http://localhost:4242/browse-tasks
 ```
 Parse the JSON response:
 - `should_run` — if false, wait `interval_minutes` minutes then go back to Step 1
 - `topics` — list of topics with `name`, `priority`, and `sources`
 - `limit_per_topic` — max articles to capture per source
 - `tag_hints` — subtopics to prefer when selecting articles (e.g. `["modal jazz", "music theory"]`)
 ### Step 2: Send heartbeat
 ```
 POST http://localhost:4242/discovery/heartbeat
 Content-Type: application/json
 {}
 ```
 ### Step 3: Browse and capture
 For each topic in `topics` (ordered by priority, highest first):
  For each URL in `topic.sources`:
    1. Navigate to the source URL
    2. Identify article links on the page (links to individual articles, not nav/footer/category pages)
    3. If `tag_hints` is non-empty, prefer articles whose headlines suggest those subtopics
    4. For each selected article (up to `limit_per_topic`):
       a. Navigate to the article URL
       b. Read the full page content
       c. Extract and analyse:
          - `title` — the article's actual headline (prefer `<h1>` over `<title>` tag)
          - `canonical_url` — from `<link rel="canonical">`, or empty string if absent
          - `reading_time_min` — word count divided by 200, rounded up to nearest integer
          - `tags` — 2 to 5 specific subtopic tags (lowercase, singular or short phrases). Be specific: `"modal jazz"` not `"jazz"`. `"rust async"` not `"programming"`.
          - `entities` — up to 5 named people, companies, technologies, or places that are central to the article
          - `content_type` — one of: `analysis`, `news`, `tutorial`, `opinion`, `review`, `interview`, `research`
          - `summary` — exactly 2 sentences describing what the article argues or reports. Write from what you read, not from the meta description. A meta description like "Read our latest article" is useless — ignore it.
       d. Skip the article if any of these are true:
          - Title is empty
          - Title contains "Sign In", "Subscribe", "Login", "Create Account", "Register"
          - URL is localhost, 127.0.0.1, or starts with chrome://
          - The page appears to be a category listing, search page, or home page rather than an article
       e. POST to capture:
          ```
          POST http://localhost:4242/capture
          Content-Type: application/json
          {
            "url": "<article url>",
            "canonical_url": "<canonical url or empty>",
            "title": "<title>",
            "source": "<hostname only, e.g. news.ycombinator.com>",
            "category": "<topic name>",
            "description": "<first 200 chars of article body>",
            "reading_time_min": <number>,
            "user_id": 1,
            "tags": ["<tag1>", "<tag2>"],
            "entities": ["<entity1>"],
            "content_type": "<type>",
            "summary": "<2 sentence summary>"
          }
          ```
       f. Wait 1 to 2 seconds before navigating to the next article (be polite to servers)
 ### Step 4: Wait
 After finishing all topics and sources, wait `interval_minutes` minutes, then go back to Step 1.
 ## Important Rules
 - **Read the article, don't guess.** The tags, summary, and content_type must come from actually reading the article — not from the URL, headline alone, or meta description.
 - **Specific tags beat generic ones.** `"type inference"` beats `"programming"`. `"sourdough fermentation"` beats `"cooking"`.
 - **2-sentence summaries only.** Not 1, not 3. Each sentence should be substantive.
 - **Do not capture login pages or paywalls.** If you see a login form or paywall, skip that article.
 - **Do not capture Forage itself.** Skip localhost:4242.
 - **Continue on errors.** If a page fails to load or POST /capture returns an error, log it and move to the next article. Never stop the loop because of a single failure.
 - **The loop runs forever.** Only stop when the user explicitly tells you to stop.
--- a/applications/forage/engine/src/labels.rs
+++ b/applications/forage/engine/src/labels.rs
@ -29,4 +29,16 @@ pub struct ForageItem {
    pub label: ItemLabel,
    pub score: f32,
    pub url: String,
    /// Specific subtopics (e.g. `["modal jazz", "music theory"]`).
    /// Empty for seed items or items not yet enriched by the discovery agent.
    pub tags: Vec<String>,
    /// Named entities (e.g. `["John Coltrane", "Blue Note"]`).
    /// Empty for seed items or items not yet enriched.
    pub entities: Vec<String>,
    /// Content type classification (e.g. `"analysis"`, `"tutorial"`).
    /// Empty string when not enriched.
    pub content_type: String,
    /// Claude's 2-sentence summary of the article.
    /// Empty string when not enriched.
    pub summary: String,
 }
--- a/applications/forage/engine/src/lib.rs
+++ b/applications/forage/engine/src/lib.rs
@ -2,6 +2,7 @@ pub mod labels;
 pub mod mab;
 pub mod schema;
 pub mod seed;
 pub mod sources;
 use std::collections::{HashMap, HashSet, VecDeque};
 use std::path::Path;
@ -38,6 +39,34 @@ pub use mab::{ExplorationStats, MabConfig};
 pub use schema::{DEFAULT_DIM, REAL_DIM};
 pub use seed::{SeedItem, url_to_item_id};
 /// A single topic the discovery agent should browse for.
 #[derive(Debug, Clone, serde::Serialize)]
 pub struct BrowseTopic {
    /// Category name (e.g. "jazz").
    pub name: String,
    /// 0.0--1.0 weight derived from user's preference vector.
    pub priority: f32,
    /// Source URLs to navigate (from the static source registry).
    pub sources: Vec<String>,
 }
 /// The full browse plan returned by `GET /browse-tasks`.
 #[derive(Debug, Clone, serde::Serialize)]
 pub struct BrowsePlan {
    /// Whether the agent should run a discovery cycle now.
    /// False when last discovery was recent and the feed has enough items.
    pub should_run: bool,
    /// How many minutes to wait between cycles.
    pub interval_minutes: u32,
    /// Topics ordered by priority descending.
    pub topics: Vec<BrowseTopic>,
    /// Max articles to capture per source per cycle.
    pub limit_per_topic: usize,
    /// Top tags from the user's saved/dwelled items.
    /// The agent uses these to prefer articles matching the user's subtopics.
    pub tag_hints: Vec<String>,
 }
 /// Input for registering a dynamically discovered page as a Forage item.
 #[derive(Debug, Clone)]
 pub struct ForageItemInput {
@ -49,6 +78,14 @@ pub struct ForageItemInput {
    pub category: String,
    pub reading_time_min: u32,
    pub description: String,
    /// Specific subtopics (e.g. `["modal jazz", "music theory"]`).
    pub tags: Vec<String>,
    /// Named entities (e.g. `["John Coltrane", "Blue Note"]`).
    pub entities: Vec<String>,
    /// Content type classification (e.g. `"analysis"`, `"tutorial"`, `"news"`).
    pub content_type: String,
    /// Claude's 2-sentence summary of the article.
    pub summary: String,
 }
 #[derive(Debug, thiserror::Error)]
@ -102,6 +139,21 @@ pub struct ForageEngine {
    /// Used by `signal()` to detect when a user engages with an exploration slot
    /// and record the outcome into `exploration_stats`.
    last_explore_items: std::sync::Mutex<HashMap<u64, HashSet<u64>>>,
    /// Per-user saved item IDs.
    ///
    /// Maintained at the Forage engine level (not tidalDB's `UserStateIndex`)
    /// because tidalDB's saved bitmap uses `RoaringBitmap<u32>` which truncates
    /// Forage's u64 FNV-hash item IDs for discovered items.  This set preserves
    /// full u64 IDs so `top_tags()` can correctly look up item metadata.
    saved_items: std::sync::Mutex<HashMap<u64, HashSet<u64>>>,
    /// Per-user item IDs where the user dwelled ≥15 seconds (article completion).
    ///
    /// Used by `top_tags()` alongside `saved_items` so that strong-dwell reads
    /// (spec: "save + strong dwell ≥15s") contribute to tag affinity.
    dwelled_items: std::sync::Mutex<HashMap<u64, HashSet<u64>>>,
    /// Path to the `user_state.json` file that persists `saved_items` and
    /// `dwelled_items` across server restarts (`None` in ephemeral mode).
    user_state_path: Option<std::path::PathBuf>,
 }
 /// Fluent builder for `ForageEngine`.
@ -149,10 +201,11 @@ impl ForageEngineBuilder {
            DEFAULT_DIM
        };
        let schema = schema::build(embed_dim);
-        let (db, stats_path) = if self.ephemeral {
+        let (db, stats_path, user_state_path) = if self.ephemeral {
            (
                TidalDb::builder().ephemeral().with_schema(schema).open()?,
                None,
                None,
            )
        } else {
            let dir = self
@ -164,14 +217,20 @@ impl ForageEngineBuilder {
                .with_data_dir(&dir)
                .with_schema(schema)
                .open()?;
-            let path = dir.join("exploration_stats.json");
+            let stats = dir.join("exploration_stats.json");
-            (db, Some(path))
+            let user_state = dir.join("user_state.json");
            (db, Some(stats), Some(user_state))
        };
        // Load persisted exploration stats from disk (persistent mode only).
        let exploration_stats_map = stats_path
            .as_ref()
            .and_then(|p| load_exploration_stats(p))
            .unwrap_or_default();
        // Load persisted saved/dwelled item sets.
        let (saved_items_map, dwelled_items_map) = user_state_path
            .as_ref()
            .map(|p| load_user_item_state(p))
            .unwrap_or_default();
        // Build a reusable HTTP client for the embedding sidecar if configured.
        // Creating the client once preserves the connection pool across all
        // add_item and seed_default_corpus calls.
@ -195,6 +254,9 @@ impl ForageEngineBuilder {
            embed_dim,
            exploration_stats: std::sync::Mutex::new(exploration_stats_map),
            last_explore_items: std::sync::Mutex::new(HashMap::new()),
            saved_items: std::sync::Mutex::new(saved_items_map),
            dwelled_items: std::sync::Mutex::new(dwelled_items_map),
            user_state_path,
            stats_path,
        })
    }
@ -287,7 +349,16 @@ impl ForageEngine {
        Ok(())
    }
-    /// Record a signal for a user–item interaction.
+    /// Record a signal for a user--item interaction.
    ///
    /// For `Save` signals, additionally:
    /// - Marks the item in the user's saved bitmap (`UserStateIndex::add_save`)
    ///   so that `top_tags()` and `browse_tasks()` can aggregate tags from
    ///   saved items.
    /// - Emits a secondary `"share"` signal so that tidalDB's preference vector
    ///   update path (which gates on `is_positive_engagement_signal`) is
    ///   triggered.  Save is strong positive intent and should shift the user's
    ///   taste vector.
    pub fn signal(&self, user_id: u64, item_id: u64, kind: SignalKind) -> Result<()> {
        let signal_type = match kind {
            SignalKind::View => "view",
@ -303,6 +374,40 @@ impl ForageEngine {
            Some(user_id),
            None,
        )?;
        // Save-specific side effects: tidalDB's signal_with_context does not
        // populate the saved bitmap or update the preference vector for "save"
        // signals (only "like", "share", "completion", "search_click" are
        // positive-engagement triggers).  Forage treats save as strong positive
        // intent, so we bridge the gap here.
        if matches!(kind, SignalKind::Save) {
            // Track in Forage-level saved set (u64-safe; see field doc).
            self.saved_items
                .lock()
                .unwrap()
                .entry(user_id)
                .or_default()
                .insert(item_id);
            // Also populate tidalDB's u32 bitmap for query-path compatibility.
            #[allow(clippy::cast_possible_truncation)]
            let item_u32 = item_id as u32;
            self.db.user_state().add_save(user_id, item_u32);
            // Emit a secondary "share" signal so the preference vector update
            // fires.  Weight is lower (0.5) to distinguish from an explicit share.
            self.db.signal_with_context(
                "share",
                EntityId::new(item_id),
                0.5,
                Timestamp::now(),
                Some(user_id),
                None,
            )?;
            // Persist the updated saved set so it survives server restarts.
            self.save_user_item_state();
        }
        let is_positive = matches!(
            kind,
            SignalKind::View | SignalKind::Save | SignalKind::Share
@ -335,6 +440,15 @@ impl ForageEngine {
                Some(user_id),
                None,
            )?;
            // Track in the dwell set so top_tags() includes completion-read items
            // per spec: "save + strong dwell ≥15s" contribute to tag affinity.
            self.dwelled_items
                .lock()
                .unwrap()
                .entry(user_id)
                .or_default()
                .insert(item_id);
            self.save_user_item_state();
        }
        // Dwell is always a positive engagement signal.
        self.track_signal_stats(user_id, item_id, true);
@ -448,6 +562,10 @@ impl ForageEngine {
        );
        meta.insert("description".to_owned(), item.description.clone());
        meta.insert("url".to_owned(), canonical);
        meta.insert("tags".to_owned(), item.tags.join(","));
        meta.insert("entities".to_owned(), item.entities.join(","));
        meta.insert("content_type".to_owned(), item.content_type);
        meta.insert("summary".to_owned(), item.summary);
        self.db.write_item_with_metadata(entity_id, &meta)?;
        // Obtain embedding: call sidecar if configured, else neutral unit vector.
@ -498,6 +616,127 @@ impl ForageEngine {
            .collect()
    }
    /// Return the top `limit` tags from items the user has positively engaged with.
    ///
    /// Scans both the user's **saved** item IDs and items where the user dwelled
    /// ≥15 seconds (article completion).  Per spec: "save + strong dwell ≥15s"
    /// are the positive engagement sources for tag affinity.
    ///
    /// Returns an empty vec for cold-start users or items with no enriched tags.
    pub fn top_tags(&self, user_id: u64, limit: usize) -> Vec<String> {
        // Collect item IDs from both saved and strong-dwell (completion) sets.
        // Use Forage-level sets (full u64 IDs) rather than tidalDB's
        // RoaringBitmap<u32> which truncates FNV-hash IDs for discovered items.
        let mut item_ids: HashSet<u64> = HashSet::new();
        {
            let saved = self.saved_items.lock().unwrap();
            if let Some(ids) = saved.get(&user_id) {
                item_ids.extend(ids.iter().copied());
            }
        }
        {
            let dwelled = self.dwelled_items.lock().unwrap();
            if let Some(ids) = dwelled.get(&user_id) {
                item_ids.extend(ids.iter().copied());
            }
        }
        if item_ids.is_empty() {
            return vec![];
        }
        let mut freq: HashMap<String, usize> = HashMap::new();
        for id in &item_ids {
            if let Ok(Some(m)) = self.db.get_item_metadata(EntityId::new(*id))
                && let Some(tags_str) = m.get("tags")
                && !tags_str.is_empty()
            {
                for tag in tags_str.split(',') {
                    *freq.entry(tag.to_string()).or_insert(0) += 1;
                }
            }
        }
        if freq.is_empty() {
            return vec![];
        }
        let mut pairs: Vec<(String, usize)> = freq.into_iter().collect();
        pairs.sort_by(|a, b| b.1.cmp(&a.1));
        pairs.into_iter().take(limit).map(|(tag, _)| tag).collect()
    }
    /// Build a browse plan for the discovery agent.
    ///
    /// Topics are weighted by the user's preference vector (from `top_categories`).
    /// Cold-start users receive equal weight across all 8 categories.
    /// `tag_hints` comes from `top_tags`.
    ///
    /// `item_count` is used to set `should_run: true` when the corpus is sparse.
    pub fn browse_tasks(&self, user_id: u64, limit_per_topic: usize) -> BrowsePlan {
        let top_cats = self.top_categories(user_id);
        let mut topics: Vec<BrowseTopic> = if top_cats.is_empty() {
            // Cold start: all 8 categories at equal priority.
            let equal_priority = 1.0 / sources::SOURCES.len() as f32;
            sources::SOURCES
                .iter()
                .map(|(name, srcs)| BrowseTopic {
                    name: (*name).to_string(),
                    priority: equal_priority,
                    sources: srcs.iter().map(|s| (*s).to_string()).collect(),
                })
                .collect()
        } else {
            // Warm user: top categories get linearly distributed priority,
            // remaining categories get a low exploration priority.
            let n = top_cats.len();
            let top_set: HashSet<&str> = top_cats.iter().map(String::as_str).collect();
            let mut all_topics: Vec<BrowseTopic> = Vec::with_capacity(sources::SOURCES.len());
            // Top categories: linearly decreasing priority from 1.0 down.
            for (rank, cat) in top_cats.iter().enumerate() {
                let priority = 1.0 - (rank as f32 / n as f32);
                let srcs = sources::sources_for(cat);
                all_topics.push(BrowseTopic {
                    name: cat.clone(),
                    priority,
                    sources: srcs.iter().map(|s| (*s).to_string()).collect(),
                });
            }
            // Remaining categories at low exploration priority.
            for (name, srcs) in sources::SOURCES {
                if !top_set.contains(*name) {
                    all_topics.push(BrowseTopic {
                        name: (*name).to_string(),
                        priority: 0.1,
                        sources: srcs.iter().map(|s| (*s).to_string()).collect(),
                    });
                }
            }
            all_topics
        };
        // Sort by priority descending.
        topics.sort_by(|a, b| {
            b.priority
                .partial_cmp(&a.priority)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        let tag_hints = self.top_tags(user_id, 5);
        BrowsePlan {
            should_run: true,
            interval_minutes: 30,
            topics,
            limit_per_topic,
            tag_hints,
        }
    }
    /// Retrieve a personalized feed for the given user.
    ///
    /// - **Warm path** (user has signaled ≥1 item): ANN query using the user's
@ -748,6 +987,41 @@ impl ForageEngine {
            );
        }
        // Load enrichment for discovered items (non-seed) after MAB selection.
        // mab::select() constructs ForageItem from SeedItem which has no enrichment
        // fields — we hydrate here from DB metadata for any discovered item.
        let seed_id_set: HashSet<u64> = self.seed_items.iter().map(|s| s.id).collect();
        for item in &mut items {
            if !seed_id_set.contains(&item.id)
                && item.tags.is_empty()
                && item.summary.is_empty()
                && let Ok(Some(m)) = self.db.get_item_metadata(EntityId::new(item.id))
            {
                item.tags = m
                    .get("tags")
                    .map(|s| {
                        if s.is_empty() {
                            vec![]
                        } else {
                            s.split(',').map(str::to_string).collect()
                        }
                    })
                    .unwrap_or_default();
                item.entities = m
                    .get("entities")
                    .map(|s| {
                        if s.is_empty() {
                            vec![]
                        } else {
                            s.split(',').map(str::to_string).collect()
                        }
                    })
                    .unwrap_or_default();
                item.content_type = m.get("content_type").cloned().unwrap_or_default();
                item.summary = m.get("summary").cloned().unwrap_or_default();
            }
        }
        // Record which item IDs were labeled Exploring so signal() can detect
        // exploration outcomes and update adaptive MAB state.
        {
@ -942,6 +1216,8 @@ impl ForageEngine {
        let results = self.db.search(&query).ok()?;
        // Return the first result not already showing in the feed.
        let seed_id_set: std::collections::HashSet<u64> =
            self.seed_items.iter().map(|s| s.id).collect();
        for r in results.items {
            let id = r.entity_id.as_u64();
            if already_in_feed.contains(&id) {
@ -949,6 +1225,36 @@ impl ForageEngine {
            }
            if let Some(meta) = meta_map.get(&id) {
                let score = r.semantic_score.map(|d| 1.0_f32 / (1.0 + d)).unwrap_or(0.6);
                // Load enrichment from DB for discovered (non-seed) items.
                // Seed items never have tags/entities/content_type/summary.
                let (tags, entities, content_type, summary) = if seed_id_set.contains(&id) {
                    (vec![], vec![], String::new(), String::new())
                } else {
                    self.db
                        .get_item_metadata(EntityId::new(id))
                        .ok()
                        .flatten()
                        .map(|m| {
                            let parse_list = |key: &str| {
                                m.get(key)
                                    .map(|s| {
                                        if s.is_empty() {
                                            vec![]
                                        } else {
                                            s.split(',').map(str::to_string).collect()
                                        }
                                    })
                                    .unwrap_or_default()
                            };
                            (
                                parse_list("tags"),
                                parse_list("entities"),
                                m.get("content_type").cloned().unwrap_or_default(),
                                m.get("summary").cloned().unwrap_or_default(),
                            )
                        })
                        .unwrap_or_default()
                };
                return Some(ForageItem {
                    id,
                    title: meta.title.clone(),
@ -959,6 +1265,10 @@ impl ForageEngine {
                    label: ItemLabel::Bridge { cat_a, cat_b },
                    score,
                    url: meta.url.clone(),
                    tags,
                    entities,
                    content_type,
                    summary,
                });
            }
        }
@ -991,6 +1301,10 @@ impl ForageEngine {
                        label: label.clone(),
                        score,
                        url: meta.url.clone(),
                        tags: vec![],
                        entities: vec![],
                        content_type: String::new(),
                        summary: String::new(),
                    })
                } else {
                    self.db
@ -1013,6 +1327,28 @@ impl ForageEngine {
                            label: label.clone(),
                            score,
                            url: m.get("url").cloned().unwrap_or_default(),
                            tags: m
                                .get("tags")
                                .map(|s| {
                                    if s.is_empty() {
                                        vec![]
                                    } else {
                                        s.split(',').map(str::to_string).collect()
                                    }
                                })
                                .unwrap_or_default(),
                            entities: m
                                .get("entities")
                                .map(|s| {
                                    if s.is_empty() {
                                        vec![]
                                    } else {
                                        s.split(',').map(str::to_string).collect()
                                    }
                                })
                                .unwrap_or_default(),
                            content_type: m.get("content_type").cloned().unwrap_or_default(),
                            summary: m.get("summary").cloned().unwrap_or_default(),
                        })
                }
            })
@ -1030,6 +1366,81 @@ pub enum SignalKind {
 // ── Persistence helpers ───────────────────────────────────────────────────────
 /// Persist the saved and dwelled item sets to `user_state.json`.
 ///
 /// Uses an atomic temp-file+rename pattern so a crash mid-write leaves the
 /// previous file intact.  Called after every save/completion-dwell event.
 impl ForageEngine {
    fn save_user_item_state(&self) {
        let Some(ref path) = self.user_state_path else {
            return;
        };
        // Snapshot both sets outside the I/O path to keep lock hold time short.
        let saved_snapshot: HashMap<String, Vec<u64>> = {
            let guard = self.saved_items.lock().unwrap();
            guard
                .iter()
                .map(|(k, v)| (k.to_string(), v.iter().copied().collect()))
                .collect()
        };
        let dwelled_snapshot: HashMap<String, Vec<u64>> = {
            let guard = self.dwelled_items.lock().unwrap();
            guard
                .iter()
                .map(|(k, v)| (k.to_string(), v.iter().copied().collect()))
                .collect()
        };
        let Ok(json) = serde_json::to_string(&serde_json::json!({
            "saved": saved_snapshot,
            "dwelled": dwelled_snapshot,
        })) else {
            return;
        };
        let tmp = path.with_extension("tmp");
        if std::fs::write(&tmp, &json).is_ok() {
            let _ = std::fs::rename(&tmp, path);
        }
    }
 }
 /// Load saved and dwelled item sets from `user_state.json`.
 ///
 /// Returns two empty maps if the file does not exist.  Logs a warning and
 /// returns empty maps if the file is corrupt, so the engine starts fresh
 /// rather than refusing to open.
 fn load_user_item_state(
    path: &std::path::Path,
 ) -> (HashMap<u64, HashSet<u64>>, HashMap<u64, HashSet<u64>>) {
    let Some(bytes) = std::fs::read(path).ok() else {
        return (HashMap::new(), HashMap::new());
    };
    let Ok(json): std::result::Result<serde_json::Value, _> = serde_json::from_slice(&bytes) else {
        eprintln!(
            "[forage-engine] user_state.json at {:?} is corrupt; starting with empty state",
            path
        );
        return (HashMap::new(), HashMap::new());
    };
    let parse_map = |v: &serde_json::Value| -> HashMap<u64, HashSet<u64>> {
        v.as_object()
            .map(|obj| {
                obj.iter()
                    .filter_map(|(k, ids)| {
                        let user_id: u64 = k.parse().ok()?;
                        let items: HashSet<u64> = ids
                            .as_array()?
                            .iter()
                            .filter_map(serde_json::Value::as_u64)
                            .collect();
                        Some((user_id, items))
                    })
                    .collect()
            })
            .unwrap_or_default()
    };
    (parse_map(&json["saved"]), parse_map(&json["dwelled"]))
 }
 /// Deserialize the per-user `ExplorationStats` map from the JSON file at `path`.
 ///
 /// Returns `None` if the file does not exist.  Logs a warning and returns `None`
--- a/applications/forage/engine/src/mab.rs
+++ b/applications/forage/engine/src/mab.rs
@ -178,6 +178,10 @@ pub fn select(
                label,
                score: rr.score as f32,
                url: meta.url.clone(),
                tags: vec![],
                entities: vec![],
                content_type: String::new(),
                summary: String::new(),
            });
        }
    }
@ -217,6 +221,10 @@ pub fn select(
                label: ItemLabel::Exploring,
                score: rr.score as f32,
                url: meta.url.clone(),
                tags: vec![],
                entities: vec![],
                content_type: String::new(),
                summary: String::new(),
            });
        }
    }
@ -240,6 +248,10 @@ pub fn select(
                    label: ItemLabel::Resurfaced,
                    score: rr.score as f32,
                    url: meta.url.clone(),
                    tags: vec![],
                    entities: vec![],
                    content_type: String::new(),
                    summary: String::new(),
                });
            }
        }
--- a/applications/forage/engine/src/sources.rs
+++ b/applications/forage/engine/src/sources.rs
@ -0,0 +1,61 @@
 /// Per-category seed URLs for the autonomous discovery agent.
 /// Each entry is (category_name, &[source_urls]).
 /// Source URLs are front pages or list pages where article links are prominent.
 pub const SOURCES: &[(&str, &[&str])] = &[
    (
        "technology",
        &["https://news.ycombinator.com", "https://lobste.rs"],
    ),
    (
        "science",
        &["https://phys.org", "https://news.ycombinator.com?q=science"],
    ),
    (
        "jazz",
        &[
            "https://pitchfork.com/reviews/albums",
            "https://www.allaboutjazz.com/news",
        ],
    ),
    (
        "travel",
        &[
            "https://www.theguardian.com/travel",
            "https://www.cntraveler.com/latest-news",
        ],
    ),
    (
        "cooking",
        &[
            "https://www.seriouseats.com",
            "https://www.bonappetit.com/recipes",
        ],
    ),
    (
        "design",
        &["https://www.dezeen.com/news", "https://designobserver.com"],
    ),
    (
        "history",
        &[
            "https://www.historytoday.com",
            "https://www.smithsonianmag.com/history",
        ],
    ),
    (
        "health",
        &[
            "https://www.health.harvard.edu/blog",
            "https://www.theatlantic.com/health",
        ],
    ),
 ];
 /// Return the source URLs for a given category, or an empty slice if unknown.
 pub fn sources_for(category: &str) -> &'static [&'static str] {
    SOURCES
        .iter()
        .find(|(cat, _)| *cat == category)
        .map(|(_, srcs)| *srcs)
        .unwrap_or(&[])
 }
--- a/applications/forage/engine/tests/smoke.rs
+++ b/applications/forage/engine/tests/smoke.rs
@ -39,6 +39,10 @@ fn builder_with_embedder_fallback_on_unavailable_sidecar() {
            category: "technology".to_owned(),
            reading_time_min: 4,
            description: "Tests neutral vector fallback when embedder is down.".to_owned(),
            tags: vec![],
            entities: vec![],
            content_type: String::new(),
            summary: String::new(),
        })
        .expect("add_item must succeed even when embedder is unreachable");
@ -169,6 +173,10 @@ fn add_item_is_idempotent() {
        category: "technology".to_owned(),
        reading_time_min: 5,
        description: "A test article for idempotency verification.".to_owned(),
        tags: vec![],
        entities: vec![],
        content_type: String::new(),
        summary: String::new(),
    };
    let id1 = engine.add_item(input()).expect("first add_item");
@ -195,6 +203,10 @@ fn discovered_item_surfaces_in_feed() {
            category: "design".to_owned(),
            reading_time_min: 3,
            description: "A page discovered via capture.".to_owned(),
            tags: vec![],
            entities: vec![],
            content_type: String::new(),
            summary: String::new(),
        })
        .expect("add_item");
@ -524,3 +536,340 @@ fn category_signals_tracked_on_signal_write() {
        stats.category_signals
    );
 }
 // ── Browse tasks tests ──────────────────────────────────────────────────────
 /// Cold user (no signals) gets all 8 source categories at equal priority.
 #[test]
 fn browse_tasks_cold_start_equal_weights() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // Cold user (no signals) — all 8 categories should have equal priority ~0.125.
    let plan = engine.browse_tasks(99, 5); // user 99 has no signals
    assert!(plan.should_run);
    assert_eq!(plan.limit_per_topic, 5);
    assert_eq!(plan.interval_minutes, 30);
    assert_eq!(
        plan.topics.len(),
        8,
        "all 8 source categories should be present"
    );
    // All priorities should be equal (within floating point tolerance).
    let first_priority = plan.topics[0].priority;
    for topic in &plan.topics {
        assert!(
            (topic.priority - first_priority).abs() < 1e-5,
            "cold-start topics should have equal priority, got {} and {}",
            first_priority,
            topic.priority
        );
    }
    // Every topic must have at least 1 source URL.
    for topic in &plan.topics {
        assert!(
            !topic.sources.is_empty(),
            "topic '{}' has no sources",
            topic.name
        );
    }
    // Cold start: no tag hints.
    assert!(
        plan.tag_hints.is_empty(),
        "cold user should have no tag hints"
    );
 }
 /// Warm user with jazz saves gets jazz as the highest-priority browse topic.
 #[test]
 fn browse_tasks_warm_user_top_category_ranks_first() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // Give user 1 several saves on jazz items to build a preference.
    // Find jazz seed items and save them.
    let jazz_items: Vec<u64> = engine
        .all_items()
        .iter()
        .filter(|s| s.category == "jazz")
        .take(5)
        .map(|s| s.id)
        .collect();
    assert!(!jazz_items.is_empty(), "seed corpus should have jazz items");
    for id in &jazz_items {
        engine.signal(1, *id, SignalKind::Save).unwrap();
    }
    let plan = engine.browse_tasks(1, 5);
    // Jazz should be the highest-priority topic.
    assert!(!plan.topics.is_empty());
    assert_eq!(
        plan.topics[0].name,
        "jazz",
        "jazz should rank first after jazz saves, got: {:?}",
        plan.topics
            .iter()
            .map(|t| (&t.name, t.priority))
            .collect::<Vec<_>>()
    );
    // Jazz's priority should be higher than all other topics.
    let jazz_priority = plan.topics[0].priority;
    for other in plan.topics.iter().skip(1) {
        assert!(
            jazz_priority > other.priority,
            "jazz ({}) should outrank {} ({})",
            jazz_priority,
            other.name,
            other.priority
        );
    }
 }
 /// Tag hints are populated from saved items' tags.
 #[test]
 fn browse_tasks_tag_hints_populated_from_saves() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // Capture an item with tags and save it.
    let input = ForageItemInput {
        url: "https://example.com/modal-jazz-article".to_string(),
        title: "A Guide to Modal Jazz".to_string(),
        source: "example.com".to_string(),
        category: "jazz".to_string(),
        reading_time_min: 8,
        description: "Deep dive into modal jazz techniques.".to_string(),
        tags: vec![
            "modal jazz".to_string(),
            "music theory".to_string(),
            "coltrane".to_string(),
        ],
        entities: vec!["John Coltrane".to_string()],
        content_type: "tutorial".to_string(),
        summary: "Explores the harmonic language of modal jazz. Coltrane is the central focus."
            .to_string(),
    };
    let item_id = engine.add_item(input).unwrap();
    // Save the item for user 1.
    engine.signal(1, item_id, SignalKind::Save).unwrap();
    let plan = engine.browse_tasks(1, 5);
    // Tag hints should contain the tags from the saved item.
    assert!(
        plan.tag_hints.contains(&"modal jazz".to_string()),
        "tag_hints should contain 'modal jazz', got: {:?}",
        plan.tag_hints
    );
    assert!(
        plan.tag_hints.contains(&"music theory".to_string()),
        "tag_hints should contain 'music theory', got: {:?}",
        plan.tag_hints
    );
 }
 // ── Top tags tests ──────────────────────────────────────────────────────────
 /// Cold user (no saves) gets empty top_tags.
 #[test]
 fn top_tags_empty_for_cold_user() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // User 99 has no saves — top_tags should return empty.
    let tags = engine.top_tags(99, 5);
    assert!(
        tags.is_empty(),
        "cold user should have no tags, got: {:?}",
        tags
    );
 }
 /// Top tags are ordered by frequency of occurrence across saved items.
 #[test]
 fn top_tags_frequency_ranked() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // Capture 3 items with overlapping tags and save them.
    // "rust" appears 3 times, "async" appears 2 times, "wasm" appears 1 time.
    let items = vec![
        (
            "https://example.com/rust-async",
            vec!["rust", "async", "tokio"],
        ),
        ("https://example.com/rust-wasm", vec!["rust", "wasm"]),
        ("https://example.com/rust-futures", vec!["rust", "async"]),
    ];
    for (url, tags) in items {
        let input = ForageItemInput {
            url: url.to_string(),
            title: format!("Article: {url}"),
            source: "example.com".to_string(),
            category: "technology".to_string(),
            reading_time_min: 5,
            description: String::new(),
            tags: tags.iter().map(|s| s.to_string()).collect(),
            entities: vec![],
            content_type: "tutorial".to_string(),
            summary: String::new(),
        };
        let id = engine.add_item(input).unwrap();
        engine.signal(1, id, SignalKind::Save).unwrap();
    }
    let tags = engine.top_tags(1, 5);
    // "rust" appears 3x — must be first.
    assert!(!tags.is_empty(), "should have tags after saves");
    assert_eq!(
        tags[0], "rust",
        "most frequent tag should be first, got: {:?}",
        tags
    );
    // "async" appears 2x — must rank above "wasm" (1x).
    let async_pos = tags
        .iter()
        .position(|t| t == "async")
        .expect("async should be present");
    let wasm_pos = tags
        .iter()
        .position(|t| t == "wasm")
        .expect("wasm should be present");
    assert!(
        async_pos < wasm_pos,
        "async (2x) should rank before wasm (1x)"
    );
 }
 /// Enrichment fields (tags, entities, content_type, summary) stored via `add_item`
 /// are hydrated on feed items returned by `feed()`.
 /// Regression guard for the feed enrichment hydration path added in fix-all.
 #[test]
 fn discovered_item_enrichment_preserved_in_feed() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    let item_id = engine
        .add_item(ForageItemInput {
            url: "https://example.com/enriched-article".to_string(),
            title: "Enriched Article".to_string(),
            source: "example.com".to_string(),
            category: "technology".to_string(),
            reading_time_min: 6,
            description: "An article with full enrichment metadata.".to_string(),
            tags: vec!["rust".to_string(), "async".to_string()],
            entities: vec!["Tokio".to_string()],
            content_type: "tutorial".to_string(),
            summary: "Teaches async Rust. Tokio is the runtime used throughout.".to_string(),
        })
        .unwrap();
    // Retrieve feed for a fresh user so the discovered item is injected.
    let feed = engine.feed(99, 7).unwrap();
    let item = feed
        .iter()
        .find(|i| i.id == item_id)
        .expect("discovered item should appear in feed");
    assert_eq!(
        item.tags,
        vec!["rust", "async"],
        "feed item should carry its stored tags, got: {:?}",
        item.tags
    );
    assert_eq!(
        item.entities,
        vec!["Tokio"],
        "feed item should carry its stored entities, got: {:?}",
        item.entities
    );
    assert_eq!(
        item.content_type, "tutorial",
        "feed item should carry its stored content_type"
    );
    assert!(
        !item.summary.is_empty(),
        "feed item should carry its stored summary, got empty string"
    );
 }
 /// Items the user dwelled on for ≥15 seconds contribute to `top_tags`,
 /// even if they were never explicitly saved.
 #[test]
 fn top_tags_includes_dwell_items() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // Add an item with distinctive tags.
    let item_id = engine
        .add_item(ForageItemInput {
            url: "https://example.com/dwell-tagged-article".to_string(),
            title: "Deep Read Article".to_string(),
            source: "example.com".to_string(),
            category: "science".to_string(),
            reading_time_min: 10,
            description: "An article worth reading slowly.".to_string(),
            tags: vec!["quantum computing".to_string(), "research".to_string()],
            entities: vec![],
            content_type: "research".to_string(),
            summary: "Explores quantum error correction. Practical applications are assessed."
                .to_string(),
        })
        .unwrap();
    // Dwell ≥15 000 ms (completion threshold) without saving.
    engine.signal_dwell(88, item_id, 20_000).unwrap();
    // top_tags should include tags from the dwelled item.
    let tags = engine.top_tags(88, 5);
    assert!(
        tags.contains(&"quantum computing".to_string()),
        "top_tags should include tags from completion-dwell items; got: {:?}",
        tags
    );
    // Short dwell (< 15 s) should NOT contribute tags.
    let item_id2 = engine
        .add_item(ForageItemInput {
            url: "https://example.com/short-dwell-article".to_string(),
            title: "Brief Glance Article".to_string(),
            source: "example.com".to_string(),
            category: "science".to_string(),
            reading_time_min: 5,
            description: "Skimmed article.".to_string(),
            tags: vec!["astronomy".to_string()],
            entities: vec![],
            content_type: "news".to_string(),
            summary: "Brief overview of recent astronomy findings.".to_string(),
        })
        .unwrap();
    engine.signal_dwell(88, item_id2, 5_000).unwrap(); // only 5 seconds
    let tags_after = engine.top_tags(88, 10);
    assert!(
        !tags_after.contains(&"astronomy".to_string()),
        "short dwell (<15s) should not contribute tags; got: {:?}",
        tags_after
    );
 }
 /// `signal_dwell` is re-exported properly for test use.
 /// Quick sanity check that the method exists and accepts valid parameters.
 #[test]
 fn signal_dwell_method_exists() {
    let engine = ForageEngine::ephemeral().unwrap();
    engine.seed_default_corpus().unwrap();
    // 30 seconds of dwell on a seed item — should succeed without error.
    engine.signal_dwell(1, 1, 30_000).unwrap();
 }
--- a/applications/forage/plan.md
+++ b/applications/forage/plan.md
@ -7,7 +7,7 @@ Each phase proves something specific. Do not build phase N+1 until phase N has p
 | Phase | Proves | Delivers |
 |-------|--------|----------|
 | **P0** | The loop closes — signal in, re-rank out, observable in real time | Local server + seed data + Claude observes interactions |
-| **P1** | The Chrome extension can drive the entire signal surface from real web pages | Extension posts signals automatically from browsing behavior |
+| **P1** | Claude can discover content without the user browsing — reactions alone drive the loop | Autonomous discovery agent + browse-tasks API + source registry |
 | **P2** | Semantic search works over content Forage finds on the real web | Embedding service + real web crawl |
 | **P3** | The MAB sharpens — exploration items hit more often over time | Adaptive exploration budget, centroid tracking, exploration-hit instrumentation |
 | **P4** | The surprise moment — cross-centroid discoveries emerge naturally | Multi-session preference evolution, intersection surfacing |
@ -121,36 +121,265 @@ This is the demo. This is the proof-of-concept that makes the thesis visible.
 ---
-## Phase 1 — Real Signal Surface
+## Phase 1 — Autonomous Discovery Loop
-**Goal:** The Chrome extension captures signals from real browsing behavior, not just the demo feed page.
+**Goal:** Claude discovers content proactively. The user never browses. The loop closes entirely through reactions to what Claude finds.
-**What changes:**
+**Thesis:** A personalized feed can be driven without the user visiting a single page. Claude browses on behalf of the user, Forage ranks what it finds, and the user's reactions (save/skip/dwell) teach Claude where to look next. The feedback signal is not visits — it is choices.
-Claude uses `javascript_tool` to inject a lightweight signal collector on pages it navigates to:
+**The loop:**
-```js
+
-// injected on each visited page via javascript_tool
+```
-const title = document.title;
+Background task in forage-server
-const url = location.href;
+    ↓ emits browse-tasks (topics weighted by user preference + tag affinity, source list)
-const readingTime = Math.round(document.body.innerText.split(/\s+/).length / 200);
+Claude (--chrome, persistent session)
-// POST to forage-server: add item if unknown, write "view" signal
+    ↓ navigates sources → finds article links
-fetch('http://localhost:4242/signal', { method: 'POST', ... });
+    ↓ reads each article in full
-// After 30s dwell, fire "dwell" signal
+    ↓ analyses: topics, entities, content type, summary, quality
-setTimeout(() => fetch(...), 30_000);
+    ↓ POST /capture with enriched metadata
 forage-server
    ↓ stores rich metadata, fires view signals, broadcasts via SSE
 Feed page (localhost:4242)
    ↓ shows enriched cards (tags, content type, entities, Claude summary) live
 User reacts (save / skip / dwell)
    ↓ signals update preference vector AND tag affinity counters
 Next browse-tasks call
    ↓ returns topics + tag weights → Claude targets specific subtopics next cycle
 Loop repeats with higher precision
 ```
-`ForageEngine` gains an `add_item` method — engine API extends to:
+**Why Claude's enrichment matters here:**
 A JavaScript content script extracts what the page declares about itself. Claude reads and understands what the page actually says. These are different things.
 - `<meta name="description">` on a jazz article: *"The latest from Blue Note Records"*
 - Claude's analysis: `topics: ["hard bop", "trumpet"], entities: ["Lee Morgan", "Blue Note"], content_type: "review", summary: "A career retrospective on Lee Morgan's 1960s Blue Note recordings, focusing on his development of the hard bop trumpet style."`
 The preference model runs on Claude's output, not the page's self-description. This is what makes tag-level personalization possible before real embeddings arrive in P2.
 ---
 ### What we build
 #### Source Registry (in `forage-engine`)
 A hardcoded per-category list of seed URLs Claude can navigate to find articles. Each source is a page where the top-level links are articles (list pages, front pages, RSS-style feeds).
 ```
 technology:  news.ycombinator.com, lobste.rs
 science:     phys.org, news.ycombinator.com?q=science
 jazz:        pitchfork.com/reviews/albums, allaboutjazz.com/news
 travel:      theguardian.com/travel, cntraveler.com/latest-news
 cooking:     seriouseats.com, bonappetit.com/recipe
 design:      designobserver.com, dezeen.com/news
 history:     historytoday.com, smithsonianmag.com/history
 health:      health.harvard.edu/blog, theatlantic.com/health
 ```
 `ForageEngine` gains:
 ```rust
-pub fn add_item(&self, item: ForageItemInput) -> Result<u64>  // returns item_id
+pub fn browse_tasks(&self, user_id: u64, limit_per_topic: usize) -> BrowsePlan
 ```
-The feed page now shows a mix of:
+`BrowsePlan` contains:
- Seed items (known corpus)
+- `topics: Vec<BrowseTopic>` — ordered by preference weight + tag affinity, cold-start gets equal weight across all 8
- Items the user actually visited (added via `add_item`)
+- `limit_per_topic: usize` — how many articles to capture per source
 - `should_run: bool` — false if last discovery was recent and feed has ≥5 items
 - `tag_hints: Vec<String>` — top tags from saved/dwelled items the agent should bias toward within each topic (e.g. `["modal jazz", "improvisation"]` tells Claude to prefer theory-heavy jazz sources over jazz news)
-**No publishable Chrome extension is built.** Claude is the browsing agent. The signal injection is Claude executing JS on pages it visits.
+#### `GET /browse-tasks` (forage-server)
-**Proves:** tidalDB can serve as a memory layer for real browsing behavior, not just a demo corpus.
+Returns a `BrowsePlan` as JSON for user 1:
 ```json
 {
  "should_run": true,
  "interval_minutes": 30,
  "limit_per_topic": 5,
  "tag_hints": ["modal jazz", "improvisation", "music theory"],
  "topics": [
    { "name": "jazz",       "priority": 0.72, "sources": ["pitchfork.com/reviews/albums", "allaboutjazz.com/news"] },
    { "name": "technology", "priority": 0.51, "sources": ["news.ycombinator.com", "lobste.rs"] },
    { "name": "science",    "priority": 0.28, "sources": ["phys.org"] }
  ]
 }
 ```
 `tag_hints` comes from the top tags across the user's positively-signaled items (save + dwell ≥15s). The agent uses these to bias which articles it chooses to read in depth within each source — skipping news roundups and prioritizing analysis pieces that match the hints.
 Cold-start response: all 8 categories at equal `priority: 0.125`, 2 sources each, `tag_hints: []`.
 #### `POST /discovery/heartbeat` (forage-server)
 Agent calls this on every cycle start. Server records `agent_last_seen` timestamp. Used by feed page to show connection status.
 #### `GET /discovery/status` (forage-server)
 ```json
 {
  "agent_connected": true,
  "last_discovery_at": "2026-02-24T10:30:00Z",
  "items_found_last_run": 23,
  "next_run_in_minutes": 12
 }
 ```
 `agent_connected: true` when `agent_last_seen` is within the last 5 minutes.
 #### Discovery state in `AppState`
 ```rust
 pub struct DiscoveryState {
    pub last_discovery_at: Mutex<Option<std::time::Instant>>,
    pub agent_last_seen:   Mutex<Option<std::time::Instant>>,
    pub items_last_run:    Mutex<u32>,
 }
 ```
 Added to `AppState` alongside `engine` and `events`. Handlers update it; feed page polls it.
 #### Feed page status indicator
 A small status bar below the header:
 - `●  Active — last run 4 min ago` (green dot) — `agent_connected: true`
 - `○  Agent not connected` (grey dot) — no heartbeat in 5 min
 - `⟳  Discovering...` (spinning) — between heartbeat and items appearing
 #### Enriched capture payload
 `ForageItemInput` and `POST /capture` gain Claude-specific fields:
 ```rust
 pub struct ForageItemInput {
    pub url: String,
    pub title: String,
    pub source: String,
    pub category: String,
    pub reading_time_min: u32,
    pub description: String,
    // Claude-enriched fields (all optional; empty = not provided)
    pub tags: Vec<String>,       // specific subtopics: ["modal jazz", "music theory", "john coltrane"]
    pub entities: Vec<String>,   // named entities: ["John Coltrane", "Blue Note Records"]
    pub content_type: String,    // "analysis" | "news" | "tutorial" | "opinion" | "review" | "interview" | "research" | ""
    pub summary: String,         // Claude's 2-sentence summary of what the article actually says
 }
 ```
 All fields serialized into item metadata storage. `tags` stored as `"tags"` (comma-separated string) so existing metadata retrieval works without schema changes.
 `CaptureReq` in `handlers.rs` gains the same optional fields with `#[serde(default)]`.
 #### Tag affinity in `ForageEngine`
 `top_tags(user_id, limit) -> Vec<String>` — scans metadata of positively-signaled items (save + strong dwell), splits the `"tags"` metadata field, returns the top-N by frequency. Used to populate `tag_hints` in `BrowsePlan`.
 No schema changes to tidalDB. Tag affinity runs entirely over item metadata; the preference vector stays 8-dimensional and tracks category-level signal. Tags are a secondary signal that guides the agent's article selection within a source, not a replacement for the embedding.
 #### Enriched feed cards
 Feed cards gain three new display elements:
 - **Tag chips** — top 3 tags from the item, rendered as small outlined badges below the description. Tap a tag → future feed cards filtered/boosted for that tag (stored as a `localStorage` tag preference that biases the next `/browse-tasks` call via a `?prefer_tags=` query param)
 - **Content type badge** — right of the category chip, distinct color: `analysis` (blue), `tutorial` (green), `news` (grey), `opinion` (amber), `review` (purple)
 - **Claude summary** — shown instead of the meta description when non-empty; clearly signals what Claude learned from reading, not what the page says about itself
 #### The discovery agent prompt
 A file at `applications/forage/agent.md` — the instruction set Claude runs with `--chrome`.
 Core loop:
 1. `GET localhost:4242/browse-tasks`
 2. If `should_run: false` → wait `interval_minutes`, repeat from step 1
 3. `POST localhost:4242/discovery/heartbeat`
 4. For each topic (in priority order):
   - For each source URL:
     - Navigate to the source page
     - Find article links on the page (exclude nav, footer, sidebar links; prefer main content area)
     - **Select** up to `limit_per_topic` articles that appear relevant to `tag_hints` (if hints are present); prefer depth over breadth — analysis and tutorial pieces over news roundups
     - For each selected article:
       - Navigate to the article
       - **Read the full page text** (`get_page_text`)
       - **Analyse**:
         - `title` — headline (from `<h1>` if better than `<title>`)
         - `canonical_url` — from `<link rel="canonical">`
         - `reading_time_min` — word count ÷ 200, rounded up
         - `tags` — 2–5 specific subtopic tags, lowercase, singular nouns or short phrases (e.g. `"modal jazz"` not `"jazz"`)
         - `entities` — up to 5 named people, companies, technologies, or places central to the article
         - `content_type` — one of: `analysis`, `news`, `tutorial`, `opinion`, `review`, `interview`, `research`
         - `summary` — 2 sentences: what the article argues or reports, not what the site says about it
       - Skip if: title is empty, contains "Sign In" / "Subscribe" / "Login" / "Create Account", or URL is localhost / chrome://
       - `POST localhost:4242/capture` with all enriched fields
       - Wait 1–2 seconds (politeness)
 5. Wait `interval_minutes` minutes
 6. Repeat
 **What Claude must NOT do:** summarise the meta description. The point is that Claude reads the article and describes what it actually contains. A meta description that says "Read our latest article on jazz" is useless. Claude's summary should say "Argues that Coltrane's 1965 transition to free jazz was less a rejection of hard bop than an extension of it into harmonic territory bebop had not explored."
 #### Invocation
 One shell script at repo root:
 ```bash
 #!/usr/bin/env bash
 # forage-discover.sh — start the Forage discovery agent
 # Prerequisites: forage-server running at localhost:4242, claude CLI with --chrome support
 claude --chrome "$(cat applications/forage/agent.md)"
 ```
 User starts the system with two terminal tabs:
 ```bash
 # Tab 1 — server
 cargo run -p forage-server --manifest-path applications/forage/server/Cargo.toml
 # Tab 2 — agent
 ./forage-discover.sh
 ```
 Then opens `localhost:4242` and reacts.
 ---
 ### Edge cases
 | Situation | Handled by |
 |-----------|------------|
 | Cold start (no prefs, no tags) | Equal weight all 8 categories, `tag_hints: []`, agent reads broadly |
 | Agent not running | Feed shows "Agent not connected"; `should_run: true` stays set |
 | Navigation 404 / timeout | Agent skips to next URL, cycle continues |
 | Paywall / login page | Agent skips on title check ("Sign In", blank, "Subscribe") |
 | Empty title | `POST /capture` returns 400; agent skips |
 | Duplicate URL | `add_item` idempotent via FNV-1a; same ID, no duplicate; enrichment not re-written |
 | Feed sparse (< 5 items) | `should_run: true` overrides interval immediately |
 | Two agents running | Both browse; idempotent captures; harmless double coverage |
 | Server restart | `last_discovery_at` resets to null; agent runs on next cycle |
 | Prefs shift mid-cycle | Current cycle finishes with old plan; next call picks up new weights |
 | Claude context grows | Agent processes one topic at a time, not all sources in one turn |
 | Rate limit (HTTP 429) | Agent skips source, logs, continues to next |
 | Article has no meta description | Agent uses first paragraph or derives from full read; summary field carries real content |
 | Tags on first article in a new category | Tags come from Claude's reading, not from existing tag history; tag affinity starts building immediately |
 | User taps tag chip on feed card | Stored as `localStorage` tag preference; next `/browse-tasks?prefer_tags=modal+jazz` biases hints |
 | Item enrichment fails (Claude unsure) | Fields default to empty string / empty array; `POST /capture` still succeeds; card renders with basic metadata only |
 ---
 ### What this does NOT build
 - A Chrome Web Store extension
 - Server-push to Claude (server cannot initiate Claude actions; Claude polls)
 - Per-user discovery (single user, user 1, as per multi-user scope constraint)
 - Configurable source lists via UI (source registry is hardcoded for P1)
 ---
 ### Acceptance criteria
 1. Two terminal commands start the full system: one for the server, one for the agent
 2. Within 5 minutes of starting, the feed contains ≥10 real articles discovered by Claude
 3. Items appear in the feed in real-time via SSE as Claude captures them (no manual refresh)
 4. Feed page shows `●  Active` status while the agent is running
 5. ≥80% of captured items have non-empty `tags`, `content_type`, and `summary` fields — Claude is analysing, not just extracting
 6. At least one feed card's summary is observably different from and more informative than its meta description
 7. After saving ≥5 items tagged `"modal jazz"`, the next `/browse-tasks` response includes `"modal jazz"` in `tag_hints`
 8. After 20 user reactions (≥5 saves on jazz items), the next `/browse-tasks` response ranks jazz sources first
 9. A navigation failure (404, timeout) during a discovery cycle does not crash the agent or the server
 10. Re-running the agent after a server restart re-populates the feed within one cycle (items already in DB are not re-added)
 ---
@ -167,6 +396,12 @@ POST /embed  { text: string } → { vector: f32[1536] }
 Default: OpenAI `text-embedding-3-small`. Swappable. Forage calls this when writing new items.
 The text embedded is now significantly richer than P0's title-only approach. With Claude's enrichment from P1 available, the embedder receives:
 ```
 {title} — {summary}. Topics: {tags}. Entities: {entities}.
 ```
 This embeds Claude's understanding of the article, not the page's self-description. The preference centroid that emerges is a semantic model of what the user actually engages with.
 With real embeddings:
 - `SearchBuilder::semantic("jazz theory")` works for real
 - `SearchBuilder::similar_to(item_id)` produces genuine similarity
--- a/applications/forage/server/Cargo.toml
+++ b/applications/forage/server/Cargo.toml
@ -15,6 +15,9 @@ axum = "0.7"
 tokio = { version = "1", features = ["full"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
-tower-http = { version = "0.5", features = ["cors", "fs"] }
+tower-http = { version = "0.5", features = ["cors", "fs", "trace"] }
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 tokio-stream = { version = "0.1", features = ["sync"] }
 clap = { version = "4", features = ["derive"] }
 dirs-next = "2"
 chrono = { version = "0.4", features = ["serde"] }
--- a/applications/forage/server/src/handlers.rs
+++ b/applications/forage/server/src/handlers.rs
@ -1,15 +1,39 @@
 use std::convert::Infallible;
 use std::sync::Arc;
 use axum::Json;
 use axum::extract::{Query, State};
 use axum::http::{HeaderMap, StatusCode, header::AUTHORIZATION};
-use axum::response::IntoResponse;
+use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Response};
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt as _;
 use tokio_stream::wrappers::BroadcastStream;
 use forage_engine::{ForageItemInput, SignalKind};
 use crate::AppState;
 // ── SSE event type ────────────────────────────────────────────────────────────
 /// Payload pushed to feed pages over SSE when a new item is captured.
 /// Contains everything `makeCard()` in the frontend needs to render a card
 /// without an additional fetch.
 #[derive(Clone, Serialize)]
 pub struct CaptureEvent {
    pub item_id: u64,
    pub title: String,
    pub url: String,
    pub source: String,
    pub reading_time_min: u32,
    pub description: String,
    pub category: String,
    pub tags: Vec<String>,
    pub entities: Vec<String>,
    pub content_type: String,
    pub summary: String,
 }
 // ── Auth helper ───────────────────────────────────────────────────────────────
 /// Check the `Authorization: Bearer <token>` header.
@ -112,6 +136,14 @@ pub struct CaptureReq {
    pub reading_time_min: u32,
    #[serde(default = "default_user")]
    pub user_id: u64,
    #[serde(default)]
    pub tags: Vec<String>,
    #[serde(default)]
    pub entities: Vec<String>,
    #[serde(default)]
    pub content_type: String,
    #[serde(default)]
    pub summary: String,
 }
 fn default_reading_time() -> u32 {
@ -154,6 +186,18 @@ pub async fn post_capture(
        );
    }
    // Snapshot fields for the SSE broadcast before they're moved into ForageItemInput.
    let sse_url = effective_url.clone();
    let sse_title = req.title.clone();
    let sse_source = req.source.clone();
    let sse_category = req.category.clone();
    let sse_description = req.description.clone();
    let sse_reading_time_min = req.reading_time_min;
    let sse_tags = req.tags.clone();
    let sse_entities = req.entities.clone();
    let sse_content_type = req.content_type.clone();
    let sse_summary = req.summary.clone();
    let input = ForageItemInput {
        url: effective_url,
        title: req.title,
@ -161,6 +205,10 @@ pub async fn post_capture(
        category: req.category,
        description: req.description,
        reading_time_min: req.reading_time_min,
        tags: req.tags,
        entities: req.entities,
        content_type: req.content_type,
        summary: req.summary,
    };
    // add_item may call the embedding sidecar (blocking HTTP). Wrap in
@ -186,6 +234,28 @@ pub async fn post_capture(
    if let Err(e) = state.engine.signal(req.user_id, item_id, SignalKind::View) {
        eprintln!("[forage-server] /capture: view signal failed for item {item_id}: {e}");
    }
    // Push the new item to any open SSE connections.  No connected clients is fine.
    let _ = state.events.send(CaptureEvent {
        item_id,
        title: sse_title,
        url: sse_url,
        source: sse_source,
        reading_time_min: sse_reading_time_min,
        description: sse_description,
        category: sse_category,
        tags: sse_tags,
        entities: sse_entities,
        content_type: sse_content_type,
        summary: sse_summary,
    });
    // Update discovery timestamp on every successful capture.
    *state.discovery.last_discovery_at.lock().await = Some(std::time::SystemTime::now());
    // Increment items count.
    let mut count = state.discovery.items_last_run.lock().await;
    *count += 1;
    (
        StatusCode::OK,
        Json(serde_json::json!({"item_id": item_id, "ok": true})),
@ -291,50 +361,169 @@ pub async fn get_items(
    (StatusCode::OK, Json(serde_json::json!(items)))
 }
-// ── POST /onboard ─────────────────────────────────────────────────────────────
+// ── GET /events ───────────────────────────────────────────────────────────────
 //
-// Bootstraps a cold user's preference vector by firing synthetic save signals
+// SSE stream that pushes a CaptureEvent JSON blob every time a page is captured.
-// for seed items in each selected category.
+// `EventSource` in browsers cannot send custom headers, so auth uses a `?token=`
 // query parameter instead of the `Authorization` header.
 #[derive(Deserialize)]
-pub struct OnboardReq {
+pub struct EventsQuery {
-    pub user_id: u64,
+    #[serde(default)]
-    pub categories: Vec<String>,
+    pub token: String,
 }
-pub async fn post_onboard(
+pub async fn get_events(
    State(state): State<Arc<AppState>>,
    Query(params): Query<EventsQuery>,
 ) -> Response {
    if !state.token.is_empty() && params.token != state.token {
        return (StatusCode::UNAUTHORIZED, "unauthorized").into_response();
    }
    let rx = state.events.subscribe();
    let stream = BroadcastStream::new(rx).filter_map(|msg| match msg {
        Ok(event) => {
            let data = serde_json::to_string(&event).unwrap_or_default();
            Some(Ok::<Event, Infallible>(Event::default().data(data)))
        }
        // Lagged — receiver fell too far behind; drop the event.
        Err(_) => None,
    });
    Sse::new(stream)
        .keep_alive(KeepAlive::default())
        .into_response()
 }
 // ── GET /browse-tasks ─────────────────────────────────────────────────────────
 //
 // Returns a BrowsePlan telling the discovery agent which topics to browse,
 // how many articles to capture per source, and tag hints from the user's
 // engagement history.
 #[derive(Deserialize)]
 pub struct BrowseTasksQuery {
    #[serde(default)]
    pub prefer_tags: String,
 }
 pub async fn get_browse_tasks(
    State(state): State<Arc<AppState>>,
    headers: HeaderMap,
    Query(q): Query<BrowseTasksQuery>,
 ) -> impl IntoResponse {
    if let Err(e) = auth_check(&state.token, &headers) {
        return e;
    }
    let mut plan = state.engine.browse_tasks(1, 5);
    // Override should_run based on discovery recency. The engine always returns
    // true (it has no access to DiscoveryState), so we apply the gate here.
    let interval_secs = u64::from(plan.interval_minutes) * 60;
    {
        let last = state.discovery.last_discovery_at.lock().await;
        let count = *state.discovery.items_last_run.lock().await;
        if let Some(t) = *last
            && t.elapsed()
                .map(|e| e.as_secs() < interval_secs)
                .unwrap_or(false)
            && count >= 5
        {
            plan.should_run = false;
        }
    }
    // Merge prefer_tags into tag_hints (user-specified hints take priority).
    if !q.prefer_tags.is_empty() {
        let user_tags: Vec<String> = q
            .prefer_tags
            .split(',')
            .map(|t| t.trim().to_lowercase())
            .filter(|t| !t.is_empty())
            .collect();
        // Prepend user tags, then append engine tags that aren't already present.
        let existing: std::collections::HashSet<String> = user_tags.iter().cloned().collect();
        let mut merged = user_tags;
        for tag in plan.tag_hints {
            if !existing.contains(&tag) {
                merged.push(tag);
            }
        }
        plan.tag_hints = merged;
    }
    (
        StatusCode::OK,
        Json(serde_json::to_value(&plan).unwrap_or_default()),
    )
 }
 // ── POST /heartbeat ───────────────────────────────────────────────────────────
 //
 // Called periodically by the discovery agent to signal it is alive.
 pub async fn post_heartbeat(
    State(state): State<Arc<AppState>>,
    headers: HeaderMap,
 ) -> impl IntoResponse {
    if let Err(e) = auth_check(&state.token, &headers) {
        return e;
    }
    *state.discovery.agent_last_seen.lock().await = Some(std::time::Instant::now());
    // Reset the per-cycle item counter. Heartbeat fires at the start of each
    // discovery cycle, so this tracks "items found in the most recent cycle."
    *state.discovery.items_last_run.lock().await = 0;
    (StatusCode::OK, Json(serde_json::json!({"ok": true})))
 }
 // ── GET /discovery/status ────────────────────────────────────────────────────
 //
 // Returns the current state of the autonomous discovery loop: whether the agent
 // is connected, when the last run happened, how many items were found, and when
 // the next run is expected.
 pub async fn get_discovery_status(
    State(state): State<Arc<AppState>>,
    headers: HeaderMap,
    Json(req): Json<OnboardReq>,
 ) -> impl IntoResponse {
    if let Err(e) = auth_check(&state.token, &headers) {
        return e;
    }
-    let mut bootstrapped = 0usize;
+    let interval_secs = 30 * 60u64; // 30 minutes
-    for cat in &req.categories {
+
-        // Find up to 3 seed items matching this category and fire save signals.
+    let agent_connected = {
-        let matching_ids: Vec<u64> = state
+        let seen = state.discovery.agent_last_seen.lock().await;
-            .engine
+        seen.map(|t| t.elapsed().as_secs() < 300).unwrap_or(false)
-            .all_items()
+    };
-            .iter()
+
-            .filter(|s| s.category.eq_ignore_ascii_case(cat))
+    let (last_discovery_at_str, next_run_in_minutes) = {
-            .take(3)
+        let last = state.discovery.last_discovery_at.lock().await;
-            .map(|s| s.id)
+        match *last {
-            .collect();
+            None => (serde_json::Value::Null, 0u64),
-        for item_id in matching_ids {
+            Some(t) => {
-            if state
+                let elapsed = t.elapsed().unwrap_or_default().as_secs();
-                .engine
+                let next = if elapsed >= interval_secs {
-                .signal(req.user_id, item_id, SignalKind::Save)
+                    0
-                .is_ok()
+                } else {
-            {
+                    (interval_secs - elapsed) / 60
-                bootstrapped += 1;
+                };
-            }
+                let dt: chrono::DateTime<chrono::Utc> = t.into();
                (serde_json::Value::String(dt.to_rfc3339()), next)
            }
        }
    };
    let items_last_run = *state.discovery.items_last_run.lock().await;
    (
        StatusCode::OK,
-        Json(serde_json::json!({ "ok": true, "bootstrapped_count": bootstrapped })),
+        Json(serde_json::json!({
            "agent_connected": agent_connected,
            "last_discovery_at": last_discovery_at_str,
            "items_found_last_run": items_last_run,
            "next_run_in_minutes": next_run_in_minutes,
        })),
    )
 }
--- a/applications/forage/server/src/main.rs
+++ b/applications/forage/server/src/main.rs
@ -6,11 +6,43 @@ use axum::Router;
 use axum::routing::{get, post};
 use clap::Parser;
 use forage_engine::ForageEngine;
 use tokio::sync::broadcast;
 use tower_http::cors::CorsLayer;
 use tower_http::services::ServeDir;
 mod handlers;
 /// Tracks autonomous discovery agent state.
 ///
 /// Handlers read/write this to report agent liveness and run history.
 /// All fields are async Mutexes so SSE handlers (which are async) can
 /// update them without blocking the Tokio runtime.
 pub struct DiscoveryState {
    /// Wall-clock timestamp of the last successful `POST /capture` from the discovery agent.
    /// Stored as `SystemTime` so it can be formatted as ISO 8601 in `/discovery/status`.
    pub last_discovery_at: tokio::sync::Mutex<Option<std::time::SystemTime>>,
    /// Timestamp of the last `POST /discovery/heartbeat` from the agent.
    pub agent_last_seen: tokio::sync::Mutex<Option<std::time::Instant>>,
    /// Number of items captured in the most recently completed discovery cycle.
    pub items_last_run: tokio::sync::Mutex<u32>,
 }
 impl Default for DiscoveryState {
    fn default() -> Self {
        Self {
            last_discovery_at: tokio::sync::Mutex::new(None),
            agent_last_seen: tokio::sync::Mutex::new(None),
            items_last_run: tokio::sync::Mutex::new(0),
        }
    }
 }
 impl DiscoveryState {
    pub fn new() -> Self {
        Self::default()
    }
 }
 /// Shared application state passed to every handler via Axum's `State` extractor.
 pub struct AppState {
    /// The Forage engine (tidalDB wrapper + MAB logic).
@ -18,6 +50,12 @@ pub struct AppState {
    /// Static bearer token for single-user auth.  Empty string means auth is
    /// disabled (the default, for backwards-compatible local dev).
    pub token: String,
    /// Broadcast channel for SSE push to connected feed pages.
    /// Sent on every successful `/capture`. Capacity of 64 is generous for
    /// a single local user — lagged receivers simply drop old events.
    pub events: broadcast::Sender<handlers::CaptureEvent>,
    /// State for the autonomous discovery loop (heartbeat, run history).
    pub discovery: Arc<DiscoveryState>,
 }
 #[derive(Parser)]
@ -32,8 +70,8 @@ struct Args {
    data_dir: Option<PathBuf>,
    /// URL of the forage-embedder sidecar (e.g. http://localhost:4243).
-    /// When set, add_item and seed_default_corpus call the sidecar for
+    /// When set, add_item calls the sidecar for 1536-dim semantic vectors
-    /// 1536-dim semantic vectors instead of 8-dim category-axis vectors.
+    /// instead of 8-dim category-axis vectors.
    /// Requires forage-embedder to be running before the server starts.
    #[arg(long)]
    embedder: Option<String>,
@ -56,6 +94,13 @@ struct Args {
 #[tokio::main]
 async fn main() {
    tracing_subscriber::fmt()
        .with_env_filter(
            tracing_subscriber::EnvFilter::try_from_default_env()
                .unwrap_or_else(|_| "forage_server=debug,tower_http=info".parse().unwrap()),
        )
        .init();
    let args = Args::parse();
    let mut builder = ForageEngine::builder();
@ -79,8 +124,8 @@ async fn main() {
        );
    }
    let engine = builder.open().expect("failed to open engine");
    engine.seed_default_corpus().expect("failed to seed corpus");
    let (events_tx, _) = broadcast::channel::<handlers::CaptureEvent>(64);
    let token = args.token.unwrap_or_default();
    if !token.is_empty() {
        eprintln!("[forage-server] auth: token required");
@ -89,6 +134,8 @@ async fn main() {
    let state = Arc::new(AppState {
        engine: Arc::new(engine),
        token,
        events: events_tx,
        discovery: Arc::new(DiscoveryState::new()),
    });
    // Resolve static file directory.
@ -111,7 +158,10 @@ async fn main() {
        .route("/feed", get(handlers::get_feed))
        .route("/prefs", get(handlers::get_prefs))
        .route("/items", get(handlers::get_items))
-        .route("/onboard", post(handlers::post_onboard))
+        .route("/events", get(handlers::get_events))
        .route("/browse-tasks", get(handlers::get_browse_tasks))
        .route("/discovery/heartbeat", post(handlers::post_heartbeat))
        .route("/discovery/status", get(handlers::get_discovery_status))
        .nest_service("/", ServeDir::new(static_dir))
        .layer(CorsLayer::permissive())
        .with_state(state);
--- a/applications/forage/server/static/index.html
+++ b/applications/forage/server/static/index.html
@ -55,24 +55,67 @@
    #auth-submit:hover { background: #223e22; }
    #auth-error { font-size: 0.78rem; color: #f87171; min-height: 18px; }
-    /* ── Onboarding overlay ── */
+    /* ── Discovery status bar ── */
-    #onboard-overlay { display: none; position: fixed; inset: 0; background: rgba(0,0,0,0.92); z-index: 150; align-items: center; justify-content: center; }
+    #discovery-status {
-    #onboard-overlay.show { display: flex; }
+      display: flex; align-items: center; gap: 8px;
-    #onboard-box { background: #1a1a1a; border: 1px solid #333; border-radius: 14px; padding: 32px; width: min(480px, 90vw); display: flex; flex-direction: column; gap: 20px; }
+      padding: 6px 24px; font-size: 0.78rem; color: #555;
-    #onboard-box h2 { font-size: 1.2rem; color: #fff; }
+      border-bottom: 1px solid #1a1a1a;
    #onboard-box p { font-size: 0.85rem; color: #888; line-height: 1.5; }
    #onboard-chips { display: flex; flex-wrap: wrap; gap: 10px; }
    .onboard-chip {
      font-size: 0.82rem; font-weight: 600; padding: 7px 16px; border-radius: 999px;
      border: 1px solid #333; background: #222; color: #aaa; cursor: pointer;
      transition: background 0.15s, border-color 0.15s, color 0.15s;
      text-transform: capitalize;
    }
-    .onboard-chip.selected { background: #0d2e1a; border-color: #4ade80; color: #4ade80; }
+    #discovery-status .dot {
-    #onboard-submit { background: #1a2e1a; border: 1px solid #4ade80; color: #4ade80; padding: 10px; border-radius: 8px; font-size: 0.88rem; cursor: pointer; opacity: 0.4; pointer-events: none; transition: opacity 0.2s; }
+      width: 7px; height: 7px; border-radius: 50%; background: #333; flex-shrink: 0;
-    #onboard-submit.ready { opacity: 1; pointer-events: auto; }
+    }
-    #onboard-submit:hover.ready { background: #223e22; }
+    #discovery-status .dot.active { background: #4ade80; }
-    #onboard-hint { font-size: 0.76rem; color: #555; }
+    #discovery-status .dot.discovering {
      background: #fb923c;
      animation: pulse 1.2s ease-in-out infinite;
    }
    @keyframes pulse { 0%,100%{opacity:1} 50%{opacity:0.4} }
    /* ── Tag chips ── */
    .card-tags { display: flex; flex-wrap: wrap; gap: 6px; }
    .chip-tag {
      font-size: 0.68rem; padding: 2px 7px; border-radius: 999px;
      border: 1px solid #333; color: #888; background: transparent;
      cursor: pointer; transition: border-color 0.15s, background 0.15s, color 0.15s;
    }
    .chip-tag:hover { border-color: #555; color: #bbb; }
    .chip-tag-selected {
      border-color: #4ade80; background: #1a3a22; color: #4ade80;
    }
    .chip-tag-selected:hover { border-color: #6aee9a; background: #22472a; }
    /* ── Tag preferences bar ── */
    #tag-prefs-bar {
      padding: 4px 24px 6px; font-size: 0.76rem; color: #555;
      border-bottom: 1px solid #1a1a1a; min-height: 24px;
    }
    #tag-prefs-bar .tag-prefs-clear {
      margin-left: 8px; font-size: 0.72rem; color: #4ade80;
      cursor: pointer; text-decoration: underline; background: none;
      border: none; padding: 0; font-family: inherit;
    }
    #tag-prefs-bar .tag-prefs-clear:hover { color: #6aee9a; }
    /* ── Content type badge ── */
    .chip-content-type {
      font-size: 0.68rem; font-weight: 600; padding: 2px 7px; border-radius: 999px;
      text-transform: uppercase; letter-spacing: 0.04em;
    }
    .chip-ct-analysis  { background: #0d1e3a; color: #60a5fa; }
    .chip-ct-tutorial  { background: #0d2e1a; color: #4ade80; }
    .chip-ct-news      { background: #222; color: #888; }
    .chip-ct-opinion   { background: #2e1e0a; color: #fb923c; }
    .chip-ct-review    { background: #1e0d2e; color: #c084fc; }
    .chip-ct-interview { background: #0d2e2a; color: #2dd4bf; }
    .chip-ct-research  { background: #1a2e0d; color: #a3e635; }
    /* ── SSE new-capture flash ── */
    @keyframes newCapture {
      0%   { border-color: #4ade80; box-shadow: 0 0 14px rgba(74,222,128,0.35); }
      100% { border-color: #2a2a2a; box-shadow: none; }
    }
    .card.new-capture { animation: newCapture 2s ease-out forwards; }
  </style>
 </head>
 <body>
@ -88,6 +131,11 @@
  <span id="interests"></span>
  <span id="status"></span>
 </header>
 <div id="discovery-status">
  <span class="dot" id="agent-dot"></span>
  <span id="agent-label">Checking agent status…</span>
 </div>
 <div id="tag-prefs-bar"></div>
 <div id="feed"><div class="loading">Loading feed…</div></div>
 <div id="toast"></div>
@ -102,17 +150,6 @@
  </div>
 </div>
 <!-- Onboarding overlay: shown to cold-start users -->
 <div id="onboard-overlay">
  <div id="onboard-box">
    <h2>What do you want to read?</h2>
    <p>Pick one or more topics to get started. Forage will personalise your feed as you read.</p>
    <div id="onboard-chips"></div>
    <div id="onboard-hint">Select at least one topic</div>
    <button id="onboard-submit">Start reading →</button>
  </div>
 </div>
 <script>
  let currentUser = 1;
  let itemMeta = {};
@ -181,82 +218,38 @@
    if (e.key === 'Enter') document.getElementById('auth-submit').click();
  });
-  // ── Onboarding overlay ──────────────────────────────────────────────────────
+  // ── Tag preferences ─────────────────────────────────────────────────────────
-  const ONBOARD_CATEGORIES = [
+  function getTagPrefs() {
-    'technology', 'science', 'jazz', 'travel',
+    return localStorage.getItem('forage_tag_prefs')?.split(',').filter(Boolean) ?? [];
    'cooking', 'design', 'history', 'health',
  ];
  function isOnboarded(userId) {
    try {
      const ids = JSON.parse(localStorage.getItem('forage_onboarded_users') || '[]');
      return ids.includes(userId);
    } catch { return false; }
  }
-  function markOnboarded(userId) {
+  function setTagPrefs(tags) {
-    try {
+    localStorage.setItem('forage_tag_prefs', tags.join(','));
      const ids = JSON.parse(localStorage.getItem('forage_onboarded_users') || '[]');
      if (!ids.includes(userId)) ids.push(userId);
      localStorage.setItem('forage_onboarded_users', JSON.stringify(ids));
    } catch {}
  }
-  function buildOnboardChips() {
+  function updateTagPrefsBar() {
-    const container = document.getElementById('onboard-chips');
+    const bar = document.getElementById('tag-prefs-bar');
-    container.innerHTML = '';
+    if (!bar) return;
-    ONBOARD_CATEGORIES.forEach(cat => {
+    bar.innerHTML = '';
-      const chip = document.createElement('button');
+    const prefs = getTagPrefs();
-      chip.className = 'onboard-chip';
+    if (prefs.length === 0) return;
-      chip.textContent = cat;
+    const text = document.createTextNode(`\uD83D\uDCCC Preferred: ${prefs.join(', ')}  `);
-      chip.dataset.cat = cat;
+    bar.appendChild(text);
-      chip.addEventListener('click', () => {
+    const clearBtn = document.createElement('button');
-        chip.classList.toggle('selected');
+    clearBtn.className = 'tag-prefs-clear';
-        updateOnboardSubmit();
+    clearBtn.textContent = 'Clear';
    clearBtn.addEventListener('click', () => {
      setTagPrefs([]);
      updateTagPrefsBar();
      // Deselect all visible tag chips
      document.querySelectorAll('.chip-tag-selected').forEach(el => {
        el.classList.remove('chip-tag-selected');
      });
      container.appendChild(chip);
    });
    bar.appendChild(clearBtn);
  }
  function updateOnboardSubmit() {
    const selected = document.querySelectorAll('.onboard-chip.selected');
    const btn = document.getElementById('onboard-submit');
    const hint = document.getElementById('onboard-hint');
    if (selected.length > 0) {
      btn.classList.add('ready');
      hint.textContent = `${selected.length} topic${selected.length > 1 ? 's' : ''} selected`;
    } else {
      btn.classList.remove('ready');
      hint.textContent = 'Select at least one topic';
    }
  }
  function showOnboardOverlay() {
    buildOnboardChips();
    document.getElementById('onboard-overlay').classList.add('show');
  }
  function hideOnboardOverlay() {
    document.getElementById('onboard-overlay').classList.remove('show');
  }
  document.getElementById('onboard-submit').addEventListener('click', async () => {
    const selected = Array.from(document.querySelectorAll('.onboard-chip.selected'))
      .map(c => c.dataset.cat);
    if (selected.length === 0) return;
    const res = await apiFetch('/onboard', {
      method: 'POST',
      body: JSON.stringify({ user_id: currentUser, categories: selected }),
    });
    if (!res) return; // 401 handled by apiFetch
    markOnboarded(currentUser);
    hideOnboardOverlay();
    fetchFeed();
    fetchPrefs();
  });
  // ── Utilities ───────────────────────────────────────────────────────────────
  function scheduleRefresh() {
@ -281,7 +274,7 @@
  }
  function labelClass(label) {
-    const map = { match: 'chip-match', exploring: 'chip-exploring', trending: 'chip-trending', resurfaced: 'chip-resurfaced', bridge: 'chip-bridge' };
+    const map = { match: 'chip-match', exploring: 'chip-exploring', trending: 'chip-trending', resurfaced: 'chip-resurfaced', bridge: 'chip-bridge', captured: 'chip-exploring' };
    return map[labelKey(label)] || 'chip-match';
  }
@ -290,7 +283,7 @@
      const { cat_a, cat_b } = label.bridge;
      return `bridge: ${cat_a} \u00d7 ${cat_b}`;
    }
-    const map = { match: 'Match', exploring: 'Exploring', trending: 'Trending', resurfaced: 'Resurfaced' };
+    const map = { match: 'Match', exploring: 'Exploring', trending: 'Trending', resurfaced: 'Resurfaced', captured: 'Captured' };
    return map[label] || label;
  }
@ -309,6 +302,33 @@
    }).catch(() => {});
  }
  // ── Discovery status ────────────────────────────────────────────────────────
  async function pollDiscoveryStatus() {
    try {
      const res = await apiFetch('/discovery/status');
      if (!res) return;
      const data = await res.json();
      const dot = document.getElementById('agent-dot');
      const label = document.getElementById('agent-label');
      if (data.agent_connected) {
        dot.className = 'dot active';
        const mins = data.last_run_seconds_ago != null
          ? Math.round(data.last_run_seconds_ago / 60)
          : null;
        label.textContent = mins != null
          ? `Active — last run ${mins} min ago · ${data.items_found_last_run} items`
          : 'Active — no runs yet';
      } else {
        dot.className = 'dot';
        label.textContent = 'Agent not connected — run ./forage-discover.sh to start discovery';
      }
    } catch {
      // server unreachable or auth failed — leave as is
    }
    setTimeout(pollDiscoveryStatus, 30_000);
  }
  // ── Feed ────────────────────────────────────────────────────────────────────
  async function fetchPrefs() {
@ -322,12 +342,8 @@
        el.classList.add('active');
        setTimeout(() => el.classList.remove('active'), 1500);
      } else {
-        el.textContent = 'No preferences yet — read some articles!';
+        el.textContent = 'No preferences yet — browse some pages!';
        el.classList.remove('active');
        // Show onboarding if this user hasn't been through it
        if (!isOnboarded(currentUser)) {
          showOnboardOverlay();
        }
      }
    } catch (e) {
      // non-fatal
@ -348,22 +364,93 @@
    const meta = itemMeta[item.id] || {};
    const url = item.url || meta.url || '#';
-    card.innerHTML = `
+    // Score badge
-      <span class="score">score: ${(item.score || 0).toFixed(3)}</span>
+    const scoreSpan = document.createElement('span');
-      <div class="card-meta">
+    scoreSpan.className = 'score';
-        <span class="chip ${categoryClass(item.category)}">${item.category}</span>
+    scoreSpan.textContent = `score: ${(item.score || 0).toFixed(3)}`;
-        <span class="chip ${labelClass(item.label)}">${labelText(item.label)}</span>
+    card.appendChild(scoreSpan);
-        <span class="reading-time">${item.reading_time_min || meta.reading_time_min || '?'} min</span>
+
-      </div>
+    // Meta row: category chip, label chip, content type badge, reading time
-      <div class="card-title">${item.title}</div>
+    const metaDiv = document.createElement('div');
-      <div class="card-source">${item.source}</div>
+    metaDiv.className = 'card-meta';
-      <div class="card-desc">${item.description || meta.description || ''}</div>
+
-      <div class="card-actions">
+    const catChip = document.createElement('span');
    catChip.className = `chip ${categoryClass(item.category)}`;
    catChip.textContent = item.category;
    metaDiv.appendChild(catChip);
    const lblChip = document.createElement('span');
    lblChip.className = `chip ${labelClass(item.label)}`;
    lblChip.textContent = labelText(item.label);
    metaDiv.appendChild(lblChip);
    if (item.content_type) {
      const ctChip = document.createElement('span');
      ctChip.className = `chip chip-content-type chip-ct-${item.content_type}`;
      ctChip.textContent = item.content_type;
      metaDiv.appendChild(ctChip);
    }
    const readTime = document.createElement('span');
    readTime.className = 'reading-time';
    readTime.textContent = `${item.reading_time_min || meta.reading_time_min || '?'} min`;
    metaDiv.appendChild(readTime);
    card.appendChild(metaDiv);
    // Title
    const titleDiv = document.createElement('div');
    titleDiv.className = 'card-title';
    titleDiv.textContent = item.title;
    card.appendChild(titleDiv);
    // Source
    const sourceDiv = document.createElement('div');
    sourceDiv.className = 'card-source';
    sourceDiv.textContent = item.source;
    card.appendChild(sourceDiv);
    // Description (prefer summary when available)
    const desc = document.createElement('div');
    desc.className = 'card-desc';
    desc.textContent = (item.summary && item.summary.length > 0) ? item.summary : (item.description || meta.description || '');
    card.appendChild(desc);
    // Tag chips (up to 3)
    if (item.tags && item.tags.length > 0) {
      const tagsDiv = document.createElement('div');
      tagsDiv.className = 'card-tags';
      item.tags.slice(0, 3).forEach(tag => {
        const chip = document.createElement('span');
        chip.className = 'chip-tag';
        if (getTagPrefs().includes(tag)) chip.classList.add('chip-tag-selected');
        chip.textContent = tag;
        chip.addEventListener('click', () => {
          const prefs = getTagPrefs();
          const idx = prefs.indexOf(tag);
          if (idx === -1) {
            prefs.push(tag);
          } else {
            prefs.splice(idx, 1);
          }
          setTagPrefs(prefs);
          chip.classList.toggle('chip-tag-selected', prefs.includes(tag));
          updateTagPrefsBar();
        });
        tagsDiv.appendChild(chip);
      });
      card.appendChild(tagsDiv);
    }
    // Actions
    const actionsDiv = document.createElement('div');
    actionsDiv.className = 'card-actions';
    actionsDiv.innerHTML = `
      <button class="btn btn-skip">Skip</button>
      <button class="btn btn-save">Save</button>
      <button class="btn btn-share">Share</button>
      </div>
    `;
    card.appendChild(actionsDiv);
    const bar = makeDwellBar();
    card.appendChild(bar);
@ -441,6 +528,56 @@
    return card;
  }
  // ── SSE live capture ────────────────────────────────────────────────────────
  // Insert a newly captured item at the top of the feed without re-rendering
  // existing cards (preserves dwell state, scroll position, everything).
  function prependCard(event) {
    const item = {
      id: event.item_id,
      title: event.title,
      url: event.url,
      source: event.source,
      reading_time_min: event.reading_time_min,
      description: event.description,
      category: event.category || 'discovered',
      label: 'captured',
      score: 0,
      tags: event.tags || [],
      entities: event.entities || [],
      content_type: event.content_type || '',
      summary: event.summary || '',
    };
    // Keep itemMeta current so card fallback lookups work.
    itemMeta[item.id] = item;
    const feed = document.getElementById('feed');
    // Remove the empty-state placeholder if present.
    const placeholder = feed.querySelector('.loading');
    if (placeholder) placeholder.remove();
    const card = makeCard(item);
    card.classList.add('new-capture');
    feed.insertBefore(card, feed.firstChild);
    // Trim to a reasonable max so the page doesn't grow forever.
    const cards = feed.querySelectorAll('.card');
    if (cards.length > 12) cards[cards.length - 1].remove();
  }
  function connectSSE() {
    const token = getToken();
    const url = token ? `/events?token=${encodeURIComponent(token)}` : '/events';
    const es = new EventSource(url);
    es.onmessage = (e) => {
      try { prependCard(JSON.parse(e.data)); } catch {}
    };
    // EventSource reconnects automatically on error; nothing else to do.
    return es;
  }
  async function fetchFeed() {
    const start = Date.now();
    document.getElementById('status').textContent = 'Loading…';
@ -482,12 +619,15 @@
  });
  // Initial load
  updateTagPrefsBar();
  loadItemMeta().then(() => {
    fetchFeed();
    fetchPrefs();
  });
  scheduleRefresh();
  connectSSE();
  pollDiscoveryStatus();
 </script>
 </body>
 </html>
--- a/applications/iknowyou/architecture.md
+++ b/applications/iknowyou/architecture.md
@ -0,0 +1,539 @@
 # iknowyou — Architecture
 ## Core Thesis
 Communication personalization is a signal processing problem. Every exchange between the system and a person produces observable signals — engagement, sentiment, timing, style — that decay over time and compound across conversations. tidalDB's signal ledger, preference vectors, windowed aggregation, and cohort system provide the learning substrate. iknowyou wraps these primitives with an observation pipeline (LM-as-classifier), a briefing engine (query-to-profile), and a generation interface (brief-to-prompt).
 The system has no training loop, no batch pipeline, no feature store. Learning is continuous: signals are written on every exchange, preference vectors update via EMA, and the next query reflects the latest state. The entire closed loop executes within a single process.
 ## Domain Model
 ### Entities
 | Entity | tidalDB Kind | What it represents |
 |--------|-------------|-------------------|
 | **Person** | `User` | An individual the system communicates with. Has metadata (timezone, role, context), a preference vector (learned from message engagement), a signal ledger, cohort memberships, and a user-state index (conversation history). |
 | **Message** | `Item` | A message the system generated and sent. Has metadata (topic, tone, length, structure, time_sent, conversation_id), an embedding (from the message content), and signals written against it based on the person's response. |
 | **Observation** | `Item` | A natural-language statement about a person's communication pattern. Has an embedding (for semantic retrieval), a `confidence` signal (decays over time), and metadata (person_id, category, source_conversation). |
 Messages and observations are both `Item` entities but are distinguished by a `kind` metadata field: `"message"` or `"observation"`. This reuses tidalDB's existing entity model without extension.
 ### Schema Primitives
 | Primitive | Configuration | Purpose |
 |-----------|--------------|---------|
 | **Signals** | 10 signal types (see below) | Capture engagement, sentiment, topic, timing dimensions |
 | **Decay** | Exponential, per-signal half-life | Recent interactions matter more; old patterns fade |
 | **Windows** | 1h, 24h, 7d, 30d, AllTime | Temporal aggregation for time-of-day patterns |
 | **Velocity** | On engagement signals | Distinguish "always liked X" from "suddenly interested in X" |
 | **Preference vectors** | 384D, EMA with adaptive rate | Communication style convergence per-person |
 | **Cohorts** | Predicate-based, per-cohort ledger | Cold-start priors, cross-pollination, drift detection |
 ## Signal Schema
 ### Engagement Signals (on Message items)
 | Signal | Half-life | Windows | Velocity | Weight semantics |
 |--------|-----------|---------|----------|-----------------|
 | `replied` | 7d | 1h, 24h, 7d, AllTime | yes | 1.0 = responded at all |
 | `replied_fast` | 3d | 1h, 24h, 7d | yes | 1.0 = latency < 120s |
 | `replied_substantively` | 7d | 24h, 7d, AllTime | yes | 0.0–1.0 normalized by word count / depth |
 | `positive_sentiment` | 14d | 24h, 7d, 30d, AllTime | no | 0.0–1.0 from observer sentiment score |
 | `negative_sentiment` | 3d | 24h, 7d | no | 0.0–1.0 from observer sentiment score |
 | `went_silent` | 1d | 24h, 7d | no | 1.0 = no response after timeout |
 ### Topic Signals (on topic-cluster items or Message items)
 | Signal | Half-life | Windows | Velocity | Weight semantics |
 |--------|-----------|---------|----------|-----------------|
 | `topic_engaged` | 14d | 7d, 30d, AllTime | yes | 1.0 = stayed on or deepened topic |
 | `topic_dropped` | 3d | 7d | no | 1.0 = redirected or went brief |
 | `initiated` | 30d | 30d, AllTime | no | 1.5 = they brought this up unprompted |
 ### Meta Signals (on Observation items)
 | Signal | Half-life | Windows | Velocity | Weight semantics |
 |--------|-----------|---------|----------|-----------------|
 | `confidence` | 30d | AllTime | no | 1.0 at creation; decays unless reinforced |
 ### Design Rationale
 - **Asymmetric decay:** Negative signals (3d) decay 2–5x faster than positive signals (7–14d). The system is forgiving by default. Bad days don't poison the model.
 - **`initiated` is the strongest signal:** When someone raises a topic unprompted, that's stronger evidence of interest than responding to a topic you raised. Weight 1.5, half-life 30d.
 - **`went_silent` is gentle:** 1-day half-life. Silence might mean they're busy, not that the message was wrong. But it's still a signal — if silence correlates with a pattern (late-night messages, formal tone), the preference vector will drift away from that pattern.
 - **Velocity on engagement signals:** Velocity separates stable preferences from emerging ones. If `topic_engaged` velocity spikes on "replication" this week, the brief surfaces it as a rising interest — even if AllTime count is low.
 ## Module Structure
 ```
 applications/iknowyou/
 ├── engine/                          ← Core library (no network, no LM calls)
 │   └── src/
 │       ├── lib.rs                   ← IkyEngine: wraps TidalDb
 │       ├── schema.rs                ← Signal schema + cohort definitions
 │       ├── observer.rs              ← ObserverOutput: structured extraction type
 │       ├── briefing.rs              ← Brief: queries tidalDB, assembles profile
 │       ├── signals.rs               ← Signal writing: observation → tidalDB signals
 │       ├── observations.rs          ← Observation lifecycle: write, retrieve, decay
 │       └── cohorts.rs               ← Cohort definitions + cold-start logic
 │
 ├── server/                          ← HTTP API + LM integration
 │   └── src/
 │       ├── main.rs                  ← Axum server, startup, shutdown
 │       ├── handlers.rs              ← /message, /observe, /brief, /feedback
 │       ├── llm.rs                   ← LM client: observer calls + generation calls
 │       └── loop.rs                  ← Orchestrator: observe → learn → brief → generate
 │
 ├── vision.md                        ← Product vision
 └── architecture.md                  ← This document
 ```
 ### Dependency Flow
 ```
 server (Axum, LM client)
  │
  ├──→ engine (pure Rust, no IO except tidalDB)
  │      │
  │      └──→ tidalDB (embedded, same process)
  │
  └──→ LM API (HTTP, external)
 ```
 The engine crate has **zero network dependencies**. It takes structured `ObserverOutput` and returns structured `Brief`. The server crate handles LM API calls and HTTP. This separation means the engine is fully testable without mocking LM calls.
 ## The Closed Loop — Detailed
 ### Phase 1: Observe
 When a person responds to a message (or doesn't respond within the timeout window), the server calls the observer LM with the conversation context and the person's message.
 **Observer input:**
 ```
 System message sent: "Have you looked at what happens when segment count exceeds L0?"
 Person replied: "yeah good call - the compaction pass is actually the bottleneck,
                 not the segment count itself. been profiling it all morning"
 Time since system message: 47 seconds
 Conversation turn: 4
 ```
 **Observer output** (structured JSON, single inference):
 ```json
 {
  "engagement": {
    "replied": true,
    "latency_seconds": 47,
    "substantive": true,
    "word_count": 22,
    "sentiment_score": 0.75,
    "sentiment_direction": "positive"
  },
  "style": {
    "formality": 0.2,
    "uses_lowercase": true,
    "uses_jargon": true,
    "structure": "stream_of_thought",
    "emoji": false
  },
  "topic": {
    "primary": "compaction_profiling",
    "domain": "database_internals",
    "specificity": "high",
    "continued_from_previous": true,
    "deepened": true
  },
  "dynamics": {
    "redirected": true,
    "redirect_direction": "more_specific",
    "who_is_leading": "person",
    "built_on_previous": true,
    "corrected_system": true
  }
 }
 ```
 The observer is a **small, fast model** (Haiku-class). It doesn't need to be creative — it needs to reliably extract structure. Latency target: < 500ms. Cost per call: negligible.
 ### Phase 2: Learn
 The engine receives `ObserverOutput` and writes signals to tidalDB. This is a pure function: structured input → signal writes. No LM call.
 **Signal writes for this exchange:**
 ```rust
 // Engagement signals on the sent message
 db.signal("replied",                 msg_entity_id, 1.0,  now)?;
 db.signal("replied_fast",           msg_entity_id, 1.0,  now)?;  // 47s < 120s
 db.signal("replied_substantively",  msg_entity_id, 0.85, now)?;  // normalized
 db.signal("positive_sentiment",     msg_entity_id, 0.75, now)?;
 // Topic signals
 db.signal("topic_engaged", topic_entity_id("compaction_profiling"), 1.0, now)?;
 db.signal("topic_engaged", topic_entity_id("database_internals"),   1.0, now)?;
 // No negative signals this exchange
 ```
 **Preference vector update:**
 The sent message's embedding blends into the person's preference vector. The message was direct, technical, question-form — so the preference vector shifts toward that communication style. EMA adaptive rate: high early (person has few interactions), lower as history accumulates.
 **Observation generation** (periodic, not every turn):
 Every N turns or on session close, the observer produces natural-language observations:
 ```
 "Jordan corrects the system's framing and steers toward more specific
 technical problems — prefers to lead the conversation direction"
 "Jordan responds fastest to direct technical questions (median 45s)
 vs. status-check questions (median 4m)"
 ```
 These are stored as `Item` entities with embeddings, `kind: "observation"`, and a `confidence` signal at weight 1.0. The confidence decays with a 30-day half-life. If the same pattern is observed again, confidence is reinforced.
 **Cohort propagation:**
 If the person matches the `developers` cohort (via `role == "engineer"` predicate), these signals also write to the cohort's signal ledger. Aggregate effect: the `developers` cohort accumulates evidence that direct technical questions produce fast, substantive, positive replies.
 ### Phase 3: Brief
 Before generating the next message, the engine queries tidalDB and assembles a communication brief. This is a read-only operation — no writes, no LM calls.
 **Brief structure:**
 ```json
 {
  "person": {
    "id": "jordan",
    "metadata": { "timezone": "America/Los_Angeles", "role": "engineer" },
    "interaction_count": 47,
    "first_interaction": "2026-01-15T09:00:00Z"
  },
  "topics": {
    "hot": [
      { "topic": "compaction_profiling", "velocity": "rising",  "alltime": 12 },
      { "topic": "wal_recovery",         "velocity": "stable",  "alltime": 28 },
      { "topic": "replication",          "velocity": "rising",  "alltime": 3  }
    ],
    "cold": [
      { "topic": "documentation", "last_engaged": "2026-01-20", "sentiment": "negative" }
    ],
    "initiated_by_person": ["compaction_profiling", "rust_performance"]
  },
  "style": {
    "formality": { "current": 0.2, "trend": "stable" },
    "preferred_length": "medium",
    "preferred_structure": "conversational",
    "responds_to_questions": true,
    "prefers_to_lead": true,
    "jargon_comfortable": true,
    "emoji_usage": "none"
  },
  "timing": {
    "most_active_hours": [9, 10, 11, 21, 22],
    "fastest_reply_hours": [21, 22],
    "goes_silent_after": 23,
    "current_hour": 21,
    "day_of_week": "tuesday",
    "in_active_window": true
  },
  "what_works": {
    "high_engagement_patterns": [
      "direct technical questions about specific subsystems",
      "building on their correction or redirection",
      "short messages that open a thread, not close one"
    ],
    "recent_positive_messages": [
      { "summary": "Asked about L0 threshold during compaction", "sentiment": 0.75 },
      { "summary": "Shared profiling approach for signal write path", "sentiment": 0.82 }
    ]
  },
  "what_doesnt_work": {
    "low_engagement_patterns": [
      "status-update style messages",
      "long explanations without questions",
      "messages after 11pm Pacific"
    ]
  },
  "observations": [
    "Jordan corrects framing and steers toward specifics — prefers to lead",
    "Jordan's replies get shorter after 10pm — engagement drops",
    "Jordan uses 'yeah' as opener when genuinely engaged, 'sure' when not"
  ],
  "cohort_priors": {
    "developers": {
      "preferred_tone": "direct",
      "preferred_depth": "technical",
      "avg_engagement_length": "medium"
    }
  }
 }
 ```
 **How the brief is assembled:**
 | Brief section | tidalDB query | Primitive used |
 |--------------|--------------|----------------|
 | `topics.hot` | `read_decay_score` + `read_velocity` on topic items | Signal decay, velocity |
 | `topics.cold` | Topic items with low AllTime count + negative sentiment | Windowed aggregation |
 | `topics.initiated_by_person` | Items with `initiated` signal > threshold | Signal decay |
 | `style.*` | Person metadata + observer-written style fields | Entity metadata |
 | `timing.*` | `read_windowed_count("replied", Window::OneHour)` across 24 hour buckets | Windowed aggregation |
 | `what_works` | `retrieve()` with person's preference vector, filtered to high-sentiment messages | ANN + preference vector |
 | `what_doesnt_work` | Messages with `went_silent` or `negative_sentiment` signals | Signal decay |
 | `observations` | `search()` with current conversation context as query, filtered to `kind: "observation"` | BM25 + ANN semantic retrieval |
 | `cohort_priors` | Cohort ledger queries for person's matching cohorts | Cohort signal ledger |
 ### Phase 4: Generate
 The brief is injected into the LM's system prompt. The LM generates the next message. The engine stores the generated message as a new `Item` entity with metadata and embedding.
 ```
 [system]
 You are communicating with Jordan. Here is what we know about how
 Jordan communicates:
 {brief as structured text}
 Guidelines derived from this profile:
 - Be direct and technical. Ask specific questions.
 - Let Jordan lead the conversation direction — build on their framing.
 - Keep messages medium length. Conversational, not structured.
 - This is an active window (9pm Tuesday) — Jordan is typically responsive now.
 - Current hot topic with rising velocity: compaction profiling.
 - Avoid: status updates, long explanations, messages after 11pm.
 ```
 The LM never touches tidalDB. It reads the brief, generates a message, and the loop continues.
 ## Observation Lifecycle
 Observations are the bridge between raw signals and human-legible learning. They capture patterns that numbers alone can't express: "uses 'yeah' when engaged, 'sure' when not."
 ### Creation
 Observations are generated by the observer LM periodically:
 - Every 5 conversation turns
 - On session close
 - When the observer detects a novel pattern (contradiction with existing observations, or new behavioral signal)
 Each observation is:
 1. Embedded (384D, same model as messages)
 2. Stored as an `Item` with `kind: "observation"`, `person_id`, `category` (style, topic, timing, dynamics)
 3. Given a `confidence` signal at weight 1.0
 ### Retrieval
 Before briefing, the engine runs `db.search()` with the current conversation context as the query text, filtered to `kind: "observation"` and the target person. BM25 matches on keywords; ANN matches on semantic similarity. RRF fusion ranks by relevance.
 Top-5 observations are included in the brief.
 ### Decay and Reinforcement
 The `confidence` signal has a 30-day half-life. An observation created 60 days ago has ~25% of its original weight. If the same pattern is observed again, a new `confidence` signal is written — reinforcing the observation back toward full weight.
 Observations that are never reinforced fade below a retrieval threshold and are effectively forgotten. No garbage collection needed — decay handles it.
 ### Contradiction Resolution
 When the observer generates an observation that contradicts an existing one (e.g., "Jordan now prefers formal tone" vs. existing "Jordan prefers casual tone"), the new observation is stored alongside the old one. The old observation's confidence is decaying; the new one starts at 1.0. Within a few weeks, the old observation falls below retrieval threshold naturally.
 No explicit deletion. No conflict resolution logic. Decay handles contradiction.
 ## Cohort Architecture
 ### Definition
 Cohorts are defined at schema time in `engine/src/cohorts.rs`:
 ```rust
 registry.define("developers", Predicate::Eq {
    field: "role".into(),
    value: "engineer".into(),
 });
 registry.define("us_pacific", Predicate::Eq {
    field: "timezone".into(),
    value: "America/Los_Angeles".into(),
 });
 registry.define("high_engagement", Predicate::Range {
    field: "interaction_count".into(),
    min: "20".into(),
    max: None,
 });
 ```
 ### Cold-Start Flow
 ```
 New person arrives
  → Match against cohort predicates (metadata-based)
    → For each matching cohort:
        Query cohort signal ledger for aggregate patterns
    → Merge cohort priors into brief (weighted by cohort size / confidence)
      → LM generates first message using cohort-derived style
        → Person responds
          → Individual signals begin overriding cohort priors
 ```
 The weight of cohort priors in the brief decreases as individual interaction count grows. By ~10 interactions, individual signals dominate. By ~30, cohort priors are negligible unless individual data is sparse on a specific dimension.
 ### Cohort Learning
 Cohort signal ledgers learn from all members simultaneously. When Jordan (a `developers` cohort member) responds positively to a direct technical question, that signal writes to both Jordan's personal ledger and the `developers` cohort ledger.
 This means: the more people the system talks to, the better its cold-start priors become — without any explicit aggregation step. tidalDB's cohort signal propagation handles it at write time.
 ## Conversation (Session) Mechanics
 Each conversation is a tidalDB session:
 ```rust
 let handle = db.start_session(person_id, agent_id, "iknowyou_default", metadata)?;
 // During conversation:
 db.session_signal(&handle, "replied", msg_id, 1.0, now)?;
 // ...more signals per exchange...
 // On conversation end:
 let summary = db.close_session(handle)?;
 // → Triggers preference vector update (EMA blend of engaged message embeddings)
 // → Triggers observation generation (periodic analysis)
 // → Session signals aggregate into global ledger
 ```
 **Session-scoped vs. global signals:**
 Within a session, signals are scoped — they don't affect the global ledger until session close. This prevents a single bad conversation from immediately poisoning the model. Session close triggers the EMA preference update and promotes signals to global state.
 **Long conversations:** For ongoing conversations (e.g., a persistent chat channel), sessions can be rotated on a timer — close and immediately reopen every 30 minutes. This provides regular preference updates without waiting for an explicit "conversation end."
 ## Embedding Strategy
 ### Message Embeddings (384D)
 Generated from message text using a sentence-transformer model (external to iknowyou). The embedding captures semantic content + style in a single vector.
 Messages with similar communication style (casual + technical + question) cluster in the embedding space. The person's preference vector — evolved through EMA blending of positively-received message embeddings — converges on the region of embedding space that represents "how this person likes to be communicated with."
 ### Observation Embeddings (384D, same model)
 Observations are embedded with the same model. This means semantic search over observations uses the same distance metric as message retrieval. "Jordan prefers direct questions" is retrievable both by keyword ("direct questions") and by semantic similarity to a conversation about asking direct questions.
 ### Preference Vector Evolution
 ```
 Initial:     null (cold start, use cohort priors)
 After 1 msg: preference = message_embedding (first positive response)
 After N:     preference = (1 - alpha) * preference + alpha * new_message_embedding
             where alpha = base_alpha / (1 + ln(update_count + 1))
             base_alpha = 0.15
 ```
 The adaptive learning rate means:
 - Interaction 1: alpha ≈ 0.15 (strong influence)
 - Interaction 5: alpha ≈ 0.08 (moderate)
 - Interaction 20: alpha ≈ 0.04 (refinement)
 - Interaction 100: alpha ≈ 0.03 (stable, slow drift)
 ## Write Path — Full Trace
 A person sends a reply. Here is everything that happens:
 ```
 1. Server receives person's message
   └─ HTTP handler in server/handlers.rs
 2. Observer LM call (async, < 500ms)
   ├─ Input: conversation context + person's message
   └─ Output: ObserverOutput (structured JSON)
 3. Engine processes ObserverOutput
   ├─ 3a. Write engagement signals on sent message
   │   ├─ db.signal("replied", msg_id, 1.0, now)              → WAL + hot tier
   │   ├─ db.signal("replied_fast", msg_id, 1.0, now)         → WAL + hot tier
   │   ├─ db.signal("replied_substantively", msg_id, 0.85, now)
   │   └─ db.signal("positive_sentiment", msg_id, 0.75, now)
   │
   ├─ 3b. Write topic signals
   │   ├─ db.signal("topic_engaged", topic_id, 1.0, now)
   │   └─ db.signal("initiated", topic_id, 1.5, now)          [if person-initiated]
   │
   ├─ 3c. Update person metadata
   │   └─ db.write_user_metadata(person_id, updated_fields)    [style cues, timing]
   │
   ├─ 3d. Session signal (within active session)
   │   └─ db.session_signal(&handle, ...)                      [scoped, not yet global]
   │
   └─ 3e. Cohort propagation (automatic at signal-write time)
       └─ For each matching cohort: cohort_ledger.record(...)
 4. [Every 5 turns] Observer generates observations
   ├─ Stored as Item entities with embeddings
   └─ confidence signal at 1.0, 30d half-life
 5. Briefing engine queries tidalDB (read-only, < 10ms)
   ├─ Signal reads: decay scores, windowed counts, velocity
   ├─ ANN retrieval: preference-aligned past messages
   ├─ Search: relevant observations for current context
   ├─ Cohort queries: priors for sparse dimensions
   └─ Assembles Brief struct
 6. Generator LM call
   ├─ Input: brief (as system prompt) + conversation history
   └─ Output: next message
 7. Store generated message as Item
   ├─ db.write_item_with_metadata(msg_id, metadata)
   ├─ db.write_item_embedding(msg_id, embedding)
   └─ Message is now a target for future signals
 8. Send message to person → loop continues
 ```
 **Latency budget:**
 | Step | Target | Notes |
 |------|--------|-------|
 | Observer LM call | < 500ms | Small model, structured output |
 | Signal writes (6–8 signals) | < 1ms total | tidalDB hot path, < 100µs each |
 | Metadata update | < 200µs | Single fjall write |
 | Briefing query | < 10ms | Signal reads + ANN + search |
 | Generator LM call | 500ms–2s | Full model, depends on length |
 | Message storage | < 500µs | Metadata + embedding write |
 | **Total loop** | **< 3s** | **Dominated by LM calls** |
 The tidalDB operations are negligible. The latency floor is the LM inference time.
 ## Performance Targets
 | Operation | Target |
 |-----------|--------|
 | Signal write (single, including WAL) | < 100µs |
 | Brief assembly (all queries) | < 10ms |
 | Observation retrieval (semantic search) | < 5ms |
 | Preference vector ANN query (10K messages) | < 3ms |
 | Full loop excluding LM calls | < 15ms |
 | Observer LM call | < 500ms |
 | Generator LM call | < 2s |
 | End-to-end response latency | < 3s |
 ## Key Architectural Decisions
 | Decision | Choice | Why |
 |----------|--------|-----|
 | Observer as separate LM call | Small/fast model, structured output | Decouples observation quality from generation quality. Testable independently. Cheap per-call. |
 | Messages as tidalDB Items | Reuse entity model, no schema extension | Messages get embeddings, signals, metadata, ANN retrieval for free. |
 | Observations as Items (not metadata) | Semantic retrieval via search pipeline | Observations are retrievable by relevance to current context, not just by person. Decay handles staleness. |
 | Engine has no LM dependency | Pure Rust, structured IO | Fully testable without mocking LM. Server owns all external calls. |
 | Session-scoped signals | Promote to global on close | Prevents single bad conversation from poisoning the model. Batched preference update. |
 | Asymmetric decay (negative < positive) | 3d negative vs. 7–14d positive | Forgiving by default. Bad days fade fast. Good patterns persist. |
 | Cohort priors fade with interaction count | Weight = 1 / (1 + individual_count / 10) | Bootstraps cold start, gets out of the way once individual data exists. |
 | 384D embeddings | Sentence-transformer class | Good quality/cost ratio. Same model for messages and observations enables cross-type search. |
 | Brief as JSON, not prompt text | Structured, inspectable, testable | Can validate brief contents without running the generator. Can swap LM providers without changing the brief format. |
 | Periodic observation generation | Every 5 turns + session close | Not every turn (too noisy, too expensive). Not only session close (too infrequent for long conversations). |
--- a/applications/iknowyou/devsetup.md
+++ b/applications/iknowyou/devsetup.md
@ -0,0 +1,186 @@
 # iknowyou — Dev Setup
 ## Infrastructure
 ### GPU Server
 | | |
 |---|---|
 | **Host** | `msd5685.mjhst.com` |
 | **SSH** | `ssh ubuntu@msd5685.mjhst.com` |
 | **GPU** | NVIDIA RTX 6000 Ada Generation (48 GB VRAM) |
 | **RAM** | 94 GB |
 | **CPUs** | 20 |
 | **Disk** | 243 GB (172 GB free) |
 | **OS** | Ubuntu 22.04, kernel 5.15.0-161 |
 | **CUDA** | 13.0 (nvcc 13.0.88) |
 | **Driver** | 535.288.01 |
 | **Public IP** | 208.122.213.81 |
 ### vLLM + Qwen3-8B
 **Model:** `Qwen/Qwen3-8B` (BF16, ~15.3 GB on GPU)
 **API:** OpenAI-compatible at `http://msd5685.mjhst.com:8000/v1`
 **Service:** systemd unit `vllm.service` — starts on boot, restarts on failure.
 ```
 # Check status
 ssh ubuntu@msd5685.mjhst.com "sudo systemctl status vllm"
 # View logs
 ssh ubuntu@msd5685.mjhst.com "sudo journalctl -u vllm -f"
 # Restart
 ssh ubuntu@msd5685.mjhst.com "sudo systemctl restart vllm"
 ```
 **Config:** `/etc/systemd/system/vllm.service`
 ```ini
 [Service]
 ExecStart=/home/ubuntu/vllm-env/bin/vllm serve Qwen/Qwen3-8B \
  --host 0.0.0.0 \
  --port 8000 \
  --reasoning-parser qwen3 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.85
 ```
 **Python env:** `/home/ubuntu/vllm-env` (Python 3.10, vLLM 0.15.1)
 ## Using the API
 ### Chat completion
 ```bash
 curl http://msd5685.mjhst.com:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-8B",
    "messages": [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Hello"}
    ],
    "temperature": 0.7,
    "top_p": 0.8,
    "max_tokens": 512
  }'
 ```
 ### Thinking mode
 Qwen3 supports a `/think` and `/no_think` toggle in the user message, or via `chat_template_kwargs`:
 ```bash
 # Thinking enabled (default — model reasons in <think> blocks before answering)
 curl http://msd5685.mjhst.com:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-8B",
    "messages": [{"role": "user", "content": "What is 23 * 47?"}],
    "temperature": 0.6,
    "top_p": 0.95
  }'
 # Thinking disabled (faster, no reasoning trace)
 curl http://msd5685.mjhst.com:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-8B",
    "messages": [{"role": "user", "content": "What is 23 * 47?"}],
    "temperature": 0.7,
    "top_p": 0.8,
    "chat_template_kwargs": {"enable_thinking": false}
  }'
 ```
 **Recommended sampling:**
 - Thinking mode: `temperature=0.6, top_p=0.95, top_k=20`
 - Non-thinking mode: `temperature=0.7, top_p=0.8, top_k=20`
 ### Structured output (for Observer)
 ```bash
 curl http://msd5685.mjhst.com:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-8B",
    "messages": [{"role": "user", "content": "Extract sentiment from: I love this idea!"}],
    "response_format": {
      "type": "json_schema",
      "json_schema": {
        "name": "sentiment",
        "schema": {
          "type": "object",
          "properties": {
            "sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
            "confidence": {"type": "number"}
          },
          "required": ["sentiment", "confidence"]
        }
      }
    },
    "chat_template_kwargs": {"enable_thinking": false}
  }'
 ```
 ### Streaming
 ```bash
 curl http://msd5685.mjhst.com:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-8B",
    "messages": [{"role": "user", "content": "Tell me a short story."}],
    "stream": true,
    "temperature": 0.7
  }'
 ```
 ### Check model status
 ```bash
 curl http://msd5685.mjhst.com:8000/v1/models
 curl http://msd5685.mjhst.com:8000/health
 ```
 ## NVIDIA Driver Notes
 The server had a driver version mismatch (kernel module 535.274 vs userspace 535.288) on first setup. Fixed by:
 ```bash
 # Unload old modules
 sudo rmmod nvidia_uvm nvidia_drm nvidia_modeset nvidia
 # Reload with new version
 sudo modprobe nvidia && sudo modprobe nvidia_uvm
 ```
 After a reboot, the DKMS-built 535.288 module loads automatically. If `nvidia-smi` ever shows "Driver/library version mismatch" again, either reboot or run the rmmod/modprobe sequence above.
 ## Topology
 ```
 Local machine (macOS)
  │
  │  SSH tunnel or direct HTTP
  │
  ▼
 msd5685.mjhst.com (Ubuntu 22.04)
  │
  ├── vLLM (systemd, port 8000)
  │     └── Qwen/Qwen3-8B (BF16, 48GB RTX 6000 Ada)
  │
  └── [future] iknowyou server (port TBD)
        └── embedded tidalDB
 ```
 For local development, use an SSH tunnel to reach the API:
 ```bash
 ssh -L 8000:localhost:8000 ubuntu@msd5685.mjhst.com
 # Then: curl http://localhost:8000/v1/models
 ```
 Or hit it directly at `http://msd5685.mjhst.com:8000` (port must be open in firewall).
--- a/applications/iknowyou/vision.md
+++ b/applications/iknowyou/vision.md
@ -0,0 +1,159 @@
 # iknowyou — Vision
 ## The Problem
 Every system that talks to people talks to all of them the same way.
 Chatbots, assistants, notification systems, CRMs, onboarding flows — they generate language aimed at a statistical median. They don't know that Jordan prefers direct questions over explanations. They don't know that Sarah goes quiet after 10pm and resents being pinged. They don't know that Marcus engages deeply with technical specifics but shuts down when you get abstract.
 The current state of "personalization" in communication is prompt stuffing — a static bio paragraph, maybe a few preference flags, injected into context and hoped for the best. It doesn't learn. It doesn't decay. It doesn't notice that someone's interests shifted last week or that they respond to humor on Fridays but not Mondays.
 Real personalization requires a system that **observes, remembers, forgets, and adapts** — continuously, per-person, across every dimension of how a human communicates.
 The tools to do this exist but they're scattered across six systems: a vector database for style embeddings, a feature store for behavioral signals, a time-series store for temporal patterns, a key-value store for preference state, an event bus for real-time observation, and application code that tries to glue it all together. The seams between these systems are where the learning breaks down.
 ## The Thesis
 > **Communication is a personalized ranking problem.**
 >
 > "What should I say to this person, in what way, at what time?" is structurally identical to "What content should this user see, in what order?" The same primitives that solve content discovery — signals with decay, preference vectors with adaptive learning, temporal windowing, cohort priors, exploration/exploitation — solve communication personalization when pointed at a different surface.
 iknowyou is a communication learning engine built on tidalDB. It doesn't generate language — it learns how language lands, and tells the generator what it knows.
 ## What It Is
 A closed-loop system that sits between a language model and the people it talks to. Every message sent is an experiment. Every response (or silence) is a measurement. The system observes, extracts structured signals, writes them into tidalDB's signal ledger, and watches preference vectors converge on how each person actually communicates.
 Before the LM generates its next message, iknowyou assembles a **communication brief** — a structured profile of everything the system has learned about this person, weighted by recency, confidence, and context.
 ### First-Class Primitives
 **Messages** are items. Every message the system generates is stored with metadata (topic, tone, length, structure, time sent) and an embedding. The person's response is a signal on that item. tidalDB's preference vectors automatically evolve toward "the kind of message this person engages with."
 **Observations** are items. Natural-language statements about a person's communication patterns, stored with embeddings and confidence signals that decay over time. Retrieved semantically before each generation. "Jordan redirects away from process topics within 1-2 messages" is an observation. It has a 30-day half-life. If it stops being true, it fades.
 **Persons** are users. Each has a preference vector (learned from message engagement), a signal ledger (all interaction history, decayed), metadata (timezone, role, context), and cohort memberships.
 **Conversations** are sessions. Each has a start and end, a policy, an audit trail, and a set of signals that aggregate into the person's global profile on close.
 ### The Signal Schema
 Communication produces a richer signal surface than content consumption. A person doesn't just "view" a message — they respond to it, and how they respond encodes multiple dimensions:
 | Signal | What it measures | Decay |
 |--------|-----------------|-------|
 | `replied` | They responded at all | 7d |
 | `replied_fast` | Latency < 2 min | 3d |
 | `replied_substantively` | Word count, depth, engagement | 7d |
 | `positive_sentiment` | Affirmative, enthusiastic, building-on | 14d |
 | `negative_sentiment` | Dismissive, frustrated, redirecting | 3d |
 | `topic_engaged` | Stayed on or deepened a topic | 14d |
 | `topic_dropped` | Changed subject or went brief | 3d |
 | `initiated` | They brought this up unprompted | 30d |
 | `went_silent` | No response after timeout | 1d |
 | `explicit_feedback` | Direct correction or praise | 60d |
 Short half-lives on negative signals: the system forgets your bad days quickly. Long half-lives on explicit feedback: when someone tells you something directly, remember it.
 ### The Closed Loop
 ```
 Conversation
  → Person responds (or doesn't)
    → Observer extracts structured signals
      → Signals written to tidalDB (decay, window, velocity — automatic)
        → Preference vectors update (EMA blend — automatic)
          → Communication brief assembled (query tidalDB)
            → LM generates next message, conditioned on brief
              → Conversation continues
 ```
 No batch jobs. No retraining. No feature pipelines. The loop is continuous and the learning is incremental — every single exchange makes the system slightly better at talking to this person.
 ### The Observer
 A small, fast LM call that extracts structured data from each exchange. Not the conversation model — a dedicated analyst. It produces:
 - **Engagement metrics:** did they reply, how fast, how much
 - **Style cues:** formality, emoji usage, sentence structure, jargon level
 - **Topic extraction:** what the conversation is about, at what specificity
 - **Conversation dynamics:** who's leading, did they redirect, did they ask or answer
 - **Temporal context:** time of day, day of week, response latency pattern
 This is the classifier. It's not a separate ML model — it's a structured-output LM call. One inference, deterministic schema.
 ### The Brief
 Before generating any message, the system queries tidalDB and assembles:
 - **Top decayed topics** — what this person cares about *right now* (velocity separates "always liked Rust" from "suddenly interested in replication")
 - **Style preference** — formality, length, structure preferences, weighted by recency
 - **Timing patterns** — windowed counts over hours-of-day reveal when they're active, responsive, and receptive
 - **What works** — messages with high positive-response signals, retrieved by preference vector similarity
 - **What doesn't** — patterns that correlate with silence or negative sentiment
 - **Relevant observations** — semantic retrieval of natural-language observations matching the current context
 - **Cohort priors** — for dimensions where individual data is sparse, fall back to what works for people like them
 The brief is structured JSON. The LM reads it as a system prompt. It never touches the database directly.
 ### Cohorts
 Cohorts solve three problems:
 **Cold start.** A new person has no signal history. But if you know they're a developer in Pacific time who came from a technical community, the `developers` and `us_pacific` cohort signal ledgers already contain aggregate patterns. The system starts with reasonable defaults instead of random guessing.
 **Cross-pollination.** When 50 developers all respond well to direct, concise, technical messages — that learning propagates to the next developer automatically through the cohort ledger. Individual learning is still primary, but cohort signal is the prior.
 **Drift detection.** When a person's individual signals diverge sharply from their cohort, that's itself a signal. An engineer who prefers casual non-technical conversation is interesting precisely because they're atypical for their cohort. The delta between individual and cohort signals is information.
 Cohorts are defined by predicates over person metadata:
 ```
 "developers":     role == "engineer"
 "us_pacific":     timezone == "America/Los_Angeles"
 "morning_active": peak_hour in [6, 11]
 "formal_pref":    observed_formality == "high"
 ```
 Predicates are evaluated at signal-write time. A person can belong to multiple cohorts. Cohort membership can change as metadata evolves.
 ## What It Is NOT
 - **Not a chatbot.** iknowyou doesn't generate language. It learns how language lands and produces structured briefs for a generator that does.
 - **Not a CRM.** It doesn't store contact records, deal pipelines, or business relationships. It stores communication patterns.
 - **Not a sentiment analysis tool.** Sentiment extraction is one input signal among many. The system learns multidimensional communication preferences, not a happiness score.
 - **Not a profile page.** The communication brief is optimized for LM consumption, not human reading. (Though an inspection UI is valuable for trust and debugging.)
 - **Not a replacement for the LM's own capabilities.** A good LM already adapts within a conversation. iknowyou provides the *cross-conversation* memory that context windows can't.
 ## Design Principles
 **The response is the ground truth.** Don't ask people what they prefer — watch what they do. A fast, substantive reply is a stronger signal than any preference checkbox. Silence is data.
 **Decay is not optional.** People change. A preference observed six months ago is not the same as one observed yesterday. Every signal has a half-life. Nothing is permanent except explicit, direct corrections — and even those fade slowly.
 **Learn fast, stabilize late.** Early interactions should have outsized influence — the system should feel like it's paying attention from the first exchange. As confidence builds, the learning rate drops. New observations refine rather than overwrite.
 **Observe, don't interrogate.** Never ask "do you prefer formal or casual language?" Infer it from how they write. The best personalization is invisible — the person just notices that conversations feel easier over time.
 **Cohorts are priors, not destiny.** Use what you know about similar people to bootstrap. Overwrite it with direct evidence immediately. Never let group patterns override individual signals.
 **The brief is the interface.** The communication model doesn't talk to tidalDB. It reads a brief. This keeps the LM stateless, the learning layer independent, and the whole system testable — you can inspect and modify the brief at any point in the loop.
 **Negative signals decay fast.** Everyone has bad days. A short, dismissive reply on a Tuesday night shouldn't poison the model for weeks. Short half-lives on negative signals; long half-lives on positive ones. The system is forgiving by default.
 **Silence is a signal, not an absence.** When someone doesn't respond, that's information. After a configurable timeout, `went_silent` fires as a negative signal on the sent message. But its half-life is short — maybe they were just busy.
 ## Who This Is For
 Any system that talks to people repeatedly and wants to get better at it:
 - **AI assistants** that communicate with the same users across sessions
 - **Notification systems** that want to reach people at the right time, in the right tone, about the right things
 - **Onboarding flows** that adapt to how each person learns
 - **Customer communication** that remembers how someone prefers to be addressed
 - **Collaborative tools** that adjust their language to match the team's communication culture
 The common thread: repeated interaction with the same person, where the quality of communication compounds over time.
 ## The Name
 iknowyou. Because the goal isn't to talk *at* people — it's to know them well enough that the conversation feels natural. Not surveillance. Not profiling. Just the kind of knowing that comes from paying attention.
--- a/docs/planning/ROADMAP.md
+++ b/docs/planning/ROADMAP.md
@ -108,8 +108,11 @@ The roadmap now has two tracks:
 | **m7p3: Performance at Scale** | COMPLETE | 900 lib + all integration; 1,201 total; scale bench (1M items), USearch ef=400, LogMergePolicy, signal trimmer (5M entry cap), social scale tests |
 | **m7p4: Operational Visibility** | COMPLETE | 946 lib + 28 m7p4_visibility (--features test-utils); QueryStats, WAL/signal/index Prometheus metrics, tidalctl diagnostics, RLHF export, cross-session aggregation |
 | **Enterprise Readiness + M7 UAT** | COMPLETE | 960 lib + ~155 integration passing; all P0/P1 gaps resolved; m7_uat.rs passing (crash recovery, degradation, rate limiting, observability, regression gate) |
 | **m8p1: Shard-Aware Foundations** | COMPLETE | 1029 lib; ShardId, RegionId, WalSegmentId, ShardRouter, ReplicationState, NodeConfig/NodeRole, BatchHeader v2, shard-aware segment naming |
 | **m8p2: WAL Shipping and Follower Replay** | COMPLETE | 1054 lib + 8 m8p2_replication integration; Transport trait, InProcessTransport, WalShipper, SegmentReceiver, FollowerDb (ReadOnly guards), ReplicationLagGauge |
 | **m8p3: CRDT Counters and Deterministic Reconciliation** | COMPLETE | 1125 lib + 13 m8p3_crdt property tests; HLC/HlcTimestamp, PNCounter, LWWRegister, CrdtSignalState, ReconciliationEngine, StateSnapshot |
-**Next:** M8 Distributed Fabric (multi-region WAL shipping, shard routing, deterministic reconciliation). M7 Production Hardening + Enterprise Readiness complete. Engine track through M7 done.
+**Next:** M8p4 (Session Continuity Across Regions), M8p5 (Control Plane + Multi-Tenancy), M8p6 (End-to-End UAT). Three phases remaining in M8 Distributed Fabric.
 ---
@ -2521,56 +2524,106 @@ Then:
 ### Phases
-#### Phase 1: Partitioned Keyspaces and WAL Shipping
+#### Phase 1: Shard-Aware Foundations (m8p1) -- COMPLETE
-**Delivers:** Deterministic shard IDs derived from subject-prefix keys, WAL segment shipping with per-segment checksums, follower apply loops using the same checkpoint format as single-node. Cross-shard atomicity defined at the "entity group" boundary (Item, User, Creator each map to a shard). Lag metrics (`replication_seconds_behind`) exported.
+**Delivers:** Identity types (`ShardId`, `RegionId`, `WalSegmentId`, `NodeRole`), `ShardRouter` for entity placement, `BatchHeader` v2 (backward-compatible WAL extension), shard-aware segment naming, `NodeConfig` in `TidalDbBuilder`, and `ReplicationState` per-shard high-water-mark. No network I/O in this phase -- just the data structure layer that everything else builds on.
 **Acceptance Criteria:**
- [ ] `ShardId = hash(entity_id) mod N` (configurable per `EntityKind`) stored alongside keys; shard map hot-swappable via epoch config.
+- [x] `ShardId(u16)` and `RegionId(u16)` are `Copy + Hash + Ord + Serialize`; `TenantId(0)` single-node default unchanged.
- [ ] WAL segments have globally unique IDs (`region_id:shard_id:seqno`); followers detect gaps and request retransmit.
+- [x] `WalSegmentId::parse("r0:s0:42")` and `Display` round-trip deterministically.
- [ ] Followers reapply segments idempotently using the same `EntitySignalState` checkpoint format from M1.
+- [x] `BatchHeader` v2 reads bytes 60-63 for shard/region IDs; v1 segments decode as shard=0, region=0 (zero-padding was always there).
- [ ] Lag SLO: < 2s p99 at 25K writes/sec across 5 shards.
+- [x] `ShardRouter::route(entity_id)` with N=1 always returns `ShardId(0)` (single-node default).
- [ ] CLI: `tidalctl shard status` shows leader, lag, checkpoint age.
+- [x] `ReplicationState::advance_hwm(shard, seqno)` is monotonic via `compare_exchange`.
 **Depends On:** M7 (hardened WAL/Signal ledger)
-**Complexity:** XL
+**Complexity:** L
 **Task Files:** `docs/planning/milestone-8/phase-1/`
 **Research Reference:** `docs/research/tidaldb_wal.md`, `docs/research/tidaldb_signal_ledger.md`
-#### Phase 2: Conflict Resolution and Session Semantics
+#### Phase 2: WAL Shipping and Follower Replay (m8p2) -- COMPLETE
-**Delivers:** Deterministic reconciliation for eventually-consistent writes: CRDT-style counters for windowed aggregates, last-writer-wins timestamps for session state, and per-session sequence numbers so agents can reason about acknowledgements. Adds write-idempotency keys to the WAL and exposes a reconciliation audit log.
+**Delivers:** `Transport` trait, `InProcessTransport` (for tests), `WalShipper` background task, `SegmentReceiver` with BLAKE3 validation and idempotent replay, `FollowerDb` (read-only mode with `TidalError::ReadOnly`), `ReplicationLagGauge`, and an 8-test integration suite (`m8p2_replication.rs`).
 **Acceptance Criteria:**
- [ ] Windowed counters replicated as bounded PN-counters (positive/negative components) with tombstones for expired buckets.
+- [x] `WalShipper` ships sealed segments to followers in parallel; lagging follower catches up within 2s on in-process transport.
- [ ] Decay scores replay identically because WAL order is preserved per shard; cross-shard dependencies (user->creator) carry causal metadata.
+- [x] `SegmentReceiver` validates BLAKE3 checksum; returns `TidalError::CorruptedWal` on mismatch.
- [ ] Session updates carry `(session_id, seqno)`; duplicates dropped, gaps surfaced via API.
+- [x] Followers reject all write methods with `TidalError::ReadOnly`.
- [ ] `reconcile --since <ts>` tool emits merged vs diverged entries for auditing.
+- [x] `ReplicationLagGauge::lag_seqno(shard)` = `leader_hwm - follower_applied`; reaches 0 after convergence.
- [ ] Hides/blocks modeled as LWW registers with vector-clock tie-breakers (region priority list).
+- [x] `m8p2_replication.rs` 8 tests pass.
 **Depends On:** Phase 1
 **Complexity:** XL
-**Research Reference:** `thoughts.md` Part V.5-6 (quarantine-first, group commit), `docs/research/tidaldb_signal_ledger.md`
+**Task Files:** `docs/planning/milestone-8/phase-2/`
-#### Phase 3: Control Plane, Multi-Tenancy, and Routing
+#### Phase 3: CRDT Counters and Deterministic Reconciliation (m8p3) -- COMPLETE
-**Delivers:** Tenant-aware namespaces (per-tenant WAL directories and key prefixes), routing layer that maps tenants + entity IDs to shard endpoints, and policy templates (data residency, read-after-write budgets). Adds hosted-ready observability (lag dashboards, per-tenant quotas) and blue/green deploy tooling for the fabric.
+**Delivers:** `HlcTimestamp` and `HLC` (Hybrid Logical Clock), `PNCounter` (per-node P/N vectors), `LWWRegister<T>` (HLC-timestamped, used for hard negatives), `CrdtSignalState` (per-node decay accumulators that sum on merge), `ReconciliationEngine` (`plan()` + `apply()` idempotent), and property tests (`m8p3_crdt.rs`).
 **Acceptance Criteria:**
- [ ] Tenant config: `{tenant_id, shard_set, residency=[regions], rpo, rto}` stored in control-plane keyspace.
+- [x] `PNCounter::merge` is commutative, associative, and idempotent (10K proptest cases each).
- [ ] Router SDK chooses nearest healthy region that satisfies residency and read-after-write target; falls back with documented staleness budget.
+- [x] `CrdtSignalState::decay_score` = sum of per-node contributions; no double-counting after merge of disjoint node histories (key-aligned HashMap lookup, not zip).
- [ ] Throttling per tenant (signals/sec, query concurrency) with circuit-breaker events surfaced via metrics + CLI.
+- [x] `LWWRegister::merge` resolves concurrent writes by `(wall_ns, logical, node_id)` ordering.
- [ ] Rolling upgrade playbook: add shard, rebalance, observe zero dropped writes.
+- [x] `ReconciliationEngine::plan(local, remote).apply()` produces identical state to single-node replay of all events (verified to 6 decimal places).
- [ ] Hosted docs: describe how embeddable apps graduate to hosted fabric without rewrites (same query + signal APIs).
+- [x] `m8p3_crdt.rs` 13 property tests pass.
-**Depends On:** Phase 2
+**Depends On:** Phase 1 (ShardId as node identifier)
 **Complexity:** L
 **Task Files:** `docs/planning/milestone-8/phase-3/`
 #### Phase 4: Session Continuity and Agent Memory Across Regions (m8p4)
 **Delivers:** `SessionSeqNo(u64)` monotonic per-session write counter, `IdempotencyKey(u128)` BLAKE3-derived per-operation key, `IdempotencyStore` (bounded LRU 100K), `SessionReplicationBridge` (ships session journal entries via `Transport`), hard-negative union-semantics during convergence (hide always wins during partition), and cross-region session tests (`m8p4_session.rs`).
 **Acceptance Criteria:**
 - [ ] Session started in region A is visible in region B within 2s (in-process transport).
 - [ ] Duplicate session events (same idempotency key) produce exactly one state change.
 - [ ] Hard negatives: `hide(t=100)` + `unhide(t=50)` → item stays hidden on both regions after replication.
 - [ ] `m8p4_session.rs` 5 tests pass.
 **Depends On:** Phase 2 (WAL shipping), Phase 3 (LWWRegister, HLC)
 **Complexity:** L
 **Task Files:** `docs/planning/milestone-8/phase-4/`
 #### Phase 5: Control Plane, Multi-Tenancy, and Routing (m8p5)
 **Delivers:** `TenantId(u64)` + `TenantConfig` (quotas + residency policy), `TenantRateLimiter` (token bucket), `TenantRouter` (Jump Consistent Hash with residency constraint), `ControlPlane` (embedded leader-local cluster health), `TenantMigration` (dual-write zero-downtime migration state machine), `RollingUpgradeCoordinator` (drain+rejoin), and multi-tenancy tests (`m8p5_multitenancy.rs`).
 **Acceptance Criteria:**
 - [ ] `TidalError::QuotaExceeded` returned within 1ms when token bucket empty.
 - [ ] Tenant migration: all signals present on target after migration; source has 0 after GC; zero downtime during dual-write.
 - [ ] Rolling upgrade: signals written during drain window present on rejoined node.
 - [ ] WAL directory for `TenantId(42)` is `{data_dir}/tenants/42/wal/`.
 - [ ] `m8p5_multitenancy.rs` 5 tests pass.
 **Depends On:** Phase 2 (WAL shipping), Phase 3 (reconciliation), Phase 4 (session continuity)
 **Complexity:** L
 **Task Files:** `docs/planning/milestone-8/phase-5/`
 #### Phase 6: End-to-End UAT (m8p6)
 **Delivers:** `SimulatedCluster` test harness (N regions × M shards via `InProcessTransport`), `NetworkPartition` + `ShardCrash` RAII fault injection, `m8_uat.rs` (5 UAT scenario tests), and performance assertions (replication < 2s p99, failover < 10s, reconciliation < 100ms). This phase is the M8 done gate.
 **Acceptance Criteria:**
 - [ ] **UAT Step 1:** Cross-region replication < 2s; decay scores match to 6 decimal places.
 - [ ] **UAT Step 2:** Failover within 10s; no data loss on promoted follower.
 - [ ] **UAT Step 3:** Degraded query succeeds with 2/3 regions; `QueryStats` degradation flag set.
 - [ ] **UAT Step 4:** Post-reconciliation: no duplicate counts; hard negatives propagated; scores match analytical formula to 6 decimal places.
 - [ ] **UAT Step 5:** Tenant migration zero downtime; old region GC'd.
 - [ ] `cargo test --test m8_uat` passes in < 60 seconds.
 **Depends On:** Phases 1–5
 **Complexity:** M
 **Task Files:** `docs/planning/milestone-8/phase-6/`
 ### Done When
-tidalDB instances can be deployed as a hosted, multi-region fabric with deterministic replication and reconciliation. Agents anywhere in the world can write signals and rely on hides/mutes/policies holding globally. Operators get tooling for shard health, tenant placement, rolling upgrades, and lag visibility. Embeddable users flip a config switch to opt into the fabric; query and signal APIs remain unchanged.
+`cargo test --test m8_uat` passes all 5 UAT scenario steps with 25K signals/sec sustained throughput across 3 simulated regions, verifying no signal loss, no duplicate counts, no leaked hard negatives, and correct decay scores after partition heal and reconciliation. Tenant migration and rolling upgrade complete with zero downtime. Embeddable users flip a config switch to join the fabric; query and signal APIs remain unchanged.
 ---
@ -2809,7 +2862,20 @@ m1p1 (Types/Schema) ✓
                                           M6 COMPLETE ✓ (6 phases: cohort, social, sorts, collections, scope, notifications)
                                           M7 COMPLETE ✓ (crash recovery, degradation, scale, observability, UAT + enterprise readiness)
-                                           M8 phases depend on M7
+
                                           M8 IN PROGRESS (Distributed Fabric):
                                             m8p1 (Shard-Aware Foundations) ✓
                                               |
                                               +---> m8p2 (WAL Shipping + Follower Replay) ✓
                                               |       |
                                               +---> m8p3 (CRDT Reconciliation) ✓
                                                       |
                                                       +---> m8p4 (Session Continuity)  ← NEXT
                                                       |       |
                                                       +-------+---> m8p5 (Control Plane + Multi-Tenancy)
                                                                       |
                                                                       +---> m8p6 (End-to-End UAT)
                                           M9 phases depend on M8
                                           M10 phases depend on M9
 ```
@ -2821,6 +2887,9 @@ m1p1 (Types/Schema) ✓
 - m3p1 (Entities) and m5p1 (Tantivy) can start in parallel with later M2 phases (M4 Agent Memory sits between M3 and M5)
 - m3p2 Tasks 01 (User Preference Vector) and 03 (Hard Negatives) can be built in parallel within m3p2
 - m4p2 (RRF) and m4p4 (Creator Search) can be built in parallel
 - m8p2 (WAL Shipping) and m8p3 (CRDT Reconciliation) can be built in parallel after m8p1 (both complete)
 - m8p4 (Session Continuity) tasks 01 and 02 are parallelizable within the phase
 - m8p5 (Multi-Tenancy) tasks 01 and 02 are parallelizable within the phase
 ---
@ -2841,6 +2910,10 @@ These decisions are made. They are not revisited unless benchmarks prove them wr
 | Key encoding         | Subject-prefix `[entity_id][0x00][TAG:suffix]` | Separate key namespaces  | Co-locates entity data, natural shard boundary, single prefix scan                    |
 | Embedding format     | f16 quantization (default)                     | float32                  | Half memory, < 1% recall loss at 1536D                                                |
 | Query language       | Custom (RETRIEVE/SEARCH/SIGNAL)                | SQL                      | Domain semantics cannot be expressed in SQL without losing optimization opportunities |
 | Replication model    | Primary-backup WAL shipping                    | Raft consensus           | No distributed consensus needed; signal CRDTs handle conflict-free merge              |
 | Signal CRDTs         | PNCounter (per-node P/N vectors) + CrdtSignalState | Per-event dedup (BLAKE3) | O(nodes) memory vs O(events); commutative/associative/idempotent merge               |
 | Hard negative CRDTs  | LWWRegister with HLC timestamps                | G-Set (union only)       | LWW allows unhide; HLC provides causal ordering even with clock skew                  |
 | Causal ordering      | HLC (Hybrid Logical Clock)                     | NTP / Lamport clocks     | Tolerates wall-clock skew; causal ordering within bounded drift (Kulkarni et al. 2014)|
 ---
--- a/docs/planning/milestone-8/phase-1/OVERVIEW.md
+++ b/docs/planning/milestone-8/phase-1/OVERVIEW.md
@ -0,0 +1,100 @@
 # m8p1: Shard-Aware Foundations
 ## Delivers
 The identity types, WAL segment tagging, and shard routing table that make
 tidalDB distribution-aware without introducing any network code. After this
 phase, every WAL segment carries a globally unique ID
 (`region_id:shard_id:seqno`), every entity operation is routable through a
 `ShardRouter`, and the existing single-node deployment works identically with
 the default shard_id=0 / region_id=0 configuration. This is the "build the
 atoms right" phase -- no new runtime behavior, but every data structure is
 distribution-ready.
 Deliverables:
 - `ShardId(u16)`, `RegionId(u16)`, `WalSegmentId { region_id, shard_id, seqno }` identity types
 - WAL batch header v2: adds `shard_id` and `region_id` fields (backward-compatible; v1 readers skip unknown fields)
 - `ShardRouter`: maps `EntityId -> ShardId` via configurable range boundaries
 - `NodeConfig` extending `Config` with cluster role, shard assignment, region assignment
 - `ReplicationState` tracking per-shard high-water-mark seqno for follower bookkeeping
 - All existing tests pass unchanged (shard_id=0 is the default; single-node is shard 0)
 ## Dependencies
 - **Requires:** M7 complete (WAL format v1, `BatchHeader`, `EventRecord`, `SegmentWriter`, `CheckpointManager`, `Config`, `StorageMode`)
 - **Files modified:**
  - `tidal/src/wal/format/batch.rs` -- extend `BatchHeader` with shard/region fields
  - `tidal/src/wal/segment.rs` -- segment filename includes shard_id prefix for multi-shard directories
  - `tidal/src/db/config.rs` -- add `NodeConfig` with cluster fields
  - `tidal/src/wal/checkpoint.rs` -- checkpoint includes shard_id
 - **Files created:**
  - `tidal/src/replication/mod.rs` -- module root
  - `tidal/src/replication/shard.rs` -- `ShardId`, `RegionId`, `ShardRouter`
  - `tidal/src/replication/segment_id.rs` -- `WalSegmentId`
  - `tidal/src/replication/state.rs` -- `ReplicationState`
 ## Research References
 - `docs/research/tidaldb_wal.md` -- WAL segment format, batch header layout
 - `thoughts.md` -- Part V.12 (subject-prefix key encoding for sharding)
 ## Acceptance Criteria (Phase Level)
 - [ ] `ShardId(u16)` and `RegionId(u16)` are `Copy + Clone + Debug + Eq + Hash + Ord + Serialize + Deserialize`
 - [ ] `WalSegmentId { region_id: RegionId, shard_id: ShardId, seqno: u64 }` has total ordering by `(region_id, shard_id, seqno)` and a human-readable `Display` impl producing `"r0:s0:42"`
 - [ ] `BatchHeader` v2 adds `shard_id: u16` and `region_id: u16` at bytes 58-61 (within existing 64-byte header); `FORMAT_VERSION` bumped to 2; v1 batches decode as shard_id=0, region_id=0
 - [ ] `ShardRouter::route(entity_id: EntityId) -> ShardId` returns the correct shard for hash-based routing; default single-shard config always returns `ShardId(0)`
 - [ ] `ShardRouter` is constructable from a `Vec<(ShardId, EntityIdRange)>` with validation that ranges are non-overlapping and cover the full u64 space
 - [ ] `NodeConfig` extends `Config` with `role: NodeRole`, `shard_id: ShardId`, `region_id: RegionId`, `peer_shards: Vec<ShardId>`; defaults produce a single-node config
 - [ ] `ReplicationState` tracks `HashMap<ShardId, u64>` (high-water-mark seqno per shard) with atomic reads/writes
 - [ ] All existing M0-M7 tests pass without modification (single-node = shard 0, region 0)
 - [ ] Segment filename format for multi-shard: `wal-s{shard_id:05}-{first_seq:020}.seg`; single-shard (shard_id=0) retains old format `wal-{first_seq:020}.seg` for backward compatibility
 - [ ] Property test: 10,000 random EntityIds always route to exactly one shard; routing is a pure function of entity_id and shard_ranges
 ## Task Execution Order
 ```
 Task 01: Identity Types ─────────┐
                                  ├──> Task 03: BatchHeader v2
 Task 02: ShardRouter ────────────┤
                                  ├──> Task 04: Segment Naming
                                  │
                                  └──> Task 05: NodeConfig
                                            │
                                            v
                                  Task 06: ReplicationState
 ```
 Tasks 01 and 02 are fully parallelizable. Task 03 and 04 depend on Task 01. Task 05 depends on both 01 and 02. Task 06 depends on 05.
 ## Module Location
 | File | Status | Contains |
 |------|--------|----------|
 | `tidal/src/replication/mod.rs` | NEW | Module root, re-exports |
 | `tidal/src/replication/shard.rs` | NEW | `ShardId`, `RegionId`, `ShardRouter`, `EntityIdRange` |
 | `tidal/src/replication/segment_id.rs` | NEW | `WalSegmentId`, ordering, Display |
 | `tidal/src/replication/state.rs` | NEW | `ReplicationState`, high-water-mark tracking |
 | `tidal/src/wal/format/batch.rs` | MODIFIED | `BatchHeader` v2 with shard/region fields |
 | `tidal/src/wal/segment.rs` | MODIFIED | Shard-aware segment filename |
 | `tidal/src/wal/checkpoint.rs` | MODIFIED | Checkpoint includes shard_id |
 | `tidal/src/db/config.rs` | MODIFIED | `NodeConfig`, `NodeRole` enum |
 | `tidal/src/lib.rs` | MODIFIED | Add `pub mod replication;` |
 ## Notes
 ### Backward compatibility is non-negotiable
 WAL v1 segments must be readable by v2 code. The 4 bytes at offsets 58-61 in the v1 header are currently zero-padding; v2 reinterprets them as shard_id and region_id. This is safe because v1 always wrote zeros there.
 ### Hash-based vs range-based routing
 `ShardRouter` supports both: `hash(entity_id) % num_shards` for uniform distribution, and explicit range boundaries for production deployments. The trait abstracts the choice.
 ### No network code in this phase
 Everything is in-process. The `replication` module defines data structures and routing logic only. The `Transport` trait is introduced in Phase 8.2.
 ## Done When
 A developer can construct a `NodeConfig` with 3 regions and 5 shards per region, create a `ShardRouter` from range boundaries, route EntityIds to shards, construct a WAL `BatchHeader` v2 with shard/region tags, and all existing single-node tests pass unchanged.
--- a/docs/planning/milestone-8/phase-1/task-01-identity-types.md
+++ b/docs/planning/milestone-8/phase-1/task-01-identity-types.md
@ -0,0 +1,193 @@
 # Task 01: Identity Types
 ## Delivers
 `ShardId(u16)`, `RegionId(u16)`, `WalSegmentId { region_id, shard_id, seqno }`, and `NodeRole` enum in `tidal/src/replication/` with full trait derivations, Display impls, and serde support. These types are the atoms of the entire distributed system -- every downstream module depends on them.
 ## Complexity: S
 ## Dependencies
 - None (this is the foundation task for the phase)
 ## Technical Design
 ### ShardId and RegionId
 ```rust
 // tidal/src/replication/shard.rs
 /// Uniquely identifies a shard within the cluster.
 ///
 /// A shard owns a contiguous range of EntityIds for a given EntityKind.
 /// ShardId(0) is the default single-node shard.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord,
         serde::Serialize, serde::Deserialize)]
 pub struct ShardId(pub u16);
 impl ShardId {
    /// The default single-node shard.
    pub const SINGLE: ShardId = ShardId(0);
 }
 impl fmt::Display for ShardId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "s{}", self.0)
    }
 }
 /// Uniquely identifies a region in the cluster.
 ///
 /// RegionId(0) is the default single-node region.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord,
         serde::Serialize, serde::Deserialize)]
 pub struct RegionId(pub u16);
 impl RegionId {
    /// The default single-node region.
    pub const SINGLE: RegionId = RegionId(0);
 }
 impl fmt::Display for RegionId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "r{}", self.0)
    }
 }
 ```
 ### WalSegmentId
 ```rust
 // tidal/src/replication/segment_id.rs
 /// Globally unique identifier for a WAL segment.
 ///
 /// Ordering: by (region_id, shard_id, seqno) -- allows total ordering
 /// across all segments in the cluster.
 ///
 /// Display: "r0:s0:42" -- human-readable for logs and tidalctl output.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash,
         serde::Serialize, serde::Deserialize)]
 pub struct WalSegmentId {
    pub region_id: RegionId,
    pub shard_id: ShardId,
    pub seqno: u64,
 }
 impl WalSegmentId {
    pub fn new(region_id: RegionId, shard_id: ShardId, seqno: u64) -> Self {
        Self { region_id, shard_id, seqno }
    }
    /// Create a segment ID for the default single-node deployment.
    pub fn single_node(seqno: u64) -> Self {
        Self::new(RegionId::SINGLE, ShardId::SINGLE, seqno)
    }
 }
 impl PartialOrd for WalSegmentId {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }
 impl Ord for WalSegmentId {
    fn cmp(&self, other: &Self) -> Ordering {
        self.region_id.cmp(&other.region_id)
            .then(self.shard_id.cmp(&other.shard_id))
            .then(self.seqno.cmp(&other.seqno))
    }
 }
 impl fmt::Display for WalSegmentId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}:{}:{}", self.region_id, self.shard_id, self.seqno)
    }
 }
 ```
 ### NodeRole
 ```rust
 // tidal/src/db/config.rs (new enum, added here)
 /// The role of this node in the cluster.
 ///
 /// `Single` is the default -- a standalone node that acts as both leader
 /// and follower. Used for embedded deployments.
 ///
 /// `Leader` accepts writes and ships WAL segments to followers.
 ///
 /// `Follower` only accepts replayed events; write calls return ReadOnly.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default,
         serde::Serialize, serde::Deserialize)]
 pub enum NodeRole {
    #[default]
    Single,
    Leader,
    Follower,
 }
 ```
 ### Module structure
 ```rust
 // tidal/src/replication/mod.rs
 //! Replication types and protocols for distributed tidalDB deployments.
 //!
 //! The `replication` module is empty in single-node deployments --
 //! all types default to shard_id=0, region_id=0, and routing is a no-op.
 pub mod shard;
 pub mod segment_id;
 pub mod state;
 pub use shard::{ShardId, RegionId};
 pub use segment_id::WalSegmentId;
 pub use state::ReplicationState;
 ```
 ## Acceptance Criteria
 - [ ] `ShardId(u16)` and `RegionId(u16)` derive `Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize`
 - [ ] `ShardId::SINGLE` = `ShardId(0)`, `RegionId::SINGLE` = `RegionId(0)`
 - [ ] `WalSegmentId` has total ordering by `(region_id, shard_id, seqno)`
 - [ ] `WalSegmentId::Display` produces `"r0:s0:42"` format
 - [ ] `WalSegmentId::single_node(seqno)` creates a single-node segment ID
 - [ ] `NodeRole` enum with `Single` (default), `Leader`, `Follower`
 - [ ] `tidal/src/replication/mod.rs` exports all types; wired into `tidal/src/lib.rs`
 - [ ] Unit tests: ordering, display, single-node defaults
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
 ## Test Strategy
 ```rust
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn segment_id_ordering() {
        let a = WalSegmentId::new(RegionId(0), ShardId(0), 1);
        let b = WalSegmentId::new(RegionId(0), ShardId(0), 2);
        let c = WalSegmentId::new(RegionId(0), ShardId(1), 0);
        let d = WalSegmentId::new(RegionId(1), ShardId(0), 0);
        assert!(a < b);
        assert!(b < c);
        assert!(c < d);
    }
    #[test]
    fn segment_id_display() {
        let id = WalSegmentId::new(RegionId(2), ShardId(3), 42);
        assert_eq!(id.to_string(), "r2:s3:42");
    }
    #[test]
    fn single_node_defaults() {
        assert_eq!(ShardId::SINGLE, ShardId(0));
        assert_eq!(RegionId::SINGLE, RegionId(0));
        assert_eq!(WalSegmentId::single_node(99).to_string(), "r0:s0:99");
    }
 }
 ```
--- a/docs/planning/milestone-8/phase-1/task-02-shard-router.md
+++ b/docs/planning/milestone-8/phase-1/task-02-shard-router.md
@ -0,0 +1,239 @@
 # Task 02: ShardRouter
 ## Delivers
 `ShardRouter` with `EntityIdRange` type, range-based and hash-based routing, validation that ranges partition the full u64 space, and property tests for deterministic routing. The `ShardRouter` maps any `EntityId` to exactly one `ShardId` and is the single source of truth for shard assignment.
 ## Complexity: M
 ## Dependencies
 - Task 01 (ShardId, RegionId types)
 ## Technical Design
 ```rust
 // tidal/src/replication/shard.rs
 use crate::EntityId;
 /// A contiguous, half-open range of EntityIds: [start, end).
 ///
 /// Used to define shard boundaries in range-based routing.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct EntityIdRange {
    pub start: u64,  // inclusive
    pub end: u64,    // exclusive; u64::MAX means "includes the last entity"
 }
 impl EntityIdRange {
    pub fn contains(&self, id: u64) -> bool {
        id >= self.start && id < self.end
    }
    /// The full u64 space (single-shard default).
    pub fn full() -> Self {
        Self { start: 0, end: u64::MAX }
    }
 }
 /// Routing strategy for entity-to-shard mapping.
 #[derive(Debug, Clone)]
 pub enum RoutingStrategy {
    /// All entities route to the default single shard.
    /// Used for single-node deployments (shard_id=0).
    Single,
    /// Hash-based routing: `hash(entity_id) % num_shards`.
    /// Uniform distribution; no explicit range boundaries.
    Hash { num_shards: u16 },
    /// Range-based routing: each shard owns a contiguous range of EntityIds.
    /// Production deployments use this for controlled data placement.
    Range(Vec<(ShardId, EntityIdRange)>),
 }
 /// Routes EntityIds to ShardIds.
 ///
 /// Thread-safe; clone is cheap (inner data is Arc<_>).
 #[derive(Debug, Clone)]
 pub struct ShardRouter {
    strategy: RoutingStrategy,
 }
 impl ShardRouter {
    /// Create a single-node router (always returns ShardId(0)).
    pub fn single() -> Self {
        Self { strategy: RoutingStrategy::Single }
    }
    /// Create a hash-based router with `num_shards` shards.
    pub fn hash(num_shards: u16) -> Result<Self, RouterError> {
        if num_shards == 0 {
            return Err(RouterError::ZeroShards);
        }
        Ok(Self { strategy: RoutingStrategy::Hash { num_shards } })
    }
    /// Create a range-based router from a list of (ShardId, EntityIdRange) pairs.
    ///
    /// Validates that:
    /// - Ranges are non-overlapping
    /// - Ranges cover the full u64 space (no gaps)
    /// - ShardIds are unique
    pub fn range(ranges: Vec<(ShardId, EntityIdRange)>) -> Result<Self, RouterError> {
        Self::validate_ranges(&ranges)?;
        Ok(Self { strategy: RoutingStrategy::Range(ranges) })
    }
    /// Route an EntityId to its owning ShardId.
    ///
    /// Always returns exactly one shard. Never panics.
    pub fn route(&self, entity_id: EntityId) -> ShardId {
        let id = entity_id.as_u64();
        match &self.strategy {
            RoutingStrategy::Single => ShardId::SINGLE,
            RoutingStrategy::Hash { num_shards } => {
                // FNV-1a hash for uniform distribution without dependencies
                let hash = fnv1a_hash(id);
                ShardId(hash as u16 % num_shards)
            }
            RoutingStrategy::Range(ranges) => {
                for (shard_id, range) in ranges {
                    if range.contains(id) {
                        return *shard_id;
                    }
                }
                // Invariant: validated at construction time that ranges cover
                // the full space, so this is unreachable.
                ShardId::SINGLE
            }
        }
    }
    /// Returns all ShardIds known to this router.
    pub fn all_shards(&self) -> Vec<ShardId> {
        match &self.strategy {
            RoutingStrategy::Single => vec![ShardId::SINGLE],
            RoutingStrategy::Hash { num_shards } => {
                (0..*num_shards).map(ShardId).collect()
            }
            RoutingStrategy::Range(ranges) => {
                let mut shards: Vec<_> = ranges.iter().map(|(s, _)| *s).collect();
                shards.sort();
                shards.dedup();
                shards
            }
        }
    }
    fn validate_ranges(ranges: &[(ShardId, EntityIdRange)]) -> Result<(), RouterError> {
        if ranges.is_empty() {
            return Err(RouterError::EmptyRanges);
        }
        // Sort by start position to check coverage and overlap.
        let mut sorted: Vec<_> = ranges.iter().collect();
        sorted.sort_by_key(|(_, r)| r.start);
        // Check no gaps and no overlaps.
        let mut expected_start = 0u64;
        for (_, range) in &sorted {
            if range.start != expected_start {
                return Err(RouterError::Gap {
                    expected: expected_start,
                    found: range.start,
                });
            }
            if range.end <= range.start {
                return Err(RouterError::EmptyRange { start: range.start });
            }
            expected_start = range.end;
        }
        // Check coverage of full space.
        if expected_start != u64::MAX {
            return Err(RouterError::IncompleteCoverage { ends_at: expected_start });
        }
        Ok(())
    }
 }
 #[inline]
 fn fnv1a_hash(value: u64) -> u64 {
    const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
    const FNV_PRIME: u64 = 1_099_511_628_211;
    let mut hash = FNV_OFFSET;
    let bytes = value.to_le_bytes();
    for byte in &bytes {
        hash ^= *byte as u64;
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    hash
 }
 #[derive(Debug, thiserror::Error)]
 pub enum RouterError {
    #[error("shard count must be > 0")]
    ZeroShards,
    #[error("range list is empty")]
    EmptyRanges,
    #[error("gap in range: expected start {expected}, found {found}")]
    Gap { expected: u64, found: u64 },
    #[error("empty range starting at {start}")]
    EmptyRange { start: u64 },
    #[error("ranges don't cover full u64 space: ends at {ends_at}")]
    IncompleteCoverage { ends_at: u64 },
 }
 ```
 ## Acceptance Criteria
 - [ ] `ShardRouter::single()` always returns `ShardId(0)` for any input
 - [ ] `ShardRouter::hash(n)` distributes entities uniformly; property test with 10K IDs shows max deviation < 15% from expected bucket size
 - [ ] `ShardRouter::range(ranges)` returns the correct shard for boundaries; property test with 10K random IDs within each range
 - [ ] `RouterError::Gap` when ranges have a gap; `RouterError::IncompleteCoverage` when ranges don't reach u64::MAX
 - [ ] `ShardRouter::all_shards()` returns all shards for each routing strategy
 - [ ] Routing is a pure function: same input always returns same output (property test with proptest)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
 ## Test Strategy
 ```rust
 #[cfg(test)]
 mod tests {
    use super::*;
    use proptest::prelude::*;
    #[test]
    fn single_router_always_returns_shard_zero() {
        let router = ShardRouter::single();
        for id in [0u64, 1, 100, u64::MAX - 1] {
            assert_eq!(router.route(EntityId::from(id)), ShardId(0));
        }
    }
    #[test]
    fn range_router_validates_gap() {
        let result = ShardRouter::range(vec![
            (ShardId(0), EntityIdRange { start: 0, end: 1000 }),
            (ShardId(1), EntityIdRange { start: 2000, end: u64::MAX }),
        ]);
        assert!(matches!(result, Err(RouterError::Gap { .. })));
    }
    proptest! {
        #[test]
        fn hash_routing_is_deterministic(id in 0u64..u64::MAX) {
            let router = ShardRouter::hash(5).unwrap();
            let entity = EntityId::from(id);
            assert_eq!(router.route(entity), router.route(entity));
        }
        #[test]
        fn hash_routing_stays_in_range(id in 0u64..u64::MAX) {
            let router = ShardRouter::hash(5).unwrap();
            let shard = router.route(EntityId::from(id));
            assert!(shard.0 < 5);
        }
    }
 }
 ```
--- a/docs/planning/milestone-8/phase-1/task-03-batch-header-v2.md
+++ b/docs/planning/milestone-8/phase-1/task-03-batch-header-v2.md
@ -0,0 +1,120 @@
 # Task 03: BatchHeader v2
 ## Delivers
 Extend `BatchHeader` in `tidal/src/wal/format/batch.rs` to v2 format with `shard_id` and `region_id` fields at bytes 58-61; update encode/decode; ensure v1 backward compatibility (zeros decode as shard 0, region 0). Bumps `FORMAT_VERSION` to 2.
 ## Complexity: S
 ## Dependencies
 - Task 01 (ShardId, RegionId types)
 ## Technical Design
 The existing `BatchHeader` is 64 bytes. The current layout (from WAL research doc):
 ```
 Bytes 0-3:   MAGIC (0x54494441 = "TIDA")
 Bytes 4-7:   FORMAT_VERSION (u32 LE)
 Bytes 8-15:  first_seq (u64 LE)
 Bytes 16-23: last_seq (u64 LE)
 Bytes 24-31: event_count (u64 LE)
 Bytes 32-39: uncompressed_size (u64 LE)
 Bytes 40-47: compressed_size (u64 LE)
 Bytes 48-55: timestamp_ns (u64 LE)
 Bytes 56-59: checksum (u32 LE)         <- BLAKE3 first 4 bytes
 Bytes 60-61: [RESERVED / ZERO]
 Bytes 62-63: [RESERVED / ZERO]
 ```
 v2 adds `shard_id` and `region_id` at the zero-padded bytes:
 ```
 Bytes 56-59: checksum (u32 LE)
 Bytes 60-61: shard_id (u16 LE)    <- NEW in v2 (was zero padding in v1)
 Bytes 62-63: region_id (u16 LE)   <- NEW in v2 (was zero padding in v1)
 ```
 This is backward compatible: v1 always wrote zeros at 60-63, so v2 code reading v1 segments correctly interprets shard_id=0, region_id=0.
 ```rust
 // tidal/src/wal/format/batch.rs
 pub const FORMAT_VERSION_V1: u32 = 1;
 pub const FORMAT_VERSION_V2: u32 = 2;
 pub const FORMAT_VERSION: u32 = FORMAT_VERSION_V2;
 #[derive(Debug, Clone, PartialEq)]
 pub struct BatchHeader {
    pub first_seq: u64,
    pub last_seq: u64,
    pub event_count: u64,
    pub uncompressed_size: u64,
    pub compressed_size: u64,
    pub timestamp_ns: u64,
    pub checksum: u32,
    // v2 fields -- default to 0 for single-node deployments
    pub shard_id: ShardId,
    pub region_id: RegionId,
 }
 impl BatchHeader {
    /// Encode to the 64-byte wire format.
    pub fn encode(&self) -> [u8; 64] {
        let mut buf = [0u8; 64];
        buf[0..4].copy_from_slice(&MAGIC.to_le_bytes());
        buf[4..8].copy_from_slice(&FORMAT_VERSION.to_le_bytes());
        buf[8..16].copy_from_slice(&self.first_seq.to_le_bytes());
        buf[16..24].copy_from_slice(&self.last_seq.to_le_bytes());
        buf[24..32].copy_from_slice(&self.event_count.to_le_bytes());
        buf[32..40].copy_from_slice(&self.uncompressed_size.to_le_bytes());
        buf[40..48].copy_from_slice(&self.compressed_size.to_le_bytes());
        buf[48..56].copy_from_slice(&self.timestamp_ns.to_le_bytes());
        buf[56..60].copy_from_slice(&self.checksum.to_le_bytes());
        buf[60..62].copy_from_slice(&self.shard_id.0.to_le_bytes());
        buf[62..64].copy_from_slice(&self.region_id.0.to_le_bytes());
        buf
    }
    /// Decode from a 64-byte buffer.
    ///
    /// Accepts both v1 (shard_id=0, region_id=0) and v2 format.
    pub fn decode(buf: &[u8; 64]) -> Result<Self, WalError> {
        let magic = u32::from_le_bytes(buf[0..4].try_into().unwrap());
        if magic != MAGIC {
            return Err(WalError::Corruption("bad magic".into()));
        }
        let version = u32::from_le_bytes(buf[4..8].try_into().unwrap());
        if version != FORMAT_VERSION_V1 && version != FORMAT_VERSION_V2 {
            return Err(WalError::Corruption(format!("unknown version {version}")));
        }
        let shard_id = ShardId(u16::from_le_bytes(buf[60..62].try_into().unwrap()));
        let region_id = RegionId(u16::from_le_bytes(buf[62..64].try_into().unwrap()));
        Ok(Self {
            first_seq: u64::from_le_bytes(buf[8..16].try_into().unwrap()),
            last_seq: u64::from_le_bytes(buf[16..24].try_into().unwrap()),
            event_count: u64::from_le_bytes(buf[24..32].try_into().unwrap()),
            uncompressed_size: u64::from_le_bytes(buf[32..40].try_into().unwrap()),
            compressed_size: u64::from_le_bytes(buf[40..48].try_into().unwrap()),
            timestamp_ns: u64::from_le_bytes(buf[48..56].try_into().unwrap()),
            checksum: u32::from_le_bytes(buf[56..60].try_into().unwrap()),
            shard_id,
            region_id,
        })
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `BatchHeader` has `shard_id: ShardId` and `region_id: RegionId` fields
 - [ ] `BatchHeader::encode()` writes shard_id at bytes 60-61 (LE) and region_id at bytes 62-63 (LE)
 - [ ] `BatchHeader::decode()` reads these bytes; v1 batches (zeros at 60-63) decode as `ShardId(0)`, `RegionId(0)`
 - [ ] `FORMAT_VERSION` is bumped to 2; v1 reader accepts v1 and v2 version bytes
 - [ ] Property test: encode + decode roundtrips for random shard_id, region_id values
 - [ ] Property test: a buffer created with v1 code (shard bytes zeroed) decodes correctly
 - [ ] All existing WAL tests pass (write/read/recovery) -- single-node uses shard=0, region=0 by default
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-1/task-04-segment-naming.md
+++ b/docs/planning/milestone-8/phase-1/task-04-segment-naming.md
@ -0,0 +1,93 @@
 # Task 04: Shard-Aware Segment Naming
 ## Delivers
 Update `segment_filename()` and `parse_segment_seq()` in `tidal/src/wal/segment.rs` to support shard-prefixed filenames. Single-shard (shard_id=0) retains the existing filename format for backward compatibility. Multi-shard deployments use a shard-prefixed format.
 ## Complexity: S
 ## Dependencies
 - Task 01 (ShardId type)
 ## Technical Design
 ```rust
 // tidal/src/wal/segment.rs
 /// Generate the WAL segment filename for a given shard and sequence number.
 ///
 /// Single-shard (shard_id=0): `wal-{first_seq:020}.seg`
 ///   -- matches existing format, full backward compatibility
 ///
 /// Multi-shard (shard_id > 0): `wal-s{shard_id:05}-{first_seq:020}.seg`
 ///   -- includes shard prefix for disambiguation in shared WAL directories
 pub fn segment_filename(shard_id: ShardId, first_seq: u64) -> String {
    if shard_id == ShardId::SINGLE {
        format!("wal-{first_seq:020}.seg")
    } else {
        format!("wal-s{:05}-{:020}.seg", shard_id.0, first_seq)
    }
 }
 /// Parse the first_seq from a WAL segment filename.
 ///
 /// Accepts both formats:
 ///   - `wal-{first_seq:020}.seg` (single-shard, v1)
 ///   - `wal-s{shard_id:05}-{first_seq:020}.seg` (multi-shard, v2)
 ///
 /// Returns `(ShardId, first_seq)`.
 pub fn parse_segment_filename(filename: &str) -> Option<(ShardId, u64)> {
    let name = filename.strip_suffix(".seg")?;
    // Multi-shard format: wal-s{shard_id}-{first_seq}
    if let Some(rest) = name.strip_prefix("wal-s") {
        let dash = rest.find('-')?;
        let shard_id: u16 = rest[..dash].parse().ok()?;
        let first_seq: u64 = rest[dash + 1..].parse().ok()?;
        return Some((ShardId(shard_id), first_seq));
    }
    // Single-shard format: wal-{first_seq}
    if let Some(seq_str) = name.strip_prefix("wal-") {
        let first_seq: u64 = seq_str.parse().ok()?;
        return Some((ShardId::SINGLE, first_seq));
    }
    None
 }
 /// Scan a directory for WAL segments belonging to `shard_id`.
 ///
 /// In single-shard deployments, returns all segments (no prefix filtering).
 /// In multi-shard deployments, filters by shard prefix.
 pub fn list_segments_for_shard(
    dir: &Path,
    shard_id: ShardId,
 ) -> Result<Vec<(u64, PathBuf)>, WalError> {
    let mut segments = Vec::new();
    for entry in fs::read_dir(dir)? {
        let entry = entry?;
        let file_name = entry.file_name();
        let name = file_name.to_string_lossy();
        if let Some((seg_shard, seq)) = parse_segment_filename(&name) {
            if seg_shard == shard_id || shard_id == ShardId::SINGLE {
                segments.push((seq, entry.path()));
            }
        }
    }
    segments.sort_by_key(|(seq, _)| *seq);
    Ok(segments)
 }
 ```
 ## Acceptance Criteria
 - [ ] `segment_filename(ShardId(0), 42)` returns `"wal-00000000000000000042.seg"` (existing format)
 - [ ] `segment_filename(ShardId(3), 42)` returns `"wal-s00003-00000000000000000042.seg"`
 - [ ] `parse_segment_filename` correctly parses both formats
 - [ ] `parse_segment_filename("not-a-segment.txt")` returns `None`
 - [ ] `list_segments_for_shard` returns segments in sequence order; filters by shard in multi-shard directories
 - [ ] All existing WAL tests pass (they use ShardId(0) which retains existing filename format)
 - [ ] Property test: `parse_segment_filename(segment_filename(shard, seq))` roundtrips correctly
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-1/task-05-node-config.md
+++ b/docs/planning/milestone-8/phase-1/task-05-node-config.md
@ -0,0 +1,123 @@
 # Task 05: NodeConfig
 ## Delivers
 Add `NodeConfig` struct to `tidal/src/db/config.rs` extending `Config` with cluster fields (`role`, `shard_id`, `region_id`, `peer_shards`). Defaults produce a single-node config with zero changes to existing embedders.
 ## Complexity: S
 ## Dependencies
 - Task 01 (ShardId, RegionId, NodeRole types)
 - Task 02 (ShardRouter)
 ## Technical Design
 ```rust
 // tidal/src/db/config.rs
 /// Cluster configuration for distributed tidalDB deployments.
 ///
 /// Defaults produce a single-node configuration identical to M0-M7 behavior.
 /// Embedded deployments that do not set any cluster fields get single-node.
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct NodeConfig {
    /// The role of this node.
    ///
    /// Default: `NodeRole::Single` (standalone, accepts reads and writes).
    pub role: NodeRole,
    /// This node's shard identity.
    ///
    /// Default: `ShardId(0)` (single-node shard).
    pub shard_id: ShardId,
    /// This node's region identity.
    ///
    /// Default: `RegionId(0)` (single-node region).
    pub region_id: RegionId,
    /// Shards this node is aware of (including itself).
    ///
    /// Empty for single-node deployments.
    pub peer_shards: Vec<ShardId>,
    /// Routing strategy for entity-to-shard assignment.
    ///
    /// Default: `ShardRouter::single()` (all entities -> ShardId(0)).
    #[serde(skip)]
    pub router: ShardRouter,
 }
 impl Default for NodeConfig {
    fn default() -> Self {
        Self {
            role: NodeRole::Single,
            shard_id: ShardId::SINGLE,
            region_id: RegionId::SINGLE,
            peer_shards: vec![],
            router: ShardRouter::single(),
        }
    }
 }
 impl NodeConfig {
    /// Returns true if this is a standalone single-node deployment.
    pub fn is_single_node(&self) -> bool {
        self.role == NodeRole::Single
    }
    /// Returns true if this node accepts writes.
    pub fn accepts_writes(&self) -> bool {
        matches!(self.role, NodeRole::Single | NodeRole::Leader)
    }
 }
 ```
 ### Integration into Config
 ```rust
 // tidal/src/db/config.rs -- extend existing Config struct
 pub struct Config {
    // ... existing fields ...
    /// Cluster configuration. Default: single-node.
    pub cluster: NodeConfig,
 }
 impl Default for Config {
    fn default() -> Self {
        Self {
            // ... existing defaults ...
            cluster: NodeConfig::default(),
        }
    }
 }
 ```
 ### Builder integration
 ```rust
 // TidalDb::builder() -- add optional cluster config method
 impl TidalDbBuilder {
    /// Configure this instance for distributed deployment.
    ///
    /// Not required for single-node embedded use.
    pub fn with_cluster(mut self, config: NodeConfig) -> Self {
        self.config.cluster = config;
        self
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `NodeConfig::default()` produces `role=Single`, `shard_id=ShardId(0)`, `region_id=RegionId(0)`, `peer_shards=[]`, `router=ShardRouter::single()`
 - [ ] `NodeConfig::is_single_node()` returns true for `Single`, false for `Leader`/`Follower`
 - [ ] `NodeConfig::accepts_writes()` returns true for `Single` and `Leader`, false for `Follower`
 - [ ] `Config` gains a `cluster: NodeConfig` field with default `NodeConfig::default()`
 - [ ] All existing tests that construct `Config` or use `TidalDb::builder()` pass unchanged (cluster field defaults to single-node)
 - [ ] `TidalDbBuilder::with_cluster(config)` sets the cluster config
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-1/task-06-replication-state.md
+++ b/docs/planning/milestone-8/phase-1/task-06-replication-state.md
@ -0,0 +1,125 @@
 # Task 06: ReplicationState
 ## Delivers
 `ReplicationState` in `tidal/src/replication/state.rs` tracking per-shard high-water-mark seqno with `AtomicU64` for lock-free reads. Serialize/deserialize for checkpoint persistence. Used by followers to track which segments have been applied.
 ## Complexity: S
 ## Dependencies
 - Task 05 (NodeConfig -- establishes the set of known shards)
 ## Technical Design
 ```rust
 // tidal/src/replication/state.rs
 use crate::replication::shard::ShardId;
 use std::collections::HashMap;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 /// Tracks the per-shard replication high-water-mark.
 ///
 /// Each shard tracks the last WAL segment seqno that has been fully
 /// applied to the local state machine. Segments with seqno <=
 /// high_water_mark are idempotent no-ops on replay.
 ///
 /// Thread-safe: all fields are atomic. Clone is O(n_shards) -- clones
 /// the snapshot, not the atomics.
 #[derive(Debug)]
 pub struct ReplicationState {
    /// Per-shard high-water-mark seqno.
    /// `AtomicU64::MAX` means "no segments applied yet" (initial state).
    applied: HashMap<ShardId, Arc<AtomicU64>>,
 }
 impl ReplicationState {
    /// Create a new `ReplicationState` tracking the given shards.
    ///
    /// All high-water-marks start at 0 (no segments applied).
    pub fn new(shards: &[ShardId]) -> Self {
        let applied = shards
            .iter()
            .map(|&s| (s, Arc::new(AtomicU64::new(0))))
            .collect();
        Self { applied }
    }
    /// Create a single-node `ReplicationState` (tracks only `ShardId(0)`).
    pub fn single() -> Self {
        Self::new(&[ShardId::SINGLE])
    }
    /// Get the high-water-mark seqno for a shard.
    ///
    /// Returns `None` if the shard is unknown to this state.
    pub fn applied_seqno(&self, shard_id: ShardId) -> Option<u64> {
        self.applied.get(&shard_id).map(|a| a.load(Ordering::Acquire))
    }
    /// Update the high-water-mark for a shard.
    ///
    /// Only advances forward -- a seqno smaller than the current
    /// high-water-mark is silently ignored.
    pub fn advance(&self, shard_id: ShardId, seqno: u64) {
        if let Some(atomic) = self.applied.get(&shard_id) {
            let mut current = atomic.load(Ordering::Acquire);
            loop {
                if seqno <= current {
                    break; // already at or past this seqno
                }
                match atomic.compare_exchange_weak(
                    current,
                    seqno,
                    Ordering::AcqRel,
                    Ordering::Acquire,
                ) {
                    Ok(_) => break,
                    Err(actual) => current = actual,
                }
            }
        }
    }
    /// Returns all tracked shards and their current seqnos.
    pub fn snapshot(&self) -> HashMap<ShardId, u64> {
        self.applied
            .iter()
            .map(|(&s, a)| (s, a.load(Ordering::Acquire)))
            .collect()
    }
    /// Serialize for checkpoint persistence.
    pub fn to_checkpoint_bytes(&self) -> Vec<u8> {
        let snap = self.snapshot();
        serde_json::to_vec(&snap).expect("ReplicationState serialization is infallible")
    }
    /// Restore from checkpoint bytes.
    pub fn from_checkpoint_bytes(bytes: &[u8], shards: &[ShardId]) -> Self {
        let snap: HashMap<ShardId, u64> = serde_json::from_slice(bytes)
            .unwrap_or_default();
        let applied = shards
            .iter()
            .map(|&s| {
                let seqno = snap.get(&s).copied().unwrap_or(0);
                (s, Arc::new(AtomicU64::new(seqno)))
            })
            .collect();
        Self { applied }
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `ReplicationState::single()` tracks only `ShardId(0)`; initial seqno = 0
 - [ ] `ReplicationState::advance(shard, seqno)` atomically advances the high-water-mark; never decreases
 - [ ] `ReplicationState::applied_seqno(shard)` returns `None` for unknown shards
 - [ ] `advance` is safe to call from multiple threads concurrently (CAS loop)
 - [ ] `to_checkpoint_bytes` + `from_checkpoint_bytes` roundtrip preserves all shard seqnos
 - [ ] `ReplicationState` is `Send + Sync`
 - [ ] Unit tests: advance monotonicity, concurrent advance from 4 threads, checkpoint roundtrip
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/OVERVIEW.md
+++ b/docs/planning/milestone-8/phase-2/OVERVIEW.md
@ -0,0 +1,109 @@
 # m8p2: WAL Shipping and Follower Replay
 ## Delivers
 One-way WAL replication from leader to followers. The leader ships sealed WAL
 segments over an abstract transport trait. Followers receive segments, validate
 checksums, and replay them idempotently through the existing signal ledger
 `apply_wal_event()` path. A replication lag metric is emitted. A follower can
 serve read queries (RETRIEVE, SEARCH) with bounded staleness.
 This is the "read replicas" capability -- the foundation for multi-region deployment.
 Deliverables:
 - `Transport` trait: `async fn send_segment(peer: ShardId, segment: &WalSegmentPayload)` and `async fn recv_segment() -> WalSegmentPayload`
 - `InProcessTransport`: for testing, uses `tokio::sync::mpsc` channels between co-located instances
 - `WalShipper`: background task on leader that watches for sealed segments, ships them to registered followers
 - `SegmentReceiver`: background task on follower that receives segments, validates BLAKE3, replays events
 - `ReplicationLagGauge`: tracks the delta between leader's latest seqno and each follower's applied seqno
 - `FollowerDb`: a `TidalDb` variant that does not accept writes, only replays segments; serves read queries from its local state
 ## Dependencies
 - **Requires:** Phase 8.1 (ShardId, RegionId, WalSegmentId, BatchHeader v2, ReplicationState)
 - **Files modified:**
  - `tidal/src/wal/segment.rs` -- `sealed_segments_since(seqno)` helper
  - `tidal/src/db/open.rs` -- support `NodeRole::Follower` startup
  - `tidal/src/db/mod.rs` -- `TidalDb::is_follower()` guard on write paths
  - `tidal/src/signals/ledger/mod.rs` -- ensure `apply_wal_event()` is idempotent when replaying duplicate segments
 - **Files created:**
  - `tidal/src/replication/transport.rs` -- `Transport` trait, `WalSegmentPayload`
  - `tidal/src/replication/in_process.rs` -- `InProcessTransport`
  - `tidal/src/replication/shipper.rs` -- `WalShipper`
  - `tidal/src/replication/receiver.rs` -- `SegmentReceiver`
  - `tidal/src/replication/lag.rs` -- `ReplicationLagGauge`
 ## Research References
 - `docs/research/tidaldb_wal.md` -- Segment sealing, batch checksum validation
 - `thoughts.md` -- Part V.5 (quarantine-first ingestion; WAL is source of truth)
 ## Acceptance Criteria (Phase Level)
 - [ ] `Transport` trait has `send_segment` and `recv_segment` async methods; `InProcessTransport` implements them via bounded mpsc channels
 - [ ] `WalShipper` runs as a background `tokio::task`; polls for newly sealed segments every 2 seconds (configurable); ships segments to all registered followers in parallel
 - [ ] `SegmentReceiver` validates BLAKE3 checksum of each received segment before replay; rejects corrupted segments with `WalError::Corruption`
 - [ ] Follower replay is idempotent: replaying a segment with seqno <= follower's high-water-mark is a no-op (no duplicate signal counting)
 - [ ] `ReplicationLagGauge` reports `leader_seqno - follower_applied_seqno` per follower; accessible via `MetricsState`
 - [ ] Leader writes 1,000 signals -> follower replays all 1,000 -> `read_decay_score` on follower matches leader to 6 decimal places (analytical equivalence)
 - [ ] Follower rejects write operations (`db.signal()`, `db.write_item()`) with `TidalError::ReadOnly`
 - [ ] Replication lag converges to 0 within 5 seconds after leader quiesces (in-process transport)
 - [ ] Leader crash and restart: follower continues serving reads from last replayed state; leader resumes shipping from last sealed segment
 - [ ] `FollowerDb` serves `db.retrieve()` and `db.search()` queries against its local replayed state
 ## Task Execution Order
 ```
 Task 01: Transport Trait ──────┐
                                ├──> Task 03: WalShipper
 Task 02: InProcessTransport ───┘         │
                                          v
                                Task 04: SegmentReceiver
                                          │
                                          v
                                Task 05: FollowerDb
                                          │
                                          v
                                Task 06: ReplicationLagGauge
                                          │
                                          v
                                Task 07: Integration Tests
 ```
 Tasks 01 and 02 are parallelizable. Task 03 requires Task 01. Tasks 04-07 are sequential.
 ## Module Location
 | File | Status | Contains |
 |------|--------|----------|
 | `tidal/src/replication/transport.rs` | NEW | `Transport` trait, `WalSegmentPayload` |
 | `tidal/src/replication/in_process.rs` | NEW | `InProcessTransport` (channel-based) |
 | `tidal/src/replication/shipper.rs` | NEW | `WalShipper` background task |
 | `tidal/src/replication/receiver.rs` | NEW | `SegmentReceiver` with checksum validation and replay |
 | `tidal/src/replication/lag.rs` | NEW | `ReplicationLagGauge` |
 | `tidal/src/wal/segment.rs` | MODIFIED | `sealed_segments_since(seqno)` |
 | `tidal/src/db/open.rs` | MODIFIED | Follower startup path |
 | `tidal/src/db/mod.rs` | MODIFIED | Write-rejection guard for followers |
 | `tidal/src/signals/ledger/mod.rs` | MODIFIED | Idempotency guard on `apply_wal_event` |
 ## Notes
 ### In-process transport only in this phase
 A TCP/gRPC transport is deferred to Phase 8.5. The `Transport` trait is async to support both in-process channels and future network transports.
 ### Idempotency via seqno
 Followers track their high-water-mark `applied_seqno`. Segments with `first_seq <= applied_seqno` are skipped entirely. This reuses the existing checkpoint format from M1.
 ### Timer-based segment sealing
 The existing `WalHandle` seals segments when they reach `max_size`. For replication, we add a timer-based seal: every `wal_ship_interval` (default 2s), the active segment is sealed even if not full. This bounds replication lag.
 ### No Raft, no consensus
 This is primary-backup replication. One leader, N followers. Promotion is manual or triggered by the control plane (Phase 8.5).
 ## Done When
 A developer can start a leader and a follower using `InProcessTransport`, write 10,000 signals to the leader, observe the follower replay all events with lag < 5 seconds, and execute `db.retrieve()` on the follower with results matching the leader's state (modulo staleness of up to 1 batch).
--- a/docs/planning/milestone-8/phase-2/task-01-transport-trait.md
+++ b/docs/planning/milestone-8/phase-2/task-01-transport-trait.md
@ -0,0 +1,80 @@
 # Task 01: Transport Trait
 ## Delivers
 Define `Transport` trait with `send_segment` / `recv_segment` async methods and `WalSegmentPayload` (segment bytes + `WalSegmentId` header) in `tidal/src/replication/transport.rs`. The trait is the abstraction boundary between replication logic (phase-independent correctness) and network I/O (deployment-specific).
 ## Complexity: S
 ## Dependencies
 - Phase 8.1 complete (WalSegmentId, ShardId)
 ## Technical Design
 ```rust
 // tidal/src/replication/transport.rs
 use crate::replication::{ShardId, WalSegmentId};
 use async_trait::async_trait;
 /// A WAL segment payload ready for transport.
 ///
 /// Contains the segment's globally unique ID, the raw segment bytes
 /// (already BLAKE3-checksummed by the WAL writer), and the count of
 /// events for quick validation on the receiver side.
 #[derive(Debug, Clone)]
 pub struct WalSegmentPayload {
    pub id: WalSegmentId,
    pub bytes: bytes::Bytes,
    pub event_count: u64,
 }
 /// Transport abstraction for WAL segment shipping.
 ///
 /// Implementations include:
 /// - `InProcessTransport` (for testing, via tokio mpsc channels)
 /// - Future: gRPC transport for production deployments
 ///
 /// The trait is async to support both in-memory and network transports
 /// without blocking the Tokio runtime.
 #[async_trait]
 pub trait Transport: Send + Sync + 'static {
    /// Send a WAL segment to a follower shard.
    ///
    /// Returns `Ok(())` when the segment is durably queued for delivery.
    /// Does NOT wait for the follower to apply the segment.
    async fn send_segment(
        &self,
        to_shard: ShardId,
        payload: WalSegmentPayload,
    ) -> Result<(), TransportError>;
    /// Receive the next WAL segment from a leader.
    ///
    /// Blocks until a segment is available. Returns `None` when the
    /// transport is closed (leader has shut down).
    async fn recv_segment(&self) -> Option<WalSegmentPayload>;
    /// Returns the ShardId this transport endpoint represents.
    fn local_shard(&self) -> ShardId;
 }
 #[derive(Debug, thiserror::Error)]
 pub enum TransportError {
    #[error("peer shard {0} not registered")]
    UnknownPeer(ShardId),
    #[error("transport channel closed")]
    Closed,
    #[error("payload too large: {size} bytes > max {max}")]
    PayloadTooLarge { size: usize, max: usize },
 }
 ```
 ## Acceptance Criteria
 - [ ] `WalSegmentPayload` has `id: WalSegmentId`, `bytes: bytes::Bytes`, `event_count: u64`
 - [ ] `Transport` trait has `send_segment` and `recv_segment` async methods
 - [ ] `Transport: Send + Sync + 'static` (object-safe, can be used in `Arc<dyn Transport>`)
 - [ ] `TransportError` covers `UnknownPeer`, `Closed`, `PayloadTooLarge`
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/task-02-in-process-transport.md
+++ b/docs/planning/milestone-8/phase-2/task-02-in-process-transport.md
@ -0,0 +1,139 @@
 # Task 02: InProcessTransport
 ## Delivers
 Implement `InProcessTransport` using `tokio::sync::mpsc::Sender/Receiver` pairs in `tidal/src/replication/in_process.rs`. One channel per (leader, follower) pair. Used exclusively in tests -- never in production code.
 ## Complexity: S
 ## Dependencies
 - Task 01 (Transport trait, WalSegmentPayload)
 ## Technical Design
 ```rust
 // tidal/src/replication/in_process.rs
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::mpsc;
 use crate::replication::transport::{Transport, TransportError, WalSegmentPayload};
 use crate::replication::ShardId;
 /// Bounded channel capacity for in-process segment delivery.
 const DEFAULT_CHANNEL_CAPACITY: usize = 256;
 /// In-process WAL segment transport for testing.
 ///
 /// Creates a mesh of mpsc channels between shards. Each shard has
 /// a sender map (shard -> Sender) and a single receiver.
 ///
 /// Usage:
 /// ```rust
 /// let factory = InProcessTransportFactory::new();
 /// let leader_transport = factory.create(ShardId(0));
 /// let follower_transport = factory.create(ShardId(1));
 /// factory.connect(ShardId(0), ShardId(1)); // leader can send to follower
 /// ```
 pub struct InProcessTransportFactory {
    senders: Arc<Mutex<HashMap<ShardId, HashMap<ShardId, mpsc::Sender<WalSegmentPayload>>>>>,
    receivers: Arc<Mutex<HashMap<ShardId, mpsc::Receiver<WalSegmentPayload>>>>,
    capacity: usize,
 }
 impl InProcessTransportFactory {
    pub fn new() -> Self {
        Self {
            senders: Arc::new(Mutex::new(HashMap::new())),
            receivers: Arc::new(Mutex::new(HashMap::new())),
            capacity: DEFAULT_CHANNEL_CAPACITY,
        }
    }
    pub fn with_capacity(mut self, capacity: usize) -> Self {
        self.capacity = capacity;
        self
    }
    /// Create a transport endpoint for `shard_id`.
    pub fn create(&self, shard_id: ShardId) -> Arc<InProcessTransport> {
        let (tx, rx) = mpsc::channel(self.capacity);
        let mut senders = self.senders.lock().unwrap();
        let mut receivers = self.receivers.lock().unwrap();
        senders.entry(shard_id).or_default();
        receivers.insert(shard_id, rx);
        Arc::new(InProcessTransport {
            local: shard_id,
            senders: Arc::clone(&self.senders),
            receiver: Mutex::new(Some(rx)),
        })
    }
    /// Wire a one-way connection: `from` can send to `to`.
    pub fn connect(&self, from: ShardId, to: ShardId) {
        let (tx, rx) = mpsc::channel(self.capacity);
        self.senders
            .lock()
            .unwrap()
            .entry(from)
            .or_default()
            .insert(to, tx);
        // Store the receiver in the `to` shard's transport.
        // (Implementation detail: injects directly into the transport's receiver field)
    }
 }
 pub struct InProcessTransport {
    local: ShardId,
    senders: Arc<Mutex<HashMap<ShardId, HashMap<ShardId, mpsc::Sender<WalSegmentPayload>>>>>,
    receiver: Mutex<Option<mpsc::Receiver<WalSegmentPayload>>>,
 }
 #[async_trait::async_trait]
 impl Transport for InProcessTransport {
    async fn send_segment(
        &self,
        to_shard: ShardId,
        payload: WalSegmentPayload,
    ) -> Result<(), TransportError> {
        let sender = {
            let senders = self.senders.lock().unwrap();
            senders
                .get(&self.local)
                .and_then(|map| map.get(&to_shard))
                .cloned()
                .ok_or(TransportError::UnknownPeer(to_shard))?
        };
        sender
            .send(payload)
            .await
            .map_err(|_| TransportError::Closed)
    }
    async fn recv_segment(&self) -> Option<WalSegmentPayload> {
        let mut guard = self.receiver.lock().unwrap();
        if let Some(rx) = guard.as_mut() {
            rx.recv().await
        } else {
            None
        }
    }
    fn local_shard(&self) -> ShardId {
        self.local
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `InProcessTransportFactory::create(shard_id)` returns a transport endpoint for that shard
 - [ ] `send_segment` delivers the payload to the receiver's channel
 - [ ] `recv_segment` returns `None` when all senders are dropped (channel closed)
 - [ ] `send_segment` to an unregistered peer returns `TransportError::UnknownPeer`
 - [ ] Concurrent sends from multiple tasks are safe (mpsc semantics)
 - [ ] Unit test: send 100 segments from one transport, receive all 100 on another
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/task-03-wal-shipper.md
+++ b/docs/planning/milestone-8/phase-2/task-03-wal-shipper.md
@ -0,0 +1,122 @@
 # Task 03: WalShipper
 ## Delivers
 `WalShipper` background task in `tidal/src/replication/shipper.rs`. Watches for newly sealed WAL segments in the data directory, ships them to all registered follower shards via `Transport`, and tracks the last-shipped seqno per follower.
 ## Complexity: M
 ## Dependencies
 - Task 01 (Transport trait)
 - Task 02 (InProcessTransport, needed for tests)
 ## Technical Design
 ```rust
 // tidal/src/replication/shipper.rs
 /// Polls for newly sealed WAL segments and ships them to followers.
 ///
 /// Runs as a background tokio task. Exits when `shutdown_rx` receives.
 /// Ships to all registered followers in parallel (join_all).
 pub struct WalShipper {
    transport: Arc<dyn Transport>,
    followers: Vec<ShardId>,
    data_dir: PathBuf,
    shard_id: ShardId,
    poll_interval: Duration,
    last_shipped: AtomicU64,
 }
 impl WalShipper {
    pub fn new(
        transport: Arc<dyn Transport>,
        followers: Vec<ShardId>,
        data_dir: PathBuf,
        shard_id: ShardId,
    ) -> Self {
        Self {
            transport,
            followers,
            data_dir,
            shard_id,
            poll_interval: Duration::from_secs(2),
            last_shipped: AtomicU64::new(0),
        }
    }
    /// Start the shipper as a background task.
    ///
    /// Returns a handle that can be used to signal shutdown.
    pub fn start(self: Arc<Self>, shutdown_rx: tokio::sync::watch::Receiver<bool>)
        -> tokio::task::JoinHandle<()>
    {
        tokio::spawn(async move {
            self.run(shutdown_rx).await;
        })
    }
    async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
        let mut interval = tokio::time::interval(self.poll_interval);
        loop {
            tokio::select! {
                _ = interval.tick() => {
                    if let Err(e) = self.ship_pending_segments().await {
                        tracing::warn!("WalShipper: error shipping segments: {e}");
                    }
                }
                Ok(_) = shutdown.changed() => {
                    if *shutdown.borrow() {
                        // Final ship before shutdown
                        let _ = self.ship_pending_segments().await;
                        break;
                    }
                }
            }
        }
    }
    async fn ship_pending_segments(&self) -> Result<(), WalError> {
        let last = self.last_shipped.load(Ordering::Acquire);
        let segments = list_sealed_segments_since(&self.data_dir, self.shard_id, last)?;
        for (seqno, path) in segments {
            let bytes = tokio::fs::read(&path).await?;
            let payload = WalSegmentPayload {
                id: WalSegmentId::new(
                    RegionId::SINGLE, // will be populated from NodeConfig in Phase 8.5
                    self.shard_id,
                    seqno,
                ),
                bytes: bytes::Bytes::from(bytes),
                event_count: 0, // filled from BatchHeader decode
            };
            // Ship to all followers in parallel.
            let futs: Vec<_> = self.followers.iter()
                .map(|&follower| {
                    let transport = Arc::clone(&self.transport);
                    let payload = payload.clone();
                    async move { transport.send_segment(follower, payload).await }
                })
                .collect();
            futures::future::join_all(futs).await;
            self.last_shipped.store(seqno, Ordering::Release);
        }
        Ok(())
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `WalShipper::start()` spawns a background tokio task
 - [ ] Shipper polls `data_dir` for sealed segments with seqno > `last_shipped`
 - [ ] Segments are shipped to all followers in parallel via `Transport::send_segment`
 - [ ] `last_shipped` is updated after each segment is shipped to all followers
 - [ ] Shutdown signal causes the shipper to flush pending segments then exit
 - [ ] Shipper handles transport errors gracefully (logs warning, does not crash)
 - [ ] Integration test: leader with 10 segments -> shipper delivers all 10 to follower transport
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/task-04-segment-receiver.md
+++ b/docs/planning/milestone-8/phase-2/task-04-segment-receiver.md
@ -0,0 +1,119 @@
 # Task 04: SegmentReceiver
 ## Delivers
 `SegmentReceiver` background task in `tidal/src/replication/receiver.rs`. Receives `WalSegmentPayload` from transport, validates BLAKE3 checksum, decodes batches, and replays events through `SignalLedger::apply_wal_event()`. Idempotent via seqno high-water-mark.
 ## Complexity: M
 ## Dependencies
 - Task 01 (Transport trait)
 - Task 02 (InProcessTransport)
 - Phase 8.1 (ReplicationState for high-water-mark)
 ## Technical Design
 ```rust
 // tidal/src/replication/receiver.rs
 /// Receives WAL segments from a leader and replays them locally.
 ///
 /// Runs as a background tokio task. The receiver maintains strict
 /// idempotency: segments with seqno <= `applied_seqno` are skipped.
 pub struct SegmentReceiver {
    transport: Arc<dyn Transport>,
    signal_ledger: Arc<SignalLedger>,
    replication_state: Arc<ReplicationState>,
    leader_shard: ShardId,
 }
 impl SegmentReceiver {
    pub fn new(
        transport: Arc<dyn Transport>,
        signal_ledger: Arc<SignalLedger>,
        replication_state: Arc<ReplicationState>,
        leader_shard: ShardId,
    ) -> Self {
        Self { transport, signal_ledger, replication_state, leader_shard }
    }
    pub fn start(self: Arc<Self>, shutdown_rx: tokio::sync::watch::Receiver<bool>)
        -> tokio::task::JoinHandle<()>
    {
        tokio::spawn(async move {
            self.run(shutdown_rx).await;
        })
    }
    async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
        loop {
            tokio::select! {
                segment = self.transport.recv_segment() => {
                    match segment {
                        Some(payload) => {
                            if let Err(e) = self.apply_segment(payload).await {
                                tracing::error!("SegmentReceiver: apply error: {e}");
                            }
                        }
                        None => {
                            tracing::info!("SegmentReceiver: transport closed, stopping");
                            break;
                        }
                    }
                }
                Ok(_) = shutdown.changed() => {
                    if *shutdown.borrow() { break; }
                }
            }
        }
    }
    async fn apply_segment(&self, payload: WalSegmentPayload) -> Result<(), WalError> {
        let seqno = payload.id.seqno;
        let shard = payload.id.shard_id;
        // Idempotency check: skip segments already applied.
        let applied = self.replication_state
            .applied_seqno(shard)
            .unwrap_or(0);
        if seqno <= applied {
            tracing::trace!(seqno, applied, "SegmentReceiver: skipping duplicate segment");
            return Ok(());
        }
        // BLAKE3 checksum validation.
        let expected_checksum = blake3::hash(&payload.bytes);
        // (Extract checksum from BatchHeader and compare)
        // Decode and replay each event.
        let batches = decode_wal_segment(&payload.bytes)?;
        for batch in batches {
            for event in batch.events {
                self.signal_ledger.apply_wal_event(
                    event.entity_id,
                    &event.signal_type,
                    event.weight,
                    event.timestamp,
                )?;
            }
        }
        // Advance high-water-mark.
        self.replication_state.advance(shard, seqno);
        tracing::debug!(seqno, "SegmentReceiver: applied segment");
        Ok(())
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `SegmentReceiver::start()` spawns a background tokio task that reads from `transport.recv_segment()`
 - [ ] BLAKE3 checksum validation: corrupted segments return `WalError::Corruption` and are NOT applied
 - [ ] Idempotency: segments with `seqno <= replication_state.applied_seqno(shard)` are skipped (no double-counting)
 - [ ] All events in a received segment are replayed through `SignalLedger::apply_wal_event()`
 - [ ] `replication_state.advance(shard, seqno)` is called after successful replay
 - [ ] Transport close (`recv_segment` returns `None`) causes the receiver to stop gracefully
 - [ ] Integration test: ship 100 segments -> receiver applies all -> decay scores match
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/task-05-follower-db.md
+++ b/docs/planning/milestone-8/phase-2/task-05-follower-db.md
@ -0,0 +1,125 @@
 # Task 05: FollowerDb
 ## Delivers
 Wire `TidalDb` to support `NodeRole::Follower` startup in `tidal/src/db/open.rs`. Guard all write methods (`signal`, `write_item`, `write_creator`, etc.) to return `TidalError::ReadOnly` when role is `Follower`. Start `SegmentReceiver` on open for follower nodes.
 ## Complexity: M
 ## Dependencies
 - Task 04 (SegmentReceiver)
 - Phase 8.1 (NodeConfig, NodeRole)
 ## Technical Design
 ### Write guards in TidalDb
 ```rust
 // tidal/src/db/mod.rs
 impl TidalDb {
    /// Guard that returns ReadOnly if this node is a follower.
    fn require_writeable(&self) -> crate::Result<()> {
        if !self.config.cluster.accepts_writes() {
            return Err(TidalError::ReadOnly);
        }
        Ok(())
    }
    pub fn signal(
        &self,
        signal_type: &str,
        entity_id: EntityId,
        weight: f64,
        timestamp: Timestamp,
    ) -> crate::Result<()> {
        self.require_writeable()?;
        // ... existing implementation ...
    }
    pub fn write_item(
        &self,
        entity_id: EntityId,
        metadata: &HashMap<String, String>,
    ) -> crate::Result<()> {
        self.require_writeable()?;
        // ... existing implementation ...
    }
    // All other write methods follow the same pattern.
 }
 ```
 ### Follower startup in open.rs
 ```rust
 // tidal/src/db/open.rs
 pub fn open_db(config: Config) -> crate::Result<TidalDb> {
    // ... existing open logic ...
    let db = TidalDb { /* ... */ };
    if config.cluster.role == NodeRole::Follower {
        // Start segment receiver background task.
        // The transport is set by the caller via db.start_replication(transport).
        tracing::info!("TidalDb: starting as follower for shard {:?}", config.cluster.shard_id);
    }
    Ok(db)
 }
 ```
 ### TidalDb::start_replication
 ```rust
 impl TidalDb {
    /// Wire up replication transport for follower nodes.
    ///
    /// Must be called after open() for NodeRole::Follower nodes.
    /// No-op for NodeRole::Single or NodeRole::Leader.
    pub fn start_replication(
        &self,
        transport: Arc<dyn Transport>,
        leader_shard: ShardId,
        shutdown_rx: tokio::sync::watch::Receiver<bool>,
    ) {
        if self.config.cluster.role != NodeRole::Follower {
            return;
        }
        let receiver = Arc::new(SegmentReceiver::new(
            transport,
            Arc::clone(&self.signal_ledger),
            Arc::clone(&self.replication_state),
            leader_shard,
        ));
        receiver.start(shutdown_rx);
    }
 }
 ```
 ### TidalError::ReadOnly
 ```rust
 // tidal/src/error.rs (or wherever TidalError is defined)
 #[derive(Debug, thiserror::Error)]
 pub enum TidalError {
    // ... existing variants ...
    /// This node is a read-only follower; write operations are not permitted.
    #[error("this node is read-only (follower)")]
    ReadOnly,
 }
 ```
 ## Acceptance Criteria
 - [ ] `TidalError::ReadOnly` variant added to the error enum
 - [ ] All write methods (`signal`, `write_item`, `write_creator`, `write_item_embedding`, `write_creator_embedding`, `close_session`, etc.) return `Err(TidalError::ReadOnly)` when `role == Follower`
 - [ ] Read methods (`retrieve`, `search`, `read_decay_score`, etc.) work normally on followers
 - [ ] `TidalDb::start_replication(transport, leader_shard, shutdown_rx)` wires `SegmentReceiver` for follower nodes; is a no-op for `Single`/`Leader`
 - [ ] Integration test: open as Follower, verify all writes fail with ReadOnly; open as Leader, verify writes succeed
 - [ ] All existing tests pass (they use Single node, unaffected)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/task-06-replication-lag-gauge.md
+++ b/docs/planning/milestone-8/phase-2/task-06-replication-lag-gauge.md
@ -0,0 +1,96 @@
 # Task 06: ReplicationLagGauge
 ## Delivers
 `ReplicationLagGauge` in `tidal/src/replication/lag.rs` tracking per-follower lag (leader_seqno - follower_applied_seqno). Exposed via `MetricsState` so existing Prometheus scraping picks it up automatically.
 ## Complexity: S
 ## Dependencies
 - Phase 8.1 (ReplicationState)
 - Task 03 (WalShipper -- for leader_seqno)
 ## Technical Design
 ```rust
 // tidal/src/replication/lag.rs
 /// Tracks per-follower replication lag.
 ///
 /// Lag = leader's latest shipped seqno - follower's applied seqno.
 /// A lag of 0 means the follower is fully caught up.
 #[derive(Debug, Default)]
 pub struct ReplicationLagGauge {
    /// Per-follower: last seqno the leader has shipped.
    leader_seqno: DashMap<ShardId, AtomicU64>,
    /// Per-follower: last seqno the follower has applied.
    follower_applied: Arc<ReplicationState>,
 }
 impl ReplicationLagGauge {
    pub fn new(replication_state: Arc<ReplicationState>) -> Self {
        Self {
            leader_seqno: DashMap::new(),
            follower_applied: replication_state,
        }
    }
    /// Update the leader's known shipped seqno for a follower.
    pub fn update_leader_seqno(&self, follower: ShardId, seqno: u64) {
        self.leader_seqno
            .entry(follower)
            .or_insert_with(|| AtomicU64::new(0))
            .store(seqno, Ordering::Release);
    }
    /// Get the current lag for a follower in seqno units.
    pub fn lag_seqno(&self, follower: ShardId) -> i64 {
        let leader = self.leader_seqno
            .get(&follower)
            .map(|a| a.load(Ordering::Acquire))
            .unwrap_or(0);
        let applied = self.follower_applied
            .applied_seqno(follower)
            .unwrap_or(0);
        leader as i64 - applied as i64
    }
    /// Collect Prometheus-style gauge values for all followers.
    pub fn collect_metrics(&self) -> Vec<(ShardId, i64)> {
        self.leader_seqno
            .iter()
            .map(|entry| {
                let follower = *entry.key();
                (follower, self.lag_seqno(follower))
            })
            .collect()
    }
 }
 ```
 ### MetricsState integration
 ```rust
 // tidal/src/db/metrics.rs (existing metrics module)
 impl MetricsState {
    // Add to existing collect() method:
    pub fn replication_lag_seqno(&self, follower_shard: u16) -> i64 {
        self.lag_gauge
            .as_ref()
            .map(|g| g.lag_seqno(ShardId(follower_shard)))
            .unwrap_or(0)
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `ReplicationLagGauge::lag_seqno(follower)` returns `leader_seqno - follower_applied_seqno`
 - [ ] `lag_seqno` returns 0 when follower is fully caught up
 - [ ] `lag_seqno` returns > 0 when follower is behind
 - [ ] `collect_metrics()` returns a snapshot of all follower lags
 - [ ] Integrated into `MetricsState` so existing `/metrics` endpoint exposes `replication_lag_seqno` gauge
 - [ ] Integration test: leader writes 100 segments; before follower applies them, lag = 100; after apply, lag = 0
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-2/task-07-replication-integration-tests.md
+++ b/docs/planning/milestone-8/phase-2/task-07-replication-integration-tests.md
@ -0,0 +1,103 @@
 # Task 07: Replication Integration Tests
 ## Delivers
 Integration tests in `tidal/tests/m8p2_replication.rs` covering the full replication stack: leader->follower segment delivery, decay score equivalence to 6 decimal places, follower read-only enforcement, lag convergence, and segment corruption rejection.
 ## Complexity: M
 ## Dependencies
 - Tasks 01-06 complete
 ## Technical Design
 ```rust
 // tidal/tests/m8p2_replication.rs
 use tidaldb::{TidalDb, TidalDbBuilder, NodeRole, ShardId, RegionId, NodeConfig};
 use tidaldb::replication::{InProcessTransportFactory, ReplicationLagGauge};
 fn leader_config(data_dir: &Path) -> Config {
    Config {
        cluster: NodeConfig {
            role: NodeRole::Leader,
            shard_id: ShardId(0),
            ..Default::default()
        },
        ..Config::with_data_dir(data_dir)
    }
 }
 fn follower_config(data_dir: &Path) -> Config {
    Config {
        cluster: NodeConfig {
            role: NodeRole::Follower,
            shard_id: ShardId(0),
            ..Default::default()
        },
        ..Config::with_data_dir(data_dir)
    }
 }
 #[tokio::test]
 async fn replication_decay_scores_match() {
    // Leader writes 1,000 signals.
    // Follower replays all segments.
    // Verify: read_decay_score on follower matches leader to 6 decimal places.
 }
 #[tokio::test]
 async fn follower_rejects_writes() {
    // Open follower. Attempt signal() write.
    // Verify: returns TidalError::ReadOnly.
 }
 #[tokio::test]
 async fn follower_serves_retrieve_queries() {
    // Leader writes items + signals.
    // Follower applies.
    // Follower.retrieve() returns ranked results.
 }
 #[tokio::test]
 async fn replication_lag_converges_to_zero() {
    // Leader writes 500 segments.
    // Wait for follower to apply all.
    // Assert: lag_seqno(follower) == 0 within 5 seconds.
 }
 #[tokio::test]
 async fn corrupted_segment_is_rejected() {
    // Manually corrupt BLAKE3 checksum in segment bytes.
    // Send to follower via transport.
    // Verify: segment is not applied (decay scores unchanged).
 }
 #[tokio::test]
 async fn leader_restart_follower_continues() {
    // Leader writes 100 signals.
    // Leader shuts down.
    // Follower serves read queries from replayed state.
    // Leader restarts; ships remaining segments.
    // Follower catches up.
 }
 #[tokio::test]
 async fn idempotent_segment_replay() {
    // Ship same segment twice to follower.
    // Verify: signal counts NOT doubled (seqno idempotency).
 }
 ```
 ## Acceptance Criteria
 - [ ] All 7 integration tests pass under `cargo test --test m8p2_replication`
 - [ ] Test `replication_decay_scores_match`: leader 1K signals -> follower matches to 6 decimal places
 - [ ] Test `follower_rejects_writes`: `TidalError::ReadOnly` on all write methods
 - [ ] Test `follower_serves_retrieve_queries`: follower returns correct ranked results
 - [ ] Test `replication_lag_converges_to_zero`: lag = 0 within 5 seconds of leader quiesce
 - [ ] Test `corrupted_segment_is_rejected`: corrupt checksums rejected, no state change
 - [ ] Test `leader_restart_follower_continues`: follower serves reads after leader crash
 - [ ] Test `idempotent_segment_replay`: no double-counting on duplicate segments
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-3/OVERVIEW.md
+++ b/docs/planning/milestone-8/phase-3/OVERVIEW.md
@ -0,0 +1,94 @@
 # m8p3: CRDT Counters and Deterministic Reconciliation
 ## Delivers
 Conflict-free replicated data types (CRDTs) for signal counters and hard
 negatives that enable deterministic reconciliation after network partitions.
 After this phase, two shards that process overlapping signal streams during a
 partition can merge their state without double-counting, without losing hard
 negatives, and without application intervention.
 This is the critical correctness layer that makes "heal the partition; verify
 deterministic reconciliation" possible in the UAT.
 Deliverables:
 - `PNCounter`: a positive-negative counter CRDT with per-node increments; merge = max per node per side
 - `LWWRegister<T>`: last-writer-wins register with HLC timestamps for hard negatives (hide/mute/block)
 - `CrdtSignalState`: wraps `HotSignalState` and `BucketedCounter` with CRDT merge semantics
 - `ReconciliationEngine`: given two `ReplicationState` snapshots, produces a merge plan; applies it idempotently
 - `HLC` (Hybrid Logical Clock): wall-clock + logical counter for causal ordering of hard-negative writes
 ## Dependencies
 - **Requires:** Phase 8.1 (ShardId, RegionId), Phase 8.2 (WAL shipping, segment replay)
 - **Files modified:**
  - `tidal/src/signals/hot.rs` -- `HotSignalState` gains `node_id` field; decay scores become per-node accumulators
  - `tidal/src/signals/warm.rs` -- `BucketedCounter` gains per-node bucket arrays for CRDT merge
  - `tidal/src/entities/hard_neg.rs` -- `HardNegEntry` gains HLC timestamp for LWW semantics
 - **Files created:**
  - `tidal/src/replication/crdt/mod.rs` -- module root
  - `tidal/src/replication/crdt/pn_counter.rs` -- `PNCounter`
  - `tidal/src/replication/crdt/lww_register.rs` -- `LWWRegister<T>`
  - `tidal/src/replication/crdt/hlc.rs` -- Hybrid Logical Clock
  - `tidal/src/replication/reconcile.rs` -- `ReconciliationEngine`
 ## Research References
 - `thoughts.md` -- Part V (StemeDB CRDT replication: G-Set for events, G-Counter for counts, LWW for state)
 ## Acceptance Criteria (Phase Level)
 - [ ] `PNCounter` supports `increment(node_id, amount)` and `decrement(node_id, amount)`; `merge(other)` takes per-node max for both P and N vectors; `value()` returns `P_total - N_total`
 - [ ] `PNCounter` merge is commutative, associative, and idempotent (property tests with 100K random operations across 5 nodes)
 - [ ] `LWWRegister<T>` resolves concurrent writes by HLC timestamp; ties broken by `node_id` (higher wins); `merge(other)` takes the register with the higher timestamp
 - [ ] `HLC::now()` returns `(wall_clock_ns, logical_counter)`; `HLC::update(received_hlc)` advances the clock; monotonically increasing within a node
 - [ ] `CrdtSignalState` wraps decay scores as per-node accumulators: `merge` of two states produces the same result regardless of merge order (commutative property test)
 - [ ] `BucketedCounter` CRDT merge: per-node bucket arrays merged by max; total count = sum across all nodes; no double-counting after merge (verification: sum of all increments across all nodes == merged counter value)
 - [ ] Hard negatives use `LWWRegister<HardNegAction>`: a `hide` at HLC T1 followed by an `unhide` at HLC T2 > T1 resolves to unhide; a concurrent `hide` and `unhide` at the same wall-clock resolves deterministically by node_id
 - [ ] `ReconciliationEngine::reconcile(local_state, remote_state) -> MergePlan`: produces a list of signal counter merges and hard-negative LWW resolutions; applying the plan is idempotent
 - [ ] After reconciliation, no signal count exceeds the true event count (no double-counting); verified by replaying all WAL events from both sides and comparing against merged state
 ## Task Execution Order
 ```
 Task 01: HLC ─────────────────────┐
                                    ├──> Task 04: CrdtSignalState
 Task 02: PNCounter ────────────────┤
                                    ├──> Task 05: ReconciliationEngine
 Task 03: LWWRegister ──────────────┘         │
                                              v
                                    Task 06: Reconciliation Property Tests
 ```
 Tasks 01, 02, 03 are fully parallelizable. Tasks 04 and 05 depend on all three. Task 06 depends on 05.
 ## Module Location
 | File | Status | Contains |
 |------|--------|----------|
 | `tidal/src/replication/crdt/mod.rs` | NEW | Module root |
 | `tidal/src/replication/crdt/pn_counter.rs` | NEW | `PNCounter` |
 | `tidal/src/replication/crdt/lww_register.rs` | NEW | `LWWRegister<T>` |
 | `tidal/src/replication/crdt/hlc.rs` | NEW | `HLC` (Hybrid Logical Clock) |
 | `tidal/src/replication/reconcile.rs` | NEW | `ReconciliationEngine`, `MergePlan` |
 | `tidal/src/signals/hot.rs` | MODIFIED | Per-node accumulator support |
 | `tidal/src/signals/warm.rs` | MODIFIED | Per-node bucket arrays |
 | `tidal/src/entities/hard_neg.rs` | MODIFIED | HLC timestamp on entries |
 ## Notes
 ### Per-node accumulators, not per-event dedup
 The naive approach of deduplicating every event by BLAKE3 hash across all nodes is O(events) in memory. Instead, we use PN-counters: each node tracks its own increment total, and merge takes per-node max. This is O(nodes) in memory, which is bounded and small.
 ### Decay score CRDT
 Exponential decay scores are not naturally CRDT-compatible because `S(t) = S(t_prev) * exp(-lambda * dt) + w` is order-dependent. The solution: each node maintains its own running decay score. On merge, per-node scores are summed (each represents that node's contribution). This is mathematically equivalent to summing all events from all nodes, because the running-score formula is a sum of weighted exponentials. Property tests verify this.
 ### HLC, not NTP
 Wall-clock skew between nodes can cause LWW to resolve incorrectly. The HLC (Kulkarni et al., 2014) adds a logical counter that advances on `send` and `max(local, remote)+1` on `receive`, guaranteeing causal ordering even with clock skew up to the HLC's tolerance (typically seconds).
 ## Done When
 Two `TidalDb` instances process overlapping signal streams and hard-negative writes during a simulated partition. After merge via `ReconciliationEngine`, the merged signal counts exactly equal the deduplicated union of all events, and hard negatives reflect the latest write by HLC timestamp. Property tests verify commutativity, associativity, and idempotency of all CRDT merge operations across 100K random operation sequences.
--- a/docs/planning/milestone-8/phase-3/task-01-hlc.md
+++ b/docs/planning/milestone-8/phase-3/task-01-hlc.md
@ -0,0 +1,126 @@
 # Task 01: Hybrid Logical Clock (HLC)
 ## Delivers
 `HLC` (Hybrid Logical Clock) in `tidal/src/replication/crdt/hlc.rs`. Provides `now()`, `update(remote)`, monotonic guarantee, and `PartialOrd`/`Ord` by `(wall_ns, logical, node_id)`. Used by `LWWRegister` for causal ordering of concurrent writes across nodes.
 ## Complexity: S
 ## Dependencies
 - Phase 8.1 (ShardId used as node_id)
 ## Technical Design
 HLC (Kulkarni et al., 2014) combines a wall clock with a logical counter:
 - On `send`: `pt = max(wall, clock.wall); l = if pt == clock.wall { clock.logical + 1 } else { 0 }; clock = (pt, l)`
 - On `receive(msg_hlc)`: `pt = max(wall, msg_hlc.wall, clock.wall); l = if pt == clock.wall && pt == msg_hlc.wall { max(clock.logical, msg_hlc.logical) + 1 } else if pt == clock.wall { clock.logical + 1 } else if pt == msg_hlc.wall { msg_hlc.logical + 1 } else { 0 }; clock = (pt, l)`
 ```rust
 // tidal/src/replication/crdt/hlc.rs
 /// Hybrid Logical Clock timestamp.
 ///
 /// Combines wall-clock time (ns) with a logical counter to provide
 /// causal ordering even with clock skew between nodes.
 ///
 /// Ordering: (wall_ns, logical, node_id) -- lexicographic.
 /// This means: same-wall-time events are ordered by logical counter;
 /// ties within one node (impossible) are broken by node_id.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct HlcTimestamp {
    pub wall_ns: u64,
    pub logical: u32,
    pub node_id: u16,  // ShardId::0 for single-node
 }
 impl PartialOrd for HlcTimestamp {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }
 impl Ord for HlcTimestamp {
    fn cmp(&self, other: &Self) -> Ordering {
        self.wall_ns.cmp(&other.wall_ns)
            .then(self.logical.cmp(&other.logical))
            .then(self.node_id.cmp(&other.node_id))
    }
 }
 /// A per-node HLC clock.
 pub struct Hlc {
    node_id: u16,
    wall_ns: AtomicU64,
    logical: AtomicU32,
 }
 impl Hlc {
    pub fn new(node_id: u16) -> Self {
        Self {
            node_id,
            wall_ns: AtomicU64::new(0),
            logical: AtomicU32::new(0),
        }
    }
    fn wall_now() -> u64 {
        std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_nanos() as u64
    }
    /// Generate a new HLC timestamp for a local event.
    pub fn now(&self) -> HlcTimestamp {
        let wall = Self::wall_now();
        // Atomic CAS loop to advance monotonically
        loop {
            let cur_wall = self.wall_ns.load(Ordering::Acquire);
            let cur_logical = self.logical.load(Ordering::Acquire);
            let (new_wall, new_logical) = if wall > cur_wall {
                (wall, 0u32)
            } else {
                (cur_wall, cur_logical + 1)
            };
            if self.wall_ns.compare_exchange(cur_wall, new_wall, Ordering::AcqRel, Ordering::Acquire).is_ok() {
                self.logical.store(new_logical, Ordering::Release);
                return HlcTimestamp { wall_ns: new_wall, logical: new_logical, node_id: self.node_id };
            }
        }
    }
    /// Update the clock on receiving a remote HLC timestamp.
    pub fn update(&self, remote: HlcTimestamp) -> HlcTimestamp {
        let wall = Self::wall_now();
        let pt = wall.max(remote.wall_ns);
        loop {
            let cur_wall = self.wall_ns.load(Ordering::Acquire);
            let cur_logical = self.logical.load(Ordering::Acquire);
            let pt = pt.max(cur_wall);
            let new_logical = if pt == cur_wall && pt == remote.wall_ns {
                cur_logical.max(remote.logical) + 1
            } else if pt == cur_wall {
                cur_logical + 1
            } else if pt == remote.wall_ns {
                remote.logical + 1
            } else {
                0
            };
            if self.wall_ns.compare_exchange(cur_wall, pt, Ordering::AcqRel, Ordering::Acquire).is_ok() {
                self.logical.store(new_logical, Ordering::Release);
                return HlcTimestamp { wall_ns: pt, logical: new_logical, node_id: self.node_id };
            }
        }
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `HlcTimestamp` ordering is `(wall_ns, logical, node_id)` lexicographic
 - [ ] `Hlc::now()` returns monotonically increasing timestamps within a single node (property test: 10K calls in sequence never decrease)
 - [ ] `Hlc::update(remote)` advances the clock if `remote.wall_ns` > current wall
 - [ ] `Hlc` is thread-safe (`Send + Sync`); concurrent `now()` calls from 4 threads produce unique timestamps
 - [ ] `HlcTimestamp` derives `Serialize, Deserialize`, `Copy`, `Clone`, `PartialEq`, `Eq`
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-3/task-02-pn-counter.md
+++ b/docs/planning/milestone-8/phase-3/task-02-pn-counter.md
@ -0,0 +1,90 @@
 # Task 02: PNCounter
 ## Delivers
 `PNCounter` in `tidal/src/replication/crdt/pn_counter.rs`. Per-node P and N vectors (backed by `HashMap<ShardId, u64>`). Supports `increment`, `decrement`, `merge`, `value`. Property tests verify commutativity, monotonicity, and associativity (CMA) across 100K random operations over 5 nodes.
 ## Complexity: M
 ## Dependencies
 - Phase 8.1 (ShardId)
 ## Technical Design
 ```rust
 // tidal/src/replication/crdt/pn_counter.rs
 /// Positive-Negative Counter CRDT.
 ///
 /// Each node (ShardId) maintains its own P (increment) and N (decrement)
 /// totals. The global value = sum(P) - sum(N). Merge takes the per-node
 /// max of each component -- safe because values only ever increase within
 /// a node.
 ///
 /// Properties:
 /// - Commutative: merge(A, B) == merge(B, A)
 /// - Associative: merge(A, merge(B, C)) == merge(merge(A, B), C)
 /// - Idempotent: merge(A, A) == A
 #[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct PNCounter {
    positive: HashMap<ShardId, u64>,
    negative: HashMap<ShardId, u64>,
 }
 impl PNCounter {
    pub fn new() -> Self {
        Self::default()
    }
    /// Increment by `amount` for this node.
    pub fn increment(&mut self, node: ShardId, amount: u64) {
        *self.positive.entry(node).or_default() += amount;
    }
    /// Decrement by `amount` for this node.
    pub fn decrement(&mut self, node: ShardId, amount: u64) {
        *self.negative.entry(node).or_default() += amount;
    }
    /// Merge another counter into this one.
    ///
    /// Takes the per-node maximum of both P and N components.
    /// Safe because each node's contribution only grows.
    pub fn merge(&mut self, other: &PNCounter) {
        for (&node, &val) in &other.positive {
            let entry = self.positive.entry(node).or_default();
            *entry = (*entry).max(val);
        }
        for (&node, &val) in &other.negative {
            let entry = self.negative.entry(node).or_default();
            *entry = (*entry).max(val);
        }
    }
    /// Returns the current value: sum(P) - sum(N).
    ///
    /// Saturates at 0 (never negative).
    pub fn value(&self) -> u64 {
        let p: u64 = self.positive.values().sum();
        let n: u64 = self.negative.values().sum();
        p.saturating_sub(n)
    }
    /// Total positive contributions across all nodes.
    pub fn total_positive(&self) -> u64 {
        self.positive.values().sum()
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `PNCounter::increment(node, amount)` increases the P component for `node`
 - [ ] `PNCounter::decrement(node, amount)` increases the N component for `node`
 - [ ] `PNCounter::value()` returns `sum(P) - sum(N)`, saturating at 0
 - [ ] `PNCounter::merge` is commutative: `merge(A, B) == merge(B, A)` (property test: 100K random sequences, 5 nodes)
 - [ ] `PNCounter::merge` is associative: `merge(A, merge(B, C)) == merge(merge(A, B), C)` (property test)
 - [ ] `PNCounter::merge` is idempotent: `merge(A, A) == A` (property test)
 - [ ] No double-counting: after merging two counters that each received N independent increments (no overlap), `value() == N * 2` (property test)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-3/task-03-lww-register.md
+++ b/docs/planning/milestone-8/phase-3/task-03-lww-register.md
@ -0,0 +1,86 @@
 # Task 03: LWWRegister
 ## Delivers
 `LWWRegister<T>` in `tidal/src/replication/crdt/lww_register.rs`. HLC-timestamped value with `merge` taking the higher timestamp. Tie-breaking by `node_id`. Used for hard negatives (hide/mute/block) which require last-writer-wins semantics across regions.
 ## Complexity: S
 ## Dependencies
 - Task 01 (HlcTimestamp)
 ## Technical Design
 ```rust
 // tidal/src/replication/crdt/lww_register.rs
 /// Last-Writer-Wins register with HLC timestamp.
 ///
 /// Resolves concurrent writes by `HlcTimestamp` ordering:
 /// - Higher `wall_ns` wins
 /// - Same wall, higher `logical` wins
 /// - Same wall + logical, higher `node_id` wins (deterministic tie-break)
 ///
 /// The value `None` represents "not yet written."
 #[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct LWWRegister<T: Clone + PartialEq> {
    value: Option<T>,
    timestamp: Option<HlcTimestamp>,
 }
 impl<T: Clone + PartialEq> LWWRegister<T> {
    pub fn empty() -> Self {
        Self { value: None, timestamp: None }
    }
    /// Write a new value with the given HLC timestamp.
    ///
    /// Only advances the register if `ts > self.timestamp`.
    pub fn write(&mut self, value: T, ts: HlcTimestamp) {
        if self.timestamp.map_or(true, |cur| ts > cur) {
            self.value = Some(value);
            self.timestamp = Some(ts);
        }
    }
    /// Merge another register into this one.
    ///
    /// The register with the higher timestamp wins.
    pub fn merge(&mut self, other: &LWWRegister<T>) {
        if let Some(other_ts) = other.timestamp {
            if self.timestamp.map_or(true, |cur| other_ts > cur) {
                self.value = other.value.clone();
                self.timestamp = other.timestamp;
            }
        }
    }
    /// Current value of the register.
    pub fn get(&self) -> Option<&T> {
        self.value.as_ref()
    }
    /// The HLC timestamp of the last write.
    pub fn timestamp(&self) -> Option<HlcTimestamp> {
        self.timestamp
    }
 }
 impl<T: Clone + PartialEq> Default for LWWRegister<T> {
    fn default() -> Self {
        Self::empty()
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `LWWRegister::write(value, ts)` accepts writes with higher timestamps only
 - [ ] `LWWRegister::merge` takes the value with the higher HLC timestamp
 - [ ] Concurrent writes at the same wall time resolve by `logical` then `node_id`
 - [ ] `LWWRegister::merge` is commutative: `merge(A, B) == merge(B, A)` (property test)
 - [ ] `LWWRegister::merge` is associative and idempotent (property tests)
 - [ ] `T: Clone + PartialEq` bound is sufficient; no `Ord` required
 - [ ] Used for `HardNegAction` in Phase 8.4; `T` will be `HardNegAction` enum
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-3/task-04-crdt-signal-state.md
+++ b/docs/planning/milestone-8/phase-3/task-04-crdt-signal-state.md
@ -0,0 +1,111 @@
 # Task 04: CrdtSignalState
 ## Delivers
 `CrdtSignalState` wrapping `HotSignalState` and `BucketedCounter` with per-node CRDT semantics. Per-node decay accumulators that sum on merge. Per-node bucket arrays that max on merge. Merge produces correct decay scores regardless of order.
 ## Complexity: L
 ## Dependencies
 - Task 02 (PNCounter)
 - Phase 8.1 (ShardId as node identifier)
 ## Technical Design
 The key insight: exponential decay scores are sums of weighted exponentials.
 `S_total(t) = sum_i(w_i * exp(-lambda * (t - t_i)))`. Each node maintains its
 own running partial sum. On merge, partial sums add (each covers disjoint events
 since each node processes distinct WAL segments). This is mathematically exact.
 ```rust
 // tidal/src/replication/crdt/signal_state.rs
 /// CRDT-aware signal state for a single entity+signal_type pair.
 ///
 /// Extends the existing HotSignalState and BucketedCounter with per-node
 /// accounting that enables correct merge after partitioned writes.
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct CrdtSignalState {
    /// Per-node running decay score.
    ///
    /// Each node contributes its own partial decay sum.
    /// Global score = sum of all node contributions at query time.
    node_decay_scores: HashMap<ShardId, f64>,
    /// Timestamp of last event per node (for decay math on merge).
    node_last_update_ns: HashMap<ShardId, u64>,
    /// Per-node windowed counters.
    ///
    /// Each node tracks its own bucket increments.
    /// On merge, per-node buckets are merged by taking per-node max
    /// (idempotent since same-node events are identical across replicas).
    node_buckets: HashMap<ShardId, PNCounter>,
    /// Lambda (decay rate) -- identical across all nodes for this signal.
    lambda: f64,
 }
 impl CrdtSignalState {
    pub fn new(lambda: f64) -> Self {
        Self {
            node_decay_scores: HashMap::new(),
            node_last_update_ns: HashMap::new(),
            node_buckets: HashMap::new(),
            lambda,
        }
    }
    /// Record a new signal event from `node`.
    pub fn on_signal(&mut self, node: ShardId, weight: f64, now_ns: u64) {
        let entry = self.node_decay_scores.entry(node).or_default();
        let last = self.node_last_update_ns.entry(node).or_insert(now_ns);
        // Decay existing score, then add new event weight.
        let dt = (now_ns.saturating_sub(*last)) as f64 / 1e9;
        *entry = *entry * (-self.lambda * dt).exp() + weight;
        *last = now_ns;
    }
    /// Global decay score: sum of all per-node contributions at `now_ns`.
    pub fn decay_score(&self, now_ns: u64) -> f64 {
        self.node_decay_scores.iter()
            .zip(self.node_last_update_ns.values())
            .map(|((_, &score), &last)| {
                let dt = (now_ns.saturating_sub(last)) as f64 / 1e9;
                score * (-self.lambda * dt).exp()
            })
            .sum()
    }
    /// Merge another CrdtSignalState into this one.
    ///
    /// Per-node scores are summed (each node contributes distinct events).
    /// Per-node buckets are merged via PNCounter merge (per-node max).
    pub fn merge(&mut self, other: &CrdtSignalState) {
        for (&node, &other_score) in &other.node_decay_scores {
            *self.node_decay_scores.entry(node).or_default() += other_score;
        }
        for (&node, &other_ts) in &other.node_last_update_ns {
            let entry = self.node_last_update_ns.entry(node).or_default();
            *entry = (*entry).max(other_ts);
        }
        for (node, other_bucket) in &other.node_buckets {
            self.node_buckets
                .entry(*node)
                .or_default()
                .merge(other_bucket);
        }
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `CrdtSignalState::decay_score(now_ns)` returns sum of all per-node contributions decayed to `now_ns`
 - [ ] Two nodes process 500 events each (non-overlapping); after merge, `decay_score` == sum of both individual scores (property test: 1000 random event sequences)
 - [ ] `merge` is commutative and associative (property tests)
 - [ ] `merge` does not double-count: same-node events produce the same score regardless of how many times the node's state is merged (idempotent per node)
 - [ ] `BucketedCounter` equivalent: per-node bucket increments merged by PNCounter; total windowed count = sum of distinct events across all nodes; no double-counting
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-3/task-05-reconciliation-engine.md
+++ b/docs/planning/milestone-8/phase-3/task-05-reconciliation-engine.md
@ -0,0 +1,148 @@
 # Task 05: ReconciliationEngine
 ## Delivers
 `ReconciliationEngine` in `tidal/src/replication/reconcile.rs`. Takes two `ReplicationState` snapshots (from two shards that experienced a partition), produces a `MergePlan` (list of signal counter merges + LWW hard-negative resolutions), applies the plan idempotently.
 ## Complexity: L
 ## Dependencies
 - Task 04 (CrdtSignalState)
 - Task 03 (LWWRegister for hard negatives)
 ## Technical Design
 ```rust
 // tidal/src/replication/reconcile.rs
 /// A plan for merging diverged state from two shards.
 ///
 /// Produced by `ReconciliationEngine::plan()`, applied by `apply()`.
 /// The plan is deterministic and idempotent -- applying it twice is safe.
 #[derive(Debug, Clone)]
 pub struct MergePlan {
    /// Signal counter merges: (entity_id, signal_type_id) -> merged CrdtSignalState
    pub signal_merges: Vec<SignalMergeOp>,
    /// Hard-negative resolutions: (user_id, item_id) -> winning LWW value
    pub hardneg_resolutions: Vec<HardNegResolutionOp>,
 }
 #[derive(Debug, Clone)]
 pub struct SignalMergeOp {
    pub entity_id: EntityId,
    pub signal_type_id: SignalTypeId,
    pub merged_state: CrdtSignalState,
 }
 #[derive(Debug, Clone)]
 pub struct HardNegResolutionOp {
    pub user_id: EntityId,
    pub item_id: EntityId,
    /// The winning hard-negative action after LWW resolution.
    /// `None` means "remove the hard negative" (explicit unhide won).
    pub action: Option<HardNegAction>,
 }
 /// Produces and applies reconciliation plans for partitioned shards.
 pub struct ReconciliationEngine {
    signal_ledger: Arc<SignalLedger>,
    hard_neg_index: Arc<HardNegIndex>,
 }
 impl ReconciliationEngine {
    pub fn new(
        signal_ledger: Arc<SignalLedger>,
        hard_neg_index: Arc<HardNegIndex>,
    ) -> Self {
        Self { signal_ledger, hard_neg_index }
    }
    /// Produce a merge plan from two diverged state snapshots.
    ///
    /// The plan covers all entities/signals that differ between the two shards.
    /// Entities only on one shard are included unchanged (no data loss).
    pub fn plan(
        &self,
        local_snapshot: &StateSnapshot,
        remote_snapshot: &StateSnapshot,
    ) -> MergePlan {
        let mut signal_merges = Vec::new();
        let mut hardneg_resolutions = Vec::new();
        // Merge signal states: union of both snapshots, CRDT-merged per entity.
        let all_keys: HashSet<_> = local_snapshot.signal_keys()
            .chain(remote_snapshot.signal_keys())
            .collect();
        for key in all_keys {
            let local = local_snapshot.signal_state(key);
            let remote = remote_snapshot.signal_state(key);
            let mut merged = local.cloned().unwrap_or_else(|| CrdtSignalState::new(key.lambda));
            if let Some(r) = remote {
                merged.merge(r);
            }
            signal_merges.push(SignalMergeOp {
                entity_id: key.entity_id,
                signal_type_id: key.signal_type_id,
                merged_state: merged,
            });
        }
        // Resolve hard negatives: LWW by HLC timestamp.
        let all_neg_keys: HashSet<_> = local_snapshot.hardneg_keys()
            .chain(remote_snapshot.hardneg_keys())
            .collect();
        for key in all_neg_keys {
            let local = local_snapshot.hardneg_register(key);
            let remote = remote_snapshot.hardneg_register(key);
            let mut reg = local.cloned().unwrap_or_default();
            if let Some(r) = remote {
                reg.merge(r);
            }
            hardneg_resolutions.push(HardNegResolutionOp {
                user_id: key.user_id,
                item_id: key.item_id,
                action: reg.get().cloned(),
            });
        }
        MergePlan { signal_merges, hardneg_resolutions }
    }
    /// Apply a merge plan to the local state.
    ///
    /// Idempotent: applying the same plan twice produces identical state.
    pub fn apply(&self, plan: &MergePlan) -> crate::Result<()> {
        for op in &plan.signal_merges {
            self.signal_ledger.apply_crdt_state(
                op.entity_id,
                op.signal_type_id,
                &op.merged_state,
            )?;
        }
        for op in &plan.hardneg_resolutions {
            match &op.action {
                Some(action) => {
                    self.hard_neg_index.apply_action(op.user_id, op.item_id, action.clone())?;
                }
                None => {
                    self.hard_neg_index.remove(op.user_id, op.item_id)?;
                }
            }
        }
        Ok(())
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `ReconciliationEngine::plan(local, remote)` covers all entities/signals from both snapshots
 - [ ] Signal merge: no double-counting (property test: sum of events from both sides == merged value)
 - [ ] Hard-negative merge: LWW with HLC timestamp; hides never leak during merge (test: concurrent hide + unhide resolves to hide when hide has higher HLC)
 - [ ] `MergePlan` is serializable (for audit logging)
 - [ ] `apply(plan)` is idempotent: applying the same plan twice produces identical state
 - [ ] `tidalctl reconcile --since <ts>` tool uses this engine (wired in Phase 8.6 UAT; stub here)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-3/task-06-reconciliation-property-tests.md
+++ b/docs/planning/milestone-8/phase-3/task-06-reconciliation-property-tests.md
@ -0,0 +1,124 @@
 # Task 06: Reconciliation Property Tests
 ## Delivers
 Property tests in `tidal/tests/m8p3_crdt.rs` verifying: no double-counting after merge, hard negatives never leak, merge is commutative/associative/idempotent across 5 simulated nodes and 100K random operations.
 ## Complexity: M
 ## Dependencies
 - Tasks 01-05 complete
 ## Technical Design
 ```rust
 // tidal/tests/m8p3_crdt.rs
 use proptest::prelude::*;
 use tidaldb::replication::crdt::{PNCounter, LWWRegister, HlcTimestamp};
 proptest! {
    /// PNCounter merge commutativity.
    #[test]
    fn pn_counter_commutative(
        ops_a in vec((0u16..5, 0u64..1000, bool::arbitrary()), 0..100),
        ops_b in vec((0u16..5, 0u64..1000, bool::arbitrary()), 0..100),
    ) {
        let mut a = PNCounter::new();
        let mut b = PNCounter::new();
        apply_ops(&mut a, &ops_a);
        apply_ops(&mut b, &ops_b);
        let mut merge_ab = a.clone(); merge_ab.merge(&b);
        let mut merge_ba = b.clone(); merge_ba.merge(&a);
        prop_assert_eq!(merge_ab.value(), merge_ba.value());
    }
    /// PNCounter merge idempotency.
    #[test]
    fn pn_counter_idempotent(
        ops in vec((0u16..5, 0u64..1000, bool::arbitrary()), 0..100),
    ) {
        let mut counter = PNCounter::new();
        apply_ops(&mut counter, &ops);
        let original_value = counter.value();
        counter.merge(&counter.clone());
        prop_assert_eq!(counter.value(), original_value);
    }
    /// No double-counting: two nodes with disjoint operations.
    #[test]
    fn pn_counter_no_double_count(
        ops_a in vec((0u64..1000u64), 0..50),
        ops_b in vec((0u64..1000u64), 0..50),
    ) {
        let mut a = PNCounter::new();
        let mut b = PNCounter::new();
        let node_a = ShardId(0);
        let node_b = ShardId(1);
        let expected: u64 = ops_a.iter().sum::<u64>() + ops_b.iter().sum::<u64>();
        for &v in &ops_a { a.increment(node_a, v); }
        for &v in &ops_b { b.increment(node_b, v); }
        a.merge(&b);
        prop_assert_eq!(a.value(), expected);
    }
    /// LWW register commutativity.
    #[test]
    fn lww_register_commutative(
        val_a in 0u8..=1u8,
        wall_a in 0u64..1000,
        logical_a in 0u32..100,
        node_a in 0u16..5,
        val_b in 0u8..=1u8,
        wall_b in 0u64..1000,
        logical_b in 0u32..100,
        node_b in 0u16..5,
    ) {
        let ts_a = HlcTimestamp { wall_ns: wall_a, logical: logical_a, node_id: node_a };
        let ts_b = HlcTimestamp { wall_ns: wall_b, logical: logical_b, node_id: node_b };
        let mut reg_a: LWWRegister<u8> = LWWRegister::empty();
        let mut reg_b: LWWRegister<u8> = LWWRegister::empty();
        reg_a.write(val_a, ts_a);
        reg_b.write(val_b, ts_b);
        let mut merge_ab = reg_a.clone(); merge_ab.merge(&reg_b);
        let mut merge_ba = reg_b.clone(); merge_ba.merge(&reg_a);
        prop_assert_eq!(merge_ab.get(), merge_ba.get());
    }
    /// Hard negatives never leak: hide always wins over unhide when hide has higher HLC.
    #[test]
    fn hard_neg_hide_wins_with_higher_hlc(
        hide_wall in 100u64..1000,
        unhide_wall in 0u64..100,
    ) {
        let ts_hide = HlcTimestamp { wall_ns: hide_wall, logical: 0, node_id: 0 };
        let ts_unhide = HlcTimestamp { wall_ns: unhide_wall, logical: 0, node_id: 1 };
        let mut reg: LWWRegister<HardNegAction> = LWWRegister::empty();
        reg.write(HardNegAction::Hide, ts_hide);
        let mut remote: LWWRegister<HardNegAction> = LWWRegister::empty();
        remote.write(HardNegAction::Unhide, ts_unhide);
        reg.merge(&remote);
        prop_assert_eq!(reg.get(), Some(&HardNegAction::Hide));
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `pn_counter_commutative`: 10K proptest cases pass
 - [ ] `pn_counter_idempotent`: 10K proptest cases pass
 - [ ] `pn_counter_no_double_count`: 10K proptest cases pass (sum of distinct increments == merged value)
 - [ ] `lww_register_commutative`: 10K proptest cases pass
 - [ ] `hard_neg_hide_wins_with_higher_hlc`: 10K proptest cases pass (hide with higher HLC always wins)
 - [ ] Integration test: two `TidalDb` instances process 500 overlapping signals during simulated partition; after `ReconciliationEngine::plan()` + `apply()`, decay scores match ground truth (single-node replay of all events) to 6 decimal places
 - [ ] `cargo test --test m8p3_crdt` passes in < 30 seconds
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-4/OVERVIEW.md
+++ b/docs/planning/milestone-8/phase-4/OVERVIEW.md
@ -0,0 +1,86 @@
 # m8p4: Session Continuity and Agent Memory Across Regions
 ## Delivers
 Session writes carry monotonic sequence numbers and idempotency keys, enabling
 agents to roam between regions without losing session state or violating memory
 guarantees. Hard negatives are monotonic: once hidden, an item never appears
 to the user even while replicas are converging. Cross-region session visibility
 is achieved within the replication lag window (< 2 seconds).
 Deliverables:
 - `SessionSeqNo(u64)`: monotonic sequence number per session write, included in WAL event
 - `IdempotencyKey(u128)`: BLAKE3-derived key per session operation for exactly-once semantics
 - `SessionReplicationBridge`: replicates session journal entries via the `Transport` trait alongside WAL segments
 - Cross-region agent memory: a session started in us-east is readable in eu-west after replication lag
 - Hard-negative monotonicity: during convergence, the union of all hard negatives is applied (never the intersection)
 ## Dependencies
 - **Requires:** Phase 8.2 (WAL shipping, SegmentReceiver), Phase 8.3 (LWWRegister for hard negatives, HLC)
 - **Files modified:**
  - `tidal/src/wal/format/session.rs` -- add `session_seqno` and `idempotency_key` fields to `SessionWalEvent`
  - `tidal/src/session/state.rs` -- track per-session high-water-mark seqno
  - `tidal/src/entities/hard_neg.rs` -- union-based merge during convergence (never remove a hard negative during replication)
  - `tidal/src/wal/session_journal.rs` -- include session events in replication payload
 - **Files created:**
  - `tidal/src/replication/session_bridge.rs` -- `SessionReplicationBridge`
  - `tidal/src/replication/idempotency.rs` -- `IdempotencyKey`, `IdempotencyStore` (bounded LRU)
 ## Research References
 - `VISION.md` -- Sessions / Agent Context section: "Sessions can be forked, merged, and policy-limited so an agent only sees what it is allowed to remember"
 ## Acceptance Criteria (Phase Level)
 - [ ] `SessionSeqNo` is a monotonically increasing u64 per session; writes with seqno <= high-water-mark on the receiver are idempotent no-ops
 - [ ] `IdempotencyKey` is derived from `BLAKE3(session_id || seqno || operation_bytes)`; stored in a bounded LRU of 100K entries per node
 - [ ] Duplicate session writes (same idempotency key) across regions produce exactly one state change
 - [ ] Session started in region A is visible (session metadata + preference hints + annotations) in region B within 2 seconds (in-process transport)
 - [ ] Hard negatives are replicated with union semantics: if shard A has `hide(user, item)` and shard B does not, after replication both shards have the hide; during convergence the stricter (hide) always wins
 - [ ] Agent roaming test: create session in us-east, write 5 preference signals; switch to eu-west follower; read session signals within 2 seconds; all 5 signals visible
 - [ ] No phantom un-hides: once a hard negative is applied, it is never removed by replication (only by explicit user action with a higher HLC timestamp)
 ## Task Execution Order
 ```
 Task 01: SessionSeqNo + WAL Format ──────┐
                                          ├──> Task 03: SessionReplicationBridge
 Task 02: IdempotencyKey + Store ──────────┘         │
                                                     v
                                          Task 04: HardNeg Monotonicity
                                                     │
                                                     v
                                          Task 05: Cross-Region Session Tests
 ```
 Tasks 01 and 02 are parallelizable. Task 03 depends on both. Task 04 depends on 03. Task 05 depends on all.
 ## Module Location
 | File | Status | Contains |
 |------|--------|----------|
 | `tidal/src/replication/session_bridge.rs` | NEW | `SessionReplicationBridge` |
 | `tidal/src/replication/idempotency.rs` | NEW | `IdempotencyKey`, `IdempotencyStore` |
 | `tidal/src/wal/format/session.rs` | MODIFIED | `session_seqno`, `idempotency_key` fields |
 | `tidal/src/session/state.rs` | MODIFIED | Per-session high-water-mark seqno |
 | `tidal/src/entities/hard_neg.rs` | MODIFIED | Union-based merge, LWW with HLC |
 | `tidal/src/wal/session_journal.rs` | MODIFIED | Session events in replication payload |
 ## Notes
 ### Union, not LWW, for hard negatives during convergence
 The safety property is: a hidden item must never appear to the user, even while replicas are still converging. This means during the convergence window, we take the union of all hard negatives from all shards. Only after full convergence can LWW semantics resolve explicit unhide operations.
 ### Bounded idempotency store
 The LRU holds 100K entries (~1.6 MB). This means idempotency is guaranteed for the last 100K operations per session. Older operations that are replayed are handled by the seqno high-water-mark check (which is unbounded and monotonic).
 ### Session replication piggybacks on WAL shipping
 Session journal entries are bundled into a separate channel on the same `Transport`, not mixed into the signal WAL segments. This keeps the signal WAL path fast and the session path independently tunable.
 ## Done When
 An agent creates a session in one region, writes preference signals and hard negatives, then the session is readable from a follower in another region within 2 seconds. Duplicate operations across regions produce no double-counting. Items hidden in one region are never visible in another region during convergence.
--- a/docs/planning/milestone-8/phase-4/task-01-session-seqno.md
+++ b/docs/planning/milestone-8/phase-4/task-01-session-seqno.md
@ -0,0 +1,139 @@
 # Task 01: SessionSeqNo + WAL Format Extension
 ## Delivers
 `SessionSeqNo(u64)` type added to `tidal/src/wal/format/session.rs` and `tidal/src/session/state.rs`. Every session write operation carries a monotonically incrementing sequence number. The receiver's high-water-mark (HWM) rejects writes with `seqno <= hwm` as idempotent no-ops.
 ## Complexity: S
 ## Dependencies
 - Phase 8.2 (WAL shipping, SegmentReceiver)
 ## Technical Design
 ```rust
 // tidal/src/wal/format/session.rs
 /// Monotonic sequence number for session writes.
 ///
 /// Incremented once per session write operation (preference signal,
 /// annotation, search query, interaction). Used by the receiver to
 /// enforce idempotent replay and exactly-once semantics.
 #[derive(
    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash,
    serde::Serialize, serde::Deserialize,
 )]
 pub struct SessionSeqNo(pub u64);
 impl SessionSeqNo {
    pub const ZERO: Self = Self(0);
    pub fn next(self) -> Self {
        Self(self.0 + 1)
    }
 }
 impl std::fmt::Display for SessionSeqNo {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "ssn:{}", self.0)
    }
 }
 /// Extended session WAL event -- backward-compatible with existing format.
 ///
 /// The `session_seqno` and `idempotency_key` fields are appended to the
 /// existing `SessionWalEvent` bytes. Old readers that don't understand
 /// the extension fields still decode the core event; they will silently
 /// ignore the extra bytes (length-prefixed framing ensures this).
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct SessionWalEvent {
    // --- existing fields (unchanged) ---
    pub session_id: SessionId,
    pub kind: SessionEventKind,
    pub timestamp_ns: u64,
    pub payload: SessionEventPayload,
    // --- new fields (m8p4 extension) ---
    /// Monotonically increasing sequence number for this session's writes.
    /// Starts at 1 for the first write in a session.
    #[serde(default)]
    pub session_seqno: Option<SessionSeqNo>,
    /// BLAKE3-derived idempotency key for exactly-once delivery.
    /// `None` for events written before m8p4.
    #[serde(default)]
    pub idempotency_key: Option<u128>,
 }
 ```
 ```rust
 // tidal/src/session/state.rs  (additions only)
 /// Per-session monotonic write counter.
 ///
 /// Tracks the highest seqno applied locally. Writes with seqno <= hwm
 /// are silently dropped (idempotent replay is safe; the state is already
 /// reflected in local storage).
 #[derive(Debug, Default)]
 pub struct SessionSeqNoTracker {
    /// Map from SessionId to highest applied SessionSeqNo.
    hwm: DashMap<SessionId, SessionSeqNo>,
 }
 impl SessionSeqNoTracker {
    pub fn new() -> Self {
        Self { hwm: DashMap::new() }
    }
    /// Returns `true` if this write should be applied (seqno > hwm).
    /// Returns `false` if the write is a duplicate and should be skipped.
    /// Updates the HWM on accept.
    pub fn should_apply(&self, session_id: SessionId, seqno: SessionSeqNo) -> bool {
        let mut entry = self.hwm.entry(session_id).or_insert(SessionSeqNo::ZERO);
        if seqno > *entry {
            *entry = seqno;
            true
        } else {
            false
        }
    }
    /// Current HWM for a session (returns ZERO if unknown).
    pub fn hwm(&self, session_id: SessionId) -> SessionSeqNo {
        self.hwm.get(&session_id)
            .map(|v| *v)
            .unwrap_or(SessionSeqNo::ZERO)
    }
    /// Initialize or reset HWM for a session (used on follower startup).
    pub fn set_hwm(&self, session_id: SessionId, seqno: SessionSeqNo) {
        self.hwm.insert(session_id, seqno);
    }
 }
 ```
 ### Sequence Number Assignment
 ```rust
 // In session write path (tidal/src/session/mod.rs)
 impl SessionManager {
    fn next_seqno(&self, session_id: SessionId) -> SessionSeqNo {
        // Fetch-and-increment per session.
        let mut counter = self.seqno_counters
            .entry(session_id)
            .or_insert(SessionSeqNo::ZERO);
        *counter = counter.next();
        *counter
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `SessionSeqNo` is `Copy + Clone + Ord + Hash + Serialize + Deserialize`
 - [ ] `SessionSeqNoTracker::should_apply(id, seqno)` returns `true` for the first call with a given seqno, `false` on duplicate, and `true` again for a higher seqno
 - [ ] HWM persists in memory; on follower node restart, WAL replay re-establishes HWM by scanning all `SessionWalEvent` entries in order
 - [ ] `SessionWalEvent` with `session_seqno: None` (pre-m8p4 events) is decoded without error; `should_apply` returns `true` for all legacy events
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-4/task-02-idempotency-key.md
+++ b/docs/planning/milestone-8/phase-4/task-02-idempotency-key.md
@ -0,0 +1,151 @@
 # Task 02: IdempotencyKey + IdempotencyStore
 ## Delivers
 `IdempotencyKey(u128)` BLAKE3-derived key per session operation, and `IdempotencyStore` (bounded LRU, 100K capacity) in `tidal/src/replication/idempotency.rs`. Duplicate session writes arriving via replication are detected in O(1) time and silently discarded.
 ## Complexity: S
 ## Dependencies
 - Task 01 (SessionSeqNo)
 ## Technical Design
 ```rust
 // tidal/src/replication/idempotency.rs
 use blake3::Hasher;
 /// Per-operation idempotency key derived from session context.
 ///
 /// Derived as: BLAKE3(session_id_bytes || seqno_bytes || operation_bytes)
 ///
 /// Using u128 (128 bits) gives 2^64 expected collisions at 2^64 operations,
 /// which is astronomically unlikely in practice. Cheaper than storing the
 /// full BLAKE3 hash (32 bytes) with no practical security difference for
 /// our use case.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub struct IdempotencyKey(pub u128);
 impl IdempotencyKey {
    /// Derive an idempotency key for a session operation.
    ///
    /// - `session_id`: the session this operation belongs to
    /// - `seqno`: monotonic sequence number (see `SessionSeqNo`)
    /// - `operation_bytes`: serialized operation payload (canonically ordered)
    pub fn derive(
        session_id: SessionId,
        seqno: SessionSeqNo,
        operation_bytes: &[u8],
    ) -> Self {
        let mut hasher = Hasher::new();
        hasher.update(&session_id.as_bytes());
        hasher.update(&seqno.0.to_le_bytes());
        hasher.update(operation_bytes);
        let hash = hasher.finalize();
        // Take first 16 bytes as u128 (little-endian).
        let bytes: [u8; 16] = hash.as_bytes()[..16].try_into().unwrap();
        Self(u128::from_le_bytes(bytes))
    }
 }
 /// Bounded LRU store for idempotency keys.
 ///
 /// Capacity: 100K entries ≈ 1.6 MB (u128 key + u8 metadata).
 /// When capacity is reached, the least-recently-seen key is evicted.
 /// This means idempotency is guaranteed for the last 100K distinct operations.
 ///
 /// Older operations fall back to the SessionSeqNo HWM check, which is
 /// unbounded and always monotonic (a write with seqno <= hwm is never re-applied).
 ///
 /// Thread-safe: uses a `Mutex<LruCache>`.
 pub struct IdempotencyStore {
    cache: Mutex<LruCache<IdempotencyKey, ()>>,
    capacity: usize,
 }
 impl IdempotencyStore {
    /// Create a new store with the given capacity.
    pub fn new(capacity: usize) -> Self {
        Self {
            cache: Mutex::new(LruCache::new(
                NonZeroUsize::new(capacity).expect("capacity must be > 0"),
            )),
            capacity,
        }
    }
    /// Create a store with the default capacity (100K).
    pub fn default_capacity() -> Self {
        Self::new(100_000)
    }
    /// Check if a key has been seen before and record it if not.
    ///
    /// Returns `true` if the key is new (should apply the operation).
    /// Returns `false` if the key was already seen (duplicate; skip).
    pub fn check_and_record(&self, key: IdempotencyKey) -> bool {
        let mut cache = self.cache.lock().unwrap();
        if cache.contains(&key) {
            false
        } else {
            cache.put(key, ());
            true
        }
    }
    /// Current number of tracked keys.
    pub fn len(&self) -> usize {
        self.cache.lock().unwrap().len()
    }
    /// Returns the configured capacity.
    pub fn capacity(&self) -> usize {
        self.capacity
    }
 }
 ```
 ### Integration in SegmentReceiver
 ```rust
 // In tidal/src/replication/receive.rs (additions)
 impl SegmentReceiver {
    fn apply_session_event(
        &self,
        event: &SessionWalEvent,
        idempotency_store: &IdempotencyStore,
        seqno_tracker: &SessionSeqNoTracker,
    ) -> Result<()> {
        // Layer 1: SeqNo HWM check (fast, unbounded).
        if let Some(seqno) = event.session_seqno {
            if !seqno_tracker.should_apply(event.session_id, seqno) {
                return Ok(()); // duplicate — skip
            }
        }
        // Layer 2: Idempotency key check (bounded LRU, catches within-window dupes).
        if let Some(key_int) = event.idempotency_key {
            let key = IdempotencyKey(key_int);
            if !idempotency_store.check_and_record(key) {
                return Ok(()); // duplicate — skip
            }
        }
        // Apply the event.
        self.session_manager.apply_wal_event(event)
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `IdempotencyKey::derive(session_id, seqno, bytes)` produces a deterministic `u128` for the same inputs
 - [ ] Different inputs produce different keys with overwhelming probability (no test for this -- mathematical guarantee from BLAKE3)
 - [ ] `IdempotencyStore::check_and_record(key)` returns `true` on first call, `false` on any subsequent call with the same key
 - [ ] LRU eviction: when store exceeds `capacity` distinct keys, oldest entries are evicted; evicted keys return `true` on re-insert (they look new again; fallback to SeqNo HWM handles correctness)
 - [ ] `IdempotencyStore::len()` returns 0 after initialization and grows up to `capacity`
 - [ ] Memory bound: 100K-entry store consumes < 10 MB (verify with `std::mem::size_of`)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-4/task-03-session-replication-bridge.md
+++ b/docs/planning/milestone-8/phase-4/task-03-session-replication-bridge.md
@ -0,0 +1,191 @@
 # Task 03: SessionReplicationBridge
 ## Delivers
 `SessionReplicationBridge` in `tidal/src/replication/session_bridge.rs`. Bundles session journal entries alongside WAL segments for transport to follower nodes. Session events are transmitted on a separate channel from signal WAL segments, keeping the signal-critical path unaffected by session I/O.
 ## Complexity: M
 ## Dependencies
 - Task 01 (SessionSeqNo + WAL format extension)
 - Task 02 (IdempotencyKey + IdempotencyStore)
 - Phase 8.2 (Transport trait, WalShipper)
 ## Technical Design
 ```rust
 // tidal/src/replication/session_bridge.rs
 /// Replicates session journal entries to follower nodes.
 ///
 /// Session events piggyback on the same `Transport` as WAL segments but
 /// use a dedicated `SessionPayload` envelope, not the signal WAL format.
 /// This separation lets us tune session replication (e.g., smaller MTU,
 /// higher frequency) independently of signal WAL shipping.
 pub struct SessionReplicationBridge {
    transport: Arc<dyn Transport>,
    session_journal: Arc<SessionJournal>,
    idempotency_store: Arc<IdempotencyStore>,
    seqno_tracker: Arc<SessionSeqNoTracker>,
    /// Highest seqno shipped per (session_id, region_id) pair.
    ship_hwm: DashMap<(SessionId, RegionId), SessionSeqNo>,
 }
 /// Envelope for session events shipped over the Transport.
 ///
 /// Distinct from `WalSegmentPayload` -- the transport multiplexes these
 /// by payload kind byte (0x01 = WAL segment, 0x02 = session batch).
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct SessionPayload {
    pub kind: PayloadKind,
    pub source_region: RegionId,
    pub source_shard: ShardId,
    pub events: Vec<SessionWalEvent>,
    /// BLAKE3 checksum of serialized `events` bytes.
    pub checksum: [u8; 32],
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[repr(u8)]
 pub enum PayloadKind {
    WalSegment = 0x01,
    SessionBatch = 0x02,
 }
 impl SessionReplicationBridge {
    pub fn new(
        transport: Arc<dyn Transport>,
        session_journal: Arc<SessionJournal>,
        idempotency_store: Arc<IdempotencyStore>,
        seqno_tracker: Arc<SessionSeqNoTracker>,
    ) -> Self {
        Self {
            transport,
            session_journal,
            idempotency_store,
            seqno_tracker,
            ship_hwm: DashMap::new(),
        }
    }
    /// Ship all un-shipped session events for `session_id` to `target_region`.
    ///
    /// Fetches events from the session journal whose seqno > current ship HWM.
    /// Bundles them into a `SessionPayload`, ships via `Transport::send_session_batch`.
    /// Updates ship HWM on success.
    pub async fn ship_session(
        &self,
        session_id: SessionId,
        target_region: RegionId,
    ) -> Result<(), TransportError> {
        let hwm_key = (session_id, target_region);
        let current_hwm = self.ship_hwm
            .get(&hwm_key)
            .map(|v| *v)
            .unwrap_or(SessionSeqNo::ZERO);
        let events = self.session_journal.events_after(session_id, current_hwm)?;
        if events.is_empty() {
            return Ok(());
        }
        let highest_seqno = events.iter()
            .filter_map(|e| e.session_seqno)
            .max()
            .unwrap_or(SessionSeqNo::ZERO);
        let payload = self.build_payload(events)?;
        self.transport.send_session_batch(target_region, payload).await?;
        self.ship_hwm.insert(hwm_key, highest_seqno);
        Ok(())
    }
    /// Receive and apply an incoming `SessionPayload` from a remote region.
    ///
    /// Validates checksum, then applies each event through the idempotency
    /// store + seqno tracker pipeline before forwarding to the session manager.
    pub async fn receive_session_batch(
        &self,
        payload: SessionPayload,
        session_manager: &SessionManager,
    ) -> Result<usize> {
        // Validate BLAKE3 checksum.
        let serialized = bincode::serialize(&payload.events)?;
        let expected = blake3::hash(&serialized);
        if expected.as_bytes() != &payload.checksum {
            return Err(TidalError::CorruptedWal("session batch checksum mismatch".into()));
        }
        let mut applied = 0;
        for event in &payload.events {
            // Layer 1: SeqNo HWM.
            if let Some(seqno) = event.session_seqno {
                if !self.seqno_tracker.should_apply(event.session_id, seqno) {
                    continue;
                }
            }
            // Layer 2: Idempotency key.
            if let Some(key_int) = event.idempotency_key {
                let key = IdempotencyKey(key_int);
                if !self.idempotency_store.check_and_record(key) {
                    continue;
                }
            }
            session_manager.apply_wal_event(event)?;
            applied += 1;
        }
        Ok(applied)
    }
    fn build_payload(&self, events: Vec<SessionWalEvent>) -> Result<SessionPayload> {
        let serialized = bincode::serialize(&events)?;
        let checksum = *blake3::hash(&serialized).as_bytes();
        Ok(SessionPayload {
            kind: PayloadKind::SessionBatch,
            source_region: self.session_journal.region_id(),
            source_shard: ShardId(0), // session journal is not sharded by entity
            events,
            checksum,
        })
    }
 }
 ```
 ### Transport Extension
 ```rust
 // tidal/src/replication/transport.rs (extension to Transport trait)
 #[async_trait::async_trait]
 pub trait Transport: Send + Sync + 'static {
    // --- existing methods (unchanged) ---
    async fn send_segment(
        &self,
        target: RegionId,
        payload: WalSegmentPayload,
    ) -> Result<(), TransportError>;
    async fn recv_segment(&self) -> Result<WalSegmentPayload, TransportError>;
    // --- new session methods ---
    async fn send_session_batch(
        &self,
        target: RegionId,
        payload: SessionPayload,
    ) -> Result<(), TransportError>;
    async fn recv_session_batch(&self) -> Result<SessionPayload, TransportError>;
 }
 ```
 ## Acceptance Criteria
 - [ ] `SessionReplicationBridge::ship_session(session_id, target)` fetches only events with seqno > current ship HWM; does nothing on empty diff
 - [ ] `receive_session_batch` validates the BLAKE3 checksum; returns `TidalError::CorruptedWal` on mismatch
 - [ ] Duplicate events (same idempotency key or same seqno <= HWM) are silently dropped; applied count reflects only new events
 - [ ] `PayloadKind::SessionBatch` (0x02) is distinct from `PayloadKind::WalSegment` (0x01); transport multiplexes by kind byte
 - [ ] `Transport` trait extended with `send_session_batch` / `recv_session_batch`; `InProcessTransport` implements both new methods
 - [ ] Unit test: ship 10 session events, receive on follower, verify 10 applied; re-ship same events, verify 0 applied (idempotent)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-4/task-04-hardneg-monotonicity.md
+++ b/docs/planning/milestone-8/phase-4/task-04-hardneg-monotonicity.md
@ -0,0 +1,168 @@
 # Task 04: Hard-Negative Monotonicity During Convergence
 ## Delivers
 Modified `HardNegIndex` merge behavior in `tidal/src/entities/hard_neg.rs` to enforce union semantics during convergence: a hide from any shard always wins during replication, even if a remote shard has a later `Unhide` operation. Explicit unhide operations are only honored once they arrive with an HLC timestamp strictly higher than the hide timestamp (via the existing `LWWRegister<HardNegAction>`).
 ## Complexity: M
 ## Dependencies
 - Task 03 (SessionReplicationBridge -- brings hard negatives into replication flow)
 - Phase 8.3, Task 03 (LWWRegister<HardNegAction>)
 ## Technical Design
 ### The Problem
 During a network partition:
 - Shard A: user hides item X at HLC(t=100)
 - Shard B: user un-hides item X at HLC(t=50) (old operation, pre-partition)
 When the partition heals, shard B's state has `Unhide(t=50)` and shard A's state has `Hide(t=100)`. The LWW register resolves this correctly: `t=100 > t=50`, so `Hide` wins.
 But during the convergence window (before B has received A's segment), shard B might serve the un-hidden item X to the user. This is the safety violation we must prevent.
 ### The Solution
 Union semantics during convergence: the `HardNegIndex` accumulates all hide operations from all replicating shards immediately (before full reconciliation). A `Remove` (explicit unhide) only takes effect after the LWW register has resolved and the hide's HLC is definitively beaten.
 ```rust
 // tidal/src/entities/hard_neg.rs
 /// Hard negative action stored per (user_id, item_id) pair.
 #[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
 pub enum HardNegAction {
    Hide,
    Mute,
    Block,
    Unhide, // explicit removal with HLC timestamp
 }
 /// Hard negative entry with LWW register for convergence.
 #[derive(Debug, Clone)]
 pub struct HardNegEntry {
    /// LWW register: tracks the most recent action (by HLC).
    pub register: LWWRegister<HardNegAction>,
    /// Union flag: set to `true` when any shard has contributed a hide/mute/block.
    /// Reset to `false` only when the LWW register definitively resolves to `Unhide`.
    pub union_active: bool,
 }
 impl HardNegEntry {
    /// Returns `true` if this entry should suppress the item from appearing
    /// in query results.
    ///
    /// During convergence: `union_active` is set; item is suppressed.
    /// After convergence: `union_active` reflects LWW resolution.
    pub fn is_active(&self) -> bool {
        if self.union_active {
            return true;
        }
        // LWW resolution only.
        matches!(self.register.get(), Some(HardNegAction::Hide | HardNegAction::Mute | HardNegAction::Block))
    }
    /// Apply a remote hard-negative action from replication.
    ///
    /// Union rule: any positive hard-negative action (hide/mute/block) sets
    /// `union_active = true`. Only a fully-resolved LWW Unhide clears it.
    pub fn apply_remote(&mut self, action: HardNegAction, ts: HlcTimestamp) {
        match &action {
            HardNegAction::Unhide => {
                // LWW only: if this Unhide beats the current register, try to clear.
                self.register.write(action, ts);
                // Clear union_active only if the register definitively has Unhide.
                if matches!(self.register.get(), Some(HardNegAction::Unhide)) {
                    self.union_active = false;
                }
            }
            _ => {
                // Hide/Mute/Block: set union_active unconditionally.
                self.register.write(action, ts);
                self.union_active = true;
            }
        }
    }
 }
 /// Index of hard negatives for a shard.
 pub struct HardNegIndex {
    /// (user_id, item_id) -> HardNegEntry
    entries: DashMap<(EntityId, EntityId), HardNegEntry>,
 }
 impl HardNegIndex {
    /// Apply a local hard-negative action (user-initiated, not from replication).
    pub fn apply_action(
        &self,
        user_id: EntityId,
        item_id: EntityId,
        action: HardNegAction,
        ts: HlcTimestamp,
    ) -> Result<()> {
        let mut entry = self.entries
            .entry((user_id, item_id))
            .or_insert_with(|| HardNegEntry {
                register: LWWRegister::empty(),
                union_active: false,
            });
        entry.apply_remote(action, ts);
        Ok(())
    }
    /// Merge a remote HardNegEntry from replication.
    ///
    /// Union semantics: if the remote entry is active, set union_active locally.
    pub fn merge_remote(
        &self,
        user_id: EntityId,
        item_id: EntityId,
        remote: &HardNegEntry,
    ) {
        let mut local = self.entries
            .entry((user_id, item_id))
            .or_insert_with(|| HardNegEntry {
                register: LWWRegister::empty(),
                union_active: false,
            });
        local.register.merge(&remote.register);
        // Union rule: if remote had an active negative, propagate.
        if remote.union_active {
            local.union_active = true;
        }
        // Re-evaluate after merge: if the register definitively says Unhide, clear.
        if matches!(local.register.get(), Some(HardNegAction::Unhide)) && !remote.union_active {
            local.union_active = false;
        }
    }
    /// Check if a (user_id, item_id) pair is hard-negated (should be filtered).
    pub fn is_negated(&self, user_id: EntityId, item_id: EntityId) -> bool {
        self.entries
            .get(&(user_id, item_id))
            .map(|e| e.is_active())
            .unwrap_or(false)
    }
    /// Remove a hard negative (explicit unhide with the given HLC timestamp).
    ///
    /// Only removes if the given ts beats the current register.
    pub fn remove(&self, user_id: EntityId, item_id: EntityId, ts: HlcTimestamp) -> Result<()> {
        if let Some(mut entry) = self.entries.get_mut(&(user_id, item_id)) {
            entry.apply_remote(HardNegAction::Unhide, ts);
        }
        Ok(())
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `HardNegEntry::is_active()` returns `true` when `union_active = true`, regardless of the LWW register state
 - [ ] `apply_remote(Hide, t=100)` followed by `apply_remote(Unhide, t=50)` leaves `union_active = true` (hide wins, Unhide loses LWW)
 - [ ] `apply_remote(Hide, t=50)` followed by `apply_remote(Unhide, t=100)` clears `union_active = false` (Unhide wins LWW)
 - [ ] `merge_remote` with an active remote entry always sets local `union_active = true`
 - [ ] Property test: concurrent hide on shard A + unhide on shard B with lower HLC → after merge, item is negated on both shards
 - [ ] `is_negated()` is called during RETRIEVE/SEARCH result post-filtering (verified by existing HardNeg integration test with updated merge logic)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-4/task-05-cross-region-session-tests.md
+++ b/docs/planning/milestone-8/phase-4/task-05-cross-region-session-tests.md
@ -0,0 +1,202 @@
 # Task 05: Cross-Region Session Integration Tests
 ## Delivers
 Integration test suite in `tidal/tests/m8p4_session.rs` verifying: agent roaming between regions, session visibility within 2 seconds, idempotent writes, and hard-negative monotonicity across regions.
 ## Complexity: M
 ## Dependencies
 - Tasks 01–04 complete
 ## Technical Design
 ```rust
 // tidal/tests/m8p4_session.rs
 use tidaldb::replication::{
    InProcessTransportFactory, SessionReplicationBridge, IdempotencyStore,
 };
 use tidaldb::session::{SessionId, SessionManager, SessionSeqNoTracker};
 use tidaldb::entities::HardNegAction;
 use tidaldb::replication::crdt::HlcTimestamp;
 /// Helper: create a pair of TidalDb instances linked by InProcessTransport.
 async fn setup_two_region_cluster() -> (TidalDb, TidalDb, Arc<InProcessTransportFactory>) {
    let factory = Arc::new(InProcessTransportFactory::new());
    let transport_a = factory.connect(RegionId(0));
    let transport_b = factory.connect(RegionId(1));
    let db_a = TidalDb::builder()
        .ephemeral()
        .with_schema(schema())
        .with_cluster(NodeConfig {
            region_id: RegionId(0),
            shard_id: ShardId(0),
            role: NodeRole::Leader,
        })
        .with_transport(transport_a)
        .open()
        .unwrap();
    let db_b = TidalDb::builder()
        .ephemeral()
        .with_schema(schema())
        .with_cluster(NodeConfig {
            region_id: RegionId(1),
            shard_id: ShardId(0),
            role: NodeRole::Follower,
        })
        .with_transport(transport_b)
        .open()
        .unwrap();
    (db_a, db_b, factory)
 }
 /// Agent roaming: session started in us-east, readable in eu-west.
 #[tokio::test]
 async fn test_session_cross_region_visibility() {
    let (db_a, db_b, _factory) = setup_two_region_cluster().await;
    let user = EntityId::new(1);
    let session_id = db_a.start_session(user, Default::default()).unwrap();
    // Write 5 preference signals in region A.
    for i in 0..5u64 {
        let item = EntityId::new(100 + i);
        db_a.signal_in_session(session_id, "view", item, 1.0, Timestamp::now()).unwrap();
    }
    // Allow replication to propagate (< 2 seconds using InProcessTransport).
    tokio::time::sleep(Duration::from_millis(200)).await;
    // Read session signals from region B.
    let session_b = db_b.get_session(session_id).unwrap();
    assert!(session_b.is_some(), "session should be visible in region B");
    let signals = db_b.session_signals(session_id).unwrap();
    assert_eq!(signals.len(), 5, "all 5 preference signals should be visible in region B");
 }
 /// Idempotent replication: duplicate session events produce no double-counting.
 #[tokio::test]
 async fn test_session_replication_idempotent() {
    let (db_a, db_b, factory) = setup_two_region_cluster().await;
    let user = EntityId::new(2);
    let session_id = db_a.start_session(user, Default::default()).unwrap();
    let item = EntityId::new(200);
    db_a.signal_in_session(session_id, "like", item, 1.0, Timestamp::now()).unwrap();
    // Let it replicate once.
    tokio::time::sleep(Duration::from_millis(100)).await;
    // Force re-send (simulated duplicate).
    factory.replay_last_session_batch(RegionId(1)).await;
    tokio::time::sleep(Duration::from_millis(50)).await;
    // Signal count on B should still be 1, not 2.
    let count = db_b.read_windowed_count(item, "like", Window::OneHour).unwrap();
    // (Session signals are user-scoped; verify via session data, not global ledger)
    let session_data = db_b.session_signals(session_id).unwrap();
    assert_eq!(session_data.len(), 1, "no double-counting from duplicate replication");
 }
 /// Hard negative monotonicity: hide in region A, unhide (lower HLC) in region B.
 /// After replication: item is suppressed in BOTH regions.
 #[tokio::test]
 async fn test_hardneg_monotonicity_hide_wins() {
    let (db_a, db_b, _factory) = setup_two_region_cluster().await;
    let user = EntityId::new(3);
    let item = EntityId::new(300);
    // Region A hides item at t=100 (higher HLC).
    let ts_hide = HlcTimestamp { wall_ns: 100, logical: 0, node_id: 0 };
    db_a.hide_item_with_ts(user, item, ts_hide).unwrap();
    // Region B has an earlier unhide at t=50 (already in state before partition).
    let ts_unhide = HlcTimestamp { wall_ns: 50, logical: 0, node_id: 1 };
    db_b.unhide_item_with_ts(user, item, ts_unhide).unwrap();
    // Allow replication to propagate.
    tokio::time::sleep(Duration::from_millis(200)).await;
    // After replication: both regions should suppress the item.
    let results_b = db_b.retrieve(&Retrieve::builder()
        .for_user(user)
        .candidates(vec![item])
        .build()
        .unwrap()
    ).unwrap();
    assert!(
        results_b.items.is_empty(),
        "hidden item must not appear in region B results after replication"
    );
 }
 /// Hard negative: explicit unhide with HIGHER HLC does clear the hide.
 #[tokio::test]
 async fn test_hardneg_explicit_unhide_with_higher_hlc() {
    let (db_a, db_b, _factory) = setup_two_region_cluster().await;
    let user = EntityId::new(4);
    let item = EntityId::new(400);
    // Both regions: hide at t=50.
    let ts_hide = HlcTimestamp { wall_ns: 50, logical: 0, node_id: 0 };
    db_a.hide_item_with_ts(user, item, ts_hide).unwrap();
    tokio::time::sleep(Duration::from_millis(100)).await;
    // User explicitly un-hides at t=200 (higher than hide).
    let ts_unhide = HlcTimestamp { wall_ns: 200, logical: 0, node_id: 1 };
    db_b.unhide_item_with_ts(user, item, ts_unhide).unwrap();
    tokio::time::sleep(Duration::from_millis(200)).await;
    // After full replication + LWW resolution: item should appear (unhide wins).
    let results_a = db_a.retrieve(&Retrieve::builder()
        .for_user(user)
        .candidates(vec![item])
        .build()
        .unwrap()
    ).unwrap();
    assert_eq!(results_a.items.len(), 1, "unhide with higher HLC should make item visible again");
 }
 /// Seqno HWM: writes with seqno <= HWM are idempotent no-ops on receiver.
 #[tokio::test]
 async fn test_session_seqno_hwm_rejects_duplicates() {
    let tracker = SessionSeqNoTracker::new();
    let session = SessionId::new();
    // Sequence 1..5 -- all accepted.
    for i in 1..=5u64 {
        assert!(tracker.should_apply(session, SessionSeqNo(i)));
    }
    // Re-send seqno 3 -- rejected.
    assert!(!tracker.should_apply(session, SessionSeqNo(3)));
    // Seqno 6 -- accepted.
    assert!(tracker.should_apply(session, SessionSeqNo(6)));
    // HWM should be 6 now.
    assert_eq!(tracker.hwm(session), SessionSeqNo(6));
 }
 ```
 ## Acceptance Criteria
 - [ ] `test_session_cross_region_visibility`: 5 session signals written in region A visible in region B within 200ms (in-process transport)
 - [ ] `test_session_replication_idempotent`: duplicate session batch replay produces no double-counting
 - [ ] `test_hardneg_monotonicity_hide_wins`: hide at higher HLC suppresses item in both regions after cross-region replication
 - [ ] `test_hardneg_explicit_unhide_with_higher_hlc`: unhide at strictly higher HLC restores visibility after replication
 - [ ] `test_session_seqno_hwm_rejects_duplicates`: HWM tracker unit test with 5 monotonic accepts + 1 duplicate reject + 1 resume accept
 - [ ] All 5 tests pass in `cargo test --test m8p4_session`
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-5/OVERVIEW.md
+++ b/docs/planning/milestone-8/phase-5/OVERVIEW.md
@ -0,0 +1,92 @@
 # m8p5: Control Plane, Multi-Tenancy, and Routing
 ## Delivers
 Tenant isolation, routing configuration, and operational tooling for a hosted
 multi-tenant deployment. Each tenant (agent workspace) gets its own WAL
 namespace and resource quotas. The control plane manages shard-to-region
 assignment, tenant placement, and rolling upgrades. A tenant can be migrated to
 a new region by changing routing configuration only.
 Deliverables:
 - `TenantId(u64)`: tenant identity type; WAL segments namespaced by tenant
 - `TenantConfig`: per-tenant quota (max signals/sec, max entities, max storage bytes), residency policy (required regions)
 - `TenantRouter`: extends `ShardRouter` with tenant-aware routing; tenant -> shard mapping
 - `ControlPlane`: manages cluster topology (shard assignments, tenant placement, region health)
 - `TenantMigration`: moves a tenant to a new shard/region by shipping WAL segments + state snapshot; zero-downtime via dual-write window
 - `RollingUpgradeCoordinator`: upgrades nodes one at a time with drain + upgrade + rejoin; uses WAL shipping to keep followers current during the window
 ## Dependencies
 - **Requires:** Phase 8.2 (WAL shipping), Phase 8.3 (reconciliation), Phase 8.4 (session continuity)
 - **Files modified:**
  - `tidal/src/db/config.rs` -- add tenant configuration fields
  - `tidal/src/replication/shard.rs` -- extend `ShardRouter` with tenant routing
  - `tidal/src/wal/segment.rs` -- tenant-namespaced segment directories
  - `tidal/src/db/open.rs` -- tenant-scoped initialization
 - **Files created:**
  - `tidal/src/replication/tenant.rs` -- `TenantId`, `TenantConfig`, `TenantRouter`
  - `tidal/src/replication/control.rs` -- `ControlPlane`, topology management
  - `tidal/src/replication/migration.rs` -- `TenantMigration`
  - `tidal/src/replication/upgrade.rs` -- `RollingUpgradeCoordinator`
 ## Research References
 - `thoughts.md` -- Part I/Citadel (per-tenant filesystem isolation: "every tenant is an island")
 ## Acceptance Criteria (Phase Level)
 - [ ] `TenantId(u64)` is `Copy + Clone + Debug + Eq + Hash + Ord`; WAL segment directories are namespaced as `{data_dir}/tenants/{tenant_id}/wal/`
 - [ ] `TenantConfig` enforces rate limits: signals/sec (token bucket), max entities (hard cap), max storage bytes (checked on write); violations return `TidalError::QuotaExceeded`
 - [ ] `TenantRouter` maps `(TenantId, EntityId) -> (RegionId, ShardId)`; default is hash-based; residency policy constrains which regions a tenant's data can reside in
 - [ ] `ControlPlane` exposes cluster health: per-shard entity count, signal throughput, replication lag, disk usage; serializable to JSON for monitoring integration
 - [ ] Tenant migration test: move tenant from shard A to shard B; during migration, dual-write ensures no signal loss; after migration, shard A's tenant data is garbage-collected; total downtime = 0 (reads served from both shards during migration window)
 - [ ] Rolling upgrade: upgrade 1 of 3 nodes; WAL shipping continues to remaining 2; upgraded node rejoins and catches up from WAL; total query availability = 100% during the upgrade window
 - [ ] Per-tenant WAL isolation: a misbehaving tenant (burst of 100K signals/sec) is throttled without affecting other tenants on the same shard; rate limiter returns `TidalError::QuotaExceeded` within 1ms
 ## Task Execution Order
 ```
 Task 01: TenantId + TenantConfig ──────────┐
                                            ├──> Task 03: ControlPlane
 Task 02: TenantRouter ────────────────────┤
                                            ├──> Task 04: TenantMigration
                                            │
                                            └──> Task 05: RollingUpgrade
                                                      │
                                                      v
                                            Task 06: Multi-Tenancy Integration Tests
 ```
 Tasks 01 and 02 are parallelizable. Tasks 03, 04, 05 depend on both. Task 06 depends on all.
 ## Module Location
 | File | Status | Contains |
 |------|--------|----------|
 | `tidal/src/replication/tenant.rs` | NEW | `TenantId`, `TenantConfig`, `TenantRouter`, quota enforcement |
 | `tidal/src/replication/control.rs` | NEW | `ControlPlane`, cluster topology, health metrics |
 | `tidal/src/replication/migration.rs` | NEW | `TenantMigration`, dual-write protocol |
 | `tidal/src/replication/upgrade.rs` | NEW | `RollingUpgradeCoordinator` |
 | `tidal/src/db/config.rs` | MODIFIED | Tenant config fields |
 | `tidal/src/replication/shard.rs` | MODIFIED | Tenant-aware routing |
 | `tidal/src/wal/segment.rs` | MODIFIED | Tenant-namespaced directories |
 | `tidal/src/db/open.rs` | MODIFIED | Tenant-scoped initialization |
 ## Notes
 ### Tenant isolation follows Citadel's model
 Per-tenant filesystem directories, per-tenant WAL files, per-tenant rate limiters. The OS enforces the boundary. A misbehaving tenant cannot affect others because its writes go to separate files and its rate limiter is checked before the WAL write.
 ### Migration via dual-write
 During migration, writes for the migrating tenant go to both the old shard and the new shard. After the new shard has caught up (verified by seqno matching), reads are switched to the new shard, and the old shard's tenant data is garbage-collected. This is the CockroachDB range-split model adapted for tenant migration.
 ### Control plane is embedded, not external
 The `ControlPlane` runs within the leader node's process (or a designated coordinator node). It is not a separate service. This matches tidalDB's embeddable philosophy.
 ## Done When
 A developer can configure 3 tenants on a 3-shard cluster, apply per-tenant rate limits, migrate a tenant from one shard to another with zero downtime, perform a rolling upgrade of all nodes, and observe that per-tenant isolation prevents noisy-neighbor effects throughout.
--- a/docs/planning/milestone-8/phase-5/task-01-tenant-identity.md
+++ b/docs/planning/milestone-8/phase-5/task-01-tenant-identity.md
@ -0,0 +1,169 @@
 # Task 01: TenantId + TenantConfig
 ## Delivers
 `TenantId(u64)` and `TenantConfig` in `tidal/src/replication/tenant.rs`. Per-tenant quotas (signals/sec token bucket, max entities, max storage bytes). WAL segment directories namespaced under `{data_dir}/tenants/{tenant_id}/wal/`. `TidalError::QuotaExceeded` returned when limits are breached.
 ## Complexity: M
 ## Dependencies
 - Phase 8.1 (ShardId, RegionId)
 - Phase 8.2 (WAL segment naming)
 ## Technical Design
 ```rust
 // tidal/src/replication/tenant.rs
 /// Tenant identity type.
 ///
 /// A tenant is an agent workspace or an isolated application namespace.
 /// All data (WAL segments, signal ledger state, entity metadata) is
 /// scoped to a tenant's filesystem directory.
 ///
 /// `TenantId(0)` is the default single-tenant ID used by non-multi-tenant
 /// deployments. This ensures backward compatibility with all existing code.
 #[derive(
    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash,
    Default,
    serde::Serialize, serde::Deserialize,
 )]
 pub struct TenantId(pub u64);
 impl TenantId {
    /// The default tenant ID for single-tenant deployments.
    pub const DEFAULT: Self = Self(0);
 }
 impl std::fmt::Display for TenantId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "t{}", self.0)
    }
 }
 /// Per-tenant resource configuration.
 ///
 /// Enforced at write time. Violations return `TidalError::QuotaExceeded`.
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct TenantConfig {
    pub tenant_id: TenantId,
    /// Maximum signals per second (token bucket rate limit).
    ///
    /// `None` means unlimited (trusted internal tenant).
    pub max_signals_per_sec: Option<u32>,
    /// Maximum number of distinct entities (items + users + creators).
    ///
    /// Checked on entity create; `None` means unlimited.
    pub max_entities: Option<u64>,
    /// Maximum total storage in bytes for this tenant's data directory.
    ///
    /// Checked on WAL segment seal; `None` means unlimited.
    pub max_storage_bytes: Option<u64>,
    /// Residency policy: which regions this tenant's data must reside in.
    ///
    /// Empty = no restriction. Used by `TenantRouter` to constrain placement.
    pub required_regions: Vec<RegionId>,
    /// Human-readable label for this tenant (for monitoring/logging).
    pub label: String,
 }
 impl TenantConfig {
    /// Default config: unlimited quotas, no residency constraint.
    pub fn default_tenant() -> Self {
        Self {
            tenant_id: TenantId::DEFAULT,
            max_signals_per_sec: None,
            max_entities: None,
            max_storage_bytes: None,
            required_regions: Vec::new(),
            label: "default".to_string(),
        }
    }
 }
 /// Token bucket rate limiter for per-tenant signal ingestion.
 ///
 /// Refills at `max_signals_per_sec` tokens per second.
 /// Costs 1 token per signal write. Bucket max = 2x rate (burst headroom).
 #[derive(Debug)]
 pub struct TenantRateLimiter {
    /// Current tokens (f64 for sub-token precision).
    tokens: AtomicF64,
    /// Refill rate (tokens/ns).
    refill_rate_per_ns: f64,
    /// Maximum bucket size (tokens).
    max_tokens: f64,
    /// Last refill timestamp (ns).
    last_refill_ns: AtomicU64,
 }
 impl TenantRateLimiter {
    pub fn new(max_signals_per_sec: u32) -> Self {
        let rate_per_ns = max_signals_per_sec as f64 / 1_000_000_000.0;
        let max_tokens = (max_signals_per_sec as f64) * 2.0; // 2s burst
        Self {
            tokens: AtomicF64::new(max_tokens),
            refill_rate_per_ns: rate_per_ns,
            max_tokens,
            last_refill_ns: AtomicU64::new(crate::util::now_ns()),
        }
    }
    /// Try to consume 1 token. Returns `Ok(())` if allowed, `Err(QuotaExceeded)` if throttled.
    pub fn try_acquire(&self) -> Result<()> {
        let now = crate::util::now_ns();
        let last = self.last_refill_ns.load(Ordering::Relaxed);
        let elapsed_ns = now.saturating_sub(last);
        let refill = elapsed_ns as f64 * self.refill_rate_per_ns;
        let new_tokens = (self.tokens.load(Ordering::Relaxed) + refill)
            .min(self.max_tokens);
        if new_tokens < 1.0 {
            return Err(TidalError::QuotaExceeded("signal rate limit exceeded".into()));
        }
        self.tokens.store(new_tokens - 1.0, Ordering::Relaxed);
        self.last_refill_ns.store(now, Ordering::Relaxed);
        Ok(())
    }
 }
 ```
 ### WAL Directory Namespacing
 ```rust
 // tidal/src/wal/segment.rs (additions)
 /// Build the tenant-scoped WAL directory path.
 ///
 /// For `TenantId::DEFAULT` (backward compat): returns `{data_dir}/wal/` unchanged.
 /// For other tenants: returns `{data_dir}/tenants/{tenant_id}/wal/`.
 pub fn tenant_wal_dir(data_dir: &Path, tenant_id: TenantId) -> PathBuf {
    if tenant_id == TenantId::DEFAULT {
        data_dir.join("wal")
    } else {
        data_dir
            .join("tenants")
            .join(tenant_id.0.to_string())
            .join("wal")
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `TenantId` is `Copy + Clone + Debug + Eq + Hash + Ord + Default + Serialize + Deserialize`
 - [ ] `TenantId::DEFAULT` is `TenantId(0)`; all existing code using `TenantId(0)` works unchanged
 - [ ] `TenantRateLimiter::try_acquire()` returns `TidalError::QuotaExceeded` within 1ms when token bucket is empty
 - [ ] Token bucket refills at the configured rate: after sleeping `1/rate` seconds, one token is available
 - [ ] WAL directory for `TenantId::DEFAULT` is `{data_dir}/wal/` (unchanged from m1p5)
 - [ ] WAL directory for `TenantId(42)` is `{data_dir}/tenants/42/wal/`
 - [ ] Unit test: configure 100 signals/sec, write 200 signals in a tight loop, verify ~100 succeed and ~100 receive `QuotaExceeded`
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-5/task-02-tenant-router.md
+++ b/docs/planning/milestone-8/phase-5/task-02-tenant-router.md
@ -0,0 +1,173 @@
 # Task 02: TenantRouter
 ## Delivers
 `TenantRouter` in `tidal/src/replication/tenant.rs` (same file as `TenantId`/`TenantConfig`). Extends `ShardRouter` with tenant-aware routing: `(TenantId, EntityId) -> (RegionId, ShardId)`. Default routing uses consistent hashing. Residency policy constrains which regions are eligible for a tenant's data.
 ## Complexity: M
 ## Dependencies
 - Task 01 (TenantId, TenantConfig)
 - Phase 8.1, Task 02 (ShardRouter)
 ## Technical Design
 ```rust
 // tidal/src/replication/tenant.rs (continued)
 /// Maps (TenantId, EntityId) -> (RegionId, ShardId) for data placement.
 ///
 /// Wraps `ShardRouter` and adds:
 /// 1. Tenant-to-shard affinity (consistent hash or explicit assignment)
 /// 2. Residency policy enforcement (required_regions constraint)
 /// 3. Tenant registry for O(1) config lookup
 pub struct TenantRouter {
    /// Inner shard router (entity-level routing).
    shard_router: Arc<ShardRouter>,
    /// Per-tenant configuration.
    tenants: DashMap<TenantId, TenantConfig>,
    /// Cluster topology: which shards are in which regions.
    topology: Arc<ClusterTopology>,
 }
 /// Cluster topology snapshot: maps ShardId -> RegionId.
 #[derive(Debug, Clone)]
 pub struct ClusterTopology {
    /// Ordered list of (ShardId, RegionId) assignments.
    shards: Vec<ShardAssignment>,
 }
 #[derive(Debug, Clone, Copy)]
 pub struct ShardAssignment {
    pub shard_id: ShardId,
    pub region_id: RegionId,
 }
 impl TenantRouter {
    pub fn new(shard_router: Arc<ShardRouter>, topology: Arc<ClusterTopology>) -> Self {
        Self {
            shard_router,
            tenants: DashMap::new(),
            topology,
        }
    }
    /// Register or update a tenant's configuration.
    pub fn register_tenant(&self, config: TenantConfig) {
        self.tenants.insert(config.tenant_id, config);
    }
    /// Look up routing for a (TenantId, EntityId) pair.
    ///
    /// Returns `(RegionId, ShardId)` for data placement.
    /// Applies residency policy if configured.
    pub fn route(
        &self,
        tenant_id: TenantId,
        entity_id: EntityId,
    ) -> Result<ShardAssignment> {
        // 1. Get eligible shards (all shards if no policy; filtered by region if policy set).
        let eligible_shards = self.eligible_shards_for(tenant_id)?;
        // 2. Consistent hash over eligible shards.
        let shard = self.consistent_hash(entity_id, &eligible_shards);
        Ok(shard)
    }
    /// Returns the primary shard assignment for a tenant's data.
    ///
    /// For single-shard tenants: always the same shard.
    /// For multi-shard tenants: hash-distributed.
    fn eligible_shards_for(&self, tenant_id: TenantId) -> Result<Vec<ShardAssignment>> {
        let config = self.tenants.get(&tenant_id);
        if let Some(config) = config {
            if !config.required_regions.is_empty() {
                // Filter topology to only shards in required regions.
                let eligible: Vec<_> = self.topology.shards.iter()
                    .copied()
                    .filter(|s| config.required_regions.contains(&s.region_id))
                    .collect();
                if eligible.is_empty() {
                    return Err(TidalError::Configuration(
                        format!("tenant {:?} residency policy has no eligible shards", tenant_id)
                    ));
                }
                return Ok(eligible);
            }
        }
        // No residency constraint: all shards eligible.
        Ok(self.topology.shards.clone())
    }
    /// Consistent hash: jumps hash over the eligible shard list.
    ///
    /// Uses Jump Consistent Hash (Lamping & Veach, 2014) for minimal
    /// remapping when shards are added/removed.
    fn consistent_hash(&self, entity_id: EntityId, shards: &[ShardAssignment]) -> ShardAssignment {
        let n = shards.len() as u64;
        let slot = jump_hash(entity_id.0, n);
        shards[slot as usize]
    }
    /// Rate limiter for a tenant (lazily created).
    pub fn rate_limiter_for(&self, tenant_id: TenantId) -> Option<Arc<TenantRateLimiter>> {
        self.tenants.get(&tenant_id)
            .and_then(|c| c.max_signals_per_sec)
            .map(|rate| Arc::new(TenantRateLimiter::new(rate)))
    }
 }
 /// Jump Consistent Hash (O(ln n) time, O(1) space).
 fn jump_hash(key: u64, num_buckets: u64) -> u64 {
    let mut k = key;
    let mut b: i64 = -1;
    let mut j: i64 = 0;
    while j < num_buckets as i64 {
        b = j;
        k = k.wrapping_mul(2862933555777941757).wrapping_add(1);
        j = ((b + 1) as f64 * (((1u64 << 31) as f64) / (((k >> 33) + 1) as f64))) as i64;
    }
    b as u64
 }
 ```
 ### Integration with TidalDb Write Path
 ```rust
 // tidal/src/db/mod.rs (additions to signal write path)
 impl TidalDb {
    pub fn signal_for_tenant(
        &self,
        tenant_id: TenantId,
        signal_type: &str,
        entity_id: EntityId,
        weight: f64,
        timestamp: Timestamp,
    ) -> crate::Result<()> {
        // 1. Check rate limit.
        if let Some(limiter) = self.tenant_router.rate_limiter_for(tenant_id) {
            limiter.try_acquire()?;
        }
        // 2. Route to shard.
        let assignment = self.tenant_router.route(tenant_id, entity_id)?;
        // 3. Write signal to the tenant-scoped signal ledger.
        self.signal_impl(signal_type, entity_id, weight, timestamp)
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `TenantRouter::route(tenant_id, entity_id)` returns a `ShardAssignment` from the eligible shards
 - [ ] Residency policy: if `TenantConfig::required_regions = [RegionId(1)]` and only shard 2 is in region 1, all entities for that tenant route to shard 2
 - [ ] Residency policy violation: if required regions have no shards in `ClusterTopology`, returns `TidalError::Configuration`
 - [ ] Consistent hash is stable: same `(tenant_id, entity_id)` always maps to the same shard unless topology changes
 - [ ] Jump hash: adding a shard remaps approximately `1/N` of keys (property test: 10K keys, add 1 shard, verify < 15% remapping)
 - [ ] `TidalDb::signal_for_tenant` applies rate limiting before write; `QuotaExceeded` is returned before WAL write (no partial state)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-5/task-03-control-plane.md
+++ b/docs/planning/milestone-8/phase-5/task-03-control-plane.md
@ -0,0 +1,176 @@
 # Task 03: ControlPlane
 ## Delivers
 `ControlPlane` in `tidal/src/replication/control.rs`. Embedded within the leader node. Manages cluster topology (shard-to-region assignments, tenant placement, region health). Exposes cluster health metrics serializable to JSON for external monitoring. No separate service — runs as a background task within the leader process.
 ## Complexity: L
 ## Dependencies
 - Task 01 (TenantId, TenantConfig)
 - Task 02 (TenantRouter, ClusterTopology)
 - Phase 8.2, Task 06 (ReplicationLagGauge)
 ## Technical Design
 ```rust
 // tidal/src/replication/control.rs
 /// Embedded cluster controller running on the leader node.
 ///
 /// Tracks cluster topology, tenant placement, and shard health.
 /// Exposes a `ClusterHealth` snapshot for external monitoring via the
 /// existing `MetricsState` integration.
 ///
 /// Design constraint: no external service. The control plane is an
 /// in-process component, consistent with tidalDB's embeddable philosophy.
 pub struct ControlPlane {
    topology: Arc<RwLock<ClusterTopology>>,
    tenant_router: Arc<TenantRouter>,
    lag_gauge: Arc<ReplicationLagGauge>,
    shard_stats: DashMap<ShardId, ShardStats>,
    region_health: DashMap<RegionId, RegionHealth>,
 }
 /// Per-shard operational statistics.
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct ShardStats {
    pub shard_id: ShardId,
    pub region_id: RegionId,
    pub entity_count: u64,
    /// WAL events applied per second (EMA, α=0.1).
    pub signal_throughput_eps: f64,
    /// Replication lag to each follower (seqno distance).
    pub replication_lag: HashMap<RegionId, u64>,
    /// Approximate disk usage for this shard's WAL directory (bytes).
    pub disk_bytes: u64,
    /// Last heartbeat from this shard (ns since epoch).
    pub last_heartbeat_ns: u64,
 }
 /// Per-region health state.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum RegionHealth {
    Healthy,
    Degraded,  // replication lag > 5s
    Offline,   // no heartbeat for > 30s
 }
 /// Full cluster health snapshot.
 ///
 /// Serializable to JSON for monitoring dashboards (Prometheus/Grafana, etc.).
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct ClusterHealth {
    pub snapshot_ns: u64,
    pub shards: Vec<ShardStats>,
    pub regions: HashMap<RegionId, RegionHealth>,
    pub tenant_count: usize,
    pub total_entities: u64,
    pub total_signals_eps: f64,
 }
 impl ControlPlane {
    pub fn new(
        topology: Arc<RwLock<ClusterTopology>>,
        tenant_router: Arc<TenantRouter>,
        lag_gauge: Arc<ReplicationLagGauge>,
    ) -> Self {
        Self {
            topology,
            tenant_router,
            lag_gauge,
            shard_stats: DashMap::new(),
            region_health: DashMap::new(),
        }
    }
    /// Update shard statistics (called by each shard on its heartbeat interval).
    pub fn record_shard_heartbeat(&self, stats: ShardStats) {
        self.region_health.insert(stats.region_id, RegionHealth::Healthy);
        self.shard_stats.insert(stats.shard_id, stats);
    }
    /// Compute and return current cluster health snapshot.
    pub fn health(&self) -> ClusterHealth {
        let now_ns = crate::util::now_ns();
        let shards: Vec<_> = self.shard_stats.iter()
            .map(|r| r.value().clone())
            .collect();
        // Mark regions offline if no heartbeat in 30s.
        let regions: HashMap<_, _> = self.region_health.iter()
            .map(|r| {
                let shard_for_region = shards.iter()
                    .find(|s| s.region_id == *r.key());
                let health = if let Some(s) = shard_for_region {
                    let age_ns = now_ns.saturating_sub(s.last_heartbeat_ns);
                    if age_ns > 30_000_000_000 { // 30s
                        RegionHealth::Offline
                    } else if s.replication_lag.values().any(|&lag| lag > 5_000_000_000) { // 5s
                        RegionHealth::Degraded
                    } else {
                        RegionHealth::Healthy
                    }
                } else {
                    RegionHealth::Offline
                };
                (*r.key(), health)
            })
            .collect();
        let total_entities = shards.iter().map(|s| s.entity_count).sum();
        let total_signals_eps = shards.iter().map(|s| s.signal_throughput_eps).sum();
        ClusterHealth {
            snapshot_ns: now_ns,
            shards,
            regions,
            tenant_count: self.tenant_router.tenant_count(),
            total_entities,
            total_signals_eps,
        }
    }
    /// Update topology: add or reassign a shard.
    ///
    /// Propagated to `TenantRouter` which will re-compute routes on next call.
    pub fn update_topology(&self, assignment: ShardAssignment) {
        let mut topology = self.topology.write().unwrap();
        if let Some(existing) = topology.shards.iter_mut().find(|s| s.shard_id == assignment.shard_id) {
            *existing = assignment;
        } else {
            topology.shards.push(assignment);
        }
    }
    /// JSON representation of `ClusterHealth` for external monitoring.
    pub fn health_json(&self) -> String {
        serde_json::to_string_pretty(&self.health())
            .unwrap_or_else(|e| format!("{{\"error\": \"{}\"}}", e))
    }
 }
 ```
 ### MetricsState Integration
 ```rust
 // tidal/src/db/metrics.rs (extension)
 impl MetricsState {
    pub fn cluster_health(&self) -> Option<ClusterHealth> {
        self.control_plane.as_ref().map(|cp| cp.health())
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `ControlPlane::health()` returns a `ClusterHealth` with per-shard stats for all registered shards
 - [ ] `RegionHealth::Offline` is set for a shard whose `last_heartbeat_ns` is > 30 seconds ago
 - [ ] `RegionHealth::Degraded` is set for a shard with `replication_lag > 5s`
 - [ ] `health_json()` produces valid JSON deserializable back to `ClusterHealth` (round-trip test)
 - [ ] `update_topology(assignment)` is reflected in the next `health()` call and the next `TenantRouter::route()` call
 - [ ] `MetricsState::cluster_health()` returns `None` on single-node deployments (control plane not configured)
 - [ ] Control plane heartbeat test: 3 simulated shards, update stats for each, verify `health()` shows all 3 as `Healthy`
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-5/task-04-tenant-migration.md
+++ b/docs/planning/milestone-8/phase-5/task-04-tenant-migration.md
@ -0,0 +1,190 @@
 # Task 04: TenantMigration
 ## Delivers
 `TenantMigration` in `tidal/src/replication/migration.rs`. Moves a tenant from one shard/region to another with zero downtime via a dual-write window. During migration, writes go to both the old shard and the new shard. After the new shard's seqno matches the old shard's, reads are atomically switched to the new shard, and the old shard's tenant data is garbage-collected.
 ## Complexity: L
 ## Dependencies
 - Task 01 (TenantId, TenantConfig)
 - Task 02 (TenantRouter)
 - Task 03 (ControlPlane)
 - Phase 8.2 (WAL shipping -- used to bootstrap the new shard from existing WAL segments)
 ## Technical Design
 ```rust
 // tidal/src/replication/migration.rs
 /// State machine for tenant migration.
 ///
 /// States:
 ///   Idle -> PreparingTarget -> DualWrite -> Finalizing -> Complete
 ///
 /// The migration progresses monotonically. If it fails at any stage,
 /// it can be retried from the same state (idempotent by design).
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum MigrationState {
    Idle,
    /// Source shard is shipping WAL segments to target shard.
    PreparingTarget {
        last_shipped_seqno: u64,
    },
    /// Both shards receive writes. Source seqno is the cut-over gate.
    DualWrite {
        cutover_seqno: u64,
    },
    /// Writes routed to target only. Waiting for read switchover.
    Finalizing {
        switched_at_ns: u64,
    },
    /// Migration complete. Old shard data can be GC'd.
    Complete,
 }
 /// Migrates a tenant from one shard to another with zero downtime.
 pub struct TenantMigration {
    tenant_id: TenantId,
    source_shard: ShardId,
    target_shard: ShardId,
    state: Mutex<MigrationState>,
    control_plane: Arc<ControlPlane>,
    tenant_router: Arc<TenantRouter>,
    transport: Arc<dyn Transport>,
 }
 impl TenantMigration {
    pub fn new(
        tenant_id: TenantId,
        source_shard: ShardId,
        target_shard: ShardId,
        control_plane: Arc<ControlPlane>,
        tenant_router: Arc<TenantRouter>,
        transport: Arc<dyn Transport>,
    ) -> Self {
        Self {
            tenant_id,
            source_shard,
            target_shard,
            state: Mutex::new(MigrationState::Idle),
            control_plane,
            tenant_router,
            transport,
        }
    }
    /// Phase 1: Ship all existing WAL segments to the target shard.
    ///
    /// The target shard replays these segments to build up state.
    /// Returns the seqno of the last shipped segment.
    pub async fn prepare_target(&self) -> Result<u64> {
        let mut state = self.state.lock().unwrap();
        assert_eq!(*state, MigrationState::Idle);
        // Ship all sealed WAL segments for this tenant to the target.
        let segments = self.list_tenant_segments()?;
        let mut last_seqno = 0u64;
        for seg in segments {
            let payload = self.read_segment_payload(&seg)?;
            self.transport.send_segment(
                self.target_shard_region()?,
                payload,
            ).await?;
            last_seqno = seg.seqno;
        }
        *state = MigrationState::PreparingTarget { last_shipped_seqno: last_seqno };
        Ok(last_seqno)
    }
    /// Phase 2: Enter dual-write mode.
    ///
    /// All subsequent writes for this tenant go to BOTH source and target shards.
    /// The `cutover_seqno` is the source shard's current seqno when dual-write starts.
    /// Once target reaches `cutover_seqno`, we can finalize.
    pub async fn enter_dual_write(&self) -> Result<u64> {
        let mut state = self.state.lock().unwrap();
        assert!(matches!(*state, MigrationState::PreparingTarget { .. }));
        // Get current seqno from source shard (the cut-over gate).
        let cutover_seqno = self.current_source_seqno()?;
        // Update routing: writes now go to both source and target.
        self.tenant_router.set_dual_write(self.tenant_id, self.source_shard, self.target_shard);
        *state = MigrationState::DualWrite { cutover_seqno };
        Ok(cutover_seqno)
    }
    /// Phase 3: Finalize -- switch reads to target, stop writing to source.
    ///
    /// Only called after target shard has caught up to `cutover_seqno`.
    /// Reads are atomically switched to the target shard.
    pub async fn finalize(&self) -> Result<()> {
        let mut state = self.state.lock().unwrap();
        let cutover_seqno = match *state {
            MigrationState::DualWrite { cutover_seqno } => cutover_seqno,
            _ => return Err(TidalError::InvalidState("finalize called outside DualWrite state".into())),
        };
        // Verify target has caught up.
        let target_seqno = self.current_target_seqno()?;
        if target_seqno < cutover_seqno {
            return Err(TidalError::NotReady(
                format!("target shard at seqno {}, cutover requires {}", target_seqno, cutover_seqno)
            ));
        }
        // Atomically switch routing: reads now go to target only, no more writes to source.
        self.tenant_router.finalize_migration(self.tenant_id, self.target_shard);
        self.control_plane.update_topology(ShardAssignment {
            shard_id: self.source_shard,
            region_id: self.source_shard_region()?,
        });
        *state = MigrationState::Finalizing { switched_at_ns: crate::util::now_ns() };
        Ok(())
    }
    /// Phase 4: Garbage-collect source shard tenant data.
    ///
    /// Called after a GC window (e.g., 5 minutes) to ensure no in-flight
    /// reads are still served from the source shard.
    pub fn gc_source(&self, gc_window_ns: u64) -> Result<()> {
        let mut state = self.state.lock().unwrap();
        let switched_at = match *state {
            MigrationState::Finalizing { switched_at_ns } => switched_at_ns,
            _ => return Err(TidalError::InvalidState("gc called outside Finalizing state".into())),
        };
        let now = crate::util::now_ns();
        if now.saturating_sub(switched_at) < gc_window_ns {
            return Err(TidalError::NotReady("GC window not elapsed".into()));
        }
        self.delete_tenant_data_on_source()?;
        *state = MigrationState::Complete;
        Ok(())
    }
    fn list_tenant_segments(&self) -> Result<Vec<SegmentMeta>> { todo!() }
    fn read_segment_payload(&self, meta: &SegmentMeta) -> Result<WalSegmentPayload> { todo!() }
    fn current_source_seqno(&self) -> Result<u64> { todo!() }
    fn current_target_seqno(&self) -> Result<u64> { todo!() }
    fn target_shard_region(&self) -> Result<RegionId> { todo!() }
    fn source_shard_region(&self) -> Result<RegionId> { todo!() }
    fn delete_tenant_data_on_source(&self) -> Result<()> { todo!() }
 }
 ```
 ## Acceptance Criteria
 - [ ] Migration state machine progresses `Idle -> PreparingTarget -> DualWrite -> Finalizing -> Complete`; state transitions are validated (panic/error on invalid transitions)
 - [ ] During `DualWrite` state: writes to `signal_for_tenant` go to BOTH source and target shards (verified by reading from both after 10 writes)
 - [ ] `finalize()` fails with `NotReady` if target seqno < cutover seqno; succeeds once target catches up
 - [ ] After `finalize()`: queries are served from the target shard; source shard data is not queried
 - [ ] `gc_source()` fails if < GC window elapsed; deletes tenant WAL segments and signal state from source shard after window
 - [ ] Zero downtime test: start migration, write 1000 signals during `DualWrite`, finalize, verify all 1000 signals present on target
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-5/task-05-rolling-upgrade.md
+++ b/docs/planning/milestone-8/phase-5/task-05-rolling-upgrade.md
@ -0,0 +1,165 @@
 # Task 05: RollingUpgradeCoordinator
 ## Delivers
 `RollingUpgradeCoordinator` in `tidal/src/replication/upgrade.rs`. Upgrades nodes one at a time with drain → upgrade → rejoin. Uses WAL shipping to keep remaining followers current during the upgrade window. Query availability remains 100% because at least one node is always serving during each upgrade step.
 ## Complexity: M
 ## Dependencies
 - Task 03 (ControlPlane)
 - Phase 8.2, Task 03 (WalShipper)
 - Phase 8.2, Task 05 (FollowerDb / NodeRole)
 ## Technical Design
 ```rust
 // tidal/src/replication/upgrade.rs
 /// Coordinates a rolling upgrade across all nodes in a cluster.
 ///
 /// Protocol (per node):
 ///   1. `drain(node)` -- stop routing new writes to the target node;
 ///      let in-flight operations complete; verify replication lag = 0.
 ///   2. Caller performs the upgrade (outside this coordinator's scope).
 ///   3. `rejoin(node)` -- re-enable routing to the upgraded node;
 ///      verify it can process new WAL segments.
 ///
 /// At any point, at least (N-1) nodes are serving queries.
 pub struct RollingUpgradeCoordinator {
    control_plane: Arc<ControlPlane>,
    wal_shipper: Arc<WalShipper>,
    /// Nodes currently in the "draining" state (not routing new writes).
    drained_nodes: Mutex<HashSet<ShardId>>,
 }
 /// Status of a single node's upgrade step.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum NodeUpgradeStatus {
    Pending,
    Draining,
    Drained,         // ready for upgrade
    Upgrading,       // external process is upgrading the node
    Rejoining,       // node is catching up from WAL
    Complete,
    Failed { reason: String },
 }
 impl RollingUpgradeCoordinator {
    pub fn new(
        control_plane: Arc<ControlPlane>,
        wal_shipper: Arc<WalShipper>,
    ) -> Self {
        Self {
            control_plane,
            wal_shipper,
            drained_nodes: Mutex::new(HashSet::new()),
        }
    }
    /// Drain a node: stop routing writes to it, wait for replication lag = 0.
    ///
    /// Fails if draining this node would leave fewer than 1 serving node.
    pub async fn drain(&self, target_shard: ShardId) -> Result<()> {
        // Safety check: cannot drain if it would leave 0 serving nodes.
        let drained = self.drained_nodes.lock().unwrap();
        let topology = self.control_plane.topology();
        let total_nodes = topology.shards.len();
        let already_drained = drained.len();
        if already_drained + 1 >= total_nodes {
            return Err(TidalError::InvalidState(
                "cannot drain: would leave no serving nodes".into()
            ));
        }
        drop(drained);
        // Mark as draining: routing layer stops sending new writes here.
        self.drained_nodes.lock().unwrap().insert(target_shard);
        // Wait for replication lag to reach 0 (target has all events).
        self.await_zero_lag(target_shard, Duration::from_secs(30)).await?;
        Ok(())
    }
    /// Rejoin a (newly upgraded) node: re-enable routing, ship missing WAL segments.
    ///
    /// The upgraded node may have missed WAL segments during its downtime.
    /// We ship those segments before re-enabling routing.
    pub async fn rejoin(&self, target_shard: ShardId) -> Result<()> {
        // Get the node's current applied seqno (via its reported stats).
        let follower_seqno = self.control_plane
            .shard_stats(target_shard)
            .map(|s| s.applied_seqno)
            .unwrap_or(0);
        // Ship missed segments.
        self.wal_shipper
            .ship_segments_since(target_shard, follower_seqno)
            .await?;
        // Wait for the node to apply all shipped segments.
        self.await_zero_lag(target_shard, Duration::from_secs(60)).await?;
        // Re-enable routing to this node.
        self.drained_nodes.lock().unwrap().remove(&target_shard);
        Ok(())
    }
    /// Returns `true` if `shard_id` is currently drained (not receiving writes).
    pub fn is_drained(&self, shard_id: ShardId) -> bool {
        self.drained_nodes.lock().unwrap().contains(&shard_id)
    }
    /// Wait until the replication lag for `target_shard` reaches 0.
    ///
    /// Polls the `ReplicationLagGauge` every 100ms. Times out after `timeout`.
    async fn await_zero_lag(
        &self,
        target_shard: ShardId,
        timeout: Duration,
    ) -> Result<()> {
        let deadline = Instant::now() + timeout;
        loop {
            if Instant::now() > deadline {
                return Err(TidalError::Timeout(
                    format!("drain timeout: shard {:?} still has replication lag", target_shard)
                ));
            }
            let lag = self.control_plane.lag_for(target_shard);
            if lag == 0 {
                return Ok(());
            }
            tokio::time::sleep(Duration::from_millis(100)).await;
        }
    }
 }
 ```
 ### Routing Integration
 ```rust
 // In WalShipper (additions)
 impl WalShipper {
    /// Skip shipping to drained nodes.
    async fn should_ship_to(&self, shard_id: ShardId) -> bool {
        !self.upgrade_coordinator
            .as_ref()
            .map(|c| c.is_drained(shard_id))
            .unwrap_or(false)
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `drain(shard)` fails with `TidalError::InvalidState` if draining would leave 0 serving nodes
 - [ ] `drain(shard)` succeeds once replication lag for that shard reaches 0
 - [ ] During drain: writes from `WalShipper` skip the drained shard; reads from other shards succeed
 - [ ] `rejoin(shard)` ships all WAL segments the node missed during its downtime, then re-enables routing
 - [ ] Rolling upgrade of all N nodes: each drain+rejoin step maintains availability (property: at least 1 node serving throughout)
 - [ ] Integration test: 3-node simulated cluster; drain node 0, "upgrade" (simulated by stop+restart), rejoin; verify all signals written during the upgrade are present on the rejoined node
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-5/task-06-multi-tenancy-tests.md
+++ b/docs/planning/milestone-8/phase-5/task-06-multi-tenancy-tests.md
@ -0,0 +1,224 @@
 # Task 06: Multi-Tenancy Integration Tests
 ## Delivers
 Integration test suite in `tidal/tests/m8p5_multitenancy.rs` verifying: per-tenant rate limiting, tenant migration with zero downtime, rolling upgrade, and noisy-neighbor isolation.
 ## Complexity: M
 ## Dependencies
 - Tasks 01–05 complete
 ## Technical Design
 ```rust
 // tidal/tests/m8p5_multitenancy.rs
 use tidaldb::replication::{
    TenantId, TenantConfig, TenantRouter, ClusterTopology, ShardAssignment,
    ControlPlane, TenantMigration, RollingUpgradeCoordinator,
    InProcessTransportFactory,
 };
 fn three_shard_topology() -> ClusterTopology {
    ClusterTopology {
        shards: vec![
            ShardAssignment { shard_id: ShardId(0), region_id: RegionId(0) },
            ShardAssignment { shard_id: ShardId(1), region_id: RegionId(1) },
            ShardAssignment { shard_id: ShardId(2), region_id: RegionId(2) },
        ],
    }
 }
 /// Rate limiting: a tenant configured at 100 signals/sec is throttled
 /// when burst exceeds that rate.
 #[test]
 fn test_tenant_rate_limiting() {
    let topology = Arc::new(three_shard_topology());
    let shard_router = Arc::new(ShardRouter::new(topology.clone()));
    let tenant_router = Arc::new(TenantRouter::new(shard_router, topology));
    let tenant = TenantId(1);
    tenant_router.register_tenant(TenantConfig {
        tenant_id: tenant,
        max_signals_per_sec: Some(100),
        max_entities: None,
        max_storage_bytes: None,
        required_regions: vec![],
        label: "test-tenant".into(),
    });
    let limiter = tenant_router.rate_limiter_for(tenant).unwrap();
    // Drain the bucket: 200 immediate acquires.
    let mut allowed = 0;
    let mut throttled = 0;
    for _ in 0..200 {
        match limiter.try_acquire() {
            Ok(()) => allowed += 1,
            Err(TidalError::QuotaExceeded(_)) => throttled += 1,
            _ => panic!("unexpected error"),
        }
    }
    // At 100 signals/sec, we get 2s burst (200 tokens). All 200 should succeed
    // since the burst capacity is 2x rate. Let's verify that after exhaustion, next fails.
    assert!(throttled == 0, "burst capacity should absorb 200 signals");
    // One more should fail.
    assert!(
        matches!(limiter.try_acquire(), Err(TidalError::QuotaExceeded(_))),
        "bucket should be empty after 200 signals"
    );
 }
 /// Noisy neighbor: tenant A at full burst doesn't affect tenant B.
 #[test]
 fn test_noisy_neighbor_isolation() {
    let topology = Arc::new(three_shard_topology());
    let shard_router = Arc::new(ShardRouter::new(topology.clone()));
    let tenant_router = Arc::new(TenantRouter::new(shard_router, topology));
    let tenant_a = TenantId(1);
    let tenant_b = TenantId(2);
    tenant_router.register_tenant(TenantConfig {
        tenant_id: tenant_a,
        max_signals_per_sec: Some(10), // low limit
        max_entities: None,
        max_storage_bytes: None,
        required_regions: vec![],
        label: "noisy-tenant".into(),
    });
    tenant_router.register_tenant(TenantConfig {
        tenant_id: tenant_b,
        max_signals_per_sec: Some(10_000), // high limit
        max_entities: None,
        max_storage_bytes: None,
        required_regions: vec![],
        label: "good-tenant".into(),
    });
    let limiter_a = tenant_router.rate_limiter_for(tenant_a).unwrap();
    let limiter_b = tenant_router.rate_limiter_for(tenant_b).unwrap();
    // Exhaust tenant A's bucket.
    for _ in 0..1000 { let _ = limiter_a.try_acquire(); }
    // Tenant B should not be affected.
    for _ in 0..100 {
        assert!(
            limiter_b.try_acquire().is_ok(),
            "tenant B should not be throttled by tenant A's exhaustion"
        );
    }
 }
 /// Residency policy: tenant configured to stay in region 1 only routes there.
 #[test]
 fn test_tenant_residency_policy() {
    let topology = Arc::new(three_shard_topology());
    let shard_router = Arc::new(ShardRouter::new(topology.clone()));
    let tenant_router = Arc::new(TenantRouter::new(shard_router, topology));
    let tenant = TenantId(10);
    tenant_router.register_tenant(TenantConfig {
        tenant_id: tenant,
        max_signals_per_sec: None,
        max_entities: None,
        max_storage_bytes: None,
        required_regions: vec![RegionId(1)], // EU residency
        label: "eu-tenant".into(),
    });
    // All entities for this tenant should route to shard 1 (region 1).
    for i in 0u64..100 {
        let assignment = tenant_router.route(tenant, EntityId::new(i)).unwrap();
        assert_eq!(assignment.region_id, RegionId(1),
            "entity {} should be in region 1 per residency policy", i);
    }
 }
 /// Tenant migration: move tenant 1 from shard 0 to shard 2 with zero downtime.
 #[tokio::test]
 async fn test_tenant_migration_zero_downtime() {
    let (db0, db2, factory) = setup_migration_cluster().await;
    let tenant = TenantId(1);
    let user = EntityId::new(1);
    // Write 100 signals to tenant 1 on shard 0 before migration.
    for i in 0..100u64 {
        db0.signal_for_tenant(tenant, "view", EntityId::new(i + 10), 1.0, Timestamp::now())
           .unwrap();
    }
    let migration = TenantMigration::new(
        tenant, ShardId(0), ShardId(2),
        db0.control_plane().clone(),
        db0.tenant_router().clone(),
        factory.transport(RegionId(0)),
    );
    // Phase 1: ship existing WAL to target.
    migration.prepare_target().await.unwrap();
    // Phase 2: enter dual-write; write 50 more signals.
    migration.enter_dual_write().await.unwrap();
    for i in 100..150u64 {
        db0.signal_for_tenant(tenant, "view", EntityId::new(i + 10), 1.0, Timestamp::now())
           .unwrap();
    }
    // Phase 3: finalize.
    tokio::time::sleep(Duration::from_millis(100)).await;
    migration.finalize().await.unwrap();
    // All 150 signals should be present on shard 2 (new home).
    let count_on_target = db2.total_signal_count_for_tenant(tenant, "view").unwrap();
    assert_eq!(count_on_target, 150, "all signals must be on target shard after migration");
    // Phase 4: GC (use 0 window for test).
    migration.gc_source(0).unwrap();
    let count_on_source = db0.total_signal_count_for_tenant(tenant, "view").unwrap();
    assert_eq!(count_on_source, 0, "source shard must have no tenant data after GC");
 }
 /// Rolling upgrade: drain node, "upgrade", rejoin; signals written during
 /// the upgrade are present on the rejoined node.
 #[tokio::test]
 async fn test_rolling_upgrade_no_data_loss() {
    let (db_leader, db_followers, factory) = setup_three_node_cluster().await;
    let coordinator = RollingUpgradeCoordinator::new(
        db_leader.control_plane().clone(),
        db_leader.wal_shipper().clone(),
    );
    // Drain follower 0.
    coordinator.drain(ShardId(1)).await.unwrap();
    // Write 200 signals during the "upgrade window".
    for i in 0..200u64 {
        db_leader.signal("view", EntityId::new(i + 1), 1.0, Timestamp::now()).unwrap();
    }
    // Rejoin (simulated: follower is already running, just re-enables routing).
    coordinator.rejoin(ShardId(1)).await.unwrap();
    // All 200 signals must be present on the rejoined follower.
    let lag = db_leader.control_plane().lag_for(ShardId(1));
    assert_eq!(lag, 0, "no replication lag after rejoin");
 }
 ```
 ## Acceptance Criteria
 - [ ] `test_tenant_rate_limiting`: 100-signal burst absorbed, 201st signal returns `QuotaExceeded` within 1ms
 - [ ] `test_noisy_neighbor_isolation`: exhausting tenant A's rate limiter has no effect on tenant B
 - [ ] `test_tenant_residency_policy`: all 100 entities for an EU-residency tenant route to region 1
 - [ ] `test_tenant_migration_zero_downtime`: all 150 signals present on target shard after migration; source has 0 after GC
 - [ ] `test_rolling_upgrade_no_data_loss`: 200 signals written during drain window present on rejoined follower
 - [ ] All 5 tests pass in `cargo test --test m8p5_multitenancy`
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-6/OVERVIEW.md
+++ b/docs/planning/milestone-8/phase-6/OVERVIEW.md
@ -0,0 +1,75 @@
 # m8p6: End-to-End UAT
 ## Delivers
 A comprehensive end-to-end test suite that exercises the complete UAT scenario:
 3 regions, 5 shards per region, 25K signals/sec, network partition, failover,
 partition heal, deterministic reconciliation, and tenant migration. This is the
 gate for M8 completion.
 Deliverables:
 - `m8_uat` integration test suite matching all 5 UAT scenario steps
 - `SimulatedCluster`: test harness that creates a multi-region tidalDB cluster using `InProcessTransport`
 - `NetworkPartition`: injectable fault that blocks `Transport::send_segment` between specified regions
 - `ShardCrash`: injectable fault that drops a shard primary and triggers follower promotion
 - Performance assertions: cross-region replication < 2s p99, failover < 10s
 ## Dependencies
 - **Requires:** All phases 8.1-8.5
 - **Files created:**
  - `tidal/tests/m8_uat.rs` -- integration test suite
  - `tidal/src/testing/cluster.rs` -- `SimulatedCluster` harness
  - `tidal/src/testing/faults.rs` -- `NetworkPartition`, `ShardCrash` fault injection
 ## Research References
 - `docs/research/tidaldb_wal.md` -- invariant checklist for replication correctness
 ## Acceptance Criteria (Phase Level)
 - [ ] **UAT Step 1:** Write signals for a user in us-east, read in eu-west after < 2 seconds; verified by `ReplicationLagGauge` assertion and `read_decay_score` equivalence
 - [ ] **UAT Step 2:** Crash an entire shard primary (simulated); follower is promoted within 10 seconds; all acknowledged signals are present on the promoted follower; no data loss
 - [ ] **UAT Step 3:** Execute `RETRIEVE items COHORT locale:EU` while ap-south is partitioned; query succeeds using available shards; results include items from non-partitioned regions only; degradation flag set in `QueryStats`
 - [ ] **UAT Step 4:** Heal the partition; `ReconciliationEngine` runs; after reconciliation: no duplicate signal counts (verified by sum of all events across all regions); hard negatives never leaked; decay scores on all shards match analytical formula to 6 decimal places
 - [ ] **UAT Step 5:** Move a tenant to a new region by changing routing config; during migration: zero downtime, all queries succeed; after migration: tenant's data is on new region only; old region's copy is GC'd
 - [ ] Invariant: no signal event is lost or double-counted across the entire test run (verified by WAL event count == materialized signal count on all shards)
 - [ ] Invariant: hard negatives (hide/mute/block) are monotonically enforced -- once hidden, never visible during convergence
 ## Task Execution Order
 ```
 Task 01: SimulatedCluster Harness ──────┐
                                         ├──> Task 03: UAT Scenario Tests (Steps 1-5)
 Task 02: Fault Injection ────────────────┘         │
                                                    v
                                          Task 04: Performance Assertions + CI
 ```
 Tasks 01 and 02 are parallelizable. Task 03 depends on both. Task 04 depends on 03.
 ## Module Location
 | File | Status | Contains |
 |------|--------|----------|
 | `tidal/tests/m8_uat.rs` | NEW | All UAT scenario tests |
 | `tidal/src/testing/cluster.rs` | NEW | `SimulatedCluster` harness |
 | `tidal/src/testing/faults.rs` | NEW | `NetworkPartition`, `ShardCrash` fault injection |
 ## Notes
 ### All tests use InProcessTransport
 No actual network. The `NetworkPartition` fault works by intercepting `send_segment` calls and dropping them for the specified region pair.
 ### Deterministic reconciliation verification
 After partition heal, we replay all WAL segments from both sides of the partition through a single-node `TidalDb` (the ground truth). We then compare every signal count and decay score on every shard against this ground truth. Any divergence fails the test.
 ### Performance assertions are soft
 The 2s p99 target is for in-process transport. Real network latency is additive. The test verifies that replication logic itself adds < 100ms overhead; the remaining budget is for network RTT.
 ## Done When
 `cargo test --test m8_uat` passes all 5 UAT scenario steps with 25K signals/sec sustained throughput across 3 simulated regions, verifying no signal loss, no duplicate counts, no leaked hard negatives, and correct decay scores after partition heal and reconciliation.
--- a/docs/planning/milestone-8/phase-6/task-01-simulated-cluster.md
+++ b/docs/planning/milestone-8/phase-6/task-01-simulated-cluster.md
@ -0,0 +1,176 @@
 # Task 01: SimulatedCluster Harness
 ## Delivers
 `SimulatedCluster` in `tidal/src/testing/cluster.rs`. Test harness that creates a multi-region tidalDB cluster using `InProcessTransport`. Exposes a simple API for spinning up N regions × M shards, writing signals, and asserting cross-region replication state. Used by all Phase 8.6 UAT tests.
 ## Complexity: M
 ## Dependencies
 - All phases 8.1–8.5 complete
 ## Technical Design
 ```rust
 // tidal/src/testing/cluster.rs
 // Only compiled with #[cfg(test)] or --features test-utils
 /// A fully simulated multi-region tidalDB cluster.
 ///
 /// All network communication happens via `InProcessTransport` (in-memory
 /// channels). No actual network, no actual disk I/O (ephemeral mode).
 /// Designed for deterministic, repeatable integration tests.
 pub struct SimulatedCluster {
    /// All nodes in the cluster, indexed by (region, shard).
    nodes: HashMap<(RegionId, ShardId), SimulatedNode>,
    /// Shared transport factory for the entire cluster.
    transport_factory: Arc<InProcessTransportFactory>,
    /// Shared control plane (single per cluster).
    control_plane: Arc<ControlPlane>,
    /// Schema used by all nodes.
    schema: Arc<Schema>,
 }
 pub struct SimulatedNode {
    pub region_id: RegionId,
    pub shard_id: ShardId,
    pub role: NodeRole,
    pub db: TidalDb,
 }
 pub struct ClusterConfig {
    pub regions: Vec<RegionId>,
    pub shards_per_region: usize,
    /// Which (region, shard) is the primary leader (shard 0 in region 0 by default).
    pub leader: (RegionId, ShardId),
    pub schema: Schema,
 }
 impl SimulatedCluster {
    /// Build a cluster from the given configuration.
    ///
    /// All nodes start immediately; WAL shipping begins automatically.
    pub async fn build(config: ClusterConfig) -> Self {
        let factory = Arc::new(InProcessTransportFactory::new());
        let topology = ClusterTopology {
            shards: config.regions.iter().flat_map(|&region| {
                (0..config.shards_per_region).map(move |s| ShardAssignment {
                    shard_id: ShardId(s as u16),
                    region_id: region,
                })
            }).collect(),
        };
        let topology = Arc::new(topology);
        let control_plane = Arc::new(ControlPlane::new(
            Arc::new(RwLock::new((*topology).clone())),
            Arc::new(TenantRouter::new(
                Arc::new(ShardRouter::new(topology.clone())),
                topology.clone(),
            )),
            Arc::new(ReplicationLagGauge::new()),
        ));
        let mut nodes = HashMap::new();
        for &region in &config.regions {
            for shard in 0..config.shards_per_region {
                let shard_id = ShardId(shard as u16);
                let is_leader = (region, shard_id) == config.leader;
                let transport = factory.connect(region);
                let db = TidalDb::builder()
                    .ephemeral()
                    .with_schema(config.schema.clone())
                    .with_cluster(NodeConfig {
                        region_id: region,
                        shard_id,
                        role: if is_leader { NodeRole::Leader } else { NodeRole::Follower },
                    })
                    .with_transport(Arc::new(transport))
                    .with_control_plane(control_plane.clone())
                    .open()
                    .unwrap();
                nodes.insert((region, shard_id), SimulatedNode {
                    region_id: region,
                    shard_id,
                    role: if is_leader { NodeRole::Leader } else { NodeRole::Follower },
                    db,
                });
            }
        }
        Self { nodes, transport_factory: factory, control_plane, schema: Arc::new(config.schema) }
    }
    /// Get the leader node.
    pub fn leader(&self) -> &SimulatedNode {
        self.nodes.values().find(|n| n.role == NodeRole::Leader).unwrap()
    }
    /// Get a follower in a specific region.
    pub fn follower_in(&self, region: RegionId) -> Option<&SimulatedNode> {
        self.nodes.values().find(|n| n.region_id == region && n.role == NodeRole::Follower)
    }
    /// Write a signal to the cluster leader.
    pub fn write_signal(&self, signal: &str, entity: EntityId, weight: f64) {
        self.leader().db.signal(signal, entity, weight, Timestamp::now()).unwrap();
    }
    /// Wait for all followers to have applied all leader events.
    pub async fn await_full_convergence(&self, timeout: Duration) {
        let deadline = Instant::now() + timeout;
        loop {
            if Instant::now() > deadline {
                panic!("convergence timeout: cluster did not converge within {:?}", timeout);
            }
            let all_converged = self.nodes.values()
                .filter(|n| n.role == NodeRole::Follower)
                .all(|n| {
                    let lag = self.control_plane.lag_for(n.shard_id);
                    lag == 0
                });
            if all_converged { return; }
            tokio::time::sleep(Duration::from_millis(50)).await;
        }
    }
    /// Read decay score from a specific region.
    pub fn read_decay_score(&self, region: RegionId, entity: EntityId, signal: &str) -> Option<f64> {
        self.nodes.values()
            .find(|n| n.region_id == region)
            .and_then(|n| n.db.read_decay_score(entity, signal, 0).ok().flatten())
    }
    /// Total number of WAL events applied on a given region's shard.
    pub fn applied_seqno(&self, region: RegionId) -> u64 {
        self.nodes.values()
            .find(|n| n.region_id == region)
            .map(|n| n.db.applied_seqno())
            .unwrap_or(0)
    }
    /// Inject a network partition between two regions (via the transport factory).
    pub fn inject_partition(&self, from: RegionId, to: RegionId) -> NetworkPartition {
        self.transport_factory.inject_partition(from, to)
    }
    /// Heal all network partitions.
    pub fn heal_all_partitions(&self) {
        self.transport_factory.heal_all();
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `SimulatedCluster::build(config)` creates N×M nodes, all connected via `InProcessTransport`
 - [ ] `leader()` returns the single leader node; `follower_in(region)` returns a follower for the given region
 - [ ] `write_signal(signal, entity, weight)` writes to the leader and returns without error
 - [ ] `await_full_convergence(timeout)` blocks until all followers have lag = 0, or panics on timeout
 - [ ] `read_decay_score(region, entity, signal)` reads from the specified region's node
 - [ ] `inject_partition(from, to)` returns a `NetworkPartition` handle; traffic between those regions is dropped while the handle is live
 - [ ] `heal_all_partitions()` restores transport for all region pairs
 - [ ] Smoke test: 2-region cluster, write 10 signals, `await_full_convergence(2s)`, verify decay score matches in both regions
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-6/task-02-fault-injection.md
+++ b/docs/planning/milestone-8/phase-6/task-02-fault-injection.md
@ -0,0 +1,183 @@
 # Task 02: Fault Injection
 ## Delivers
 `NetworkPartition` and `ShardCrash` in `tidal/src/testing/faults.rs`. `NetworkPartition` intercepts `Transport::send_segment` calls and drops them for specified region pairs. `ShardCrash` drops a shard's primary and triggers follower promotion. Both are RAII handles — faults are active while the handle is alive, automatically healed/cleaned up on drop.
 ## Complexity: M
 ## Dependencies
 - Task 01 (SimulatedCluster)
 - Phase 8.2, Task 01 (Transport trait)
 - Phase 8.2, Task 05 (FollowerDb, NodeRole)
 ## Technical Design
 ```rust
 // tidal/src/testing/faults.rs
 // Only compiled with #[cfg(test)] or --features test-utils
 /// RAII handle for a network partition between two regions.
 ///
 /// While this handle is alive, all `Transport::send_segment` and
 /// `Transport::send_session_batch` calls from `from` to `to` (and
 /// optionally `to` to `from` for symmetric partitions) are dropped
 /// without delivery.
 ///
 /// When the handle is dropped, the partition is automatically healed.
 pub struct NetworkPartition {
    from: RegionId,
    to: RegionId,
    symmetric: bool,
    transport_factory: Arc<InProcessTransportFactory>,
 }
 impl NetworkPartition {
    /// Create a one-way partition: `from` cannot reach `to`.
    pub fn one_way(
        from: RegionId,
        to: RegionId,
        factory: Arc<InProcessTransportFactory>,
    ) -> Self {
        factory.block_route(from, to);
        Self { from, to, symmetric: false, transport_factory: factory }
    }
    /// Create a symmetric partition: neither side can reach the other.
    pub fn symmetric(
        region_a: RegionId,
        region_b: RegionId,
        factory: Arc<InProcessTransportFactory>,
    ) -> Self {
        factory.block_route(region_a, region_b);
        factory.block_route(region_b, region_a);
        Self { from: region_a, to: region_b, symmetric: true, transport_factory: factory }
    }
    /// Check how many segments have been dropped since partition was injected.
    pub fn dropped_segments(&self) -> u64 {
        self.transport_factory.dropped_count(self.from, self.to)
    }
 }
 impl Drop for NetworkPartition {
    fn drop(&mut self) {
        self.transport_factory.unblock_route(self.from, self.to);
        if self.symmetric {
            self.transport_factory.unblock_route(self.to, self.from);
        }
    }
 }
 /// RAII handle for a simulated shard crash.
 ///
 /// Crashes the primary of the given shard. The primary is taken offline
 /// (stops processing WAL writes, stops shipping to followers). The most
 /// advanced follower is promoted to leader automatically.
 ///
 /// When the handle is dropped, the "crashed" shard can be optionally
 /// restored (simulating a node restart) or left offline.
 pub struct ShardCrash {
    crashed_shard: ShardId,
    original_leader_seqno: u64,
    cluster: Arc<SimulatedCluster>,
    auto_rejoin: bool,
 }
 impl ShardCrash {
    /// Crash the primary of `shard_id`.
    ///
    /// `auto_rejoin`: if true, the shard restarts and rejoins on drop.
    pub async fn crash(
        shard_id: ShardId,
        cluster: Arc<SimulatedCluster>,
        auto_rejoin: bool,
    ) -> Self {
        // Record the shard's current seqno before crash.
        let original_seqno = cluster.applied_seqno_for(shard_id);
        // Take the shard offline: stop WAL shipping, stop write processing.
        cluster.take_shard_offline(shard_id).await;
        // Promote the most advanced follower (if any).
        cluster.promote_best_follower(shard_id).await;
        Self {
            crashed_shard: shard_id,
            original_leader_seqno: original_seqno,
            cluster,
            auto_rejoin,
        }
    }
    /// How many events the crashed shard had applied at crash time.
    pub fn pre_crash_seqno(&self) -> u64 {
        self.original_leader_seqno
    }
    /// Manually rejoin the crashed shard (ship missed WAL, re-enable as follower).
    pub async fn rejoin(&self) {
        self.cluster.rejoin_shard(self.crashed_shard).await;
    }
 }
 impl Drop for ShardCrash {
    fn drop(&mut self) {
        if self.auto_rejoin {
            // Best effort async rejoin on drop (may race with test teardown).
            let cluster = self.cluster.clone();
            let shard = self.crashed_shard;
            tokio::spawn(async move {
                cluster.rejoin_shard(shard).await;
            });
        }
    }
 }
 /// Extension to InProcessTransportFactory for fault injection.
 impl InProcessTransportFactory {
    /// Block all traffic from `from` to `to`.
    pub fn block_route(&self, from: RegionId, to: RegionId) {
        self.blocked_routes.write().unwrap().insert((from, to));
    }
    /// Unblock traffic from `from` to `to`.
    pub fn unblock_route(&self, from: RegionId, to: RegionId) {
        self.blocked_routes.write().unwrap().remove(&(from, to));
    }
    /// Heal all partitions.
    pub fn heal_all(&self) {
        self.blocked_routes.write().unwrap().clear();
    }
    /// Count of segments dropped on a specific route since the factory was created.
    pub fn dropped_count(&self, from: RegionId, to: RegionId) -> u64 {
        self.drop_counters
            .get(&(from, to))
            .map(|c| c.load(Ordering::Relaxed))
            .unwrap_or(0)
    }
    /// Replay the last session batch that was dropped to `to` region.
    /// Used by idempotency tests to simulate duplicate delivery.
    pub async fn replay_last_session_batch(&self, to: RegionId) {
        if let Some(batch) = self.last_session_batch.lock().unwrap().get(&to).cloned() {
            self.deliver_session_batch(to, batch).await;
        }
    }
 }
 ```
 ## Acceptance Criteria
 - [ ] `NetworkPartition::one_way(from, to)` drops all segments from `from` to `to`; segments from `to` to `from` still deliver
 - [ ] `NetworkPartition::symmetric(a, b)` drops segments in both directions
 - [ ] Dropping `NetworkPartition` heals the route; subsequent segments deliver normally
 - [ ] `dropped_segments()` accurately counts segments dropped since partition injection
 - [ ] `ShardCrash::crash(shard, cluster, false)` takes the shard offline; a follower is promoted
 - [ ] After `ShardCrash::rejoin()`: the previously crashed shard catches up from WAL segments and its applied seqno matches the current leader's
 - [ ] `heal_all()` restores all blocked routes in one call
 - [ ] Partition test: inject partition, write 50 segments, verify they are not applied on isolated follower; heal, verify they are applied
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-6/task-03-uat-scenario-tests.md
+++ b/docs/planning/milestone-8/phase-6/task-03-uat-scenario-tests.md
@ -0,0 +1,316 @@
 # Task 03: UAT Scenario Tests (Steps 1–5)
 ## Delivers
 Integration test suite in `tidal/tests/m8_uat.rs` covering all 5 UAT scenario steps. Uses `SimulatedCluster` and fault injection from Tasks 01–02. This is the gate for M8 completion.
 ## Complexity: M
 ## Dependencies
 - Tasks 01–02 complete (SimulatedCluster, fault injection)
 - All phases 8.1–8.5 complete
 ## Technical Design
 ```rust
 // tidal/tests/m8_uat.rs
 use tidaldb::{
    EntityId, Timestamp, Window,
    query::{retrieve::Retrieve, search::Search},
    replication::{RegionId, ShardId, NodeRole},
 };
 use tidaldb::testing::{
    cluster::{SimulatedCluster, ClusterConfig},
    faults::{NetworkPartition, ShardCrash},
 };
 fn m8_schema() -> Schema {
    SchemaBuilder::new()
        .signal("view", EntityKind::Item,
            DecaySpec::Exponential { half_life: Duration::from_secs(7 * 24 * 3600) })
        .windows(&[Window::OneHour, Window::TwentyFourHours])
        .add()
        .signal("like", EntityKind::Item,
            DecaySpec::Exponential { half_life: Duration::from_secs(24 * 3600) })
        .add()
        .build()
        .unwrap()
 }
 fn three_region_config() -> ClusterConfig {
    ClusterConfig {
        regions: vec![RegionId(0), RegionId(1), RegionId(2)],
        shards_per_region: 1,
        leader: (RegionId(0), ShardId(0)),
        schema: m8_schema(),
    }
 }
 /// UAT Step 1: Cross-region signal replication < 2 seconds.
 ///
 /// Write signals for a user in us-east (region 0), read in eu-west (region 1)
 /// after < 2 seconds. Verified by ReplicationLagGauge assertion and
 /// read_decay_score equivalence.
 #[tokio::test]
 async fn uat_step1_cross_region_replication() {
    let cluster = SimulatedCluster::build(three_region_config()).await;
    let item = EntityId::new(1);
    let t = Timestamp::now();
    // Write 25 signals in us-east (region 0 leader).
    for _ in 0..25 {
        cluster.write_signal("view", item, 1.0);
    }
    // Wait for convergence (< 2 seconds on in-process transport).
    cluster.await_full_convergence(Duration::from_secs(2)).await;
    // Read in eu-west (region 1) and ap-south (region 2).
    let score_east = cluster.read_decay_score(RegionId(0), item, "view").unwrap();
    let score_west = cluster.read_decay_score(RegionId(1), item, "view").unwrap();
    let score_south = cluster.read_decay_score(RegionId(2), item, "view").unwrap();
    // All regions should report the same score (within floating point epsilon).
    let epsilon = 1e-6;
    assert!((score_east - score_west).abs() < epsilon,
        "eu-west score {} diverges from us-east score {} by > {}", score_west, score_east, epsilon);
    assert!((score_east - score_south).abs() < epsilon,
        "ap-south score {} diverges from us-east score {} by > {}", score_south, score_east, epsilon);
    // Verify via replication lag gauge.
    let lag_1 = cluster.control_plane().lag_seqno(RegionId(1));
    let lag_2 = cluster.control_plane().lag_seqno(RegionId(2));
    assert_eq!(lag_1, 0, "eu-west should have no replication lag");
    assert_eq!(lag_2, 0, "ap-south should have no replication lag");
 }
 /// UAT Step 2: Shard crash and follower promotion.
 ///
 /// Crash an entire shard primary. Follower is promoted within 10 seconds.
 /// All acknowledged signals are present on the promoted follower. No data loss.
 #[tokio::test]
 async fn uat_step2_shard_crash_and_failover() {
    let cluster = Arc::new(SimulatedCluster::build(three_region_config()).await);
    let item = EntityId::new(2);
    // Write 100 signals (all acknowledged by leader before crash).
    for _ in 0..100 {
        cluster.write_signal("view", item, 1.0);
    }
    // Wait for eu-west follower to receive all events.
    cluster.await_full_convergence(Duration::from_secs(2)).await;
    // Record the pre-crash seqno on eu-west.
    let pre_crash_seqno = cluster.applied_seqno(RegionId(1));
    // Crash the us-east primary.
    let crash = ShardCrash::crash(ShardId(0), cluster.clone(), false).await;
    // Follower promotion should complete within 10 seconds.
    let deadline = Instant::now() + Duration::from_secs(10);
    loop {
        if Instant::now() > deadline {
            panic!("failover timeout: no new leader elected within 10 seconds");
        }
        if cluster.has_leader() { break; }
        tokio::time::sleep(Duration::from_millis(100)).await;
    }
    // New leader (eu-west promoted follower) must have all 100 signals.
    let new_leader_seqno = cluster.leader().db.applied_seqno();
    assert!(new_leader_seqno >= pre_crash_seqno,
        "promoted leader must have at least {} events (had {})", pre_crash_seqno, new_leader_seqno);
    let score_on_promoted = cluster.read_decay_score(RegionId(1), item, "view").unwrap();
    assert!(score_on_promoted > 0.0, "all 100 signals must be present on the promoted leader");
 }
 /// UAT Step 3: Degraded query during partition.
 ///
 /// Execute RETRIEVE while ap-south (region 2) is partitioned.
 /// Query succeeds using available shards. Degradation flag is set in QueryStats.
 #[tokio::test]
 async fn uat_step3_degraded_query_during_partition() {
    let cluster = SimulatedCluster::build(three_region_config()).await;
    let item = EntityId::new(3);
    // Seed some data.
    for _ in 0..10 {
        cluster.write_signal("view", item, 1.0);
    }
    cluster.await_full_convergence(Duration::from_secs(1)).await;
    // Inject partition: ap-south (region 2) is isolated.
    let _partition = NetworkPartition::symmetric(
        RegionId(0), RegionId(2),
        cluster.transport_factory(),
    );
    // Write more signals during the partition.
    for _ in 0..5 {
        cluster.write_signal("view", item, 1.0);
    }
    // Query should still succeed from us-east or eu-west.
    let results = cluster.leader().db.retrieve(&Retrieve::builder()
        .candidates(vec![item])
        .build()
        .unwrap()
    ).unwrap();
    assert!(!results.items.is_empty(), "query must succeed with 2 of 3 regions available");
    // QueryStats should indicate degradation.
    // (Exact API for degradation flag verified in m7p4 visibility tests -- same pattern)
    let stats = results.stats;
    // degraded = true is set when < all shards participated
    // (exact field name TBD during implementation; verified in UAT step 3 acceptance)
 }
 /// UAT Step 4: Partition heal and reconciliation.
 ///
 /// Heal the partition from Step 3. ReconciliationEngine runs. After reconciliation:
 /// no duplicate signal counts, hard negatives never leaked, decay scores on all
 /// shards match analytical formula to 6 decimal places.
 #[tokio::test]
 async fn uat_step4_partition_heal_reconciliation() {
    let cluster = SimulatedCluster::build(three_region_config()).await;
    let item = EntityId::new(4);
    let user = EntityId::new(100);
    // Phase 1: write events on both sides of partition.
    let partition = NetworkPartition::symmetric(
        RegionId(0), RegionId(2),
        cluster.transport_factory(),
    );
    // Write to leader (us-east, region 0) during partition.
    for _ in 0..50 {
        cluster.write_signal("view", item, 1.0);
    }
    // Write to ap-south (region 2) directly during partition.
    // (ap-south is isolated, so it accumulates its own events)
    for _ in 0..30 {
        cluster.node(RegionId(2)).db
            .signal("view", item, 1.0, Timestamp::now())
            .unwrap();
    }
    // Apply hard negative on ap-south during partition.
    let ts_hide = HlcTimestamp { wall_ns: 200, logical: 0, node_id: 2 };
    cluster.node(RegionId(2)).db.hide_item_with_ts(user, item, ts_hide).unwrap();
    // Phase 2: heal partition.
    drop(partition);
    // Run reconciliation.
    cluster.reconcile_all().await;
    cluster.await_full_convergence(Duration::from_secs(5)).await;
    // Verify: total signal count = 50 + 30 = 80 (no double-counting).
    let score_east = cluster.read_decay_score(RegionId(0), item, "view").unwrap();
    let score_west = cluster.read_decay_score(RegionId(1), item, "view").unwrap();
    let score_south = cluster.read_decay_score(RegionId(2), item, "view").unwrap();
    // Analytical formula: 80 events × weight=1.0, all at approximately t=now.
    // Decay score = sum of decayed events; with very short elapsed time, ≈ 80.0.
    let epsilon = 1e-6;
    assert!((score_east - score_west).abs() < epsilon,
        "post-reconciliation scores diverge between us-east and eu-west");
    assert!((score_east - score_south).abs() < epsilon,
        "post-reconciliation scores diverge between us-east and ap-south");
    // Verify: hard negative applied on ap-south is propagated to all regions.
    // Item must not appear in query results for the user on any region.
    for &region in &[RegionId(0), RegionId(1), RegionId(2)] {
        let results = cluster.node(region).db.retrieve(&Retrieve::builder()
            .for_user(user)
            .candidates(vec![item])
            .build()
            .unwrap()
        ).unwrap();
        assert!(results.items.is_empty(),
            "hard negative must suppress item in region {:?} after reconciliation", region);
    }
 }
 /// UAT Step 5: Tenant migration with zero downtime.
 ///
 /// Move a tenant to a new region by changing routing config.
 /// During migration: zero downtime, all queries succeed.
 /// After migration: tenant's data is on new region only; old region's copy is GC'd.
 #[tokio::test]
 async fn uat_step5_tenant_migration() {
    let cluster = SimulatedCluster::build(three_region_config()).await;
    let tenant = TenantId(42);
    let item = EntityId::new(5);
    // Register tenant on shard 0, region 0.
    cluster.register_tenant(TenantConfig {
        tenant_id: tenant,
        max_signals_per_sec: None,
        max_entities: None,
        max_storage_bytes: None,
        required_regions: vec![RegionId(0)],
        label: "migrating-tenant".into(),
    });
    // Write 100 signals before migration.
    for _ in 0..100 {
        cluster.leader().db
            .signal_for_tenant(tenant, "view", item, 1.0, Timestamp::now())
            .unwrap();
    }
    cluster.await_full_convergence(Duration::from_secs(1)).await;
    // Begin migration: move tenant 42 from shard 0 (region 0) to shard 0 (region 2).
    let migration = cluster.begin_tenant_migration(tenant, ShardId(0), ShardId(0), RegionId(2));
    migration.prepare_target().await.unwrap();
    migration.enter_dual_write().await.unwrap();
    // Write 50 more signals during dual-write window.
    for _ in 0..50 {
        cluster.leader().db
            .signal_for_tenant(tenant, "view", item, 1.0, Timestamp::now())
            .unwrap();
    }
    tokio::time::sleep(Duration::from_millis(200)).await;
    migration.finalize().await.unwrap();
    // All 150 signals must be present on the new region.
    let score_new = cluster.read_decay_score(RegionId(2), item, "view").unwrap();
    assert!(score_new > 0.0, "all signals must be on new region after migration");
    // Queries during migration must have succeeded (no error returned during dual-write).
    // (Verified by the fact that all writes above returned Ok)
    // GC old region.
    migration.gc_source(0).unwrap();
    // Old region should have no data for this tenant.
    let score_old = cluster.read_score_for_tenant(RegionId(0), tenant, item, "view").unwrap_or(0.0);
    assert_eq!(score_old, 0.0, "source region must have no tenant data after GC");
 }
 ```
 ## Acceptance Criteria
 - [ ] `uat_step1_cross_region_replication`: scores in all 3 regions equal within 6 decimal places after < 2s; replication lag = 0
 - [ ] `uat_step2_shard_crash_and_failover`: failover completes within 10 seconds; no data loss on promoted follower
 - [ ] `uat_step3_degraded_query_during_partition`: query succeeds with 2/3 regions; `QueryStats` degradation flag set
 - [ ] `uat_step4_partition_heal_reconciliation`: no duplicate signal counts after reconciliation (50 + 30 = 80 distinct events); hard negatives propagated to all regions; scores match analytical formula to 6 decimal places
 - [ ] `uat_step5_tenant_migration`: 150 signals present on target region after migration; old region has 0; zero errors during dual-write window
 - [ ] All 5 tests pass in `cargo test --test m8_uat`
 - [ ] Total test suite runtime < 60 seconds (InProcessTransport keeps this fast)
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/docs/planning/milestone-8/phase-6/task-04-performance-and-ci.md
+++ b/docs/planning/milestone-8/phase-6/task-04-performance-and-ci.md
@ -0,0 +1,196 @@
 # Task 04: Performance Assertions + CI Integration
 ## Delivers
 Performance assertions added to `m8_uat.rs` that verify: cross-region replication < 2s p99, failover < 10s, reconciliation overhead < 100ms. CI configuration ensuring M8 tests run on every PR without flakiness. A benchmark in `tidal/benches/replication.rs` for sustained 25K signals/sec throughput measurement.
 ## Complexity: S
 ## Dependencies
 - Task 03 (UAT scenario tests)
 ## Technical Design
 ```rust
 // tidal/tests/m8_uat.rs (additions)
 /// Performance: cross-region replication latency < 2s p99.
 ///
 /// Measures the latency from WAL write on leader to applied on follower.
 /// Uses InProcessTransport (no real network). Asserts p99 < 2s.
 #[tokio::test]
 async fn perf_replication_latency_p99() {
    let cluster = SimulatedCluster::build(three_region_config()).await;
    let mut latencies_ns: Vec<u64> = Vec::with_capacity(1000);
    for i in 0u64..1000 {
        let item = EntityId::new(i);
        let before_ns = crate::util::now_ns();
        cluster.write_signal("view", item, 1.0);
        // Wait until eu-west follower has applied this specific event.
        cluster.await_event_applied(RegionId(1), before_ns, Duration::from_secs(3)).await;
        let after_ns = crate::util::now_ns();
        latencies_ns.push(after_ns - before_ns);
    }
    latencies_ns.sort_unstable();
    let p99_ns = latencies_ns[(latencies_ns.len() as f64 * 0.99) as usize];
    let p99_ms = p99_ns / 1_000_000;
    assert!(
        p99_ms < 2000,
        "replication latency p99 = {}ms, must be < 2000ms (in-process transport overhead)",
        p99_ms
    );
    println!("Replication latency: p50={}ms p99={}ms",
        latencies_ns[latencies_ns.len() / 2] / 1_000_000,
        p99_ms,
    );
 }
 /// Performance: failover completes in < 10 seconds.
 #[tokio::test]
 async fn perf_failover_under_10s() {
    let cluster = Arc::new(SimulatedCluster::build(three_region_config()).await);
    let start = Instant::now();
    let _crash = ShardCrash::crash(ShardId(0), cluster.clone(), false).await;
    while !cluster.has_leader() {
        tokio::time::sleep(Duration::from_millis(50)).await;
        assert!(
            start.elapsed() < Duration::from_secs(10),
            "failover must complete within 10 seconds"
        );
    }
    let elapsed = start.elapsed();
    println!("Failover completed in {}ms", elapsed.as_millis());
    assert!(elapsed < Duration::from_secs(10));
 }
 /// Performance: reconciliation overhead < 100ms for 10K events per side.
 #[tokio::test]
 async fn perf_reconciliation_overhead() {
    let cluster = SimulatedCluster::build(three_region_config()).await;
    // Inject partition.
    let partition = NetworkPartition::symmetric(
        RegionId(0), RegionId(2), cluster.transport_factory()
    );
    // Write 10K events on each side.
    for i in 0..10_000u64 {
        cluster.write_signal("view", EntityId::new(i), 1.0);
        cluster.node(RegionId(2)).db
            .signal("view", EntityId::new(i + 10_000), 1.0, Timestamp::now())
            .unwrap();
    }
    drop(partition); // Heal.
    let reconcile_start = Instant::now();
    cluster.reconcile_all().await;
    cluster.await_full_convergence(Duration::from_secs(10)).await;
    let reconcile_elapsed = reconcile_start.elapsed();
    println!("Reconciliation of 20K events took {}ms", reconcile_elapsed.as_millis());
    assert!(
        reconcile_elapsed < Duration::from_millis(100),
        "reconciliation overhead must be < 100ms for 20K total events (got {}ms)",
        reconcile_elapsed.as_millis()
    );
 }
 ```
 ```rust
 // tidal/benches/replication.rs
 //! Replication throughput benchmark: sustained 25K signals/sec across 3 regions.
 use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 fn bench_signal_throughput(c: &mut Criterion) {
    let rt = tokio::runtime::Runtime::new().unwrap();
    let cluster = rt.block_on(SimulatedCluster::build(three_region_config()));
    let mut group = c.benchmark_group("replication");
    group.throughput(Throughput::Elements(25_000));
    group.bench_function("25k_signals_per_sec", |b| {
        b.iter(|| {
            rt.block_on(async {
                for i in 0..25_000u64 {
                    cluster.write_signal("view", EntityId::new(i % 10_000), 1.0);
                }
                cluster.await_full_convergence(Duration::from_secs(5)).await;
            });
        });
    });
    group.finish();
 }
 criterion_group!(benches, bench_signal_throughput);
 criterion_main!(benches);
 ```
 ### CI Configuration
 ```yaml
 # .github/workflows/m8-tests.yml (or equivalent in the project's CI)
 name: M8 Replication Tests
 on:
  pull_request:
    paths:
      - 'tidal/src/replication/**'
      - 'tidal/src/testing/**'
      - 'tidal/tests/m8*'
 jobs:
  m8-unit:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: dtolnay/rust-toolchain@stable
      - run: cargo test --manifest-path tidal/Cargo.toml --lib --features test-utils
  m8-integration:
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@v4
      - uses: dtolnay/rust-toolchain@stable
      - run: cargo test --manifest-path tidal/Cargo.toml --test m8_uat --features test-utils
      - run: cargo test --manifest-path tidal/Cargo.toml --test m8p2_replication --features test-utils
      - run: cargo test --manifest-path tidal/Cargo.toml --test m8p3_crdt --features test-utils
      - run: cargo test --manifest-path tidal/Cargo.toml --test m8p4_session --features test-utils
      - run: cargo test --manifest-path tidal/Cargo.toml --test m8p5_multitenancy --features test-utils
  clippy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: dtolnay/rust-toolchain@stable
        with:
          components: clippy, rustfmt
      - run: cargo clippy --manifest-path tidal/Cargo.toml -D warnings --features test-utils
      - run: cargo fmt --manifest-path tidal/Cargo.toml --check
 ```
 ## Acceptance Criteria
 - [ ] `perf_replication_latency_p99`: 1000-sample p99 replication latency < 2000ms with InProcessTransport; prints p50 and p99
 - [ ] `perf_failover_under_10s`: leader election + follower promotion completes within 10 seconds; timing printed
 - [ ] `perf_reconciliation_overhead`: reconciliation of 20K total events (10K per side) completes in < 100ms; timing printed
 - [ ] `benches/replication.rs`: 25K signals/sec benchmark runs without panic; throughput number printed by criterion
 - [ ] CI configuration: M8 integration tests run on PRs that touch `tidal/src/replication/**` or `tidal/tests/m8*`; job timeout = 5 minutes
 - [ ] No flaky tests: run `cargo test --test m8_uat` 5 times in a row; all passes (deterministic due to InProcessTransport)
 - [ ] Total CI job runtime (all M8 integration tests) < 3 minutes
 - [ ] `cargo clippy -D warnings` and `cargo fmt` pass
--- a/forage-discover.sh
+++ b/forage-discover.sh
@ -0,0 +1,29 @@
 #!/usr/bin/env bash
 # forage-discover.sh — Start the Forage autonomous discovery agent
 #
 # Prerequisites:
 #   - forage-server running at localhost:4242
 #     cargo run -p forage-server --manifest-path applications/forage/server/Cargo.toml
 #   - Claude Code CLI installed with --chrome support
 #     https://code.claude.com/docs/en/chrome
 #
 # Usage:
 #   ./forage-discover.sh
 set -euo pipefail
 AGENT_MD="$(dirname "$0")/applications/forage/agent.md"
 if [[ ! -f "$AGENT_MD" ]]; then
  echo "Error: agent instructions not found at $AGENT_MD" >&2
  exit 1
 fi
 echo "Starting Forage discovery agent..."
 echo "Server: http://localhost:4242"
 echo "Agent instructions: $AGENT_MD"
 echo ""
 echo "Press Ctrl+C to stop."
 echo ""
 exec claude --chrome "$(cat "$AGENT_MD")"
--- a/tidal/Cargo.toml
+++ b/tidal/Cargo.toml
@ -17,6 +17,7 @@ blake3 = "1"
 crossbeam = "0.8"
 dashmap = "6"
 fjall = "3"
 lru = "0.12"
 fs4 = "0.8"
 rand = "0.9"
 roaring = "0.10"
@ -90,6 +91,15 @@ required-features = ["test-utils"]
 [[test]]
 name = "m7p3_social_scale"
 [[test]]
 name = "m8p2_replication"
 [[test]]
 name = "m8p3_crdt"
 [[test]]
 name = "m8p4_session"
 [[test]]
 name = "vector_usearch"
--- a/tidal/benches/recovery.rs
+++ b/tidal/benches/recovery.rs
@ -6,16 +6,16 @@
 //! a WAL + checkpoint from a previously populated database. This is the metric
 //! operators care about most during restarts and crash recovery.
 //!
-//! ## Scope
+//! ## Benchmarks
 //!
-//! This benchmark measures **checkpoint restore + in-memory index rebuild** only.
+//! - **`cold_start_10k_items`**: Measures checkpoint restore + in-memory index
-//! All data is written via `db.close()` (clean checkpoint), so on the next open
+//!   rebuild from a clean checkpoint (no WAL backlog). This is the realistic
-//! the WAL replay phase is near-zero (the checkpoint covers all events). This is
+//!   production recovery path for graceful shutdowns.
 //! the realistic production recovery path for graceful shutdowns.
 //!
-//! A true WAL-backlog benchmark (measuring recovery from unsaved in-flight events)
+//! - **`cold_start_with_wal_backlog_10k`**: Measures recovery from a checkpoint
-//! requires writing events after the checkpoint without calling `close()`. That
+//!   plus 2K WAL backlog signals that were never checkpointed (simulating a crash
-//! scenario is deferred and is not covered here.
+//!   before checkpoint). The WAL backlog is re-injected after each iteration's
 //!   `close()` to ensure every iteration measures the same replay workload.
 //!
 //! ## Scale
 //!
@ -33,7 +33,11 @@ use std::time::Duration;
 use criterion::{Criterion, criterion_group, criterion_main};
 use tidaldb::TidalDb;
 use tidaldb::replication::ShardId;
 use tidaldb::schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, Timestamp, Window};
 use tidaldb::wal::checkpoint::CheckpointManager;
 use tidaldb::wal::format::{EventRecord, MAX_EVENTS_PER_BATCH, encode_batch};
 use tidaldb::wal::segment::segment_filename;
 fn bench_schema() -> tidaldb::schema::Schema {
    let mut builder = SchemaBuilder::new();
@ -59,7 +63,7 @@ fn generate_test_data(dir: &std::path::Path) {
    // is the gatekeeping test for CI.
    let db = TidalDb::builder()
        .with_data_dir(dir)
-        .with_schema(schema.clone())
+        .with_schema(schema)
        .open()
        .expect("open should succeed");
@ -77,6 +81,46 @@ fn generate_test_data(dir: &std::path::Path) {
    db.close().expect("close should succeed");
 }
 /// Inject `backlog_count` raw WAL signal events into the WAL directory,
 /// starting at sequence `checkpoint_seq + 1`. Uses entity IDs
 /// `base_entity + 1 ..= base_entity + backlog_count` with `signal_type = 0`
 /// ("view", the only signal in `bench_schema`, assigned ID 0 alphabetically).
 fn inject_wal_backlog(data_dir: &std::path::Path, base_entity: u64, backlog_count: u64) {
    let wal_dir = data_dir.join("wal");
    std::fs::create_dir_all(&wal_dir).expect("create wal dir");
    // Read the current checkpoint to determine where to start injected seqs.
    let checkpoint = CheckpointManager::read(&wal_dir).expect("read checkpoint");
    let checkpoint_seq = checkpoint.map_or(0, |(seq, _)| seq);
    let base_ns = 1_000_000_000_000u64;
    // Build event records for the backlog.
    let events: Vec<EventRecord> = (1..=backlog_count)
        .map(|i| EventRecord {
            entity_id: base_entity + i,
            signal_type: 0, // "view" is the only signal, assigned ID 0
            weight: 1.0,
            timestamp_nanos: base_ns + (base_entity + i) * 1_000_000,
        })
        .collect();
    // Encode into batches (max 256 events per batch), write as a single
    // WAL segment file starting at checkpoint_seq + 1.
    let mut seq = checkpoint_seq + 1;
    let seg_path = wal_dir.join(segment_filename(ShardId::SINGLE, seq));
    let mut segment_bytes: Vec<u8> = Vec::new();
    for chunk in events.chunks(usize::from(MAX_EVENTS_PER_BATCH)) {
        let batch_ts = chunk[0].timestamp_nanos;
        let batch_bytes = encode_batch(chunk, seq, batch_ts).expect("encode batch");
        segment_bytes.extend_from_slice(&batch_bytes);
        seq += chunk.len() as u64;
    }
    std::fs::write(&seg_path, &segment_bytes).expect("write WAL segment");
 }
 fn recovery_benchmark(c: &mut Criterion) {
    let mut group = c.benchmark_group("recovery");
    // Recovery benchmarks can be slower -- allow more time.
@ -109,5 +153,67 @@ fn recovery_benchmark(c: &mut Criterion) {
    group.finish();
 }
-criterion_group!(benches, recovery_benchmark);
+/// Generates a data directory with a clean checkpoint (10K base items)
 /// plus 2K WAL backlog signals on entity IDs `10_001..=12_000` that are
 /// NOT covered by the checkpoint.
 ///
 /// The approach:
 ///   1. Write 10K base signals, call `close()` -> checkpoint + WAL compaction.
 ///   2. Inject raw WAL segment files with 2K events at sequence numbers above
 ///      the checkpoint boundary, simulating events written to the WAL but never
 ///      checkpointed (i.e., a crash before checkpoint).
 ///
 /// NOTE: We inject WAL segments post-close rather than using `std::mem::forget`
 /// because `TidalDb::Drop` calls `shutdown_inner()` which checkpoints + compacts,
 /// and `forget` would leak the file lock preventing reopen in the same process.
 fn generate_wal_backlog_data(dir: &std::path::Path) {
    // Phase 1: Write base signals and checkpoint via clean close.
    generate_test_data(dir);
    // Phase 2: Inject raw WAL events that simulate a crash before checkpoint.
    inject_wal_backlog(dir, 10_000, 2_000);
 }
 fn recovery_with_wal_backlog_benchmark(c: &mut Criterion) {
    let mut group = c.benchmark_group("recovery_wal_backlog");
    group.sample_size(10);
    group.measurement_time(Duration::from_secs(30));
    // Generate the test data directory once (checkpoint + WAL backlog).
    let dir = tempfile::tempdir().expect("tempdir");
    generate_wal_backlog_data(dir.path());
    let schema = bench_schema();
    group.bench_function("cold_start_with_wal_backlog_10k", |b| {
        b.iter(|| {
            let db = TidalDb::builder()
                .with_data_dir(dir.path())
                .with_schema(schema.clone())
                .open()
                .expect("open should succeed");
            // Verify that a backlog entity was replayed from WAL.
            let count = db
                .read_windowed_count(EntityId::new(10_001), "view", Window::AllTime)
                .expect("read should succeed");
            assert!(
                count > 0,
                "backlog entity 10001 should have signals after WAL replay"
            );
            db.close().expect("close should succeed");
            // Re-inject WAL backlog for next iteration (close() checkpoints it away)
            inject_wal_backlog(dir.path(), 10_000, 2_000);
        });
    });
    group.finish();
 }
 criterion_group!(
    benches,
    recovery_benchmark,
    recovery_with_wal_backlog_benchmark
 );
 criterion_main!(benches);
--- a/tidal/src/db/builder.rs
+++ b/tidal/src/db/builder.rs
@ -92,6 +92,16 @@ impl TidalDbBuilder {
        self
    }
    /// Configure this instance for distributed deployment.
    ///
    /// Not required for single-node embedded use. The default `NodeConfig`
    /// produces a single-node configuration identical to M0-M7 behavior.
    #[must_use]
    pub fn with_cluster(mut self, config: super::config::NodeConfig) -> Self {
        self.config.cluster = config;
        self
    }
    /// Switch to ephemeral (in-memory) mode, clearing any directory paths.
    ///
    /// This is the default mode. Calling this is only necessary to reset
@ -382,210 +392,5 @@ impl Default for TidalDbBuilder {
 }
 #[cfg(test)]
-mod tests {
+#[path = "builder_tests.rs"]
-    use super::*;
+mod tests;
    #[test]
    fn builder_ephemeral_succeeds() {
        let db = TidalDb::builder().ephemeral().open();
        assert!(db.is_ok());
    }
    #[test]
    fn builder_default_is_ephemeral() {
        let db = TidalDb::builder().open();
        assert!(db.is_ok());
    }
    #[test]
    fn builder_persistent_requires_data_dir() {
        // Construct a persistent-mode builder without calling with_data_dir
        // by manually setting mode.
        let builder = TidalDbBuilder {
            config: Config {
                mode: StorageMode::Persistent,
                data_dir: None,
                wal_dir: None,
                cache_dir: None,
            },
            metrics_addr: None,
            schema: None,
            rate_limiter_config: None,
        };
        let result = builder.validate();
        assert!(result.is_err());
        let err = result.expect_err("should fail");
        assert!(
            matches!(err, ConfigError::MissingDataDir),
            "expected MissingDataDir, got: {err}"
        );
    }
    #[test]
    fn builder_persistent_missing_dir() {
        let result = TidalDb::builder()
            .with_data_dir("/nonexistent/path/that/does/not/exist")
            .open();
        assert!(result.is_err());
        let err_msg = result.expect_err("should fail").to_string();
        assert!(
            err_msg.contains("does not exist"),
            "expected DirectoryNotFound, got: {err_msg}"
        );
    }
    #[test]
    fn builder_persistent_existing_dir() {
        let tmp = tempfile::tempdir().expect("failed to create tempdir");
        let result = TidalDb::builder().with_data_dir(tmp.path()).open();
        assert!(result.is_ok(), "open with valid tempdir should succeed");
    }
    #[test]
    fn health_check_ok() {
        let db = TidalDb::builder().ephemeral().open().expect("open failed");
        assert!(db.health_check().is_ok());
    }
    #[test]
    fn close_ok() {
        let db = TidalDb::builder().ephemeral().open().expect("open failed");
        assert!(db.close().is_ok());
    }
    #[test]
    fn builder_with_wal_and_cache_dir() {
        let tmp = tempfile::tempdir().expect("failed to create tempdir");
        let wal = tmp.path().join("wal");
        let cache = tmp.path().join("cache");
        std::fs::create_dir_all(&wal).expect("mkdir wal");
        std::fs::create_dir_all(&cache).expect("mkdir cache");
        let result = TidalDb::builder()
            .with_data_dir(tmp.path())
            .wal_dir(&wal)
            .cache_dir(&cache)
            .open();
        assert!(
            result.is_ok(),
            "open with explicit wal/cache dirs should succeed"
        );
    }
    #[test]
    fn builder_ephemeral_resets_dirs() {
        let builder = TidalDb::builder()
            .with_data_dir("/some/path")
            .wal_dir("/some/wal")
            .cache_dir("/some/cache")
            .ephemeral();
        assert_eq!(builder.config.mode, StorageMode::Ephemeral);
        assert!(builder.config.data_dir.is_none());
        assert!(builder.config.wal_dir.is_none());
        assert!(builder.config.cache_dir.is_none());
    }
    #[test]
    fn builder_wal_dir_nonexistent() {
        let tmp = tempfile::tempdir().expect("failed to create tempdir");
        let result = TidalDb::builder()
            .with_data_dir(tmp.path())
            .wal_dir("/nonexistent/wal")
            .open();
        assert!(result.is_err());
        let err_msg = result.expect_err("should fail").to_string();
        assert!(err_msg.contains("does not exist"));
    }
    #[test]
    fn resolve_defaults_sets_wal_and_cache() {
        let tmp = tempfile::tempdir().expect("failed to create tempdir");
        let mut builder = TidalDb::builder().with_data_dir(tmp.path());
        assert!(builder.config.wal_dir.is_none());
        assert!(builder.config.cache_dir.is_none());
        builder.resolve_defaults();
        let paths = super::Paths::new(tmp.path());
        assert_eq!(builder.config.wal_dir.as_ref(), Some(&paths.wal_dir()));
        assert_eq!(builder.config.cache_dir.as_ref(), Some(&paths.cache_dir()));
    }
    #[test]
    fn resolve_defaults_preserves_explicit_overrides() {
        let tmp = tempfile::tempdir().expect("failed to create tempdir");
        let custom_wal = tmp.path().join("custom_wal");
        let custom_cache = tmp.path().join("custom_cache");
        let mut builder = TidalDb::builder()
            .with_data_dir(tmp.path())
            .wal_dir(&custom_wal)
            .cache_dir(&custom_cache);
        builder.resolve_defaults();
        assert_eq!(builder.config.wal_dir.as_ref(), Some(&custom_wal));
        assert_eq!(builder.config.cache_dir.as_ref(), Some(&custom_cache));
    }
    #[test]
    fn resolve_defaults_noop_for_ephemeral() {
        let mut builder = TidalDb::builder().ephemeral();
        builder.resolve_defaults();
        assert!(builder.config.wal_dir.is_none());
        assert!(builder.config.cache_dir.is_none());
    }
    // ── Fix A: Directory lock tests ─────────────────────────────────────
    #[test]
    fn dual_open_same_directory_fails() {
        let dir = tempfile::tempdir().expect("tempdir");
        // First open succeeds (no schema -- M0 mode, but still persistent).
        let _db1 = TidalDb::builder()
            .with_data_dir(dir.path().to_path_buf())
            .open()
            .expect("first open should succeed");
        // Second open on the same directory must fail with DataDirLocked.
        let result = TidalDb::builder()
            .with_data_dir(dir.path().to_path_buf())
            .open();
        assert!(result.is_err(), "expected error for dual open");
        let err_msg = result.expect_err("should fail").to_string();
        assert!(
            err_msg.contains("already open"),
            "expected DataDirLocked, got: {err_msg}"
        );
    }
    #[test]
    fn lock_released_after_close() {
        let dir = tempfile::tempdir().expect("tempdir");
        {
            let db = TidalDb::builder()
                .with_data_dir(dir.path().to_path_buf())
                .open()
                .expect("open");
            db.close().expect("close");
        }
        // After close (and drop), the lock should be released.
        let db2 = TidalDb::builder()
            .with_data_dir(dir.path().to_path_buf())
            .open();
        assert!(
            db2.is_ok(),
            "second open after close should succeed: {:?}",
            db2.err()
        );
    }
    #[test]
    fn ephemeral_mode_skips_lock() {
        // Two ephemeral databases should both succeed (no lock file).
        let _db1 = TidalDb::builder().ephemeral().open().expect("open 1");
        let _db2 = TidalDb::builder().ephemeral().open().expect("open 2");
    }
 }
--- a/tidal/src/db/builder_tests.rs
+++ b/tidal/src/db/builder_tests.rs
@ -0,0 +1,206 @@
 use super::*;
 #[test]
 fn builder_ephemeral_succeeds() {
    let db = TidalDb::builder().ephemeral().open();
    assert!(db.is_ok());
 }
 #[test]
 fn builder_default_is_ephemeral() {
    let db = TidalDb::builder().open();
    assert!(db.is_ok());
 }
 #[test]
 fn builder_persistent_requires_data_dir() {
    // Construct a persistent-mode builder without calling with_data_dir
    // by manually setting mode.
    let builder = TidalDbBuilder {
        config: Config {
            mode: StorageMode::Persistent,
            data_dir: None,
            wal_dir: None,
            cache_dir: None,
            cluster: Default::default(),
        },
        metrics_addr: None,
        schema: None,
        rate_limiter_config: None,
    };
    let result = builder.validate();
    assert!(result.is_err());
    let err = result.expect_err("should fail");
    assert!(
        matches!(err, ConfigError::MissingDataDir),
        "expected MissingDataDir, got: {err}"
    );
 }
 #[test]
 fn builder_persistent_missing_dir() {
    let result = TidalDb::builder()
        .with_data_dir("/nonexistent/path/that/does/not/exist")
        .open();
    assert!(result.is_err());
    let err_msg = result.expect_err("should fail").to_string();
    assert!(
        err_msg.contains("does not exist"),
        "expected DirectoryNotFound, got: {err_msg}"
    );
 }
 #[test]
 fn builder_persistent_existing_dir() {
    let tmp = tempfile::tempdir().expect("failed to create tempdir");
    let result = TidalDb::builder().with_data_dir(tmp.path()).open();
    assert!(result.is_ok(), "open with valid tempdir should succeed");
 }
 #[test]
 fn health_check_ok() {
    let db = TidalDb::builder().ephemeral().open().expect("open failed");
    assert!(db.health_check().is_ok());
 }
 #[test]
 fn close_ok() {
    let db = TidalDb::builder().ephemeral().open().expect("open failed");
    assert!(db.close().is_ok());
 }
 #[test]
 fn builder_with_wal_and_cache_dir() {
    let tmp = tempfile::tempdir().expect("failed to create tempdir");
    let wal = tmp.path().join("wal");
    let cache = tmp.path().join("cache");
    std::fs::create_dir_all(&wal).expect("mkdir wal");
    std::fs::create_dir_all(&cache).expect("mkdir cache");
    let result = TidalDb::builder()
        .with_data_dir(tmp.path())
        .wal_dir(&wal)
        .cache_dir(&cache)
        .open();
    assert!(
        result.is_ok(),
        "open with explicit wal/cache dirs should succeed"
    );
 }
 #[test]
 fn builder_ephemeral_resets_dirs() {
    let builder = TidalDb::builder()
        .with_data_dir("/some/path")
        .wal_dir("/some/wal")
        .cache_dir("/some/cache")
        .ephemeral();
    assert_eq!(builder.config.mode, StorageMode::Ephemeral);
    assert!(builder.config.data_dir.is_none());
    assert!(builder.config.wal_dir.is_none());
    assert!(builder.config.cache_dir.is_none());
 }
 #[test]
 fn builder_wal_dir_nonexistent() {
    let tmp = tempfile::tempdir().expect("failed to create tempdir");
    let result = TidalDb::builder()
        .with_data_dir(tmp.path())
        .wal_dir("/nonexistent/wal")
        .open();
    assert!(result.is_err());
    let err_msg = result.expect_err("should fail").to_string();
    assert!(err_msg.contains("does not exist"));
 }
 #[test]
 fn resolve_defaults_sets_wal_and_cache() {
    let tmp = tempfile::tempdir().expect("failed to create tempdir");
    let mut builder = TidalDb::builder().with_data_dir(tmp.path());
    assert!(builder.config.wal_dir.is_none());
    assert!(builder.config.cache_dir.is_none());
    builder.resolve_defaults();
    let paths = Paths::new(tmp.path());
    assert_eq!(builder.config.wal_dir.as_ref(), Some(&paths.wal_dir()));
    assert_eq!(builder.config.cache_dir.as_ref(), Some(&paths.cache_dir()));
 }
 #[test]
 fn resolve_defaults_preserves_explicit_overrides() {
    let tmp = tempfile::tempdir().expect("failed to create tempdir");
    let custom_wal = tmp.path().join("custom_wal");
    let custom_cache = tmp.path().join("custom_cache");
    let mut builder = TidalDb::builder()
        .with_data_dir(tmp.path())
        .wal_dir(&custom_wal)
        .cache_dir(&custom_cache);
    builder.resolve_defaults();
    assert_eq!(builder.config.wal_dir.as_ref(), Some(&custom_wal));
    assert_eq!(builder.config.cache_dir.as_ref(), Some(&custom_cache));
 }
 #[test]
 fn resolve_defaults_noop_for_ephemeral() {
    let mut builder = TidalDb::builder().ephemeral();
    builder.resolve_defaults();
    assert!(builder.config.wal_dir.is_none());
    assert!(builder.config.cache_dir.is_none());
 }
 // ── Fix A: Directory lock tests ─────────────────────────────────────
 #[test]
 fn dual_open_same_directory_fails() {
    let dir = tempfile::tempdir().expect("tempdir");
    // First open succeeds (no schema -- M0 mode, but still persistent).
    let _db1 = TidalDb::builder()
        .with_data_dir(dir.path().to_path_buf())
        .open()
        .expect("first open should succeed");
    // Second open on the same directory must fail with DataDirLocked.
    let result = TidalDb::builder()
        .with_data_dir(dir.path().to_path_buf())
        .open();
    assert!(result.is_err(), "expected error for dual open");
    let err_msg = result.expect_err("should fail").to_string();
    assert!(
        err_msg.contains("already open"),
        "expected DataDirLocked, got: {err_msg}"
    );
 }
 #[test]
 fn lock_released_after_close() {
    let dir = tempfile::tempdir().expect("tempdir");
    {
        let db = TidalDb::builder()
            .with_data_dir(dir.path().to_path_buf())
            .open()
            .expect("open");
        db.close().expect("close");
    }
    // After close (and drop), the lock should be released.
    let db2 = TidalDb::builder()
        .with_data_dir(dir.path().to_path_buf())
        .open();
    assert!(
        db2.is_ok(),
        "second open after close should succeed: {:?}",
        db2.err()
    );
 }
 #[test]
 fn ephemeral_mode_skips_lock() {
    // Two ephemeral databases should both succeed (no lock file).
    let _db1 = TidalDb::builder().ephemeral().open().expect("open 1");
    let _db2 = TidalDb::builder().ephemeral().open().expect("open 2");
 }
--- a/tidal/src/db/cohorts.rs
+++ b/tidal/src/db/cohorts.rs
@ -16,6 +16,7 @@ impl TidalDb {
    ///
    /// Returns `TidalError::Internal` if a cohort with the same name already exists.
    pub fn define_cohort(&self, def: crate::cohort::CohortDef) -> crate::Result<()> {
        self.require_writeable("define_cohort")?;
        // Persist to durable storage before registering in-memory, so a crash
        // between the two leaves the definition on disk for the next open.
        if let Some(ref storage) = self.storage {
--- a/tidal/src/db/collections.rs
+++ b/tidal/src/db/collections.rs
@ -26,6 +26,7 @@ impl TidalDb {
        name: &str,
        visibility: Visibility,
    ) -> crate::Result<CollectionId> {
        self.require_writeable("create_collection")?;
        let id = self.collection_index.next_collection_id();
        let collection = Collection {
            id,
@ -55,6 +56,7 @@ impl TidalDb {
        collection_id: CollectionId,
        item_id: EntityId,
    ) -> crate::Result<()> {
        self.require_writeable("add_to_collection")?;
        #[allow(clippy::cast_possible_truncation)]
        let item_id_u32 = item_id.as_u64() as u32;
@ -91,6 +93,7 @@ impl TidalDb {
        collection_id: CollectionId,
        item_id: EntityId,
    ) -> crate::Result<()> {
        self.require_writeable("remove_from_collection")?;
        #[allow(clippy::cast_possible_truncation)]
        let item_id_u32 = item_id.as_u64() as u32;
@ -137,6 +140,7 @@ impl TidalDb {
        query_text: &str,
        _filter_expr: Option<crate::storage::indexes::filter::FilterExpr>,
    ) -> crate::Result<()> {
        self.require_writeable("save_search")?;
        let ss = SavedSearch {
            user_id: user.as_u64(),
            name: name.to_string(),
@ -193,6 +197,7 @@ impl TidalDb {
    ///
    /// Returns `TidalError::Internal` if storage is unavailable.
    pub fn delete_saved_search(&self, user: EntityId, name: &str) -> crate::Result<()> {
        self.require_writeable("delete_saved_search")?;
        let key = encode_key(
            EntityId::new(user.as_u64()),
            Tag::SavedSearch,
--- a/tidal/src/db/config.rs
+++ b/tidal/src/db/config.rs
@ -1,5 +1,8 @@
 use std::path::PathBuf;
 use crate::replication::shard::ShardRouter;
 use crate::replication::{RegionId, ShardId};
 /// How tidalDB stores data.
 ///
 /// `Ephemeral` keeps everything in memory -- ideal for tests and short-lived
@ -49,6 +52,8 @@ pub struct Config {
    pub wal_dir: Option<PathBuf>,
    /// Override for the cache directory. Defaults to `{data_dir}/cache`.
    pub cache_dir: Option<PathBuf>,
    /// Cluster configuration. Default: single-node.
    pub cluster: NodeConfig,
 }
 impl Default for Config {
@ -58,6 +63,7 @@ impl Default for Config {
            data_dir: None,
            wal_dir: None,
            cache_dir: None,
            cluster: NodeConfig::default(),
        }
    }
 }
@ -91,6 +97,67 @@ pub enum ConfigError {
    DataDirLocked { path: PathBuf },
 }
 /// The role of this node in the cluster.
 ///
 /// `Single` is the default -- a standalone node that acts as both leader
 /// and follower. Used for embedded deployments.
 ///
 /// `Leader` accepts writes and ships WAL segments to followers.
 ///
 /// `Follower` only accepts replayed events; write calls return `ReadOnly`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
 pub enum NodeRole {
    #[default]
    Single,
    Leader,
    Follower,
 }
 /// Cluster configuration for distributed tidalDB deployments.
 ///
 /// Defaults produce a single-node configuration identical to M0-M7 behavior.
 /// Embedded deployments that do not set any cluster fields get single-node.
 #[derive(Debug, Clone)]
 pub struct NodeConfig {
    /// The role of this node.
    pub role: NodeRole,
    /// This node's shard identity.
    pub shard_id: ShardId,
    /// This node's region identity.
    pub region_id: RegionId,
    /// Shards this node is aware of (including itself). Empty for single-node.
    pub peer_shards: Vec<ShardId>,
    /// Routing strategy for entity-to-shard assignment.
    /// Not serialized -- reconstructed from `peer_shards` on startup.
    pub router: ShardRouter,
 }
 impl Default for NodeConfig {
    fn default() -> Self {
        Self {
            role: NodeRole::Single,
            shard_id: ShardId::SINGLE,
            region_id: RegionId::SINGLE,
            peer_shards: vec![],
            router: ShardRouter::single(),
        }
    }
 }
 impl NodeConfig {
    /// Returns true if this is a standalone single-node deployment.
    #[must_use]
    pub fn is_single_node(&self) -> bool {
        self.role == NodeRole::Single
    }
    /// Returns true if this node accepts writes.
    #[must_use]
    pub const fn accepts_writes(&self) -> bool {
        matches!(self.role, NodeRole::Single | NodeRole::Leader)
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
@ -153,4 +220,78 @@ mod tests {
        assert!(s.contains("already open"), "got: {s}");
        assert!(s.contains("/locked/dir"), "got: {s}");
    }
    #[test]
    fn node_role_default_is_single() {
        assert_eq!(NodeRole::default(), NodeRole::Single);
    }
    #[test]
    fn node_role_debug() {
        assert_eq!(format!("{:?}", NodeRole::Single), "Single");
        assert_eq!(format!("{:?}", NodeRole::Leader), "Leader");
        assert_eq!(format!("{:?}", NodeRole::Follower), "Follower");
    }
    #[test]
    fn node_role_equality() {
        assert_eq!(NodeRole::Single, NodeRole::Single);
        assert_ne!(NodeRole::Leader, NodeRole::Follower);
        assert_ne!(NodeRole::Single, NodeRole::Leader);
    }
    #[test]
    fn node_role_clone() {
        let role = NodeRole::Leader;
        let cloned = role;
        assert_eq!(role, cloned);
    }
    #[test]
    fn node_role_serde_roundtrip() {
        for role in [NodeRole::Single, NodeRole::Leader, NodeRole::Follower] {
            let json = serde_json::to_string(&role).expect("serialize");
            let back: NodeRole = serde_json::from_str(&json).expect("deserialize");
            assert_eq!(role, back);
        }
    }
    // ── NodeConfig tests ──────────────────────────────────────────────────
    #[test]
    fn node_config_default() {
        let nc = NodeConfig::default();
        assert_eq!(nc.role, NodeRole::Single);
        assert_eq!(nc.shard_id, ShardId::SINGLE);
        assert_eq!(nc.region_id, RegionId::SINGLE);
        assert!(nc.peer_shards.is_empty());
        assert!(nc.is_single_node());
        assert!(nc.accepts_writes());
    }
    #[test]
    fn node_config_leader_accepts_writes() {
        let nc = NodeConfig {
            role: NodeRole::Leader,
            ..NodeConfig::default()
        };
        assert!(!nc.is_single_node());
        assert!(nc.accepts_writes());
    }
    #[test]
    fn node_config_follower_rejects_writes() {
        let nc = NodeConfig {
            role: NodeRole::Follower,
            ..NodeConfig::default()
        };
        assert!(!nc.is_single_node());
        assert!(!nc.accepts_writes());
    }
    #[test]
    fn config_has_cluster_field_defaulting_to_single_node() {
        let cfg = Config::default();
        assert!(cfg.cluster.is_single_node());
    }
 }
--- a/tidal/src/db/creators.rs
+++ b/tidal/src/db/creators.rs
@ -23,6 +23,7 @@ impl TidalDb {
        id: EntityId,
        metadata: &HashMap<String, String>,
    ) -> crate::Result<()> {
        self.require_writeable("write_creator")?;
        let storage = self.storage()?;
        let key = encode_key(id, Tag::Meta, b"");
        let value = crate::entities::serialize_entity(None, metadata);
@ -84,6 +85,7 @@ impl TidalDb {
    /// - `TidalError::Internal` if the embedding has zero norm.
    #[allow(clippy::significant_drop_tightening)] // lock must be held across insert_embedding call
    pub fn write_creator_embedding(&self, id: EntityId, embedding: &[f32]) -> crate::Result<()> {
        self.require_writeable("write_creator_embedding")?;
        let storage = self.storage()?;
        // Auto-register the creator "content" slot if absent.
--- a/tidal/src/db/export.rs
+++ b/tidal/src/db/export.rs
@ -80,16 +80,39 @@ pub struct ExportedSignal {
    pub weight: f32,
    /// Nanosecond timestamp when the signal was written.
    pub timestamp_ns: u64,
    /// The user who generated the signal, if known (from session journal).
    pub user_id: Option<u64>,
    /// The session in which the signal was written, if known.
    pub session_id: Option<u64>,
    /// Optional annotation attached to the signal (e.g., RLHF label).
    pub annotation: Option<String>,
 }
 impl ExportedSignal {
    /// Render as a JSON line (no trailing newline).
    ///
    /// Optional fields (`user_id`, `session_id`, `annotation`) are included
    /// only when `Some`, keeping the format compact for anonymous batch signals.
    #[must_use]
    pub fn to_json_line(&self) -> String {
-        format!(
+        use std::fmt::Write as _;
-            r#"{{"entity_id":{},"signal_type":"{}","weight":{},"timestamp_ns":{}}}"#,
+        let mut s = format!(
            r#"{{"entity_id":{},"signal_type":"{}","weight":{},"timestamp_ns":{}"#,
            self.entity_id, self.signal_type, self.weight, self.timestamp_ns
-        )
+        );
        if let Some(uid) = self.user_id {
            let _ = write!(s, r#","user_id":{uid}"#);
        }
        if let Some(sid) = self.session_id {
            let _ = write!(s, r#","session_id":{sid}"#);
        }
        if let Some(ref ann) = self.annotation {
            // Escape double quotes and backslashes in annotation text.
            let escaped = ann.replace('\\', "\\\\").replace('"', "\\\"");
            let _ = write!(s, r#","annotation":"{escaped}""#);
        }
        s.push('}');
        s
    }
 }
@ -215,14 +238,6 @@ impl super::TidalDb {
            filter
        };
        // User-id filter: WAL segment events record only entity_id, signal_type,
        // weight, and timestamp — the originating user is not stored in the WAL.
        // Per task-08 spec: if user_id filter is set but no user-to-entity mapping
        // exists, return an empty result rather than returning all users' events.
        if request.user_id.is_some() {
            return Ok(Vec::new());
        }
        // Find WAL directory.
        let Some(wal_dir) = self.config.data_dir.as_ref().map(|d| d.join("wal")) else {
            // Ephemeral mode: no WAL on disk.
@ -233,12 +248,14 @@ impl super::TidalDb {
            return Ok(Vec::new());
        }
-        // Read all WAL events (all segments, ignoring checkpoint boundary).
+        // Batch WAL events: skip entirely when user_id filter is set because
        // batch WAL records have no user context — including them would pollute
        // user-filtered results with anonymous events.
        let mut results = Vec::new();
        if request.user_id.is_none() {
            let all_events = crate::wal::reader::read_all_events(&wal_dir)
                .map_err(|e| TidalError::internal("export_signals", e.to_string()))?;
        // Filter and collect results.
        let mut results = Vec::new();
            for event in all_events {
                // Time range filter: since is inclusive, until is exclusive.
                if let Some(since) = request.since
@ -258,10 +275,6 @@ impl super::TidalDb {
                }
                // Resolve signal type name from the u8 ID.
            // An unrecognised ID means the WAL was written with a schema that had
            // more signal types than the current one (e.g. after a schema migration
            // that removed a signal type). We preserve the event with a placeholder
            // name so callers can detect and filter these stale records.
                let signal_type_name = type_names
                    .get(&event.signal_type)
                    .cloned()
@ -272,18 +285,121 @@ impl super::TidalDb {
                    signal_type: signal_type_name,
                    weight: event.weight,
                    timestamp_ns: event.timestamp_nanos,
                    user_id: None,
                    session_id: None,
                    annotation: None,
                });
            }
        }
-            // Apply limit: stop collecting once we have enough.
+        // Session journal events: always read (they carry user context).
-            if results.len() >= effective_limit {
+        let session_signals = read_session_journal_signals(&wal_dir, request);
-                break;
+        results.extend(session_signals);
-            }
+
-        }
+        // Sort merged results by timestamp ascending for deterministic output.
        results.sort_by_key(|s| s.timestamp_ns);
        // Apply limit across the merged, sorted result.
        results.truncate(effective_limit);
        Ok(results)
    }
 }
 // ── read_session_journal_signals ──────────────────────────────────────────────
 /// Read session journal signals from disk and apply the same filters as the
 /// batch WAL path (time range, signal type name, user ID).
 ///
 /// Returns an empty vec if the session journal does not exist or cannot be read
 /// (non-fatal — the batch WAL results are still valid).
 fn read_session_journal_signals(
    wal_dir: &std::path::Path,
    request: &ExportRequest,
 ) -> Vec<ExportedSignal> {
    use crate::wal::format::{SessionWalEvent, decode_session_events};
    use crate::wal::session_journal::SESSION_JOURNAL_FILENAME;
    let journal_path = wal_dir.join(SESSION_JOURNAL_FILENAME);
    if !journal_path.exists() {
        return Vec::new();
    }
    let Ok(bytes) = std::fs::read(&journal_path) else {
        return Vec::new();
    };
    let events = decode_session_events(&bytes);
    // First pass: build session_id -> user_id mapping from Start events.
    let mut session_user: HashMap<u64, u64> = HashMap::new();
    for event in &events {
        if let SessionWalEvent::Start {
            session_id,
            user_id,
            ..
        } = event
        {
            session_user.insert(*session_id, *user_id);
        }
    }
    // Build signal type name filter set for string-based matching.
    let type_filter_set: HashSet<&str> = request.signal_types.iter().map(String::as_str).collect();
    // Second pass: collect matching Signal events.
    let mut results = Vec::new();
    for event in &events {
        if let SessionWalEvent::Signal {
            session_id,
            entity_id,
            weight,
            ts_ns,
            signal_name,
            annotation,
            ..
        } = event
        {
            // User filter: look up user_id from session start mapping.
            let mapped_user_id = session_user.get(session_id).copied();
            if let Some(filter_uid) = request.user_id
                && mapped_user_id != Some(filter_uid)
            {
                continue;
            }
            // Time range filter: since is inclusive, until is exclusive.
            if let Some(since) = request.since
                && *ts_ns < since
            {
                continue;
            }
            if let Some(until) = request.until
                && *ts_ns >= until
            {
                continue;
            }
            // Signal type filter: if non-empty, check signal_name against the list.
            if !type_filter_set.is_empty() && !type_filter_set.contains(signal_name.as_str()) {
                continue;
            }
            results.push(ExportedSignal {
                entity_id: *entity_id,
                signal_type: signal_name.clone(),
                weight: *weight,
                timestamp_ns: *ts_ns,
                user_id: mapped_user_id,
                session_id: Some(*session_id),
                annotation: annotation.clone(),
            });
        }
    }
    results
 }
 // ── TidalDb::user_session_summary ────────────────────────────────────────────
 impl super::TidalDb {
@ -371,106 +487,5 @@ impl super::TidalDb {
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
-mod tests {
+#[path = "export_tests.rs"]
-    use super::*;
+mod tests;
    #[test]
    fn exported_signal_json_line_format() {
        let sig = ExportedSignal {
            entity_id: 42,
            signal_type: "view".to_string(),
            weight: 1.0,
            timestamp_ns: 1_700_000_000_000_000_000,
        };
        let line = sig.to_json_line();
        assert!(line.starts_with('{'));
        assert!(line.ends_with('}'));
        assert!(line.contains("\"entity_id\":42"));
        assert!(line.contains("\"signal_type\":\"view\""));
        assert!(line.contains("\"weight\":1"));
        assert!(line.contains("\"timestamp_ns\":1700000000000000000"));
    }
    #[test]
    fn export_request_time_range_constructor() {
        let req = ExportRequest::time_range(100, 200);
        assert_eq!(req.since, Some(100));
        assert_eq!(req.until, Some(200));
        assert!(req.signal_types.is_empty());
        assert!(req.user_id.is_none());
        assert!(req.limit.is_none());
        assert_eq!(req.format, ExportFormat::JsonLines);
    }
    #[test]
    fn export_request_signals_in_range_constructor() {
        let req = ExportRequest::signals_in_range(vec!["view".into(), "like".into()], 100, 200);
        assert_eq!(req.signal_types.len(), 2);
        assert_eq!(req.since, Some(100));
    }
    #[test]
    fn exported_signal_json_line_weight_precision() {
        let sig = ExportedSignal {
            entity_id: 1,
            signal_type: "like".to_string(),
            weight: 2.5,
            timestamp_ns: 100,
        };
        let line = sig.to_json_line();
        assert!(line.contains("\"weight\":2.5"));
    }
    #[test]
    fn cosine_distance_identical_vectors() {
        let a = vec![1.0_f32, 0.0, 0.0];
        let b = vec![1.0_f32, 0.0, 0.0];
        let dist = cosine_distance(&a, &b);
        assert!(dist.abs() < 1e-10);
    }
    #[test]
    fn cosine_distance_orthogonal_vectors() {
        let a = vec![1.0_f32, 0.0];
        let b = vec![0.0_f32, 1.0];
        let dist = cosine_distance(&a, &b);
        assert!((dist - 1.0).abs() < 1e-10);
    }
    #[test]
    fn cosine_distance_opposite_vectors() {
        let a = vec![1.0_f32, 0.0];
        let b = vec![-1.0_f32, 0.0];
        let dist = cosine_distance(&a, &b);
        assert!((dist - 2.0).abs() < 1e-10);
    }
    #[test]
    fn cosine_distance_zero_vector() {
        let a = vec![0.0_f32, 0.0];
        let b = vec![1.0_f32, 0.0];
        let dist = cosine_distance(&a, &b);
        assert!(dist.abs() < 1e-10);
    }
    #[test]
    fn export_limit_exceeds_max_returns_error() {
        use crate::TidalDb;
        let db = TidalDb::builder().ephemeral().open().unwrap();
        let req = ExportRequest {
            user_id: None,
            signal_types: Vec::new(),
            since: Some(0),
            until: Some(1),
            format: ExportFormat::JsonLines,
            limit: Some(ExportRequest::MAX_EXPORT_LIMIT + 1),
        };
        let err = db.export_signals(&req).unwrap_err();
        assert!(
            err.to_string().contains("exceeds maximum"),
            "expected limit error, got: {err}"
        );
        db.close().unwrap();
    }
 }
--- a/tidal/src/db/export_tests.rs
+++ b/tidal/src/db/export_tests.rs
@ -0,0 +1,145 @@
 use super::*;
 #[test]
 fn exported_signal_json_line_format() {
    let sig = ExportedSignal {
        entity_id: 42,
        signal_type: "view".to_string(),
        weight: 1.0,
        timestamp_ns: 1_700_000_000_000_000_000,
        user_id: None,
        session_id: None,
        annotation: None,
    };
    let line = sig.to_json_line();
    assert!(line.starts_with('{'));
    assert!(line.ends_with('}'));
    assert!(line.contains("\"entity_id\":42"));
    assert!(line.contains("\"signal_type\":\"view\""));
    assert!(line.contains("\"weight\":1"));
    assert!(line.contains("\"timestamp_ns\":1700000000000000000"));
    // Anonymous signals must NOT include user/session/annotation keys.
    assert!(!line.contains("user_id"));
    assert!(!line.contains("session_id"));
    assert!(!line.contains("annotation"));
 }
 #[test]
 fn export_request_time_range_constructor() {
    let req = ExportRequest::time_range(100, 200);
    assert_eq!(req.since, Some(100));
    assert_eq!(req.until, Some(200));
    assert!(req.signal_types.is_empty());
    assert!(req.user_id.is_none());
    assert!(req.limit.is_none());
    assert_eq!(req.format, ExportFormat::JsonLines);
 }
 #[test]
 fn export_request_signals_in_range_constructor() {
    let req = ExportRequest::signals_in_range(vec!["view".into(), "like".into()], 100, 200);
    assert_eq!(req.signal_types.len(), 2);
    assert_eq!(req.since, Some(100));
 }
 #[test]
 fn exported_signal_json_line_weight_precision() {
    let sig = ExportedSignal {
        entity_id: 1,
        signal_type: "like".to_string(),
        weight: 2.5,
        timestamp_ns: 100,
        user_id: None,
        session_id: None,
        annotation: None,
    };
    let line = sig.to_json_line();
    assert!(line.contains("\"weight\":2.5"));
 }
 #[test]
 fn exported_signal_json_line_with_session_context() {
    let sig = ExportedSignal {
        entity_id: 42,
        signal_type: "view".to_string(),
        weight: 1.0,
        timestamp_ns: 1234,
        user_id: Some(7),
        session_id: Some(99),
        annotation: Some("good content".to_string()),
    };
    let line = sig.to_json_line();
    assert!(line.contains("\"user_id\":7"));
    assert!(line.contains("\"session_id\":99"));
    assert!(line.contains("\"annotation\":\"good content\""));
 }
 #[test]
 fn exported_signal_json_line_partial_context() {
    let sig = ExportedSignal {
        entity_id: 10,
        signal_type: "like".to_string(),
        weight: 1.0,
        timestamp_ns: 500,
        user_id: Some(3),
        session_id: None,
        annotation: None,
    };
    let line = sig.to_json_line();
    assert!(line.contains("\"user_id\":3"));
    assert!(!line.contains("session_id"));
    assert!(!line.contains("annotation"));
 }
 #[test]
 fn cosine_distance_identical_vectors() {
    let a = vec![1.0_f32, 0.0, 0.0];
    let b = vec![1.0_f32, 0.0, 0.0];
    let dist = cosine_distance(&a, &b);
    assert!(dist.abs() < 1e-10);
 }
 #[test]
 fn cosine_distance_orthogonal_vectors() {
    let a = vec![1.0_f32, 0.0];
    let b = vec![0.0_f32, 1.0];
    let dist = cosine_distance(&a, &b);
    assert!((dist - 1.0).abs() < 1e-10);
 }
 #[test]
 fn cosine_distance_opposite_vectors() {
    let a = vec![1.0_f32, 0.0];
    let b = vec![-1.0_f32, 0.0];
    let dist = cosine_distance(&a, &b);
    assert!((dist - 2.0).abs() < 1e-10);
 }
 #[test]
 fn cosine_distance_zero_vector() {
    let a = vec![0.0_f32, 0.0];
    let b = vec![1.0_f32, 0.0];
    let dist = cosine_distance(&a, &b);
    assert!(dist.abs() < 1e-10);
 }
 #[test]
 fn export_limit_exceeds_max_returns_error() {
    use crate::TidalDb;
    let db = TidalDb::builder().ephemeral().open().unwrap();
    let req = ExportRequest {
        user_id: None,
        signal_types: Vec::new(),
        since: Some(0),
        until: Some(1),
        format: ExportFormat::JsonLines,
        limit: Some(ExportRequest::MAX_EXPORT_LIMIT + 1),
    };
    let err = db.export_signals(&req).unwrap_err();
    assert!(
        err.to_string().contains("exceeds maximum"),
        "expected limit error, got: {err}"
    );
    db.close().unwrap();
 }
--- a/tidal/src/db/items.rs
+++ b/tidal/src/db/items.rs
@ -38,6 +38,7 @@ impl TidalDb {
        id: EntityId,
        metadata: &HashMap<String, String>,
    ) -> crate::Result<()> {
        self.require_writeable("write_item_with_metadata")?;
        // Validate metadata size limits.
        const MAX_METADATA_KEYS: usize = 64;
        const MAX_KEY_BYTES: usize = 512;
@ -217,6 +218,7 @@ impl TidalDb {
    /// - `TidalError::Internal` if the embedding has zero norm.
    #[allow(clippy::significant_drop_tightening)] // lock must be held across insert_embedding call
    pub fn write_item_embedding(&self, id: EntityId, embedding: &[f32]) -> crate::Result<()> {
        self.require_writeable("write_item_embedding")?;
        let storage = self.storage()?;
        // Auto-register the item "content" slot if absent.
--- a/tidal/src/db/lifecycle.rs
+++ b/tidal/src/db/lifecycle.rs
@ -53,6 +53,15 @@ impl TidalDb {
        // Mark health as degraded so the metrics endpoint reflects shutdown.
        self.metrics.health_ok.store(false, Ordering::Release);
        // M8p2: Join the segment receiver thread before WAL shuts down.
        // The receiver may hold Arc<SignalLedger> and the transport may block
        // in recv_segment until the sender side is dropped.
        if let Ok(mut guard) = self.receiver_handle.lock()
            && let Some(handle) = guard.take()
        {
            handle.join();
        }
        // M7p2: Signal sweeper to stop and join the thread.
        // This must happen BEFORE WAL shutdown, because close_session_internal()
        // writes to the WAL.
--- a/tidal/src/db/metrics/mod.rs
+++ b/tidal/src/db/metrics/mod.rs
@ -100,6 +100,11 @@ pub struct MetricsState {
    /// Total number of failed periodic signal checkpoints.
    pub(crate) checkpoint_failures_total: AtomicU64,
    // ── Replication metrics (m8p2) ──────────────────────────────────────
    /// Current replication lag in WAL segments (follower only; 0 on leader).
    #[cfg(feature = "metrics")]
    pub(crate) replication_lag_seqno: AtomicU64,
 }
 impl MetricsState {
@ -140,6 +145,8 @@ impl MetricsState {
            #[cfg(feature = "metrics")]
            bitmap_index_cardinality: AtomicU64::new(0),
            checkpoint_failures_total: AtomicU64::new(0),
            #[cfg(feature = "metrics")]
            replication_lag_seqno: AtomicU64::new(0),
        }
    }
@ -319,6 +326,15 @@ impl MetricsState {
                "gauge",
                self.degradation_level.load(Ordering::Relaxed) as f64,
            );
            // Replication lag.
            write_metric_line(
                &mut out,
                "tidaldb_replication_lag_seqno",
                "Replication lag in WAL segments behind the leader",
                "gauge",
                self.replication_lag_seqno.load(Ordering::Relaxed) as f64,
            );
        }
        // Checkpoint failure counter (unconditional -- not feature-gated).
@ -354,234 +370,4 @@ impl MetricsState {
 }
 #[cfg(test)]
-mod tests {
+mod tests;
    use super::*;
    #[test]
    fn new_creates_healthy_state() {
        let state = MetricsState::new();
        assert!(state.health_ok.load(Ordering::Relaxed));
    }
    #[test]
    fn uptime_is_non_negative() {
        let state = MetricsState::new();
        assert!(state.uptime_seconds() >= 0.0);
    }
    #[test]
    fn health_ok_value_returns_one_when_healthy() {
        let state = MetricsState::new();
        assert!((state.health_ok_value() - 1.0).abs() < f64::EPSILON);
    }
    #[test]
    fn health_ok_value_returns_zero_when_degraded() {
        let state = MetricsState::new();
        state.health_ok.store(false, Ordering::Relaxed);
        assert!(state.health_ok_value().abs() < f64::EPSILON);
    }
    #[test]
    fn render_prometheus_contains_expected_metrics() {
        let state = MetricsState::new();
        let output = state.render_prometheus();
        assert!(output.contains("tidaldb_uptime_seconds"));
        assert!(output.contains("tidaldb_health_ok"));
        assert!(output.contains("tidaldb_info"));
        assert!(output.contains("partition_id=\"0\""));
    }
    #[test]
    fn render_healthz_contains_expected_fields() {
        let state = MetricsState::new();
        let output = state.render_healthz();
        assert!(output.contains("\"status\":\"ok\""));
        assert!(output.contains("\"uptime_seconds\":"));
        assert!(output.contains("\"version\":"));
        assert!(output.contains("\"build_hash\":"));
    }
    #[test]
    fn render_healthz_degraded() {
        let state = MetricsState::new();
        state.health_ok.store(false, Ordering::Relaxed);
        let output = state.render_healthz();
        assert!(output.contains("\"status\":\"degraded\""));
    }
    #[cfg(feature = "metrics")]
    #[test]
    fn metrics_state_renders_signal_metrics() {
        let state = MetricsState::new();
        state.signal_writes_total.store(42, Ordering::Relaxed);
        state.signal_hot_entries.store(100, Ordering::Relaxed);
        state.wal_lag_bytes.store(8192, Ordering::Relaxed);
        state
            .wal_compacted_segments_total
            .store(3, Ordering::Relaxed);
        let output = state.render_prometheus();
        assert!(
            output.contains("tidaldb_signal_writes_total"),
            "missing signal_writes_total: {output}"
        );
        assert!(output.contains("42"), "missing value 42: {output}");
        assert!(
            output.contains("tidaldb_signal_hot_entries"),
            "missing signal_hot_entries: {output}"
        );
        assert!(output.contains("100"), "missing value 100: {output}");
        assert!(
            output.contains("tidaldb_wal_lag_bytes"),
            "missing wal_lag_bytes: {output}"
        );
        assert!(output.contains("8192"), "missing value 8192: {output}");
        assert!(
            output.contains("tidaldb_wal_compacted_segments_total"),
            "missing wal_compacted_segments_total: {output}"
        );
        assert!(
            output.contains("tidaldb_checkpoint_age_seconds"),
            "missing checkpoint_age_seconds: {output}"
        );
        assert!(
            output.contains("tidaldb_signal_write_latency_us"),
            "missing signal_write_latency_us histogram: {output}"
        );
    }
    #[cfg(feature = "metrics")]
    #[test]
    fn render_prometheus_contains_index_metrics() {
        let m = MetricsState::new();
        m.tantivy_segment_count.store(3, Ordering::Relaxed);
        m.tantivy_indexed_docs.store(10000, Ordering::Relaxed);
        m.usearch_vector_count.store(500, Ordering::Relaxed);
        m.usearch_index_size_bytes
            .store(1_048_576, Ordering::Relaxed);
        m.bitmap_index_cardinality.store(42, Ordering::Relaxed);
        let prom = m.render_prometheus();
        assert!(
            prom.contains("tidaldb_tantivy_segment_count 3"),
            "missing tantivy_segment_count: {prom}"
        );
        assert!(
            prom.contains("tidaldb_tantivy_indexed_docs 10000"),
            "missing tantivy_indexed_docs: {prom}"
        );
        assert!(
            prom.contains("tidaldb_usearch_vector_count 500"),
            "missing usearch_vector_count: {prom}"
        );
        assert!(
            prom.contains("tidaldb_usearch_index_size_bytes 1048576"),
            "missing usearch_index_size_bytes: {prom}"
        );
        assert!(
            prom.contains("tidaldb_bitmap_index_cardinality 42"),
            "missing bitmap_index_cardinality: {prom}"
        );
    }
    #[cfg(feature = "metrics")]
    #[test]
    fn active_sessions_tracks_lifecycle() {
        let state = MetricsState::new();
        state.active_sessions.fetch_add(1, Ordering::Relaxed);
        state.active_sessions.fetch_add(1, Ordering::Relaxed);
        assert_eq!(state.active_sessions.load(Ordering::Relaxed), 2);
        state.active_sessions.fetch_sub(1, Ordering::Relaxed);
        assert_eq!(state.active_sessions.load(Ordering::Relaxed), 1);
    }
    #[cfg(feature = "metrics")]
    #[test]
    fn degradation_level_renders_correctly() {
        let state = MetricsState::new();
        state.degradation_level.store(2, Ordering::Relaxed);
        let output = state.render_prometheus();
        assert!(
            output.contains("tidaldb_degradation_level"),
            "missing tidaldb_degradation_level: {output}"
        );
    }
    #[cfg(feature = "metrics")]
    #[test]
    fn render_prometheus_contains_session_metrics() {
        let state = MetricsState::new();
        let prom = state.render_prometheus();
        assert!(
            prom.contains("tidaldb_active_sessions"),
            "missing active_sessions: {prom}"
        );
        assert!(
            prom.contains("tidaldb_closed_sessions_total"),
            "missing closed_sessions_total: {prom}"
        );
        assert!(
            prom.contains("tidaldb_session_auto_closed_total"),
            "missing session_auto_closed_total: {prom}"
        );
        assert!(
            prom.contains("tidaldb_rate_limited_total"),
            "missing rate_limited_total: {prom}"
        );
        assert!(
            prom.contains("tidaldb_degradation_level"),
            "missing degradation_level: {prom}"
        );
    }
    #[cfg(feature = "metrics")]
    #[test]
    fn metrics_state_checkpoint_age_zero_when_no_checkpoint() {
        let state = MetricsState::new();
        // last_checkpoint_ns is 0 (default) -- checkpoint_age should be 0.
        let output = state.render_prometheus();
        // Find the checkpoint_age_seconds line and verify it's 0.
        let age_line = output
            .lines()
            .find(|l| l.starts_with("tidaldb_checkpoint_age_seconds "))
            .expect("missing checkpoint_age_seconds line");
        assert!(
            age_line.contains(" 0"),
            "checkpoint age should be 0 when no checkpoint: {age_line}"
        );
    }
    // ── Feature-flag verification tests (m7p4, task-07) ─────────────────
    /// `QueryStats` is NOT feature-gated -- always available regardless of
    /// whether the `metrics` feature is enabled.
    #[test]
    fn query_stats_always_available() {
        use crate::query::QueryStats;
        let stats = QueryStats::new("test".to_owned());
        assert_eq!(stats.profile_name, "test");
        assert_eq!(stats.total_time_us, 0);
    }
    /// Base `MetricsState` fields (`uptime_seconds`, `health_ok_value`) work
    /// without the `metrics` feature -- they are unconditionally compiled.
    #[test]
    fn metrics_state_base_always_available() {
        let state = MetricsState::new();
        assert!(state.uptime_seconds() >= 0.0);
        assert!((state.health_ok_value() - 1.0).abs() < f64::EPSILON);
    }
    /// Feature-gated counters (`signal_writes_total`, etc.) only exist when
    /// the `metrics` feature is enabled -- this test proves they compile and
    /// are functional.
    #[cfg(feature = "metrics")]
    #[test]
    fn metrics_feature_counters_exist() {
        let state = MetricsState::new();
        state.signal_writes_total.fetch_add(1, Ordering::Relaxed);
        assert_eq!(state.signal_writes_total.load(Ordering::Relaxed), 1);
    }
 }
--- a/tidal/src/db/metrics/tests.rs
+++ b/tidal/src/db/metrics/tests.rs
@ -0,0 +1,229 @@
 use super::*;
 #[test]
 fn new_creates_healthy_state() {
    let state = MetricsState::new();
    assert!(state.health_ok.load(Ordering::Relaxed));
 }
 #[test]
 fn uptime_is_non_negative() {
    let state = MetricsState::new();
    assert!(state.uptime_seconds() >= 0.0);
 }
 #[test]
 fn health_ok_value_returns_one_when_healthy() {
    let state = MetricsState::new();
    assert!((state.health_ok_value() - 1.0).abs() < f64::EPSILON);
 }
 #[test]
 fn health_ok_value_returns_zero_when_degraded() {
    let state = MetricsState::new();
    state.health_ok.store(false, Ordering::Relaxed);
    assert!(state.health_ok_value().abs() < f64::EPSILON);
 }
 #[test]
 fn render_prometheus_contains_expected_metrics() {
    let state = MetricsState::new();
    let output = state.render_prometheus();
    assert!(output.contains("tidaldb_uptime_seconds"));
    assert!(output.contains("tidaldb_health_ok"));
    assert!(output.contains("tidaldb_info"));
    assert!(output.contains("partition_id=\"0\""));
 }
 #[test]
 fn render_healthz_contains_expected_fields() {
    let state = MetricsState::new();
    let output = state.render_healthz();
    assert!(output.contains("\"status\":\"ok\""));
    assert!(output.contains("\"uptime_seconds\":"));
    assert!(output.contains("\"version\":"));
    assert!(output.contains("\"build_hash\":"));
 }
 #[test]
 fn render_healthz_degraded() {
    let state = MetricsState::new();
    state.health_ok.store(false, Ordering::Relaxed);
    let output = state.render_healthz();
    assert!(output.contains("\"status\":\"degraded\""));
 }
 #[cfg(feature = "metrics")]
 #[test]
 fn metrics_state_renders_signal_metrics() {
    let state = MetricsState::new();
    state.signal_writes_total.store(42, Ordering::Relaxed);
    state.signal_hot_entries.store(100, Ordering::Relaxed);
    state.wal_lag_bytes.store(8192, Ordering::Relaxed);
    state
        .wal_compacted_segments_total
        .store(3, Ordering::Relaxed);
    let output = state.render_prometheus();
    assert!(
        output.contains("tidaldb_signal_writes_total"),
        "missing signal_writes_total: {output}"
    );
    assert!(output.contains("42"), "missing value 42: {output}");
    assert!(
        output.contains("tidaldb_signal_hot_entries"),
        "missing signal_hot_entries: {output}"
    );
    assert!(output.contains("100"), "missing value 100: {output}");
    assert!(
        output.contains("tidaldb_wal_lag_bytes"),
        "missing wal_lag_bytes: {output}"
    );
    assert!(output.contains("8192"), "missing value 8192: {output}");
    assert!(
        output.contains("tidaldb_wal_compacted_segments_total"),
        "missing wal_compacted_segments_total: {output}"
    );
    assert!(
        output.contains("tidaldb_checkpoint_age_seconds"),
        "missing checkpoint_age_seconds: {output}"
    );
    assert!(
        output.contains("tidaldb_signal_write_latency_us"),
        "missing signal_write_latency_us histogram: {output}"
    );
 }
 #[cfg(feature = "metrics")]
 #[test]
 fn render_prometheus_contains_index_metrics() {
    let m = MetricsState::new();
    m.tantivy_segment_count.store(3, Ordering::Relaxed);
    m.tantivy_indexed_docs.store(10000, Ordering::Relaxed);
    m.usearch_vector_count.store(500, Ordering::Relaxed);
    m.usearch_index_size_bytes
        .store(1_048_576, Ordering::Relaxed);
    m.bitmap_index_cardinality.store(42, Ordering::Relaxed);
    let prom = m.render_prometheus();
    assert!(
        prom.contains("tidaldb_tantivy_segment_count 3"),
        "missing tantivy_segment_count: {prom}"
    );
    assert!(
        prom.contains("tidaldb_tantivy_indexed_docs 10000"),
        "missing tantivy_indexed_docs: {prom}"
    );
    assert!(
        prom.contains("tidaldb_usearch_vector_count 500"),
        "missing usearch_vector_count: {prom}"
    );
    assert!(
        prom.contains("tidaldb_usearch_index_size_bytes 1048576"),
        "missing usearch_index_size_bytes: {prom}"
    );
    assert!(
        prom.contains("tidaldb_bitmap_index_cardinality 42"),
        "missing bitmap_index_cardinality: {prom}"
    );
 }
 #[cfg(feature = "metrics")]
 #[test]
 fn active_sessions_tracks_lifecycle() {
    let state = MetricsState::new();
    state.active_sessions.fetch_add(1, Ordering::Relaxed);
    state.active_sessions.fetch_add(1, Ordering::Relaxed);
    assert_eq!(state.active_sessions.load(Ordering::Relaxed), 2);
    state.active_sessions.fetch_sub(1, Ordering::Relaxed);
    assert_eq!(state.active_sessions.load(Ordering::Relaxed), 1);
 }
 #[cfg(feature = "metrics")]
 #[test]
 fn degradation_level_renders_correctly() {
    let state = MetricsState::new();
    state.degradation_level.store(2, Ordering::Relaxed);
    let output = state.render_prometheus();
    assert!(
        output.contains("tidaldb_degradation_level"),
        "missing tidaldb_degradation_level: {output}"
    );
 }
 #[cfg(feature = "metrics")]
 #[test]
 fn render_prometheus_contains_session_metrics() {
    let state = MetricsState::new();
    let prom = state.render_prometheus();
    assert!(
        prom.contains("tidaldb_active_sessions"),
        "missing active_sessions: {prom}"
    );
    assert!(
        prom.contains("tidaldb_closed_sessions_total"),
        "missing closed_sessions_total: {prom}"
    );
    assert!(
        prom.contains("tidaldb_session_auto_closed_total"),
        "missing session_auto_closed_total: {prom}"
    );
    assert!(
        prom.contains("tidaldb_rate_limited_total"),
        "missing rate_limited_total: {prom}"
    );
    assert!(
        prom.contains("tidaldb_degradation_level"),
        "missing degradation_level: {prom}"
    );
 }
 #[cfg(feature = "metrics")]
 #[test]
 fn metrics_state_checkpoint_age_zero_when_no_checkpoint() {
    let state = MetricsState::new();
    // last_checkpoint_ns is 0 (default) -- checkpoint_age should be 0.
    let output = state.render_prometheus();
    // Find the checkpoint_age_seconds line and verify it's 0.
    let age_line = output
        .lines()
        .find(|l| l.starts_with("tidaldb_checkpoint_age_seconds "))
        .expect("missing checkpoint_age_seconds line");
    assert!(
        age_line.contains(" 0"),
        "checkpoint age should be 0 when no checkpoint: {age_line}"
    );
 }
 // ── Feature-flag verification tests (m7p4, task-07) ─────────────────
 /// `QueryStats` is NOT feature-gated -- always available regardless of
 /// whether the `metrics` feature is enabled.
 #[test]
 fn query_stats_always_available() {
    use crate::query::QueryStats;
    let stats = QueryStats::new("test".to_owned());
    assert_eq!(stats.profile_name, "test");
    assert_eq!(stats.total_time_us, 0);
 }
 /// Base `MetricsState` fields (`uptime_seconds`, `health_ok_value`) work
 /// without the `metrics` feature -- they are unconditionally compiled.
 #[test]
 fn metrics_state_base_always_available() {
    let state = MetricsState::new();
    assert!(state.uptime_seconds() >= 0.0);
    assert!((state.health_ok_value() - 1.0).abs() < f64::EPSILON);
 }
 /// Feature-gated counters (`signal_writes_total`, etc.) only exist when
 /// the `metrics` feature is enabled -- this test proves they compile and
 /// are functional.
 #[cfg(feature = "metrics")]
 #[test]
 fn metrics_feature_counters_exist() {
    let state = MetricsState::new();
    state.signal_writes_total.fetch_add(1, Ordering::Relaxed);
    assert_eq!(state.signal_writes_total.load(Ordering::Relaxed), 1);
 }
--- a/tidal/src/db/mod.rs
+++ b/tidal/src/db/mod.rs
@ -30,7 +30,7 @@ pub(crate) mod wal_bridge;
 pub use backup::BackupInfo;
 pub use builder::TidalDbBuilder;
-pub use config::{Config, ConfigError, StorageMode};
+pub use config::{Config, ConfigError, NodeConfig, StorageMode};
 pub use export::UserSessionSummary;
 pub(crate) use metadata::deserialize_metadata;
 pub use metrics::MetricsState;
@ -149,6 +149,9 @@ pub struct TidalDb {
    // True when a backup is in progress. Signal writes return Backpressure
    // during the backup window. Cleared by the BackupGuard RAII drop.
    backup_in_progress: Arc<AtomicBool>,
    // M8p2 replication
    replication_state: Arc<crate::replication::state::ReplicationState>,
    receiver_handle: std::sync::Mutex<Option<crate::replication::receiver::SegmentReceiverHandle>>,
    // Directory-level exclusive lock (persistent mode only).
    // Held for the lifetime of the process; released on Drop when the
    // File handle is closed. Advisory flock prevents two processes from
@ -237,6 +240,8 @@ impl TidalDb {
            shutdown_sweeper: Arc::new(AtomicBool::new(false)),
            sweeper_thread: std::sync::Mutex::new(None),
            backup_in_progress: Arc::new(AtomicBool::new(false)),
            replication_state: Arc::new(crate::replication::state::ReplicationState::single()),
            receiver_handle: std::sync::Mutex::new(None),
            lock_file: None,
        }
    }
@ -459,6 +464,8 @@ impl TidalDb {
            shutdown_sweeper: Arc::new(AtomicBool::new(false)),
            sweeper_thread: std::sync::Mutex::new(None),
            backup_in_progress: Arc::new(AtomicBool::new(false)),
            replication_state: Arc::new(crate::replication::state::ReplicationState::single()),
            receiver_handle: std::sync::Mutex::new(None),
            lock_file: None,
        };
@ -507,6 +514,34 @@ impl TidalDb {
        }
    }
    /// Access the replication state (for lag gauges and tests).
    #[must_use]
    #[allow(clippy::missing_const_for_fn)] // Arc field prevents const in practice
    pub fn replication_state(&self) -> &Arc<crate::replication::state::ReplicationState> {
        &self.replication_state
    }
    /// Start the segment receiver for a follower node.
    ///
    /// The receiver thread blocks on the transport's `recv_segment()` and
    /// applies each received WAL segment to the local signal ledger.
    ///
    /// # Errors
    ///
    /// Returns `TidalError::Internal` if no ledger is wired (ephemeral without schema).
    pub fn start_replication<T: crate::replication::Transport>(
        &self,
        transport: std::sync::Arc<T>,
    ) -> crate::Result<()> {
        let ledger = self.ledger()?.clone();
        let state = Arc::clone(&self.replication_state);
        let handle = crate::replication::receiver::spawn_receiver(transport, ledger, state);
        if let Ok(mut guard) = self.receiver_handle.lock() {
            *guard = Some(handle);
        }
        Ok(())
    }
    /// Returns `Ok(())` if the database is initialized and operational.
    ///
    /// # Errors
--- a/tidal/src/db/relationships.rs
+++ b/tidal/src/db/relationships.rs
@ -29,6 +29,7 @@ impl TidalDb {
        use crate::entities::relationship::{
            encode_relationship_key, serialize_relationship_value,
        };
        self.require_writeable("write_relationship")?;
        let storage = self.storage()?;
@ -78,6 +79,7 @@ impl TidalDb {
        to: EntityId,
    ) -> crate::Result<()> {
        use crate::entities::relationship::encode_relationship_key;
        self.require_writeable("delete_relationship")?;
        let storage = self.storage()?;
        let key = encode_relationship_key(from, rel_type, to);
--- a/tidal/src/db/session_restore.rs
+++ b/tidal/src/db/session_restore.rs
@ -250,6 +250,7 @@ impl TidalDb {
                    ts_ns,
                    signal_name,
                    annotation,
                    ..
                } => {
                    session_signals.entry(*session_id).or_default().push((
                        *entity_id,
--- a/tidal/src/db/sessions.rs
+++ b/tidal/src/db/sessions.rs
@ -33,6 +33,7 @@ impl TidalDb {
        policy_name: &str,
        metadata: HashMap<String, String>,
    ) -> crate::Result<SessionHandle> {
        self.require_writeable("start_session")?;
        // Validate policy exists in schema.
        let schema = self.schema_def.as_ref().ok_or_else(|| {
            TidalError::internal("start_session", "no schema: open with with_schema()")
--- a/tidal/src/db/signals.rs
+++ b/tidal/src/db/signals.rs
@ -24,6 +24,7 @@ impl TidalDb {
        metadata: &HashMap<String, String>,
    ) -> crate::Result<()> {
        use crate::storage::{Tag, encode_key};
        self.require_writeable("write_item")?;
        let storage = self.storage()?;
        let key = encode_key(id, Tag::Meta, b"");
@ -54,6 +55,7 @@ impl TidalDb {
        weight: f64,
        timestamp: Timestamp,
    ) -> crate::Result<()> {
        self.require_writeable("signal")?;
        if !weight.is_finite() {
            return Err(TidalError::invalid_input(
                "signal weight must be finite (NaN and Inf are not allowed)",
@ -220,6 +222,7 @@ impl TidalDb {
        for_user: Option<u64>,
        creator_id: Option<u64>,
    ) -> crate::Result<()> {
        self.require_writeable("signal_with_context")?;
        // Record the base signal.
        self.signal(signal_type, entity_id, weight, timestamp)?;
--- a/tidal/src/db/sweeper.rs
+++ b/tidal/src/db/sweeper.rs
@ -147,6 +147,18 @@ impl TidalDb {
        self
    }
    /// Reject write operations on follower nodes.
    ///
    /// Returns `TidalError::ReadOnly` when the node's role is `Follower`.
    /// Must be the **first** statement in every public write method so the
    /// guard fires before any validation or allocation.
    pub(crate) fn require_writeable(&self, operation: &str) -> crate::Result<()> {
        if self.config.cluster.role == crate::db::config::NodeRole::Follower {
            return Err(crate::schema::TidalError::read_only(operation));
        }
        Ok(())
    }
    // ── Internal helpers ─────────────────────────────────────────────────────
    /// Borrow the storage backend, or error if the database was opened without a schema.
--- a/tidal/src/db/users.rs
+++ b/tidal/src/db/users.rs
@ -22,6 +22,7 @@ impl TidalDb {
        id: EntityId,
        metadata: &HashMap<String, String>,
    ) -> crate::Result<()> {
        self.require_writeable("write_user")?;
        let storage = self.storage()?;
        let key = encode_key(id, Tag::Meta, b"");
        let value = crate::entities::serialize_entity(None, metadata);
--- a/tidal/src/entities/hard_neg.rs
+++ b/tidal/src/entities/hard_neg.rs
@ -10,6 +10,8 @@
 use dashmap::DashMap;
 use roaring::RoaringBitmap;
 use crate::replication::crdt::HlcTimestamp;
 /// Signal types that constitute a hard negative.
 pub const HARD_NEG_SIGNALS: &[&str] = &["skip", "hide", "dislike", "block"];
@ -23,6 +25,9 @@ pub const HARD_NEG_SIGNALS: &[&str] = &["skip", "hide", "dislike", "block"];
 /// users never contend.
 pub struct HardNegIndex {
    inner: DashMap<u64, RoaringBitmap>,
    /// Highest hide HLC timestamp seen per (`user_id`, `item_id`).
    /// Used for union semantics: unhide only clears if its ts > max hide ts.
    hide_ts: DashMap<(u64, u32), HlcTimestamp>,
 }
 impl HardNegIndex {
@ -30,6 +35,7 @@ impl HardNegIndex {
    pub fn new() -> Self {
        Self {
            inner: DashMap::new(),
            hide_ts: DashMap::new(),
        }
    }
@ -38,6 +44,15 @@ impl HardNegIndex {
        self.inner.entry(user_id).or_default().insert(item_id);
    }
    /// Remove a hard negative for a user-item pair.
    ///
    /// No-op if the user has no bitmap or the item is not in it.
    pub fn remove(&self, user_id: u64, item_id: u32) {
        if let Some(mut bm) = self.inner.get_mut(&user_id) {
            bm.remove(item_id);
        }
    }
    /// Check if a user has rejected a specific item.
    #[must_use]
    pub fn is_negative(&self, user_id: u64, item_id: u32) -> bool {
@ -73,6 +88,66 @@ impl HardNegIndex {
    pub fn is_hard_neg_signal(signal_type: &str) -> bool {
        HARD_NEG_SIGNALS.contains(&signal_type)
    }
    // -----------------------------------------------------------------
    // Replication-aware methods (HLC-timestamped, union semantics)
    // -----------------------------------------------------------------
    /// Apply a hide from replication (union semantics).
    ///
    /// Always adds to the bitmap (hide from any shard wins during convergence).
    /// Records the highest HLC timestamp seen for this (user, item) pair.
    ///
    /// This is the replication path -- local hides still use [`add`].
    pub fn apply_replication_hide(&self, user_id: u64, item_id: u32, ts: HlcTimestamp) {
        self.inner.entry(user_id).or_default().insert(item_id);
        let mut entry = self.hide_ts.entry((user_id, item_id)).or_insert(ts);
        if ts > *entry {
            *entry = ts;
        }
    }
    /// Apply an unhide from replication (union semantics).
    ///
    /// Only removes from the bitmap if `ts` strictly beats the highest recorded
    /// hide timestamp for this (user, item) pair.  This ensures that during
    /// convergence, a hide with a later HLC always wins over an earlier unhide.
    ///
    /// Returns `true` if the item was actually removed from the bitmap.
    #[must_use]
    pub fn apply_replication_unhide(&self, user_id: u64, item_id: u32, ts: HlcTimestamp) -> bool {
        let max_hide_ts = self.hide_ts.get(&(user_id, item_id)).map(|v| *v);
        match max_hide_ts {
            Some(hide_ts) if ts > hide_ts => {
                // Unhide wins: ts is strictly later than the latest hide.
                if let Some(mut bm) = self.inner.get_mut(&user_id) {
                    bm.remove(item_id);
                }
                // Remove the hide_ts entry (no longer hidden).
                self.hide_ts.remove(&(user_id, item_id));
                true
            }
            Some(_) => {
                // Hide wins: unhide's ts is not later than the hide's ts.
                false
            }
            None => {
                // No recorded hide -- safe to remove (legacy local hide or pre-HLC).
                if let Some(mut bm) = self.inner.get_mut(&user_id) {
                    bm.remove(item_id);
                }
                true
            }
        }
    }
    /// Highest recorded hide timestamp for (`user_id`, `item_id`), if any.
    ///
    /// Returns `None` if the item has not been hidden via the replication path.
    #[must_use]
    pub fn hide_timestamp(&self, user_id: u64, item_id: u32) -> Option<HlcTimestamp> {
        self.hide_ts.get(&(user_id, item_id)).map(|v| *v)
    }
 }
 impl Default for HardNegIndex {
@ -85,6 +160,15 @@ impl Default for HardNegIndex {
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    use crate::replication::crdt::HlcTimestamp;
    fn ts(wall_ns: u64, logical: u32, node_id: u16) -> HlcTimestamp {
        HlcTimestamp {
            wall_ns,
            logical,
            node_id,
        }
    }
    #[test]
    fn add_and_check() {
@ -132,4 +216,90 @@ mod tests {
        assert!(!HardNegIndex::is_hard_neg_signal("view"));
        assert!(!HardNegIndex::is_hard_neg_signal("like"));
    }
    // -----------------------------------------------------------------
    // Replication-aware method tests
    // -----------------------------------------------------------------
    #[test]
    fn apply_replication_hide_adds_to_bitmap() {
        let idx = HardNegIndex::new();
        idx.apply_replication_hide(1, 100, ts(1000, 0, 0));
        assert!(idx.is_negative(1, 100));
        assert_eq!(idx.hide_timestamp(1, 100), Some(ts(1000, 0, 0)));
    }
    #[test]
    fn apply_replication_unhide_with_higher_ts_clears() {
        let idx = HardNegIndex::new();
        idx.apply_replication_hide(1, 200, ts(100, 0, 0));
        let cleared = idx.apply_replication_unhide(1, 200, ts(200, 0, 0));
        assert!(cleared, "unhide with higher ts should succeed");
        assert!(!idx.is_negative(1, 200));
        assert_eq!(idx.hide_timestamp(1, 200), None);
    }
    #[test]
    fn apply_replication_unhide_with_lower_ts_is_blocked() {
        let idx = HardNegIndex::new();
        // Hide at t=100
        idx.apply_replication_hide(1, 300, ts(100, 0, 0));
        // Unhide at t=50 (earlier -- should NOT clear the hide)
        let cleared = idx.apply_replication_unhide(1, 300, ts(50, 0, 1));
        assert!(
            !cleared,
            "unhide with lower ts should be blocked by union semantics"
        );
        assert!(idx.is_negative(1, 300), "item should still be hidden");
    }
    #[test]
    fn apply_replication_unhide_equal_ts_is_blocked() {
        let idx = HardNegIndex::new();
        idx.apply_replication_hide(1, 400, ts(100, 0, 0));
        // Same timestamp -- ts is NOT strictly greater, so blocked.
        let cleared = idx.apply_replication_unhide(1, 400, ts(100, 0, 0));
        assert!(!cleared);
        assert!(idx.is_negative(1, 400));
    }
    #[test]
    fn union_semantics_hide_from_any_shard_wins() {
        let idx = HardNegIndex::new();
        // Shard A hides at t=100
        idx.apply_replication_hide(2, 500, ts(100, 0, 0));
        // Shard B had an older unhide at t=50 (arrives after)
        let cleared = idx.apply_replication_unhide(2, 500, ts(50, 0, 1));
        assert!(!cleared);
        assert!(idx.is_negative(2, 500), "hide from shard A still wins");
    }
    #[test]
    fn multiple_hides_track_highest_ts() {
        let idx = HardNegIndex::new();
        idx.apply_replication_hide(3, 600, ts(50, 0, 0));
        idx.apply_replication_hide(3, 600, ts(200, 0, 1));
        idx.apply_replication_hide(3, 600, ts(100, 0, 2));
        // Highest hide ts is 200.
        assert_eq!(idx.hide_timestamp(3, 600), Some(ts(200, 0, 1)));
        // Unhide at ts=150 is blocked (150 < 200).
        assert!(!idx.apply_replication_unhide(3, 600, ts(150, 0, 0)));
        // Unhide at ts=300 succeeds (300 > 200).
        assert!(idx.apply_replication_unhide(3, 600, ts(300, 0, 0)));
        assert!(!idx.is_negative(3, 600));
    }
    #[test]
    fn existing_api_unchanged() {
        // Verify the bitmap API still works exactly as before.
        let idx = HardNegIndex::new();
        idx.add(10, 1);
        assert!(idx.is_negative(10, 1));
        idx.remove(10, 1);
        assert!(!idx.is_negative(10, 1));
        // apply_replication_unhide on locally-added item (no hide_ts) also removes it.
        idx.add(10, 2);
        idx.apply_replication_unhide(10, 2, ts(1, 0, 0));
        assert!(!idx.is_negative(10, 2));
    }
 }
--- a/tidal/src/lib.rs
+++ b/tidal/src/lib.rs
@ -4,6 +4,7 @@ pub mod entities;
 pub mod load;
 pub mod query;
 pub mod ranking;
 pub mod replication;
 pub mod schema;
 pub mod session;
 pub mod signals;
@ -32,11 +33,12 @@ pub mod testing;
 #[cfg(any(test, feature = "test-utils"))]
 pub use db::TempTidalHome;
 pub use db::backup::BackupInfo;
 pub use db::config::NodeRole;
 pub use db::export::{ExportFormat, ExportRequest, ExportedSignal, UserSessionSummary};
 #[cfg(feature = "metrics")]
 pub use db::http::MetricsHandle;
 pub use db::metrics::MetricsState;
-pub use db::{Config, ConfigError, Paths, StorageMode, TidalDb, TidalDbBuilder};
+pub use db::{Config, ConfigError, NodeConfig, Paths, StorageMode, TidalDb, TidalDbBuilder};
 pub use load::DegradationLevel;
 pub use schema::error::ErrorContext;
 pub use schema::{AgentPolicy, TidalError};
--- a/tidal/src/load/mod.rs
+++ b/tidal/src/load/mod.rs
@ -13,7 +13,7 @@ pub use rate_limiter::{RateLimiter, RateLimiterConfig};
 #[derive(Debug, Clone, Copy)]
 pub struct BackpressureConfig {
    /// Maximum pending messages in the WAL channel before rejecting.
-    /// Default: 80% of `DEFAULT_CHANNEL_CAPACITY` (8000 out of 10000).
+    /// Default: 1000 pending batches per ROADMAP.md §m7p2.
    pub queue_depth_threshold: usize,
    /// Suggested retry delay in milliseconds returned to the caller.
    /// Default: 50ms.
@ -23,7 +23,7 @@ pub struct BackpressureConfig {
 impl Default for BackpressureConfig {
    fn default() -> Self {
        Self {
-            queue_depth_threshold: 8_000,
+            queue_depth_threshold: 1_000,
            retry_after_ms: 50,
        }
    }
--- a/tidal/src/replication/crdt/hlc.rs
+++ b/tidal/src/replication/crdt/hlc.rs
@ -0,0 +1,483 @@
 //! Hybrid Logical Clock (HLC) for causal ordering across distributed nodes.
 //!
 //! Reference: Kulkarni et al., 2014 -- "Logical Physical Clocks and Consistent
 //! Snapshots in Globally Distributed Databases".
 //!
 //! The HLC combines a wall-clock component with a bounded logical counter.
 //! On every local event the clock advances to `max(wall, prev_wall)`;
 //! when the wall clock does not advance, the logical counter increments.
 //! On receiving a remote timestamp the clock advances past *both* the local
 //! state and the remote timestamp, preserving the causal "happens-before"
 //! relation.
 //!
 //! # Internal representation
 //!
 //! To guarantee uniqueness under concurrent `now()` calls without a mutex,
 //! the mutable state is packed into a single `AtomicU64`:
 //!
 //! ```text
 //!  bits 63..16  (48 bits)  wall-clock milliseconds since UNIX epoch
 //!  bits 15..0   (16 bits)  logical counter (0..65535)
 //! ```
 //!
 //! A single `compare_exchange` on this packed word atomically advances both
 //! components, eliminating the race window that would exist between two
 //! separate atomics.  The public [`HlcTimestamp`] uses nanoseconds for the
 //! wall clock (ms * `1_000_000`) and a `u32` logical counter for headroom.
 use std::sync::atomic::{AtomicU64, Ordering};
 use crate::replication::shard::ShardId;
 /// Number of bits allocated to the logical counter in the packed word.
 const LOGICAL_BITS: u32 = 16;
 /// Mask for extracting the logical counter from the packed word.
 const LOGICAL_MASK: u64 = (1u64 << LOGICAL_BITS) - 1; // 0xFFFF
 /// Maximum logical counter value before saturation.
 const LOGICAL_MAX: u64 = LOGICAL_MASK;
 /// Pack `(wall_ms, logical)` into a single `u64`.
 const fn pack(wall_ms: u64, logical: u16) -> u64 {
    (wall_ms << LOGICAL_BITS) | (logical as u64)
 }
 /// Unpack a `u64` into `(wall_ms, logical)`.
 const fn unpack(packed: u64) -> (u64, u16) {
    let wall_ms = packed >> LOGICAL_BITS;
    #[allow(clippy::cast_possible_truncation)]
    let logical = (packed & LOGICAL_MASK) as u16;
    (wall_ms, logical)
 }
 // ---------------------------------------------------------------------------
 // HlcTimestamp
 // ---------------------------------------------------------------------------
 /// A point-in-time HLC timestamp.
 ///
 /// Ordering: `(wall_ns, logical, node_id)` -- lexicographic.
 /// Same wall-time events are ordered by logical counter;
 /// ties within a node (impossible in practice) are broken by `node_id`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct HlcTimestamp {
    /// Wall-clock time in nanoseconds since UNIX epoch.
    ///
    /// Internally stored at millisecond granularity (bottom 6 decimal digits
    /// are always zero), but the nanosecond unit is preserved for API
    /// compatibility with other time sources.
    pub wall_ns: u64,
    /// Logical counter -- advances when wall time does not.
    pub logical: u32,
    /// The node (`ShardId.0`) that produced this timestamp.
    pub node_id: u16,
 }
 impl HlcTimestamp {
    /// Create a zero-valued HLC timestamp (minimum value).
    #[must_use]
    pub const fn zero() -> Self {
        Self {
            wall_ns: 0,
            logical: 0,
            node_id: 0,
        }
    }
    /// Convert `wall_ns` to milliseconds (truncating sub-ms precision).
    const fn wall_ms(&self) -> u64 {
        self.wall_ns / 1_000_000
    }
 }
 impl PartialOrd for HlcTimestamp {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
 }
 impl Ord for HlcTimestamp {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.wall_ns
            .cmp(&other.wall_ns)
            .then(self.logical.cmp(&other.logical))
            .then(self.node_id.cmp(&other.node_id))
    }
 }
 // ---------------------------------------------------------------------------
 // Hlc
 // ---------------------------------------------------------------------------
 /// Per-node Hybrid Logical Clock.
 ///
 /// Thread-safe: `now()` and `update()` use a single-word atomic CAS loop
 /// to advance the clock without a mutex.  Multiple threads can call `now()`
 /// concurrently; all returned timestamps are guaranteed unique within the
 /// node.
 ///
 /// # Examples
 ///
 /// ```
 /// use tidaldb::replication::crdt::hlc::{Hlc, HlcTimestamp};
 ///
 /// let clock = Hlc::new(0);
 /// let t1 = clock.now();
 /// let t2 = clock.now();
 /// assert!(t2 >= t1);
 /// ```
 pub struct Hlc {
    node_id: u16,
    /// Packed `(wall_ms << 16 | logical)`.  A single atomic word ensures
    /// that both components are advanced atomically via `compare_exchange`.
    packed: AtomicU64,
 }
 impl Hlc {
    /// Create a new clock for the given node.
    #[must_use]
    pub const fn new(node_id: u16) -> Self {
        Self {
            node_id,
            packed: AtomicU64::new(0),
        }
    }
    /// Create a clock for a [`ShardId`].
    #[must_use]
    pub const fn for_shard(shard: ShardId) -> Self {
        Self::new(shard.0)
    }
    /// Current wall-clock time in milliseconds since UNIX epoch.
    fn wall_ms_now() -> u64 {
        std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_millis() as u64
    }
    /// Generate a new HLC timestamp for a local event.
    ///
    /// Advances the clock monotonically.  If the wall clock has not
    /// advanced since the last call, the logical counter increments
    /// (saturating at 65535 -- practically unreachable since it would
    /// require that many events within a single millisecond without
    /// the system clock advancing).
    pub fn now(&self) -> HlcTimestamp {
        let wall = Self::wall_ms_now();
        loop {
            // Acquire: ensures we see the latest packed word written by any
            // thread that previously completed a CAS in this loop.
            let cur = self.packed.load(Ordering::Acquire);
            let (cur_wall, cur_logical) = unpack(cur);
            let (new_wall, new_logical) = if wall > cur_wall {
                // Physical clock advanced -- reset the logical counter.
                (wall, 0u16)
            } else {
                // Physical clock did not advance (same or regressed).
                // Stay at the same wall_ms, bump logical.
                #[allow(clippy::cast_possible_truncation)]
                let next_logical = ((u64::from(cur_logical) + 1).min(LOGICAL_MAX)) as u16;
                (cur_wall, next_logical)
            };
            let new_packed = pack(new_wall, new_logical);
            // AcqRel on success: the write is visible to subsequent Acquire
            // loads.  Acquire on failure: re-read the latest value before
            // retry.
            if self
                .packed
                .compare_exchange(cur, new_packed, Ordering::AcqRel, Ordering::Acquire)
                .is_ok()
            {
                return HlcTimestamp {
                    wall_ns: new_wall.saturating_mul(1_000_000),
                    logical: u32::from(new_logical),
                    node_id: self.node_id,
                };
            }
            // CAS failed -- another thread advanced the clock.  Retry.
        }
    }
    /// Update the clock on receiving a remote HLC timestamp.
    ///
    /// Advances the local clock to be causally after `remote`, ensuring
    /// that any subsequent `now()` call produces a timestamp strictly
    /// greater than `remote`.
    ///
    /// Returns the new local timestamp (which is strictly > `remote`).
    pub fn update(&self, remote: HlcTimestamp) -> HlcTimestamp {
        let wall = Self::wall_ms_now();
        let remote_ms = remote.wall_ms();
        // Truncate remote.logical to u16 (safe: if remote.logical > u16::MAX
        // we saturate, which still advances past it).
        #[allow(clippy::cast_possible_truncation)]
        let remote_logical = (remote.logical.min(u32::from(u16::MAX))) as u16;
        loop {
            let cur = self.packed.load(Ordering::Acquire);
            let (cur_wall, cur_logical) = unpack(cur);
            // pt = max(wall, remote_ms, cur_wall)
            // This ensures we never go backwards from any known timestamp.
            let pt = wall.max(remote_ms).max(cur_wall);
            let new_logical = if pt == cur_wall && pt == remote_ms {
                // All three are equal -- must advance past both local
                // and remote logical counters.
                #[allow(clippy::cast_possible_truncation)]
                let l = ((u64::from(cur_logical.max(remote_logical)) + 1).min(LOGICAL_MAX)) as u16;
                l
            } else if pt == cur_wall {
                // Local wall is highest -- advance past local logical.
                #[allow(clippy::cast_possible_truncation)]
                let l = ((u64::from(cur_logical) + 1).min(LOGICAL_MAX)) as u16;
                l
            } else if pt == remote_ms {
                // Remote wall is highest -- advance past remote logical.
                #[allow(clippy::cast_possible_truncation)]
                let l = ((u64::from(remote_logical) + 1).min(LOGICAL_MAX)) as u16;
                l
            } else {
                // Physical clock is strictly ahead of both -- reset.
                0u16
            };
            let new_packed = pack(pt, new_logical);
            if self
                .packed
                .compare_exchange(cur, new_packed, Ordering::AcqRel, Ordering::Acquire)
                .is_ok()
            {
                return HlcTimestamp {
                    wall_ns: pt.saturating_mul(1_000_000),
                    logical: u32::from(new_logical),
                    node_id: self.node_id,
                };
            }
        }
    }
 }
 // Compile-time assertion: Hlc is Send + Sync (all fields are atomics or
 // Copy primitives).
 #[allow(dead_code)]
 const fn _assert_hlc_send_sync() {
    const fn assert_send_sync<T: Send + Sync>() {}
    assert_send_sync::<Hlc>();
 }
 // ---------------------------------------------------------------------------
 // Tests
 // ---------------------------------------------------------------------------
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    // -- Pack/unpack round-trip --
    #[test]
    fn pack_unpack_roundtrip() {
        let cases: Vec<(u64, u16)> = vec![
            (0, 0),
            (1, 0),
            (0, 1),
            (u64::MAX >> LOGICAL_BITS, u16::MAX),
            (1_740_000_000_000, 42), // ~2025 in ms
        ];
        for (wall_ms, logical) in cases {
            let packed = pack(wall_ms, logical);
            let (w, l) = unpack(packed);
            assert_eq!(w, wall_ms, "wall_ms mismatch for ({wall_ms}, {logical})");
            assert_eq!(l, logical, "logical mismatch for ({wall_ms}, {logical})");
        }
    }
    // -- HlcTimestamp ordering --
    #[test]
    fn hlc_timestamp_ordering_wall_ns() {
        let a = HlcTimestamp {
            wall_ns: 100,
            logical: 0,
            node_id: 0,
        };
        let b = HlcTimestamp {
            wall_ns: 200,
            logical: 0,
            node_id: 0,
        };
        assert!(a < b);
    }
    #[test]
    fn hlc_timestamp_ordering_logical() {
        let a = HlcTimestamp {
            wall_ns: 100,
            logical: 0,
            node_id: 0,
        };
        let b = HlcTimestamp {
            wall_ns: 100,
            logical: 1,
            node_id: 0,
        };
        assert!(a < b);
    }
    #[test]
    fn hlc_timestamp_ordering_node_id() {
        let a = HlcTimestamp {
            wall_ns: 100,
            logical: 0,
            node_id: 0,
        };
        let b = HlcTimestamp {
            wall_ns: 100,
            logical: 0,
            node_id: 1,
        };
        assert!(a < b);
    }
    // -- Hlc monotonicity --
    #[test]
    fn hlc_monotone_sequential() {
        let clock = Hlc::new(0);
        let mut prev = clock.now();
        for _ in 0..1_000 {
            let next = clock.now();
            assert!(next >= prev, "clock went backwards: {prev:?} > {next:?}");
            prev = next;
        }
    }
    // -- Hlc::update --
    #[test]
    fn hlc_update_advances_past_remote() {
        let clock = Hlc::new(0);
        // ~year 2096 in ms -- far enough in the future to exceed wall clock,
        // small enough that wall_ms * 1_000_000 does not overflow u64.
        let far_future_ms: u64 = 4_000_000_000_000;
        let remote = HlcTimestamp {
            wall_ns: far_future_ms * 1_000_000,
            logical: 99,
            node_id: 1,
        };
        let updated = clock.update(remote);
        assert!(
            updated > remote,
            "updated {updated:?} should be > remote {remote:?}"
        );
    }
    #[test]
    fn hlc_update_advances_past_local() {
        let clock = Hlc::new(0);
        // Advance the local clock first.
        let local = clock.now();
        // Remote is far in the past -- update should still advance past local.
        let remote = HlcTimestamp {
            wall_ns: 1_000_000, // 1 ms
            logical: 0,
            node_id: 1,
        };
        let updated = clock.update(remote);
        assert!(
            updated > local,
            "updated {updated:?} should be > local {local:?}"
        );
    }
    #[test]
    fn hlc_update_same_wall_as_remote_and_local() {
        let clock = Hlc::new(0);
        // ~year 2096 in ms -- far enough in the future to exceed wall clock.
        let far_future_ms: u64 = 4_000_000_000_000;
        let far_future_ns = far_future_ms * 1_000_000;
        let remote1 = HlcTimestamp {
            wall_ns: far_future_ns,
            logical: 10,
            node_id: 1,
        };
        let t1 = clock.update(remote1);
        assert!(t1 > remote1);
        // Now send another remote with the same wall_ns but lower logical.
        let remote2 = HlcTimestamp {
            wall_ns: far_future_ns,
            logical: 5,
            node_id: 2,
        };
        let t2 = clock.update(remote2);
        // t2 must be > both remote2 and t1.
        assert!(t2 > remote2);
        assert!(t2 > t1);
    }
    // -- Concurrent uniqueness --
    #[test]
    fn hlc_concurrent_uniqueness() {
        use std::sync::Arc;
        let clock = Arc::new(Hlc::new(0));
        let mut handles = vec![];
        for _ in 0..4 {
            let c = Arc::clone(&clock);
            handles.push(std::thread::spawn(move || {
                (0..250).map(|_| c.now()).collect::<Vec<_>>()
            }));
        }
        let mut all: Vec<HlcTimestamp> = handles
            .into_iter()
            .flat_map(|h| h.join().unwrap())
            .collect();
        all.sort();
        // All 1000 timestamps must be unique.
        for w in all.windows(2) {
            assert_ne!(w[0], w[1], "duplicate timestamp: {:?}", w[0]);
        }
    }
    // -- Zero --
    #[test]
    fn hlc_zero_is_minimum() {
        let zero = HlcTimestamp::zero();
        let clock = Hlc::new(0);
        let t = clock.now();
        assert!(t > zero);
    }
    // -- for_shard --
    #[test]
    fn hlc_for_shard() {
        let shard = ShardId(42);
        let clock = Hlc::for_shard(shard);
        let t = clock.now();
        assert_eq!(t.node_id, 42);
    }
    // -- wall_ns is at ms granularity --
    #[test]
    fn wall_ns_is_ms_granularity() {
        let clock = Hlc::new(0);
        let t = clock.now();
        assert_eq!(
            t.wall_ns % 1_000_000,
            0,
            "wall_ns should be a multiple of 1_000_000 (ms precision)"
        );
    }
 }
--- a/tidal/src/replication/crdt/lww_register.rs
+++ b/tidal/src/replication/crdt/lww_register.rs
@ -0,0 +1,490 @@
 //! Last-Writer-Wins Register CRDT.
 //!
 //! Resolves concurrent writes by HLC timestamp ordering. Ties broken by
 //! `node_id` (higher wins). Used for hard negatives (hide/mute/block)
 //! which require LWW semantics across distributed nodes.
 //!
 //! # CRDT Properties
 //!
 //! `merge` is commutative, associative, and idempotent -- these properties
 //! are verified by property tests using `proptest`.
 //!
 //! # Design Notes
 //!
 //! The value slot is `Option<T>` where `None` means "not yet written."
 //! This is distinct from a register that has been written with a value and
 //! then cleared -- clearing would require a write with a sentinel value,
 //! not setting the register back to `None`.
 use super::hlc::HlcTimestamp;
 /// Last-Writer-Wins register with HLC timestamp.
 ///
 /// Resolves concurrent writes by [`HlcTimestamp`] ordering:
 /// - Higher `wall_ns` wins
 /// - Same wall, higher `logical` wins
 /// - Same wall + logical, higher `node_id` wins (deterministic tie-break)
 ///
 /// `None` represents "not yet written."
 ///
 /// # Properties
 ///
 /// - **Commutative:** `merge(A, B) == merge(B, A)`
 /// - **Associative:** `merge(A, merge(B, C)) == merge(merge(A, B), C)`
 /// - **Idempotent:** `merge(A, A) == A`
 ///
 /// # Examples
 ///
 /// ```
 /// use tidaldb::replication::crdt::lww_register::LWWRegister;
 /// use tidaldb::replication::crdt::hlc::HlcTimestamp;
 ///
 /// let mut reg: LWWRegister<u8> = LWWRegister::empty();
 /// let ts = HlcTimestamp { wall_ns: 1000, logical: 0, node_id: 0 };
 /// reg.write(42u8, ts);
 /// assert_eq!(reg.get(), Some(&42u8));
 /// ```
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct LWWRegister<T: Clone + PartialEq> {
    value: Option<T>,
    timestamp: Option<HlcTimestamp>,
 }
 impl<T: Clone + PartialEq> LWWRegister<T> {
    /// Create an empty register (no value written yet).
    #[must_use]
    pub const fn empty() -> Self {
        Self {
            value: None,
            timestamp: None,
        }
    }
    /// Write a new value with the given HLC timestamp.
    ///
    /// Only advances the register if `ts > self.timestamp`.
    /// Writes at or before the current timestamp are silently discarded
    /// (they represent causally earlier events).
    pub fn write(&mut self, value: T, ts: HlcTimestamp) {
        if self.timestamp.is_none_or(|cur| ts > cur) {
            self.value = Some(value);
            self.timestamp = Some(ts);
        }
    }
    /// Merge another register into this one.
    ///
    /// The register with the higher HLC timestamp wins.
    /// If both are empty, the result is empty.
    /// If only one has a value, that value is taken.
    pub fn merge(&mut self, other: &Self) {
        if let Some(other_ts) = other.timestamp
            && self.timestamp.is_none_or(|cur| other_ts > cur)
        {
            self.value.clone_from(&other.value);
            self.timestamp = other.timestamp;
        }
    }
    /// Current value of the register.
    #[must_use]
    pub const fn get(&self) -> Option<&T> {
        self.value.as_ref()
    }
    /// The HLC timestamp of the last accepted write.
    #[must_use]
    pub const fn timestamp(&self) -> Option<HlcTimestamp> {
        self.timestamp
    }
    /// Returns `true` if no value has been written.
    #[must_use]
    pub const fn is_empty(&self) -> bool {
        self.value.is_none()
    }
 }
 impl<T: Clone + PartialEq> Default for LWWRegister<T> {
    fn default() -> Self {
        Self::empty()
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    fn ts(wall_ns: u64, logical: u32, node_id: u16) -> HlcTimestamp {
        HlcTimestamp {
            wall_ns,
            logical,
            node_id,
        }
    }
    // ------------------------------------------------------------------
    // Basic write semantics
    // ------------------------------------------------------------------
    #[test]
    fn empty_register_has_no_value() {
        let reg: LWWRegister<u8> = LWWRegister::empty();
        assert!(reg.is_empty());
        assert_eq!(reg.get(), None);
        assert_eq!(reg.timestamp(), None);
    }
    #[test]
    fn write_to_empty_register_succeeds() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(42, ts(100, 0, 0));
        assert_eq!(reg.get(), Some(&42));
        assert_eq!(reg.timestamp(), Some(ts(100, 0, 0)));
        assert!(!reg.is_empty());
    }
    #[test]
    fn write_with_higher_wall_overwrites() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(1, ts(100, 0, 0));
        reg.write(2, ts(200, 0, 0));
        assert_eq!(reg.get(), Some(&2));
    }
    #[test]
    fn write_with_lower_wall_is_discarded() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(2, ts(200, 0, 0));
        reg.write(1, ts(100, 0, 0));
        assert_eq!(reg.get(), Some(&2));
    }
    #[test]
    fn write_with_same_wall_higher_logical_overwrites() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(1, ts(100, 0, 0));
        reg.write(2, ts(100, 1, 0));
        assert_eq!(reg.get(), Some(&2));
    }
    #[test]
    fn write_with_same_wall_lower_logical_is_discarded() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(2, ts(100, 5, 0));
        reg.write(1, ts(100, 3, 0));
        assert_eq!(reg.get(), Some(&2));
    }
    #[test]
    fn write_with_same_wall_same_logical_higher_node_overwrites() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(1, ts(100, 0, 0));
        reg.write(2, ts(100, 0, 1));
        assert_eq!(reg.get(), Some(&2));
    }
    #[test]
    fn write_with_equal_timestamp_is_discarded() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(1, ts(100, 0, 0));
        reg.write(99, ts(100, 0, 0));
        // Equal is NOT greater, so the write is discarded.
        assert_eq!(reg.get(), Some(&1));
    }
    // ------------------------------------------------------------------
    // Merge semantics
    // ------------------------------------------------------------------
    #[test]
    fn merge_empty_into_empty() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        let b: LWWRegister<u8> = LWWRegister::empty();
        a.merge(&b);
        assert!(a.is_empty());
    }
    #[test]
    fn merge_value_into_empty() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        let mut b: LWWRegister<u8> = LWWRegister::empty();
        b.write(42, ts(500, 0, 0));
        a.merge(&b);
        assert_eq!(a.get(), Some(&42));
    }
    #[test]
    fn merge_empty_into_value() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        a.write(42, ts(500, 0, 0));
        let b: LWWRegister<u8> = LWWRegister::empty();
        a.merge(&b);
        assert_eq!(a.get(), Some(&42));
    }
    #[test]
    fn merge_takes_higher_timestamp() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        a.write(1, ts(100, 0, 0));
        let mut b: LWWRegister<u8> = LWWRegister::empty();
        b.write(2, ts(200, 0, 0));
        a.merge(&b);
        assert_eq!(a.get(), Some(&2));
    }
    #[test]
    fn merge_keeps_self_when_higher() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        a.write(1, ts(200, 0, 0));
        let mut b: LWWRegister<u8> = LWWRegister::empty();
        b.write(2, ts(100, 0, 0));
        a.merge(&b);
        assert_eq!(a.get(), Some(&1));
    }
    #[test]
    fn merge_tie_break_by_node_id() {
        // Same wall + logical, higher node_id wins.
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        a.write(1, ts(100, 0, 0));
        let mut b: LWWRegister<u8> = LWWRegister::empty();
        b.write(2, ts(100, 0, 1)); // higher node_id
        let mut merged = a.clone();
        merged.merge(&b);
        assert_eq!(merged.get(), Some(&2));
    }
    // ------------------------------------------------------------------
    // CRDT properties (manual verification)
    // ------------------------------------------------------------------
    #[test]
    fn merge_commutative() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        a.write(1, ts(100, 0, 0));
        let mut b: LWWRegister<u8> = LWWRegister::empty();
        b.write(2, ts(200, 0, 1));
        let mut ab = a.clone();
        ab.merge(&b);
        let mut ba = b.clone();
        ba.merge(&a);
        assert_eq!(ab.get(), ba.get());
        assert_eq!(ab.timestamp(), ba.timestamp());
    }
    #[test]
    fn merge_associative() {
        let mut a: LWWRegister<u8> = LWWRegister::empty();
        a.write(1, ts(100, 0, 0));
        let mut b: LWWRegister<u8> = LWWRegister::empty();
        b.write(2, ts(200, 0, 1));
        let mut c: LWWRegister<u8> = LWWRegister::empty();
        c.write(3, ts(150, 0, 2));
        // (a merge b) merge c
        let mut ab_c = a.clone();
        ab_c.merge(&b);
        ab_c.merge(&c);
        // a merge (b merge c)
        let mut bc = b.clone();
        bc.merge(&c);
        let mut a_bc = a.clone();
        a_bc.merge(&bc);
        assert_eq!(ab_c.get(), a_bc.get());
        assert_eq!(ab_c.timestamp(), a_bc.timestamp());
    }
    #[test]
    fn merge_idempotent() {
        let mut reg: LWWRegister<u8> = LWWRegister::empty();
        reg.write(42, ts(100, 0, 0));
        let snapshot = reg.clone();
        reg.merge(&snapshot);
        assert_eq!(reg.get(), Some(&42));
        assert_eq!(reg.timestamp(), Some(ts(100, 0, 0)));
    }
    // ------------------------------------------------------------------
    // Domain-specific: hard negatives
    // ------------------------------------------------------------------
    /// Simulates the hard negative use case: hide has higher HLC, should
    /// survive merge against a concurrent unhide with lower HLC.
    #[test]
    fn hard_negative_hide_wins_with_higher_hlc() {
        let ts_hide = ts(1000, 0, 0);
        let ts_unhide = ts(500, 0, 1);
        let mut local: LWWRegister<&str> = LWWRegister::empty();
        local.write("hide", ts_hide);
        let mut remote: LWWRegister<&str> = LWWRegister::empty();
        remote.write("unhide", ts_unhide);
        local.merge(&remote);
        assert_eq!(local.get(), Some(&"hide"));
    }
    /// Simulates the reverse: unhide at later HLC overrides an earlier hide.
    #[test]
    fn hard_negative_unhide_wins_with_higher_hlc() {
        let ts_hide = ts(500, 0, 0);
        let ts_unhide = ts(1000, 0, 1);
        let mut local: LWWRegister<&str> = LWWRegister::empty();
        local.write("hide", ts_hide);
        let mut remote: LWWRegister<&str> = LWWRegister::empty();
        remote.write("unhide", ts_unhide);
        local.merge(&remote);
        assert_eq!(local.get(), Some(&"unhide"));
    }
    /// When hide and unhide happen at the same wall time + logical, the
    /// higher node_id wins deterministically.
    #[test]
    fn hard_negative_concurrent_same_wall_resolved_by_node_id() {
        let ts_hide = ts(1000, 0, 0);
        let ts_unhide = ts(1000, 0, 5); // higher node_id
        let mut local: LWWRegister<&str> = LWWRegister::empty();
        local.write("hide", ts_hide);
        let mut remote: LWWRegister<&str> = LWWRegister::empty();
        remote.write("unhide", ts_unhide);
        local.merge(&remote);
        assert_eq!(
            local.get(),
            Some(&"unhide"),
            "higher node_id (5 > 0) should win at same wall+logical"
        );
    }
    // ------------------------------------------------------------------
    // Default trait
    // ------------------------------------------------------------------
    #[test]
    fn default_is_empty() {
        let reg: LWWRegister<u8> = LWWRegister::default();
        assert!(reg.is_empty());
    }
 }
 #[cfg(test)]
 mod property_tests {
    use super::*;
    use proptest::prelude::*;
    fn arb_hlc_timestamp() -> impl Strategy<Value = HlcTimestamp> {
        (0..=1_000_000u64, 0..=100u32, 0..=10u16).prop_map(|(w, l, n)| HlcTimestamp {
            wall_ns: w,
            logical: l,
            node_id: n,
        })
    }
    fn arb_register() -> impl Strategy<Value = LWWRegister<u8>> {
        prop_oneof![
            // Empty register
            Just(LWWRegister::empty()),
            // Register with a value
            (any::<u8>(), arb_hlc_timestamp()).prop_map(|(v, ts)| {
                let mut r = LWWRegister::empty();
                r.write(v, ts);
                r
            }),
        ]
    }
    proptest! {
        /// merge(A, B) == merge(B, A) for all registers.
        #[test]
        fn merge_is_commutative(a in arb_register(), b in arb_register()) {
            let mut ab = a.clone();
            ab.merge(&b);
            let mut ba = b.clone();
            ba.merge(&a);
            prop_assert_eq!(ab.get(), ba.get());
            prop_assert_eq!(ab.timestamp(), ba.timestamp());
        }
        /// merge(merge(A, B), C) == merge(A, merge(B, C)) for all registers.
        #[test]
        fn merge_is_associative(
            a in arb_register(),
            b in arb_register(),
            c in arb_register(),
        ) {
            let mut ab_c = a.clone();
            ab_c.merge(&b);
            ab_c.merge(&c);
            let mut bc = b.clone();
            bc.merge(&c);
            let mut a_bc = a.clone();
            a_bc.merge(&bc);
            prop_assert_eq!(ab_c.get(), a_bc.get());
            prop_assert_eq!(ab_c.timestamp(), a_bc.timestamp());
        }
        /// merge(A, A) == A for all registers.
        #[test]
        fn merge_is_idempotent(a in arb_register()) {
            let mut merged = a.clone();
            merged.merge(&a);
            prop_assert_eq!(merged.get(), a.get());
            prop_assert_eq!(merged.timestamp(), a.timestamp());
        }
        /// write with strictly higher timestamp always advances the register.
        #[test]
        fn write_with_higher_ts_always_wins(
            v1 in any::<u8>(),
            v2 in any::<u8>(),
            ts1 in arb_hlc_timestamp(),
        ) {
            // Construct ts2 strictly greater than ts1.
            let ts2 = HlcTimestamp {
                wall_ns: ts1.wall_ns + 1,
                logical: ts1.logical,
                node_id: ts1.node_id,
            };
            let mut reg = LWWRegister::empty();
            reg.write(v1, ts1);
            reg.write(v2, ts2);
            prop_assert_eq!(reg.get(), Some(&v2));
            prop_assert_eq!(reg.timestamp(), Some(ts2));
        }
        /// write with strictly lower timestamp never changes the register.
        #[test]
        fn write_with_lower_ts_never_wins(
            v1 in any::<u8>(),
            v2 in any::<u8>(),
            ts2 in arb_hlc_timestamp(),
        ) {
            // Construct ts1 strictly greater than ts2.
            let ts1 = HlcTimestamp {
                wall_ns: ts2.wall_ns + 1,
                logical: ts2.logical,
                node_id: ts2.node_id,
            };
            let mut reg = LWWRegister::empty();
            reg.write(v1, ts1);
            reg.write(v2, ts2);
            prop_assert_eq!(reg.get(), Some(&v1));
            prop_assert_eq!(reg.timestamp(), Some(ts1));
        }
    }
 }
--- a/tidal/src/replication/crdt/mod.rs
+++ b/tidal/src/replication/crdt/mod.rs
@ -0,0 +1,21 @@
 //! CRDT types for conflict-free distributed state in tidalDB.
 //!
 //! Provides the building blocks for deterministic reconciliation after
 //! network partitions:
 //!
 //! - [`hlc`]: Hybrid Logical Clock for causal ordering
 //! - [`pn_counter`]: Positive-Negative Counter for signal counts
 //! - [`lww_register`]: Last-Writer-Wins Register for hard negatives
 //!
 //! Future tasks will add:
 //! - [`signal_state`]: CRDT-aware signal state wrapping decay + windowed counts
 pub mod hlc;
 pub mod lww_register;
 pub mod pn_counter;
 pub mod signal_state;
 pub use hlc::{Hlc, HlcTimestamp};
 pub use lww_register::LWWRegister;
 pub use pn_counter::PNCounter;
 pub use signal_state::CrdtSignalState;
--- a/tidal/src/replication/crdt/pn_counter.rs
+++ b/tidal/src/replication/crdt/pn_counter.rs
@ -0,0 +1,233 @@
 //! Positive-Negative Counter CRDT.
 //!
 //! A PN-Counter maintains per-node P (increment) and N (decrement) totals.
 //! The global value = sum(P) - sum(N). Merge takes the per-node maximum of
 //! each component, making it commutative, associative, and idempotent.
 use std::collections::HashMap;
 use crate::replication::shard::ShardId;
 /// Positive-Negative Counter CRDT.
 ///
 /// Each node ([`ShardId`]) maintains its own P (increment) and N (decrement)
 /// totals. The global value = `sum(P) - sum(N)`. Merge takes the per-node
 /// max of each component -- safe because values only ever increase within
 /// a node.
 ///
 /// # Properties
 ///
 /// - **Commutative:** `merge(A, B) == merge(B, A)`
 /// - **Associative:** `merge(A, merge(B, C)) == merge(merge(A, B), C)`
 /// - **Idempotent:** `merge(A, A) == A`
 ///
 /// # Examples
 ///
 /// ```
 /// use tidaldb::replication::crdt::PNCounter;
 /// use tidaldb::replication::ShardId;
 ///
 /// let mut a = PNCounter::new();
 /// let mut b = PNCounter::new();
 /// a.increment(ShardId(0), 10);
 /// b.increment(ShardId(1), 5);
 /// a.merge(&b);
 /// assert_eq!(a.value(), 15);
 /// ```
 #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PNCounter {
    positive: HashMap<ShardId, u64>,
    negative: HashMap<ShardId, u64>,
 }
 impl PNCounter {
    /// Create a new empty counter.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }
    /// Increment the counter for `node` by `amount`.
    pub fn increment(&mut self, node: ShardId, amount: u64) {
        *self.positive.entry(node).or_default() += amount;
    }
    /// Decrement the counter for `node` by `amount`.
    pub fn decrement(&mut self, node: ShardId, amount: u64) {
        *self.negative.entry(node).or_default() += amount;
    }
    /// Merge another counter into this one.
    ///
    /// Takes the per-node maximum of both P and N components.
    /// Safe because each node's contribution only grows.
    pub fn merge(&mut self, other: &Self) {
        for (&node, &val) in &other.positive {
            let entry = self.positive.entry(node).or_default();
            *entry = (*entry).max(val);
        }
        for (&node, &val) in &other.negative {
            let entry = self.negative.entry(node).or_default();
            *entry = (*entry).max(val);
        }
    }
    /// Returns the current value: `sum(P) - sum(N)`, saturating at 0.
    #[must_use]
    pub fn value(&self) -> u64 {
        let p: u64 = self.positive.values().sum();
        let n: u64 = self.negative.values().sum();
        p.saturating_sub(n)
    }
    /// Total positive contributions across all nodes.
    #[must_use]
    pub fn total_positive(&self) -> u64 {
        self.positive.values().sum()
    }
    /// Total negative contributions across all nodes.
    #[must_use]
    pub fn total_negative(&self) -> u64 {
        self.negative.values().sum()
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    #[test]
    fn increment_and_value() {
        let mut c = PNCounter::new();
        c.increment(ShardId(0), 10);
        c.increment(ShardId(1), 5);
        assert_eq!(c.value(), 15);
    }
    #[test]
    fn decrement_saturates_at_zero() {
        let mut c = PNCounter::new();
        c.increment(ShardId(0), 5);
        c.decrement(ShardId(0), 10);
        assert_eq!(c.value(), 0);
    }
    #[test]
    fn merge_commutative() {
        let mut a = PNCounter::new();
        a.increment(ShardId(0), 10);
        let mut b = PNCounter::new();
        b.increment(ShardId(1), 5);
        let mut ab = a.clone();
        ab.merge(&b);
        let mut ba = b.clone();
        ba.merge(&a);
        assert_eq!(ab.value(), ba.value());
        assert_eq!(ab, ba);
    }
    #[test]
    fn merge_idempotent() {
        let mut c = PNCounter::new();
        c.increment(ShardId(0), 42);
        let snapshot = c.clone();
        c.merge(&snapshot);
        assert_eq!(c.value(), snapshot.value());
        assert_eq!(c, snapshot);
    }
    #[test]
    fn merge_associative() {
        let mut a = PNCounter::new();
        a.increment(ShardId(0), 10);
        let mut b = PNCounter::new();
        b.increment(ShardId(1), 20);
        let mut c = PNCounter::new();
        c.increment(ShardId(2), 30);
        // (A merge B) merge C
        let mut ab_c = a.clone();
        ab_c.merge(&b);
        ab_c.merge(&c);
        // A merge (B merge C)
        let mut a_bc = a.clone();
        let mut bc = b.clone();
        bc.merge(&c);
        a_bc.merge(&bc);
        assert_eq!(ab_c.value(), a_bc.value());
        assert_eq!(ab_c, a_bc);
    }
    #[test]
    fn no_double_count_disjoint_nodes() {
        let mut a = PNCounter::new();
        a.increment(ShardId(0), 100);
        let mut b = PNCounter::new();
        b.increment(ShardId(1), 200);
        a.merge(&b);
        assert_eq!(a.value(), 300);
    }
    #[test]
    fn merge_same_node_takes_max() {
        let mut a = PNCounter::new();
        a.increment(ShardId(0), 50);
        let mut b = PNCounter::new();
        b.increment(ShardId(0), 30);
        a.merge(&b);
        // Node 0's positive should be max(50, 30) = 50
        assert_eq!(a.value(), 50);
    }
    #[test]
    fn total_positive_and_negative() {
        let mut c = PNCounter::new();
        c.increment(ShardId(0), 10);
        c.increment(ShardId(1), 5);
        c.decrement(ShardId(0), 3);
        assert_eq!(c.total_positive(), 15);
        assert_eq!(c.total_negative(), 3);
        assert_eq!(c.value(), 12);
    }
    #[test]
    fn empty_counter_is_zero() {
        let c = PNCounter::new();
        assert_eq!(c.value(), 0);
        assert_eq!(c.total_positive(), 0);
        assert_eq!(c.total_negative(), 0);
    }
    #[test]
    fn merge_with_decrements() {
        let mut a = PNCounter::new();
        a.increment(ShardId(0), 100);
        a.decrement(ShardId(0), 20);
        let mut b = PNCounter::new();
        b.increment(ShardId(1), 50);
        b.decrement(ShardId(1), 10);
        a.merge(&b);
        // P = max(100, -) + max(-, 50) = 150
        // N = max(20, -) + max(-, 10) = 30
        assert_eq!(a.value(), 120);
    }
    #[test]
    fn serde_roundtrip() {
        let mut c = PNCounter::new();
        c.increment(ShardId(0), 42);
        c.decrement(ShardId(1), 7);
        let json = serde_json::to_string(&c).expect("serialize");
        let deserialized: PNCounter = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(c, deserialized);
    }
 }
--- a/tidal/src/replication/crdt/signal_state.rs
+++ b/tidal/src/replication/crdt/signal_state.rs
@ -0,0 +1,316 @@
 //! CRDT-aware signal state for per-node decay accumulation.
 //!
 //! `CrdtSignalState` enables deterministic reconciliation after network
 //! partitions by tracking per-node contributions separately. On merge,
 //! each node's contribution is preserved without double-counting.
 use std::collections::HashMap;
 use super::pn_counter::PNCounter;
 use crate::replication::shard::ShardId;
 /// CRDT-aware signal state for a single (entity, `signal_type`) pair.
 ///
 /// Extends decay scoring with per-node accounting that enables correct
 /// merge after partitioned writes. Each node maintains its own running
 /// partial sum; on merge, partial sums are added (since each node processes
 /// distinct WAL segments, these are disjoint contributions).
 ///
 /// # Why addition, not max
 ///
 /// Exponential decay: `S(t) = sum_i(w_i * exp(-lambda * (t - t_i)))`.
 /// Each node contributes its own subset of events. At merge time,
 /// `S_merged = S_node_A + S_node_B` (summing partial sums).
 /// Taking max would silently drop contributions from the smaller node.
 ///
 /// # Properties
 ///
 /// - **Commutative:** `merge(A, B) == merge(B, A)`
 /// - **Associative:** `merge(A, merge(B, C)) == merge(merge(A, B), C)`
 /// - **No double-counting:** merging the same node's state twice does NOT
 ///   increase the score (each node tracks an idempotency flag via its key)
 ///
 /// # Idempotency caveat
 ///
 /// Within a node, `on_signal` accumulates a running score. If a node's
 /// state is merged into a target that already has that node's contribution,
 /// the second merge would double-count. The caller (`ReconciliationEngine`)
 /// is responsible for ensuring each node's state is merged at most once.
 /// In practice, shards process disjoint WAL segments, so this never occurs.
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct CrdtSignalState {
    /// Per-node running decay score.
    ///
    /// Each node contributes its own partial decay sum.
    /// Global score = sum of all node contributions, decayed to query time.
    node_decay_scores: HashMap<ShardId, f64>,
    /// Timestamp of the last event per node (nanoseconds).
    ///
    /// Required to apply additional decay at query time:
    /// `decayed = score * exp(-lambda * (now_ns - last_ns))`.
    node_last_update_ns: HashMap<ShardId, u64>,
    /// Per-node windowed event counters.
    ///
    /// Each node tracks its own event increments via [`PNCounter`].
    /// On merge, per-node buckets are merged (per-node max via [`PNCounter`] merge).
    node_buckets: HashMap<ShardId, PNCounter>,
    /// Exponential decay rate (`lambda = ln(2) / half_life_s`).
    ///
    /// Identical across all nodes for this signal type.
    lambda: f64,
 }
 impl CrdtSignalState {
    /// Create an empty state with the given decay rate.
    #[must_use]
    pub fn new(lambda: f64) -> Self {
        Self {
            node_decay_scores: HashMap::new(),
            node_last_update_ns: HashMap::new(),
            node_buckets: HashMap::new(),
            lambda,
        }
    }
    /// Build a `CrdtSignalState` representing a single node's contribution.
    ///
    /// Used when snapshotting live ledger state: wraps the current node's
    /// running decay score as a per-node CRDT contribution so it can
    /// participate in merge operations.
    #[must_use]
    pub fn from_node_contribution(
        node: ShardId,
        score: f64,
        last_update_ns: u64,
        lambda: f64,
    ) -> Self {
        let mut state = Self::new(lambda);
        state.node_decay_scores.insert(node, score);
        state.node_last_update_ns.insert(node, last_update_ns);
        state
    }
    /// Record a new signal event from `node` at `now_ns` with `weight`.
    ///
    /// Updates the running decay score for this node:
    /// `score = score * exp(-lambda * dt) + weight`.
    #[allow(clippy::cast_precision_loss)] // ns delta fits in f64 mantissa for practical timescales
    pub fn on_signal(&mut self, node: ShardId, weight: f64, now_ns: u64) {
        let last = self.node_last_update_ns.entry(node).or_insert(now_ns);
        let dt = (now_ns.saturating_sub(*last)) as f64 / 1_000_000_000.0;
        let score = self.node_decay_scores.entry(node).or_insert(0.0);
        *score = (*score).mul_add((-self.lambda * dt).exp(), weight);
        *last = now_ns;
    }
    /// Global decay score: sum of all per-node contributions decayed to `now_ns`.
    ///
    /// Iterates over `node_decay_scores` and looks up the corresponding
    /// last-update timestamp by the same key. This key-aligned lookup is
    /// critical -- iterating two `HashMap`s via `.zip()` does NOT guarantee
    /// key alignment and would produce incorrect decay factors.
    #[must_use]
    #[allow(clippy::cast_precision_loss)] // ns delta fits in f64 mantissa for practical timescales
    pub fn decay_score(&self, now_ns: u64) -> f64 {
        self.node_decay_scores
            .iter()
            .map(|(&node, &score)| {
                // Look up the last update time for THIS node (key-aligned).
                let last_ns = self
                    .node_last_update_ns
                    .get(&node)
                    .copied()
                    .unwrap_or(now_ns);
                let dt = (now_ns.saturating_sub(last_ns)) as f64 / 1_000_000_000.0;
                score * (-self.lambda * dt).exp()
            })
            .sum()
    }
    /// Record an event in the windowed counter for `node`.
    pub fn increment_bucket(&mut self, node: ShardId, amount: u64) {
        self.node_buckets
            .entry(node)
            .or_default()
            .increment(node, amount);
    }
    /// Total windowed count: sum of all per-node increments.
    #[must_use]
    pub fn total_count(&self) -> u64 {
        self.node_buckets
            .values()
            .map(PNCounter::total_positive)
            .sum()
    }
    /// Merge another `CrdtSignalState` into this one.
    ///
    /// - Per-node decay scores are **added** (each node contributes distinct events).
    /// - Per-node timestamps are max'd (preserve the latest update time for decay math).
    /// - Per-node bucket counters are merged via [`PNCounter`] (per-node max, idempotent).
    pub fn merge(&mut self, other: &Self) {
        // Add per-node decay scores.
        for (&node, &other_score) in &other.node_decay_scores {
            *self.node_decay_scores.entry(node).or_insert(0.0) += other_score;
        }
        // Max per-node timestamps.
        for (&node, &other_ts) in &other.node_last_update_ns {
            let entry = self.node_last_update_ns.entry(node).or_default();
            *entry = (*entry).max(other_ts);
        }
        // Merge per-node bucket counters.
        for (node, other_bucket) in &other.node_buckets {
            self.node_buckets
                .entry(*node)
                .or_default()
                .merge(other_bucket);
        }
    }
    /// The decay rate (lambda) for this signal type.
    #[must_use]
    pub const fn lambda(&self) -> f64 {
        self.lambda
    }
    /// Number of nodes that have contributed events.
    #[must_use]
    pub fn node_count(&self) -> usize {
        self.node_decay_scores.len()
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    const LAMBDA: f64 = std::f64::consts::LN_2 / (7.0 * 24.0 * 3600.0); // 7-day half-life
    #[test]
    fn single_node_decay_score() {
        let mut state = CrdtSignalState::new(LAMBDA);
        let t0 = 1_000_000_000u64;
        state.on_signal(ShardId(0), 1.0, t0);
        let score = state.decay_score(t0);
        assert!((score - 1.0).abs() < 1e-10, "score at t0 should be ~1.0");
    }
    #[test]
    fn two_node_scores_add_on_merge() {
        let t = 1_000_000_000u64;
        let mut a = CrdtSignalState::new(LAMBDA);
        a.on_signal(ShardId(0), 1.0, t);
        let mut b = CrdtSignalState::new(LAMBDA);
        b.on_signal(ShardId(1), 1.0, t);
        let score_a = a.decay_score(t);
        let score_b = b.decay_score(t);
        a.merge(&b);
        let score_merged = a.decay_score(t);
        assert!(
            (score_merged - (score_a + score_b)).abs() < 1e-10,
            "merged score {score_merged} should equal sum {score_a} + {score_b}"
        );
    }
    #[test]
    fn merge_commutative() {
        let t = 1_000_000_000u64;
        let mut a = CrdtSignalState::new(LAMBDA);
        a.on_signal(ShardId(0), 2.0, t);
        let mut b = CrdtSignalState::new(LAMBDA);
        b.on_signal(ShardId(1), 3.0, t);
        let mut ab = a.clone();
        ab.merge(&b);
        let mut ba = b.clone();
        ba.merge(&a);
        let diff = (ab.decay_score(t) - ba.decay_score(t)).abs();
        assert!(diff < 1e-10, "merge should be commutative, diff={diff}");
    }
    #[test]
    fn merge_associative() {
        let t = 1_000_000_000u64;
        let mut a = CrdtSignalState::new(LAMBDA);
        a.on_signal(ShardId(0), 1.0, t);
        let mut b = CrdtSignalState::new(LAMBDA);
        b.on_signal(ShardId(1), 2.0, t);
        let mut c = CrdtSignalState::new(LAMBDA);
        c.on_signal(ShardId(2), 3.0, t);
        let mut a_bc = a.clone();
        let mut bc = b.clone();
        bc.merge(&c);
        a_bc.merge(&bc);
        let mut ab_c = a.clone();
        ab_c.merge(&b);
        ab_c.merge(&c);
        let diff = (a_bc.decay_score(t) - ab_c.decay_score(t)).abs();
        assert!(diff < 1e-10, "merge should be associative, diff={diff}");
    }
    #[test]
    fn windowed_count_no_double_counting() {
        let mut a = CrdtSignalState::new(LAMBDA);
        a.increment_bucket(ShardId(0), 10);
        let mut b = CrdtSignalState::new(LAMBDA);
        b.increment_bucket(ShardId(1), 20);
        a.merge(&b);
        assert_eq!(a.total_count(), 30, "disjoint nodes: count should be sum");
    }
    #[test]
    fn decay_score_aligned_key_lookup() {
        // Verify that decay_score correctly aligns keys between
        // node_decay_scores and node_last_update_ns (was buggy in spec).
        let t0 = 1_000_000_000u64;
        let t1 = t0 + 1_000_000_000; // 1 second later
        let lambda = std::f64::consts::LN_2; // 1-second half-life
        let mut state = CrdtSignalState::new(lambda);
        // Node 0 had events at t0
        state.on_signal(ShardId(0), 1.0, t0);
        // Node 1 had events at t1
        state.on_signal(ShardId(1), 1.0, t1);
        // Query at t1: node 0 should decay, node 1 should not
        let score = state.decay_score(t1);
        // node0: 1.0 * exp(-ln2 * 1.0) = 0.5
        // node1: 1.0 * exp(-ln2 * 0.0) = 1.0
        // total: ~1.5
        assert!(
            (score - 1.5).abs() < 1e-6,
            "aligned key lookup failed, got {score}"
        );
    }
    #[test]
    fn empty_state_decay_is_zero() {
        let state = CrdtSignalState::new(LAMBDA);
        assert_eq!(state.decay_score(1_000_000_000), 0.0);
    }
    #[test]
    fn node_count_tracks_contributors() {
        let mut state = CrdtSignalState::new(LAMBDA);
        assert_eq!(state.node_count(), 0);
        state.on_signal(ShardId(0), 1.0, 1000);
        assert_eq!(state.node_count(), 1);
        state.on_signal(ShardId(1), 1.0, 1000);
        assert_eq!(state.node_count(), 2);
    }
 }
--- a/tidal/src/replication/idempotency.rs
+++ b/tidal/src/replication/idempotency.rs
@ -0,0 +1,213 @@
 //! Idempotency key derivation and bounded LRU store.
 //!
 //! Duplicate session writes arriving via replication are detected in O(1)
 //! time and silently discarded.
 use std::num::NonZeroUsize;
 use std::sync::Mutex;
 use blake3::Hasher;
 use lru::LruCache;
 use crate::wal::format::session::SessionSeqNo;
 /// Per-operation idempotency key derived from session context.
 ///
 /// Derived as: `BLAKE3(session_id_bytes || seqno_bytes || operation_bytes)`.
 /// Stored as `u128` (first 16 bytes of the 32-byte BLAKE3 output).
 /// The 128-bit space gives 2^64 expected first collision at 2^64 operations --
 /// astronomically safe for our use case.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub struct IdempotencyKey(pub u128);
 impl IdempotencyKey {
    /// Derive an idempotency key for a session write operation.
    ///
    /// - `session_id_raw`: the session's `u64` id (from `SessionId::as_u64()`)
    /// - `seqno`: monotonic sequence number for this write
    /// - `operation_bytes`: serialized operation payload
    #[must_use]
    pub fn derive(session_id_raw: u64, seqno: SessionSeqNo, operation_bytes: &[u8]) -> Self {
        let mut hasher = Hasher::new();
        hasher.update(&session_id_raw.to_le_bytes());
        hasher.update(&seqno.0.to_le_bytes());
        hasher.update(operation_bytes);
        let hash = hasher.finalize();
        let out = hash.as_bytes();
        // BLAKE3 output is always 32 bytes; we take the first 16 for u128.
        let bytes = [
            out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7], out[8], out[9],
            out[10], out[11], out[12], out[13], out[14], out[15],
        ];
        Self(u128::from_le_bytes(bytes))
    }
 }
 /// Bounded LRU store for idempotency keys.
 ///
 /// Capacity: configurable (default 100K entries ~ 1.6 MB).
 /// When capacity is reached, the least-recently-seen key is evicted.
 /// Evicted keys return `true` on re-check (look new); the `SessionSeqNo`
 /// high-water-mark provides correctness for these older operations.
 ///
 /// Thread-safe via `Mutex<LruCache>`.
 pub struct IdempotencyStore {
    cache: Mutex<LruCache<IdempotencyKey, ()>>,
    capacity: usize,
 }
 impl IdempotencyStore {
    /// Create a store with the given capacity (must be > 0).
    ///
    /// # Panics
    /// Panics if `capacity == 0`.
    #[must_use]
    pub fn new(capacity: usize) -> Self {
        Self {
            cache: Mutex::new(LruCache::new(
                NonZeroUsize::new(capacity).expect("capacity must be > 0"),
            )),
            capacity,
        }
    }
    /// Create a store with the default 100K capacity.
    #[must_use]
    pub fn default_capacity() -> Self {
        Self::new(100_000)
    }
    /// Check if a key has been seen and record it if not.
    ///
    /// Returns `true` if the key is new (apply the operation).
    /// Returns `false` if the key was already seen (skip -- duplicate).
    ///
    /// # Panics
    /// Panics if the internal mutex is poisoned.
    #[must_use]
    pub fn check_and_record(&self, key: IdempotencyKey) -> bool {
        let mut cache = self.cache.lock().expect("idempotency store lock poisoned");
        if cache.contains(&key) {
            false
        } else {
            cache.put(key, ());
            true
        }
    }
    /// Current number of tracked keys.
    ///
    /// # Panics
    /// Panics if the internal mutex is poisoned.
    #[must_use]
    pub fn len(&self) -> usize {
        self.cache
            .lock()
            .expect("idempotency store lock poisoned")
            .len()
    }
    /// Returns `true` if the store is empty.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
    /// Configured maximum capacity.
    #[must_use]
    pub const fn capacity(&self) -> usize {
        self.capacity
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    fn make_key(sid: u64, seq: u64, op: &[u8]) -> IdempotencyKey {
        IdempotencyKey::derive(sid, SessionSeqNo(seq), op)
    }
    #[test]
    fn derive_is_deterministic() {
        let k1 = make_key(1, 1, b"op");
        let k2 = make_key(1, 1, b"op");
        assert_eq!(k1, k2);
    }
    #[test]
    fn different_inputs_produce_different_keys() {
        let k1 = make_key(1, 1, b"op_a");
        let k2 = make_key(1, 1, b"op_b");
        let k3 = make_key(1, 2, b"op_a");
        let k4 = make_key(2, 1, b"op_a");
        assert_ne!(k1, k2);
        assert_ne!(k1, k3);
        assert_ne!(k1, k4);
    }
    #[test]
    fn store_first_call_returns_true() {
        let store = IdempotencyStore::new(10);
        let key = make_key(1, 1, b"op");
        assert!(store.check_and_record(key));
    }
    #[test]
    fn store_second_call_returns_false() {
        let store = IdempotencyStore::new(10);
        let key = make_key(1, 1, b"op");
        let _ = store.check_and_record(key);
        assert!(!store.check_and_record(key));
    }
    #[test]
    fn store_len_grows_up_to_capacity() {
        let store = IdempotencyStore::new(5);
        for i in 0..5u64 {
            assert!(store.check_and_record(make_key(1, i, b"op")));
        }
        assert_eq!(store.len(), 5);
    }
    #[test]
    fn store_lru_eviction_at_capacity() {
        let store = IdempotencyStore::new(3);
        let k1 = make_key(1, 1, b"op");
        let k2 = make_key(1, 2, b"op");
        let k3 = make_key(1, 3, b"op");
        let k4 = make_key(1, 4, b"op");
        let _ = store.check_and_record(k1);
        let _ = store.check_and_record(k2);
        let _ = store.check_and_record(k3);
        // Capacity hit: inserting k4 evicts k1 (LRU). Order: k2, k3, k4.
        let _ = store.check_and_record(k4);
        // k1 was evicted -> looks new again.
        assert!(store.check_and_record(k1), "evicted key should return true");
        // Re-inserting k1 evicted k2 (LRU). Order: k3, k4, k1.
        // k2 is now evicted, so it also looks new.
        assert!(store.check_and_record(k2), "k2 was evicted by k1 re-insert");
        // k3 should still be present (though now LRU after k2 re-insert evicted it... no):
        // Actually inserting k2 evicts k3 (LRU). Order: k4, k1, k2.
        // k4 is still present.
        assert!(!store.check_and_record(k4));
    }
    #[test]
    fn memory_bound_100k_entries() {
        // u128 = 16 bytes per key; at 100K entries ~ 1.6 MB.
        // Verify the type size is correct.
        let key_size = std::mem::size_of::<IdempotencyKey>();
        assert_eq!(key_size, 16, "IdempotencyKey must be 16 bytes");
        // 16 bytes * 100_000 = 1.6 MB, well within the 10 MB limit.
        assert!(key_size * 100_000 < 10 * 1024 * 1024);
    }
    #[test]
    fn is_empty_initially() {
        let store = IdempotencyStore::new(10);
        assert!(store.is_empty());
        assert_eq!(store.len(), 0);
    }
 }
--- a/tidal/src/replication/in_process.rs
+++ b/tidal/src/replication/in_process.rs
@ -0,0 +1,233 @@
 //! In-process transport for WAL segment shipping via crossbeam channels.
 //!
 //! Used for integration tests and single-process multi-shard deployments.
 //! Each shard gets a unique receiver and a clone of all senders, enabling
 //! any-to-any communication within the process.
 use std::collections::HashMap;
 use crossbeam::channel::{Receiver, Sender, bounded};
 use crate::replication::shard::ShardId;
 use crate::replication::transport::{Transport, TransportError, WalSegmentPayload};
 /// Maximum payload size in bytes (64 MB). Segments larger than this are
 /// rejected to prevent unbounded memory growth in the channel buffer.
 const MAX_PAYLOAD_BYTES: usize = 64 * 1024 * 1024;
 /// Factory for building a set of [`InProcessTransport`] endpoints, one per shard.
 ///
 /// Each transport owns all senders (one per shard) and its own unique receiver.
 /// The channel capacity is 64 segments per shard.
 pub struct InProcessTransportFactory {
    shards: Vec<ShardId>,
 }
 impl InProcessTransportFactory {
    /// Create a factory for the given shard IDs.
    #[must_use]
    pub fn new(shards: &[ShardId]) -> Self {
        Self {
            shards: shards.to_vec(),
        }
    }
    /// Build one `InProcessTransport` per shard.
    ///
    /// Each transport can send to any shard and receives on its own channel.
    ///
    /// # Panics
    ///
    /// Panics if a receiver is missing for a shard -- this is unreachable
    /// because the receiver map is populated from the same shard list.
    #[must_use]
    pub fn build(self) -> HashMap<ShardId, InProcessTransport> {
        let mut senders: HashMap<ShardId, Sender<WalSegmentPayload>> = HashMap::new();
        let mut receivers: HashMap<ShardId, Receiver<WalSegmentPayload>> = HashMap::new();
        for &shard_id in &self.shards {
            let (tx, rx) = bounded(64);
            senders.insert(shard_id, tx);
            receivers.insert(shard_id, rx);
        }
        let mut result = HashMap::new();
        for &local_shard in &self.shards {
            let receiver = receivers
                .remove(&local_shard)
                .expect("receiver missing for shard that was just created");
            let transport = InProcessTransport {
                local_shard,
                senders: senders.clone(),
                receiver,
            };
            result.insert(local_shard, transport);
        }
        result
    }
 }
 /// An in-process transport endpoint backed by crossbeam bounded channels.
 ///
 /// Thread-safe: `Sender` and `Receiver` from crossbeam are `Send + Sync`.
 pub struct InProcessTransport {
    local_shard: ShardId,
    senders: HashMap<ShardId, Sender<WalSegmentPayload>>,
    receiver: Receiver<WalSegmentPayload>,
 }
 impl Transport for InProcessTransport {
    fn send_segment(&self, to: ShardId, payload: WalSegmentPayload) -> Result<(), TransportError> {
        if payload.bytes.len() > MAX_PAYLOAD_BYTES {
            return Err(TransportError::PayloadTooLarge {
                size: payload.bytes.len(),
                max: MAX_PAYLOAD_BYTES,
            });
        }
        let sender = self
            .senders
            .get(&to)
            .ok_or(TransportError::UnknownPeer(to))?;
        sender.send(payload).map_err(|_| TransportError::Closed)
    }
    fn recv_segment(&self) -> Option<WalSegmentPayload> {
        self.receiver.recv().ok()
    }
    fn local_shard(&self) -> ShardId {
        self.local_shard
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    use crate::replication::segment_id::WalSegmentId;
    use crate::replication::shard::RegionId;
    fn make_payload(shard: ShardId, seqno: u64) -> WalSegmentPayload {
        WalSegmentPayload {
            id: WalSegmentId::new(RegionId::SINGLE, shard, seqno),
            bytes: vec![0xAB; 100],
            event_count: 5,
        }
    }
    #[test]
    fn factory_creates_one_transport_per_shard() {
        let shards = [ShardId(0), ShardId(1), ShardId(2)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        assert_eq!(transports.len(), 3);
        assert!(transports.contains_key(&ShardId(0)));
        assert!(transports.contains_key(&ShardId(1)));
        assert!(transports.contains_key(&ShardId(2)));
    }
    #[test]
    fn send_and_receive_between_shards() {
        let shards = [ShardId(0), ShardId(1)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        let t0 = &transports[&ShardId(0)];
        let t1 = &transports[&ShardId(1)];
        // Shard 0 sends to Shard 1.
        let payload = make_payload(ShardId(0), 42);
        t0.send_segment(ShardId(1), payload).unwrap();
        // Shard 1 receives it.
        let received = t1.recv_segment().unwrap();
        assert_eq!(received.id.seqno, 42);
        assert_eq!(received.event_count, 5);
        assert_eq!(received.bytes.len(), 100);
    }
    #[test]
    fn send_to_unknown_peer_fails() {
        let shards = [ShardId(0)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        let t0 = &transports[&ShardId(0)];
        let result = t0.send_segment(ShardId(99), make_payload(ShardId(0), 1));
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(matches!(err, TransportError::UnknownPeer(ShardId(99))));
    }
    #[test]
    fn payload_too_large_rejected() {
        let shards = [ShardId(0), ShardId(1)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        let t0 = &transports[&ShardId(0)];
        let payload = WalSegmentPayload {
            id: WalSegmentId::single_node(1),
            bytes: vec![0u8; MAX_PAYLOAD_BYTES + 1],
            event_count: 0,
        };
        let result = t0.send_segment(ShardId(1), payload);
        assert!(result.is_err());
        assert!(matches!(
            result.unwrap_err(),
            TransportError::PayloadTooLarge { .. }
        ));
    }
    #[test]
    fn local_shard_returns_correct_id() {
        let shards = [ShardId(7), ShardId(8)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        assert_eq!(transports[&ShardId(7)].local_shard(), ShardId(7));
        assert_eq!(transports[&ShardId(8)].local_shard(), ShardId(8));
    }
    #[test]
    fn recv_returns_none_when_all_transports_dropped() {
        let shards = [ShardId(0), ShardId(1)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        // Extract shard 1's receiver before dropping everything.
        // We need to drop ALL transports (including t1 itself) because
        // each transport holds clones of all senders -- including senders
        // to its own channel.
        let t1_receiver = {
            let t1 = &transports[&ShardId(1)];
            t1.receiver.clone()
        };
        // Drop all transports -- this drops all senders.
        drop(transports);
        // With all senders dropped, recv should return None.
        assert!(t1_receiver.recv().is_err());
    }
    #[test]
    fn multiple_segments_fifo_order() {
        let shards = [ShardId(0), ShardId(1)];
        let factory = InProcessTransportFactory::new(&shards);
        let transports = factory.build();
        let t0 = &transports[&ShardId(0)];
        let t1 = &transports[&ShardId(1)];
        for seq in 1..=5u64 {
            t0.send_segment(ShardId(1), make_payload(ShardId(0), seq))
                .unwrap();
        }
        for expected_seq in 1..=5u64 {
            let received = t1.recv_segment().unwrap();
            assert_eq!(received.id.seqno, expected_seq);
        }
    }
 }
--- a/tidal/src/replication/lag.rs
+++ b/tidal/src/replication/lag.rs
@ -0,0 +1,149 @@
 //! Replication lag gauge: computes per-shard lag in WAL segments.
 //!
 //! The lag is defined as `leader_seqno - applied_seqno` for each shard.
 //! A lag of 0 means the follower is fully caught up. The gauge is designed
 //! for integration into the Prometheus metrics endpoint.
 use std::sync::atomic::{AtomicU64, Ordering};
 use super::shard::ShardId;
 use super::state::ReplicationState;
 /// Tracks the latest known leader seqno per shard and computes the lag
 /// relative to the local `ReplicationState`.
 ///
 /// # Thread Safety
 ///
 /// Uses `AtomicU64` for the leader seqno. The `ReplicationState` it
 /// references is also lock-free. Reading lag is wait-free.
 #[derive(Debug)]
 pub struct ReplicationLagGauge {
    /// The latest seqno observed from the leader for the primary shard.
    /// In single-shard deployments, this is `ShardId::SINGLE`.
    leader_seqno: AtomicU64,
    /// The local replication state (shared with the receiver).
    state: std::sync::Arc<ReplicationState>,
    /// The shard we are tracking.
    shard_id: ShardId,
 }
 impl ReplicationLagGauge {
    /// Create a new lag gauge for a single shard.
    #[must_use]
    pub const fn new(shard_id: ShardId, state: std::sync::Arc<ReplicationState>) -> Self {
        Self {
            leader_seqno: AtomicU64::new(0),
            state,
            shard_id,
        }
    }
    /// Update the leader's latest known seqno.
    ///
    /// Monotonic: smaller values are silently ignored.
    pub fn update_leader_seqno(&self, seqno: u64) {
        // CAS loop to enforce monotonicity (same pattern as ReplicationState::advance).
        let mut current = self.leader_seqno.load(Ordering::Acquire);
        loop {
            if seqno <= current {
                break;
            }
            match self.leader_seqno.compare_exchange_weak(
                current,
                seqno,
                Ordering::AcqRel,
                Ordering::Acquire,
            ) {
                Ok(_) => break,
                Err(actual) => current = actual,
            }
        }
    }
    /// Compute the current lag in WAL segments.
    ///
    /// Returns `leader_seqno - applied_seqno`. If the leader seqno is
    /// unknown (still 0), returns 0.
    #[must_use]
    pub fn lag_segments(&self) -> u64 {
        let leader = self.leader_seqno.load(Ordering::Acquire);
        let applied = self.state.applied_seqno(self.shard_id).unwrap_or(0);
        leader.saturating_sub(applied)
    }
    /// The shard this gauge tracks.
    #[must_use]
    pub const fn shard_id(&self) -> ShardId {
        self.shard_id
    }
    /// The latest leader seqno observed.
    #[must_use]
    pub fn leader_seqno(&self) -> u64 {
        self.leader_seqno.load(Ordering::Acquire)
    }
    /// The latest applied seqno on this node.
    #[must_use]
    pub fn applied_seqno(&self) -> u64 {
        self.state.applied_seqno(self.shard_id).unwrap_or(0)
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use std::sync::Arc;
    use super::*;
    #[test]
    fn lag_starts_at_zero() {
        let state = Arc::new(ReplicationState::single());
        let gauge = ReplicationLagGauge::new(ShardId::SINGLE, state);
        assert_eq!(gauge.lag_segments(), 0);
    }
    #[test]
    fn lag_reflects_leader_ahead() {
        let state = Arc::new(ReplicationState::single());
        let gauge = ReplicationLagGauge::new(ShardId::SINGLE, Arc::clone(&state));
        gauge.update_leader_seqno(10);
        assert_eq!(gauge.lag_segments(), 10);
        // Apply some segments
        state.advance(ShardId::SINGLE, 7);
        assert_eq!(gauge.lag_segments(), 3);
    }
    #[test]
    fn lag_zero_when_caught_up() {
        let state = Arc::new(ReplicationState::single());
        let gauge = ReplicationLagGauge::new(ShardId::SINGLE, Arc::clone(&state));
        gauge.update_leader_seqno(5);
        state.advance(ShardId::SINGLE, 5);
        assert_eq!(gauge.lag_segments(), 0);
    }
    #[test]
    fn leader_seqno_is_monotone() {
        let state = Arc::new(ReplicationState::single());
        let gauge = ReplicationLagGauge::new(ShardId::SINGLE, state);
        gauge.update_leader_seqno(10);
        gauge.update_leader_seqno(5); // should be ignored
        assert_eq!(gauge.leader_seqno(), 10);
        gauge.update_leader_seqno(15);
        assert_eq!(gauge.leader_seqno(), 15);
    }
    #[test]
    fn accessors_work() {
        let state = Arc::new(ReplicationState::single());
        state.advance(ShardId::SINGLE, 3);
        let gauge = ReplicationLagGauge::new(ShardId::SINGLE, Arc::clone(&state));
        gauge.update_leader_seqno(7);
        assert_eq!(gauge.shard_id(), ShardId::SINGLE);
        assert_eq!(gauge.leader_seqno(), 7);
        assert_eq!(gauge.applied_seqno(), 3);
        assert_eq!(gauge.lag_segments(), 4);
    }
 }
--- a/tidal/src/replication/mod.rs
+++ b/tidal/src/replication/mod.rs
@ -0,0 +1,33 @@
 //! Replication types and protocols for distributed tidalDB deployments.
 //!
 //! The `replication` module is empty in single-node deployments --
 //! all types default to `shard_id=0`, `region_id=0`, and routing is a no-op.
 pub mod crdt;
 pub mod idempotency;
 pub mod in_process;
 pub mod lag;
 pub mod receiver;
 pub mod reconcile;
 pub mod segment_id;
 pub mod session_bridge;
 pub mod shard;
 pub mod shipper;
 pub mod state;
 pub mod transport;
 pub use crdt::{Hlc, HlcTimestamp};
 pub use idempotency::{IdempotencyKey, IdempotencyStore};
 pub use in_process::{InProcessTransport, InProcessTransportFactory};
 pub use lag::ReplicationLagGauge;
 pub use receiver::{SegmentReceiverHandle, spawn_receiver};
 pub use reconcile::{HardNegAction, MergePlan, ReconciliationEngine, StateSnapshot};
 pub use segment_id::WalSegmentId;
 pub use session_bridge::{
    InProcessSessionTransportFactory, SessionBridgeError, SessionPayload, SessionReplicationBridge,
    SessionShardTransport,
 };
 pub use shard::{EntityIdRange, RegionId, RouterError, RoutingStrategy, ShardId, ShardRouter};
 pub use shipper::{ShipperConfig, WalShipperHandle, spawn_shipper};
 pub use state::ReplicationState;
 pub use transport::{Transport, TransportError, WalSegmentPayload};
--- a/tidal/src/replication/receiver.rs
+++ b/tidal/src/replication/receiver.rs
@ -0,0 +1,283 @@
 //! Segment receiver: consumes WAL segments from the transport and applies
 //! them to the local signal ledger.
 //!
 //! The receiver runs in a background thread, blocking on
 //! [`Transport::recv_segment`] and replaying each batch into the shared
 //! [`SignalLedger`] via `apply_wal_event`. Idempotent replay is ensured by
 //! checking the per-shard high-water-mark in [`ReplicationState`].
 use std::sync::Arc;
 use std::thread::JoinHandle;
 use crate::replication::shard::ShardId;
 use crate::replication::state::ReplicationState;
 use crate::replication::transport::Transport;
 use crate::schema::{EntityId, Timestamp};
 use crate::signals::{SignalLedger, SignalTypeId};
 use crate::wal::format::batch::{HEADER_SIZE, decode_batch};
 /// Handle to a running segment receiver thread.
 ///
 /// Call [`join`](Self::join) to block until the thread exits (triggered by
 /// the transport returning `None` from `recv_segment`).
 pub struct SegmentReceiverHandle {
    thread: Option<JoinHandle<()>>,
 }
 impl SegmentReceiverHandle {
    /// Block until the receiver thread exits.
    pub fn join(mut self) {
        if let Some(handle) = self.thread.take() {
            let _ = handle.join();
        }
    }
 }
 /// Spawn a background thread that receives WAL segments and replays them
 /// into the signal ledger.
 ///
 /// The thread exits when `transport.recv_segment()` returns `None` (transport
 /// closed / shutdown).
 ///
 /// # Panics
 ///
 /// Panics if the OS fails to spawn the background thread.
 pub fn spawn_receiver<T: Transport>(
    transport: Arc<T>,
    ledger: Arc<SignalLedger>,
    replication_state: Arc<ReplicationState>,
 ) -> SegmentReceiverHandle {
    let thread = std::thread::Builder::new()
        .name("tidaldb-segment-receiver".into())
        .spawn(move || {
            loop {
                let Some(payload) = transport.recv_segment() else {
                    tracing::debug!("segment receiver: transport closed, shutting down");
                    return;
                };
                let shard_id = payload.id.shard_id;
                apply_payload(&payload.bytes, shard_id, &ledger, &replication_state);
            }
        })
        .expect("failed to spawn segment receiver thread");
    SegmentReceiverHandle {
        thread: Some(thread),
    }
 }
 /// Apply all batches in a WAL segment payload to the signal ledger.
 ///
 /// Idempotent: batches whose last sequence number is at or below the
 /// replication state's high-water-mark for the source shard are skipped.
 fn apply_payload(
    bytes: &[u8],
    from_shard: ShardId,
    ledger: &SignalLedger,
    state: &ReplicationState,
 ) {
    let mut offset = 0;
    while offset < bytes.len() {
        let remaining = &bytes[offset..];
        if remaining.len() < HEADER_SIZE {
            break;
        }
        match decode_batch(remaining) {
            Ok((header, events)) => {
                let batch_last_seq = if header.event_count > 0 {
                    header.first_seq + u64::from(header.event_count) - 1
                } else {
                    header.first_seq
                };
                // Idempotency: skip if already applied.
                if let Some(applied) = state.applied_seqno(from_shard)
                    && batch_last_seq <= applied
                {
                    let batch_size = HEADER_SIZE + header.payload_len as usize;
                    offset += batch_size;
                    continue;
                }
                // Apply each event to the ledger.
                for event in &events {
                    let signal_type_id = SignalTypeId::new(u16::from(event.signal_type));
                    let entity_id = EntityId::new(event.entity_id);
                    let weight = f64::from(event.weight);
                    let timestamp = Timestamp::from_nanos(event.timestamp_nanos);
                    ledger.apply_wal_event(signal_type_id, entity_id, weight, timestamp);
                }
                // Advance replication high-water-mark.
                state.advance(from_shard, batch_last_seq);
                let batch_size = HEADER_SIZE + header.payload_len as usize;
                offset += batch_size;
            }
            Err(e) => {
                tracing::warn!(
                    error = %e,
                    "receiver: corrupt batch, skipping remainder of payload"
                );
                break;
            }
        }
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use std::time::Duration;
    use super::*;
    use crate::replication::segment_id::WalSegmentId;
    use crate::replication::shard::RegionId;
    use crate::schema::{DecaySpec, SchemaBuilder, Window};
    use crate::signals::NoopWalWriter;
    use crate::wal::format::batch::{EventRecord, encode_batch};
    fn make_schema() -> crate::schema::Schema {
        let mut builder = SchemaBuilder::new();
        let _ = builder
            .signal(
                "view",
                crate::schema::EntityKind::Item,
                DecaySpec::Exponential {
                    half_life: Duration::from_secs(7 * 24 * 3600),
                },
            )
            .windows(&[Window::AllTime])
            .velocity(false)
            .add();
        builder.build().unwrap()
    }
    fn make_event(entity_id: u64, signal_type: u8, ts_ns: u64) -> EventRecord {
        EventRecord {
            entity_id,
            signal_type,
            weight: 1.0,
            timestamp_nanos: ts_ns,
        }
    }
    #[test]
    fn apply_payload_updates_ledger() {
        let schema = make_schema();
        let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
        let state = Arc::new(ReplicationState::new(&[ShardId::SINGLE]));
        // Resolve the signal type for "view" to get the correct type id.
        let type_id = ledger.resolve_signal_type("view").unwrap();
        let events = vec![make_event(42, type_id.as_u16() as u8, 1_000_000_000)];
        let bytes = encode_batch(&events, 1, 1).unwrap();
        apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
        // Verify the ledger was updated.
        assert!(ledger.entries().contains_key(&(EntityId::new(42), type_id)));
        assert_eq!(state.applied_seqno(ShardId::SINGLE), Some(1));
    }
    #[test]
    fn apply_payload_idempotent() {
        let schema = make_schema();
        let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
        let state = Arc::new(ReplicationState::new(&[ShardId::SINGLE]));
        let type_id = ledger.resolve_signal_type("view").unwrap();
        let events = vec![make_event(42, type_id.as_u16() as u8, 1_000_000_000)];
        let bytes = encode_batch(&events, 1, 1).unwrap();
        // Apply once.
        apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
        // Apply again -- should be idempotent.
        apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
        assert_eq!(state.applied_seqno(ShardId::SINGLE), Some(1));
    }
    #[test]
    fn apply_payload_multiple_batches() {
        let schema = make_schema();
        let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
        let state = Arc::new(ReplicationState::new(&[ShardId::SINGLE]));
        let type_id = ledger.resolve_signal_type("view").unwrap();
        let e1 = vec![make_event(1, type_id.as_u16() as u8, 100)];
        let e2 = vec![make_event(2, type_id.as_u16() as u8, 200)];
        let mut bytes = encode_batch(&e1, 1, 100).unwrap();
        bytes.extend(encode_batch(&e2, 2, 200).unwrap());
        apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
        assert!(ledger.entries().contains_key(&(EntityId::new(1), type_id)));
        assert!(ledger.entries().contains_key(&(EntityId::new(2), type_id)));
        assert_eq!(state.applied_seqno(ShardId::SINGLE), Some(2));
    }
    /// A minimal transport that returns one payload then signals shutdown.
    struct OneShot {
        rx: crossbeam::channel::Receiver<crate::replication::WalSegmentPayload>,
    }
    impl Transport for OneShot {
        fn send_segment(
            &self,
            _to: ShardId,
            _payload: crate::replication::WalSegmentPayload,
        ) -> Result<(), crate::replication::TransportError> {
            Ok(())
        }
        fn recv_segment(&self) -> Option<crate::replication::WalSegmentPayload> {
            self.rx.recv().ok()
        }
        fn local_shard(&self) -> ShardId {
            ShardId(1)
        }
    }
    #[test]
    fn receiver_thread_exits_on_transport_close() {
        let (tx, rx) = crossbeam::channel::bounded(4);
        let transport = Arc::new(OneShot { rx });
        let schema = make_schema();
        let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
        let state = Arc::new(ReplicationState::new(&[ShardId(0)]));
        let handle = spawn_receiver(
            Arc::clone(&transport),
            Arc::clone(&ledger),
            Arc::clone(&state),
        );
        // Send one segment.
        let type_id = ledger.resolve_signal_type("view").unwrap();
        let events = vec![make_event(99, type_id.as_u16() as u8, 100)];
        let payload_bytes = encode_batch(&events, 1, 1).unwrap();
        tx.send(crate::replication::WalSegmentPayload {
            id: WalSegmentId::new(RegionId::SINGLE, ShardId(0), 1),
            bytes: payload_bytes,
            event_count: 1,
        })
        .unwrap();
        // Give the receiver a moment to process.
        std::thread::sleep(Duration::from_millis(50));
        // Drop sender -- receiver's recv will return None.
        drop(tx);
        // The receiver should exit gracefully.
        handle.join();
        // Verify the segment was applied.
        assert!(ledger.entries().contains_key(&(EntityId::new(99), type_id)));
        assert_eq!(state.applied_seqno(ShardId(0)), Some(1));
    }
 }
--- a/tidal/src/replication/reconcile.rs
+++ b/tidal/src/replication/reconcile.rs
@ -0,0 +1,340 @@
 //! Reconciliation engine for deterministic merge after network partitions.
 //!
 //! When two `TidalDB` nodes diverge during a partition, each accumulates
 //! independent signal events and hard-negative decisions. After the partition
 //! heals, the `ReconciliationEngine` produces a deterministic `MergePlan`
 //! from their diverged `StateSnapshot`s and applies it to the local state.
 //!
 //! # Merge semantics
 //!
 //! - **Signal states:** CRDT-merged per `(entity, signal_type)`. Each node's
 //!   contribution is summed (disjoint events); timestamps are max'd.
 //! - **Hard negatives:** LWW-resolved per `(user, item)` by HLC timestamp.
 //!   The most recent hide or unhide wins deterministically.
 //!
 //! # Idempotency
 //!
 //! Applying a `MergePlan` is idempotent: applying the same plan twice
 //! produces identical state. This is critical for at-least-once delivery
 //! guarantees during reconnection.
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 use crate::entities::HardNegIndex;
 use crate::replication::crdt::{CrdtSignalState, LWWRegister};
 use crate::schema::EntityId;
 use crate::signals::{SignalLedger, SignalTypeId};
 // ---------------------------------------------------------------------------
 // HardNegAction
 // ---------------------------------------------------------------------------
 /// An action applied to a hard-negative register.
 ///
 /// Stored inside an `LWWRegister<HardNegAction>` and resolved by HLC
 /// timestamp during reconciliation.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum HardNegAction {
    /// The user explicitly hid, muted, or blocked this item.
    Hide,
    /// The user reversed a previous hide (explicit un-hide).
    Unhide,
 }
 // ---------------------------------------------------------------------------
 // StateSnapshot
 // ---------------------------------------------------------------------------
 /// A snapshot of CRDT state for reconciliation.
 ///
 /// Produced by `TidalDb::take_crdt_snapshot()` or constructed manually in
 /// tests. Contains the per-key CRDT state for all entities and hard negatives
 /// that participated in diverged writes.
 #[derive(Debug, Clone, Default)]
 pub struct StateSnapshot {
    /// Per-(entity, `signal_type`) CRDT signal state.
    signal_states: HashMap<(EntityId, SignalTypeId), CrdtSignalState>,
    /// Per-(user, item) LWW hard-negative register.
    hardneg_registers: HashMap<(EntityId, EntityId), LWWRegister<HardNegAction>>,
 }
 impl StateSnapshot {
    /// Create an empty snapshot.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }
    /// Insert a signal state entry.
    pub fn add_signal_state(
        &mut self,
        entity_id: EntityId,
        signal_type_id: SignalTypeId,
        state: CrdtSignalState,
    ) {
        self.signal_states
            .insert((entity_id, signal_type_id), state);
    }
    /// Insert a hard-negative register entry.
    pub fn add_hardneg_register(
        &mut self,
        user_id: EntityId,
        item_id: EntityId,
        register: LWWRegister<HardNegAction>,
    ) {
        self.hardneg_registers.insert((user_id, item_id), register);
    }
    /// Number of signal state entries.
    #[must_use]
    pub fn signal_count(&self) -> usize {
        self.signal_states.len()
    }
    /// Number of hard-negative register entries.
    #[must_use]
    pub fn hardneg_count(&self) -> usize {
        self.hardneg_registers.len()
    }
    /// Iterate over all signal state keys.
    pub fn signal_keys(&self) -> impl Iterator<Item = (EntityId, SignalTypeId)> + '_ {
        self.signal_states.keys().copied()
    }
    /// Get signal state for a key.
    #[must_use]
    pub fn signal_state(&self, key: (EntityId, SignalTypeId)) -> Option<&CrdtSignalState> {
        self.signal_states.get(&key)
    }
    /// Iterate over all hard-negative keys.
    pub fn hardneg_keys(&self) -> impl Iterator<Item = (EntityId, EntityId)> + '_ {
        self.hardneg_registers.keys().copied()
    }
    /// Get hard-negative register for a key.
    #[must_use]
    pub fn hardneg_register(
        &self,
        key: (EntityId, EntityId),
    ) -> Option<&LWWRegister<HardNegAction>> {
        self.hardneg_registers.get(&key)
    }
 }
 // ---------------------------------------------------------------------------
 // MergePlan operations
 // ---------------------------------------------------------------------------
 /// A merge operation for a single signal counter.
 #[derive(Debug, Clone)]
 pub struct SignalMergeOp {
    /// The entity whose signal state is being merged.
    pub entity_id: EntityId,
    /// The signal type being merged.
    pub signal_type_id: SignalTypeId,
    /// The CRDT-merged state (union of both nodes' contributions).
    pub merged_state: CrdtSignalState,
 }
 /// A resolution for a single hard-negative register.
 #[derive(Debug, Clone)]
 pub struct HardNegResolutionOp {
    /// The user whose hard-negative is being resolved.
    pub user_id: EntityId,
    /// The item targeted by the hard-negative.
    pub item_id: EntityId,
    /// Winning action after LWW resolution. `None` means no hard negative
    /// was ever written (both sides were empty).
    pub action: Option<HardNegAction>,
 }
 // ---------------------------------------------------------------------------
 // MergePlan
 // ---------------------------------------------------------------------------
 /// The reconciliation plan: a list of operations to apply.
 ///
 /// Produced by `ReconciliationEngine::plan()`. Applying the plan is
 /// idempotent -- applying it twice produces identical state.
 #[derive(Debug, Clone)]
 pub struct MergePlan {
    /// Signal merge operations (one per diverged entity-signal pair).
    pub signal_merges: Vec<SignalMergeOp>,
    /// Hard-negative resolution operations (one per diverged user-item pair).
    pub hardneg_resolutions: Vec<HardNegResolutionOp>,
 }
 impl MergePlan {
    /// Total number of operations in this plan.
    #[must_use]
    pub const fn operation_count(&self) -> usize {
        self.signal_merges.len() + self.hardneg_resolutions.len()
    }
    /// Whether this plan has no operations (snapshots were identical).
    #[must_use]
    pub const fn is_empty(&self) -> bool {
        self.signal_merges.is_empty() && self.hardneg_resolutions.is_empty()
    }
 }
 // ---------------------------------------------------------------------------
 // ReconciliationEngine
 // ---------------------------------------------------------------------------
 /// Produces and applies reconciliation plans for partitioned shards.
 ///
 /// The engine is bound to a local `SignalLedger` and `HardNegIndex`.
 /// It does not own or modify the remote state -- the caller provides
 /// snapshots and the engine computes a deterministic merge.
 ///
 /// # Usage
 ///
 /// ```ignore
 /// let engine = ReconciliationEngine::new(
 ///     Arc::clone(&signal_ledger),
 ///     Arc::clone(&hard_neg_index),
 /// );
 /// let plan = engine.plan(&local_snapshot, &remote_snapshot);
 /// engine.apply(&plan)?;
 /// ```
 pub struct ReconciliationEngine {
    signal_ledger: Arc<SignalLedger>,
    hard_neg_index: Arc<HardNegIndex>,
 }
 impl ReconciliationEngine {
    /// Create a new engine bound to the given ledger and hard-neg index.
    #[must_use]
    pub const fn new(signal_ledger: Arc<SignalLedger>, hard_neg_index: Arc<HardNegIndex>) -> Self {
        Self {
            signal_ledger,
            hard_neg_index,
        }
    }
    /// Produce a deterministic merge plan from two diverged state snapshots.
    ///
    /// - Signal states: union of both snapshots, CRDT-merged per
    ///   `(entity, signal_type)`.
    /// - Hard negatives: LWW-resolved per `(user, item)` by HLC timestamp.
    ///
    /// Entities/signals present on only one side are included unchanged
    /// (no data loss -- single-sided state is still valid state).
    #[must_use]
    pub fn plan(&self, local: &StateSnapshot, remote: &StateSnapshot) -> MergePlan {
        // -- Signal merges --
        let signal_keys: HashSet<(EntityId, SignalTypeId)> =
            local.signal_keys().chain(remote.signal_keys()).collect();
        let mut signal_merges = Vec::with_capacity(signal_keys.len());
        for key in signal_keys {
            let local_state = local.signal_state(key);
            let remote_state = remote.signal_state(key);
            let merged = match (local_state, remote_state) {
                (Some(l), Some(r)) => {
                    let mut m = l.clone();
                    m.merge(r);
                    m
                }
                (Some(l), None) => l.clone(),
                (None, Some(r)) => r.clone(),
                (None, None) => continue, // unreachable: key came from one of the iterators
            };
            signal_merges.push(SignalMergeOp {
                entity_id: key.0,
                signal_type_id: key.1,
                merged_state: merged,
            });
        }
        // -- Hard-negative resolutions --
        let neg_keys: HashSet<(EntityId, EntityId)> =
            local.hardneg_keys().chain(remote.hardneg_keys()).collect();
        let mut hardneg_resolutions = Vec::with_capacity(neg_keys.len());
        for key in neg_keys {
            let local_reg = local.hardneg_register(key);
            let remote_reg = remote.hardneg_register(key);
            let resolved = match (local_reg, remote_reg) {
                (Some(l), Some(r)) => {
                    let mut m = l.clone();
                    m.merge(r);
                    m
                }
                (Some(l), None) => l.clone(),
                (None, Some(r)) => r.clone(),
                (None, None) => continue, // unreachable
            };
            hardneg_resolutions.push(HardNegResolutionOp {
                user_id: key.0,
                item_id: key.1,
                action: resolved.get().cloned(),
            });
        }
        MergePlan {
            signal_merges,
            hardneg_resolutions,
        }
    }
    /// Apply a merge plan to the local state.
    ///
    /// Idempotent: applying the same plan twice produces identical state.
    ///
    /// # Errors
    ///
    /// Returns an error if any signal type in the plan is unknown to the
    /// ledger's schema.
    pub fn apply(&self, plan: &MergePlan) -> crate::Result<()> {
        // Apply signal merges.
        for op in &plan.signal_merges {
            self.signal_ledger.apply_crdt_state(
                op.entity_id,
                op.signal_type_id,
                &op.merged_state,
            )?;
        }
        // Apply hard-negative resolutions.
        for op in &plan.hardneg_resolutions {
            // RoaringBitmap uses u32; EntityId wraps u64. Truncation is safe
            // because HardNegIndex was designed for item IDs that fit in u32
            // (RoaringBitmap constraint).
            #[allow(clippy::cast_possible_truncation)]
            let item_id = op.item_id.as_u64() as u32;
            let user_id = op.user_id.as_u64();
            match &op.action {
                Some(HardNegAction::Hide) => {
                    self.hard_neg_index.add(user_id, item_id);
                }
                Some(HardNegAction::Unhide) | None => {
                    // Unhide or empty register: ensure the item is NOT in the
                    // hard-negative set.
                    self.hard_neg_index.remove(user_id, item_id);
                }
            }
        }
        Ok(())
    }
 }
 // ---------------------------------------------------------------------------
 // Tests
 // ---------------------------------------------------------------------------
 #[cfg(test)]
 #[allow(clippy::unwrap_used, clippy::float_cmp, clippy::cast_precision_loss)]
 #[path = "reconcile_tests.rs"]
 mod tests;
--- a/tidal/src/replication/reconcile_tests.rs
+++ b/tidal/src/replication/reconcile_tests.rs
@ -0,0 +1,368 @@
 use std::sync::Arc;
 use std::time::Duration;
 use super::*;
 use crate::entities::HardNegIndex;
 use crate::replication::crdt::hlc::HlcTimestamp;
 use crate::replication::crdt::{CrdtSignalState, LWWRegister};
 use crate::replication::shard::ShardId;
 use crate::schema::{DecaySpec, EntityId, EntityKind, SchemaBuilder, Window};
 use crate::signals::ledger::types::NoopWalWriter;
 use crate::signals::{SignalLedger, SignalTypeId};
 /// 7-day half-life lambda (matches the standard "view" signal).
 const LAMBDA: f64 = std::f64::consts::LN_2 / (7.0 * 24.0 * 3600.0);
 fn test_schema() -> crate::schema::Schema {
    let mut builder = SchemaBuilder::new();
    let _ = builder
        .signal(
            "view",
            EntityKind::Item,
            DecaySpec::Exponential {
                half_life: Duration::from_secs(7 * 24 * 3600),
            },
        )
        .windows(&[Window::AllTime])
        .velocity(false)
        .add();
    builder.build().unwrap()
 }
 fn make_ledger() -> Arc<SignalLedger> {
    Arc::new(SignalLedger::new(test_schema(), Box::new(NoopWalWriter)))
 }
 fn make_engine() -> (ReconciliationEngine, Arc<SignalLedger>, Arc<HardNegIndex>) {
    let ledger = make_ledger();
    let hard_neg = Arc::new(HardNegIndex::new());
    let engine = ReconciliationEngine::new(Arc::clone(&ledger), Arc::clone(&hard_neg));
    (engine, ledger, hard_neg)
 }
 fn ts(wall_ns: u64, logical: u32, node_id: u16) -> HlcTimestamp {
    HlcTimestamp {
        wall_ns,
        logical,
        node_id,
    }
 }
 // -- StateSnapshot construction --
 #[test]
 fn snapshot_empty_by_default() {
    let snap = StateSnapshot::new();
    assert_eq!(snap.signal_count(), 0);
    assert_eq!(snap.hardneg_count(), 0);
 }
 #[test]
 fn snapshot_add_and_retrieve_signal() {
    let mut snap = StateSnapshot::new();
    let entity = EntityId::new(1);
    let sig_id = SignalTypeId::new(0);
    let state = CrdtSignalState::new(LAMBDA);
    snap.add_signal_state(entity, sig_id, state);
    assert_eq!(snap.signal_count(), 1);
    assert!(snap.signal_state((entity, sig_id)).is_some());
    assert!(snap.signal_state((EntityId::new(99), sig_id)).is_none());
 }
 #[test]
 fn snapshot_add_and_retrieve_hardneg() {
    let mut snap = StateSnapshot::new();
    let user = EntityId::new(10);
    let item = EntityId::new(20);
    let mut reg = LWWRegister::empty();
    reg.write(HardNegAction::Hide, ts(1000, 0, 0));
    snap.add_hardneg_register(user, item, reg);
    assert_eq!(snap.hardneg_count(), 1);
    let retrieved = snap.hardneg_register((user, item)).unwrap();
    assert_eq!(retrieved.get(), Some(&HardNegAction::Hide));
 }
 // -- ReconciliationEngine::plan() --
 #[test]
 fn plan_empty_snapshots_produces_empty_plan() {
    let (engine, _, _) = make_engine();
    let plan = engine.plan(&StateSnapshot::new(), &StateSnapshot::new());
    assert!(plan.is_empty());
    assert_eq!(plan.operation_count(), 0);
 }
 #[test]
 fn plan_merges_disjoint_signal_states() {
    let (engine, _, _) = make_engine();
    let entity = EntityId::new(1);
    let sig_id = SignalTypeId::new(0);
    let t = 1_000_000_000u64;
    // Local: node 0 contributed 1.0 at t
    let mut local_snap = StateSnapshot::new();
    let mut local_state = CrdtSignalState::new(LAMBDA);
    local_state.on_signal(ShardId(0), 1.0, t);
    local_snap.add_signal_state(entity, sig_id, local_state);
    // Remote: node 1 contributed 2.0 at t
    let mut remote_snap = StateSnapshot::new();
    let mut remote_state = CrdtSignalState::new(LAMBDA);
    remote_state.on_signal(ShardId(1), 2.0, t);
    remote_snap.add_signal_state(entity, sig_id, remote_state);
    let plan = engine.plan(&local_snap, &remote_snap);
    assert_eq!(plan.signal_merges.len(), 1);
    assert_eq!(plan.hardneg_resolutions.len(), 0);
    // Merged score at t should be ~3.0 (1.0 + 2.0, both at same time)
    let merged_score = plan.signal_merges[0].merged_state.decay_score(t);
    assert!(
        (merged_score - 3.0).abs() < 1e-6,
        "merged score {merged_score} should be ~3.0"
    );
 }
 #[test]
 fn plan_includes_single_sided_signal_state() {
    let (engine, _, _) = make_engine();
    let entity = EntityId::new(1);
    let sig_id = SignalTypeId::new(0);
    let t = 1_000_000_000u64;
    // Only local has state.
    let mut local_snap = StateSnapshot::new();
    let mut local_state = CrdtSignalState::new(LAMBDA);
    local_state.on_signal(ShardId(0), 5.0, t);
    local_snap.add_signal_state(entity, sig_id, local_state);
    let remote_snap = StateSnapshot::new(); // empty
    let plan = engine.plan(&local_snap, &remote_snap);
    assert_eq!(plan.signal_merges.len(), 1);
    let merged_score = plan.signal_merges[0].merged_state.decay_score(t);
    assert!(
        (merged_score - 5.0).abs() < 1e-6,
        "single-sided score {merged_score} should be ~5.0"
    );
 }
 #[test]
 fn plan_hardneg_lww_resolution_hide_wins_higher_hlc() {
    let (engine, _, _) = make_engine();
    let user = EntityId::new(10);
    let item = EntityId::new(20);
    // Local: hide at t=1000
    let mut local_snap = StateSnapshot::new();
    let mut local_reg = LWWRegister::empty();
    local_reg.write(HardNegAction::Hide, ts(1000, 0, 0));
    local_snap.add_hardneg_register(user, item, local_reg);
    // Remote: unhide at t=500 (earlier)
    let mut remote_snap = StateSnapshot::new();
    let mut remote_reg = LWWRegister::empty();
    remote_reg.write(HardNegAction::Unhide, ts(500, 0, 1));
    remote_snap.add_hardneg_register(user, item, remote_reg);
    let plan = engine.plan(&local_snap, &remote_snap);
    assert_eq!(plan.hardneg_resolutions.len(), 1);
    assert_eq!(
        plan.hardneg_resolutions[0].action,
        Some(HardNegAction::Hide),
        "hide at t=1000 should beat unhide at t=500"
    );
 }
 #[test]
 fn plan_hardneg_lww_resolution_unhide_wins_higher_hlc() {
    let (engine, _, _) = make_engine();
    let user = EntityId::new(10);
    let item = EntityId::new(20);
    // Local: hide at t=500
    let mut local_snap = StateSnapshot::new();
    let mut local_reg = LWWRegister::empty();
    local_reg.write(HardNegAction::Hide, ts(500, 0, 0));
    local_snap.add_hardneg_register(user, item, local_reg);
    // Remote: unhide at t=1000 (later)
    let mut remote_snap = StateSnapshot::new();
    let mut remote_reg = LWWRegister::empty();
    remote_reg.write(HardNegAction::Unhide, ts(1000, 0, 1));
    remote_snap.add_hardneg_register(user, item, remote_reg);
    let plan = engine.plan(&local_snap, &remote_snap);
    assert_eq!(plan.hardneg_resolutions.len(), 1);
    assert_eq!(
        plan.hardneg_resolutions[0].action,
        Some(HardNegAction::Unhide),
        "unhide at t=1000 should beat hide at t=500"
    );
 }
 // -- ReconciliationEngine::apply() --
 #[test]
 fn apply_signal_merge_updates_ledger() {
    use crate::schema::Timestamp;
    let (engine, ledger, _) = make_engine();
    let entity = EntityId::new(42);
    let sig_id = SignalTypeId::new(0); // "view" is alphabetically first (only signal)
    let t = Timestamp::now().as_nanos();
    // Build a merged state with two nodes using a recent timestamp.
    let mut merged = CrdtSignalState::new(LAMBDA);
    merged.on_signal(ShardId(0), 3.0, t);
    merged.on_signal(ShardId(1), 7.0, t);
    let plan = MergePlan {
        signal_merges: vec![SignalMergeOp {
            entity_id: entity,
            signal_type_id: sig_id,
            merged_state: merged,
        }],
        hardneg_resolutions: vec![],
    };
    engine.apply(&plan).unwrap();
    // Verify the ledger entry was created and has a positive score.
    let entry = ledger.entries().get(&(entity, sig_id));
    assert!(
        entry.is_some(),
        "ledger should have an entry for the entity"
    );
    let score = entry.unwrap().hot.stored_score(0);
    assert!(score > 0.0, "stored score {score} should be positive");
 }
 #[test]
 fn apply_hardneg_hide_adds_to_index() {
    let (engine, _, hard_neg) = make_engine();
    let user = EntityId::new(10);
    let item = EntityId::new(20);
    let plan = MergePlan {
        signal_merges: vec![],
        hardneg_resolutions: vec![HardNegResolutionOp {
            user_id: user,
            item_id: item,
            action: Some(HardNegAction::Hide),
        }],
    };
    engine.apply(&plan).unwrap();
    assert!(
        hard_neg.is_negative(10, 20),
        "item 20 should be hidden for user 10"
    );
 }
 #[test]
 fn apply_hardneg_unhide_removes_from_index() {
    let (engine, _, hard_neg) = make_engine();
    // Pre-populate a hard negative.
    hard_neg.add(10, 20);
    assert!(hard_neg.is_negative(10, 20));
    let plan = MergePlan {
        signal_merges: vec![],
        hardneg_resolutions: vec![HardNegResolutionOp {
            user_id: EntityId::new(10),
            item_id: EntityId::new(20),
            action: Some(HardNegAction::Unhide),
        }],
    };
    engine.apply(&plan).unwrap();
    assert!(
        !hard_neg.is_negative(10, 20),
        "item 20 should no longer be hidden for user 10"
    );
 }
 #[test]
 fn apply_is_idempotent() {
    use crate::schema::Timestamp;
    let (engine, ledger, hard_neg) = make_engine();
    let entity = EntityId::new(1);
    let sig_id = SignalTypeId::new(0);
    let t = Timestamp::now().as_nanos();
    let mut merged = CrdtSignalState::new(LAMBDA);
    merged.on_signal(ShardId(0), 5.0, t);
    let plan = MergePlan {
        signal_merges: vec![SignalMergeOp {
            entity_id: entity,
            signal_type_id: sig_id,
            merged_state: merged,
        }],
        hardneg_resolutions: vec![HardNegResolutionOp {
            user_id: EntityId::new(10),
            item_id: EntityId::new(20),
            action: Some(HardNegAction::Hide),
        }],
    };
    // Apply twice.
    engine.apply(&plan).unwrap();
    let score_first = ledger
        .entries()
        .get(&(entity, sig_id))
        .unwrap()
        .hot
        .stored_score(0);
    engine.apply(&plan).unwrap();
    let score_second = ledger
        .entries()
        .get(&(entity, sig_id))
        .unwrap()
        .hot
        .stored_score(0);
    // Scores should be very close (not exactly equal because Timestamp::now()
    // advances between the two apply calls, causing slightly different decay).
    // But the key invariant is that the second apply does not double the score.
    let ratio = score_second / score_first;
    assert!(
        (ratio - 1.0).abs() < 0.01,
        "idempotent apply: ratio {ratio} should be ~1.0 (first={score_first}, second={score_second})"
    );
    // Hard-neg should still be set (idempotent add).
    assert!(hard_neg.is_negative(10, 20));
 }
 // -- CrdtSignalState::from_node_contribution --
 #[test]
 fn from_node_contribution_produces_correct_score() {
    let state = CrdtSignalState::from_node_contribution(ShardId(0), 10.0, 1_000_000_000, LAMBDA);
    // At the same timestamp, score should be exactly 10.0.
    let score = state.decay_score(1_000_000_000);
    assert!(
        (score - 10.0).abs() < 1e-10,
        "from_node_contribution score {score} should be 10.0"
    );
 }
 #[test]
 fn from_node_contribution_merges_with_other_nodes() {
    let a = CrdtSignalState::from_node_contribution(ShardId(0), 3.0, 1_000_000_000, LAMBDA);
    let b = CrdtSignalState::from_node_contribution(ShardId(1), 7.0, 1_000_000_000, LAMBDA);
    let mut merged = a.clone();
    merged.merge(&b);
    let score = merged.decay_score(1_000_000_000);
    assert!(
        (score - 10.0).abs() < 1e-10,
        "merged score {score} should be 10.0"
    );
 }
--- a/tidal/src/replication/segment_id.rs
+++ b/tidal/src/replication/segment_id.rs
@ -0,0 +1,182 @@
 use std::cmp::Ordering;
 use std::fmt;
 use std::num::ParseIntError;
 use std::str::FromStr;
 use super::shard::{RegionId, ShardId};
 /// Globally unique identifier for a WAL segment.
 ///
 /// Ordering: by (`region_id`, `shard_id`, seqno) -- allows total ordering
 /// across all segments in the cluster.
 ///
 /// Display: `"r0:s0:42"` -- human-readable for logs and tidalctl output.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub struct WalSegmentId {
    pub region_id: RegionId,
    pub shard_id: ShardId,
    pub seqno: u64,
 }
 impl WalSegmentId {
    #[must_use]
    pub const fn new(region_id: RegionId, shard_id: ShardId, seqno: u64) -> Self {
        Self {
            region_id,
            shard_id,
            seqno,
        }
    }
    /// Create a segment ID for the default single-node deployment.
    #[must_use]
    pub const fn single_node(seqno: u64) -> Self {
        Self::new(RegionId::SINGLE, ShardId::SINGLE, seqno)
    }
 }
 impl PartialOrd for WalSegmentId {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }
 impl Ord for WalSegmentId {
    fn cmp(&self, other: &Self) -> Ordering {
        self.region_id
            .cmp(&other.region_id)
            .then(self.shard_id.cmp(&other.shard_id))
            .then(self.seqno.cmp(&other.seqno))
    }
 }
 impl fmt::Display for WalSegmentId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}:{}:{}", self.region_id, self.shard_id, self.seqno)
    }
 }
 /// Error returned when parsing a `WalSegmentId` from a string fails.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum ParseSegmentIdError {
    /// Expected exactly 3 colon-separated parts (`"r0:s0:42"`).
    WrongPartCount,
    /// Region part must start with `'r'`.
    MissingRegionPrefix,
    /// Shard part must start with `'s'`.
    MissingShardPrefix,
    /// A numeric field could not be parsed.
    InvalidNumber(ParseIntError),
 }
 impl fmt::Display for ParseSegmentIdError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::WrongPartCount => write!(f, "expected format \"r<region>:s<shard>:<seqno>\""),
            Self::MissingRegionPrefix => write!(f, "region part must start with 'r'"),
            Self::MissingShardPrefix => write!(f, "shard part must start with 's'"),
            Self::InvalidNumber(e) => write!(f, "invalid number: {e}"),
        }
    }
 }
 impl std::error::Error for ParseSegmentIdError {}
 impl From<ParseIntError> for ParseSegmentIdError {
    fn from(e: ParseIntError) -> Self {
        Self::InvalidNumber(e)
    }
 }
 impl FromStr for WalSegmentId {
    type Err = ParseSegmentIdError;
    /// Parse a segment ID from `"r<region>:s<shard>:<seqno>"` format.
    ///
    /// # Examples
    ///
    /// ```
    /// use tidaldb::replication::segment_id::WalSegmentId;
    /// let id: WalSegmentId = "r0:s0:42".parse().unwrap();
    /// assert_eq!(id.seqno, 42);
    /// ```
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let parts: Vec<&str> = s.split(':').collect();
        if parts.len() != 3 {
            return Err(ParseSegmentIdError::WrongPartCount);
        }
        let region_str = parts[0]
            .strip_prefix('r')
            .ok_or(ParseSegmentIdError::MissingRegionPrefix)?;
        let shard_str = parts[1]
            .strip_prefix('s')
            .ok_or(ParseSegmentIdError::MissingShardPrefix)?;
        let region_id = RegionId(region_str.parse::<u16>()?);
        let shard_id = ShardId(shard_str.parse::<u16>()?);
        let seqno = parts[2].parse::<u64>()?;
        Ok(Self {
            region_id,
            shard_id,
            seqno,
        })
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn segment_id_ordering() {
        let a = WalSegmentId::new(RegionId(0), ShardId(0), 1);
        let b = WalSegmentId::new(RegionId(0), ShardId(0), 2);
        let c = WalSegmentId::new(RegionId(0), ShardId(1), 0);
        let d = WalSegmentId::new(RegionId(1), ShardId(0), 0);
        assert!(a < b);
        assert!(b < c);
        assert!(c < d);
    }
    #[test]
    fn segment_id_display() {
        let id = WalSegmentId::new(RegionId(2), ShardId(3), 42);
        assert_eq!(id.to_string(), "r2:s3:42");
    }
    #[test]
    fn single_node_defaults() {
        assert_eq!(ShardId::SINGLE, ShardId(0));
        assert_eq!(RegionId::SINGLE, RegionId(0));
        assert_eq!(WalSegmentId::single_node(99).to_string(), "r0:s0:99");
    }
    #[test]
    fn single_node_ordering_by_seqno() {
        let a = WalSegmentId::single_node(10);
        let b = WalSegmentId::single_node(20);
        assert!(a < b);
    }
    #[test]
    fn parse_roundtrip() {
        let id = WalSegmentId::new(RegionId(2), ShardId(3), 42);
        let s = id.to_string();
        let parsed: WalSegmentId = s.parse().expect("should parse");
        assert_eq!(parsed, id);
    }
    #[test]
    fn parse_single_node() {
        let parsed: WalSegmentId = "r0:s0:42".parse().expect("should parse");
        assert_eq!(parsed, WalSegmentId::single_node(42));
    }
    #[test]
    fn parse_wrong_format_errors() {
        assert!("bad".parse::<WalSegmentId>().is_err());
        assert!("r0:s0".parse::<WalSegmentId>().is_err());
        assert!("0:0:42".parse::<WalSegmentId>().is_err());
        assert!("r0:0:42".parse::<WalSegmentId>().is_err());
        assert!("r0:s0:abc".parse::<WalSegmentId>().is_err());
    }
 }
--- a/tidal/src/replication/session_bridge.rs
+++ b/tidal/src/replication/session_bridge.rs
@ -0,0 +1,585 @@
 //! Session replication bridge.
 //!
 //! Replicates session journal entries to follower nodes on a dedicated
 //! crossbeam channel pair, independent of the WAL segment transport.
 //!
 //! # Architecture
 //!
 //! Session events live on a separate transport from WAL segments because:
 //! 1. Session writes are latency-sensitive (interactive user sessions).
 //! 2. WAL segments are batch-shipped on seal -- different cadence.
 //! 3. Independent flow control: a slow WAL receiver must not block session
 //!    replication and vice versa.
 //!
 //! The bridge provides two layers of deduplication on the receiver side:
 //! - **Layer 1 (`SeqNo` HWM):** `SessionSeqNoTracker` rejects events with
 //!   `seqno <= hwm` for that session. This is the primary defense and is
 //!   O(1) per event.
 //! - **Layer 2 (Idempotency key):** `IdempotencyStore` (bounded LRU) catches
 //!   duplicates that arrive with a fresh seqno due to sender retries after
 //!   ambiguous failures.
 use std::collections::HashMap;
 use std::sync::Arc;
 use blake3::Hasher;
 use crossbeam::channel::{Receiver, Sender, TrySendError, bounded};
 use dashmap::DashMap;
 use crate::replication::idempotency::{IdempotencyKey, IdempotencyStore};
 use crate::replication::shard::ShardId;
 use crate::session::state::SessionSeqNoTracker;
 use crate::wal::format::session::{
    SessionSeqNo, SessionWalEvent, decode_session_events, encode_session_event,
 };
 /// Envelope for session events shipped between shards.
 ///
 /// Uses the existing binary codec (`encode_session_event`) for the payload,
 /// not JSON or bincode -- avoids adding new serialization deps.
 #[derive(Debug, Clone)]
 pub struct SessionPayload {
    /// Source shard that produced these events.
    pub source_shard: ShardId,
    /// Binary-encoded session events (using `encode_session_event`).
    pub bytes: Vec<u8>,
    /// BLAKE3 checksum of `bytes` for integrity verification.
    pub checksum: [u8; 32],
    /// Number of events in this batch.
    pub event_count: u32,
 }
 impl SessionPayload {
    /// Build a payload from a list of session events.
    ///
    /// Encodes each event using the existing `encode_session_event` codec,
    /// concatenates the results, and computes a BLAKE3 checksum over the
    /// entire byte buffer. The checksum is verified by the receiver before
    /// any events are decoded.
    #[must_use]
    pub fn build(source_shard: ShardId, events: &[SessionWalEvent]) -> Self {
        let mut bytes = Vec::new();
        for event in events {
            bytes.extend(encode_session_event(event));
        }
        let checksum = *Hasher::new().update(&bytes).finalize().as_bytes();
        #[allow(clippy::cast_possible_truncation)]
        Self {
            source_shard,
            bytes,
            checksum,
            event_count: events.len() as u32,
        }
    }
    /// Decode and verify the events in this payload.
    ///
    /// # Errors
    ///
    /// Returns `SessionBridgeError::ChecksumMismatch` if the BLAKE3 checksum
    /// of `bytes` does not match the stored `checksum`.
    pub fn decode_and_verify(&self) -> Result<Vec<SessionWalEvent>, SessionBridgeError> {
        let expected = *Hasher::new().update(&self.bytes).finalize().as_bytes();
        if expected != self.checksum {
            return Err(SessionBridgeError::ChecksumMismatch);
        }
        Ok(decode_session_events(&self.bytes))
    }
 }
 /// Errors from the session replication bridge.
 #[derive(Debug, thiserror::Error)]
 pub enum SessionBridgeError {
    /// The BLAKE3 checksum of the payload bytes does not match the
    /// checksum in the envelope header. The payload is corrupted and
    /// must be discarded.
    #[error("session batch checksum mismatch")]
    ChecksumMismatch,
    /// The target shard ID was not registered with the transport factory.
    #[error("unknown peer shard: {0}")]
    UnknownPeer(ShardId),
    /// The session transport channel is full or disconnected.
    #[error("session transport channel closed")]
    Closed,
 }
 /// In-process session transport factory.
 ///
 /// Creates one sender+receiver pair per shard for session payloads,
 /// separate from the WAL segment transport channels.
 pub struct InProcessSessionTransportFactory {
    shards: Vec<ShardId>,
    capacity: usize,
 }
 impl InProcessSessionTransportFactory {
    /// Create a factory for the given shard IDs with default capacity (64).
    #[must_use]
    pub fn new(shards: &[ShardId]) -> Self {
        Self {
            shards: shards.to_vec(),
            capacity: 64,
        }
    }
    /// Create a factory with explicit channel capacity.
    #[must_use]
    pub fn with_capacity(shards: &[ShardId], capacity: usize) -> Self {
        Self {
            shards: shards.to_vec(),
            capacity,
        }
    }
    /// Build one [`SessionShardTransport`] per shard.
    ///
    /// Each transport owns all senders (one per shard) and its own unique
    /// receiver. This mirrors the pattern in
    /// [`super::in_process::InProcessTransportFactory`].
    ///
    /// # Panics
    ///
    /// Panics if a receiver is missing for a shard -- unreachable because
    /// the receiver map is populated from the same shard list.
    #[must_use]
    pub fn build(self) -> HashMap<ShardId, SessionShardTransport> {
        let mut senders: HashMap<ShardId, Sender<SessionPayload>> = HashMap::new();
        let mut receivers: HashMap<ShardId, Receiver<SessionPayload>> = HashMap::new();
        for &shard_id in &self.shards {
            let (tx, rx) = bounded(self.capacity);
            senders.insert(shard_id, tx);
            receivers.insert(shard_id, rx);
        }
        let mut result = HashMap::new();
        for &local_shard in &self.shards {
            let receiver = receivers
                .remove(&local_shard)
                .expect("receiver missing for shard that was just created");
            result.insert(
                local_shard,
                SessionShardTransport {
                    local_shard,
                    senders: senders.clone(),
                    receiver,
                },
            );
        }
        result
    }
 }
 /// One end of an in-process session transport.
 ///
 /// Holds all senders (for shipping to any peer) and its own receiver
 /// (for receiving from any peer). Non-blocking send, blocking receive.
 pub struct SessionShardTransport {
    local_shard: ShardId,
    senders: HashMap<ShardId, Sender<SessionPayload>>,
    receiver: Receiver<SessionPayload>,
 }
 impl SessionShardTransport {
    /// Send a session payload to the `to` shard (non-blocking, best-effort).
    ///
    /// # Errors
    ///
    /// - `SessionBridgeError::UnknownPeer` if `to` is not a registered shard.
    /// - `SessionBridgeError::Closed` if the channel is full or disconnected.
    pub fn send(&self, to: ShardId, payload: SessionPayload) -> Result<(), SessionBridgeError> {
        let tx = self
            .senders
            .get(&to)
            .ok_or(SessionBridgeError::UnknownPeer(to))?;
        tx.try_send(payload).map_err(|e| match e {
            TrySendError::Full(_) | TrySendError::Disconnected(_) => SessionBridgeError::Closed,
        })
    }
    /// Block until a session payload arrives, or return `None` on shutdown.
    ///
    /// Returns `None` when all senders have been dropped (transport closed).
    /// The receiver thread should exit when this returns `None`.
    #[must_use]
    pub fn recv(&self) -> Option<SessionPayload> {
        self.receiver.recv().ok()
    }
    /// The shard identity of this transport endpoint.
    #[must_use]
    pub const fn local_shard(&self) -> ShardId {
        self.local_shard
    }
 }
 /// Session replication bridge.
 ///
 /// Manages shipping un-shipped session journal entries to peer shards and
 /// applying incoming session payloads with idempotency guarantees.
 ///
 /// # Ship side
 ///
 /// `ship()` tracks a per-`(session_id, target_shard)` high-water mark and
 /// only ships events with `seqno > hwm`. After a successful send the HWM
 /// is advanced, so the same events are never re-shipped unless the bridge
 /// is reconstructed (e.g., after crash recovery).
 ///
 /// # Receive side
 ///
 /// `recv_and_apply()` validates the checksum, runs two layers of dedup
 /// (`SeqNo` HWM + idempotency key), then applies each surviving event via
 /// a caller-provided callback.
 pub struct SessionReplicationBridge {
    local_shard: ShardId,
    transport: SessionShardTransport,
    idempotency_store: Arc<IdempotencyStore>,
    seqno_tracker: Arc<SessionSeqNoTracker>,
    /// `(session_id_raw, target_shard)` -> highest seqno shipped.
    ship_hwm: DashMap<(u64, ShardId), SessionSeqNo>,
 }
 impl SessionReplicationBridge {
    /// Create a new bridge with the given transport and dedup state.
    pub fn new(
        transport: SessionShardTransport,
        idempotency_store: Arc<IdempotencyStore>,
        seqno_tracker: Arc<SessionSeqNoTracker>,
    ) -> Self {
        let local_shard = transport.local_shard();
        Self {
            local_shard,
            transport,
            idempotency_store,
            seqno_tracker,
            ship_hwm: DashMap::new(),
        }
    }
    /// Ship a batch of session events to `target`.
    ///
    /// Only events with `session_seqno > current ship HWM` are included.
    /// Updates ship HWM on success. Does nothing if the filtered batch is
    /// empty (returns `Ok(0)`).
    ///
    /// Legacy events (with `session_seqno == None`) are always shipped
    /// because they predate the seqno mechanism.
    ///
    /// # Errors
    ///
    /// Returns `SessionBridgeError::UnknownPeer` or `SessionBridgeError::Closed`
    /// if the transport send fails.
    pub fn ship(
        &self,
        target: ShardId,
        session_id_raw: u64,
        events: &[SessionWalEvent],
    ) -> Result<usize, SessionBridgeError> {
        let hwm_key = (session_id_raw, target);
        let current_hwm = self
            .ship_hwm
            .get(&hwm_key)
            .map_or(SessionSeqNo::ZERO, |v| *v);
        // Filter: only events with seqno > current HWM (or legacy with no seqno).
        let to_ship: Vec<SessionWalEvent> = events
            .iter()
            .filter(|e| {
                let seqno = extract_seqno(e);
                seqno.is_none_or(|s| s > current_hwm)
            })
            .cloned()
            .collect();
        if to_ship.is_empty() {
            return Ok(0);
        }
        let highest = to_ship
            .iter()
            .filter_map(extract_seqno)
            .max()
            .unwrap_or(current_hwm);
        let count = to_ship.len();
        let payload = SessionPayload::build(self.local_shard, &to_ship);
        self.transport.send(target, payload)?;
        self.ship_hwm.insert(hwm_key, highest);
        Ok(count)
    }
    /// Receive and apply one incoming session payload.
    ///
    /// Validates checksum, applies idempotency checks (seqno HWM + key),
    /// then applies each new event via the provided callback.
    ///
    /// Returns the number of events actually applied (after deduplication).
    /// Returns `None` if the transport is shut down.
    pub fn recv_and_apply(&self, mut apply_fn: impl FnMut(&SessionWalEvent)) -> Option<usize> {
        let payload = self.transport.recv()?;
        let events = match payload.decode_and_verify() {
            Ok(e) => e,
            Err(e) => {
                // Log and skip corrupted payloads.
                tracing::warn!(
                    "session_bridge: corrupted payload from {}: {e}",
                    payload.source_shard
                );
                return Some(0);
            }
        };
        let mut applied = 0;
        for event in &events {
            // Layer 1: SeqNo HWM check.
            if let SessionWalEvent::Signal {
                session_id,
                session_seqno: Some(seqno),
                ..
            } = event
                && !self.seqno_tracker.should_apply(*session_id, *seqno)
            {
                continue;
            }
            // Layer 2: Idempotency key check.
            if let SessionWalEvent::Signal {
                session_id,
                session_seqno: Some(seqno),
                idempotency_key: Some(key_int),
                ..
            } = event
            {
                let key = IdempotencyKey::derive(*session_id, *seqno, &key_int.to_le_bytes());
                if !self.idempotency_store.check_and_record(key) {
                    continue;
                }
            }
            apply_fn(event);
            applied += 1;
        }
        Some(applied)
    }
    /// The local shard identity of this bridge.
    #[must_use]
    pub const fn local_shard(&self) -> ShardId {
        self.local_shard
    }
    /// Access the seqno tracker (for test assertions).
    #[must_use]
    pub fn seqno_tracker(&self) -> &SessionSeqNoTracker {
        &self.seqno_tracker
    }
 }
 /// Extract the session seqno from a `SessionWalEvent`, if present.
 const fn extract_seqno(event: &SessionWalEvent) -> Option<SessionSeqNo> {
    match event {
        SessionWalEvent::Signal { session_seqno, .. } => *session_seqno,
        SessionWalEvent::Start { .. } | SessionWalEvent::Close { .. } => None,
    }
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
    use super::*;
    fn two_shards() -> (SessionReplicationBridge, SessionReplicationBridge) {
        let shards = [ShardId(0), ShardId(1)];
        let mut transports = InProcessSessionTransportFactory::new(&shards).build();
        let t0 = transports.remove(&ShardId(0)).unwrap();
        let t1 = transports.remove(&ShardId(1)).unwrap();
        let store0 = Arc::new(IdempotencyStore::new(1000));
        let store1 = Arc::new(IdempotencyStore::new(1000));
        let tracker0 = Arc::new(SessionSeqNoTracker::new());
        let tracker1 = Arc::new(SessionSeqNoTracker::new());
        let b0 = SessionReplicationBridge::new(t0, store0, tracker0);
        let b1 = SessionReplicationBridge::new(t1, store1, tracker1);
        (b0, b1)
    }
    fn signal_event(session_id: u64, seqno: u64) -> SessionWalEvent {
        SessionWalEvent::Signal {
            session_id,
            entity_id: 42,
            weight: 1.0,
            ts_ns: 1_000_000,
            signal_name: "view".to_string(),
            annotation: None,
            session_seqno: Some(SessionSeqNo(seqno)),
            idempotency_key: Some(seqno as u128 * 1000 + session_id as u128),
        }
    }
    #[test]
    fn ship_and_receive_10_events() {
        let (b0, b1) = two_shards();
        let events: Vec<_> = (1..=10).map(|i| signal_event(1, i)).collect();
        let shipped = b0.ship(ShardId(1), 1, &events).unwrap();
        assert_eq!(shipped, 10);
        let mut received = Vec::new();
        let applied = b1.recv_and_apply(|e| received.push(e.clone())).unwrap();
        assert_eq!(applied, 10);
        assert_eq!(received.len(), 10);
    }
    #[test]
    fn duplicate_payload_produces_zero_shipped() {
        let (b0, _b1) = two_shards();
        let events = vec![signal_event(2, 1)];
        b0.ship(ShardId(1), 2, &events).unwrap();
        let applied_first = _b1.recv_and_apply(|_| {}).unwrap();
        assert_eq!(applied_first, 1);
        // Ship the same events again.
        // Since ship_hwm is advanced, b0 skips them.
        let shipped_again = b0.ship(ShardId(1), 2, &events).unwrap();
        assert_eq!(shipped_again, 0, "ship HWM prevents re-shipping");
    }
    #[test]
    fn checksum_mismatch_returns_zero() {
        let shards = [ShardId(0), ShardId(1)];
        let mut transports = InProcessSessionTransportFactory::new(&shards).build();
        let t0 = transports.remove(&ShardId(0)).unwrap();
        let t1 = transports.remove(&ShardId(1)).unwrap();
        let store1 = Arc::new(IdempotencyStore::new(100));
        let tracker1 = Arc::new(SessionSeqNoTracker::new());
        // Send a corrupted payload directly to t1's channel.
        let mut payload = SessionPayload::build(ShardId(0), &[signal_event(3, 1)]);
        payload.checksum[0] ^= 0xFF; // corrupt the checksum
        t0.send(ShardId(1), payload).unwrap();
        let b1 = SessionReplicationBridge::new(t1, store1, tracker1);
        let applied = b1.recv_and_apply(|_| {}).unwrap();
        assert_eq!(applied, 0, "corrupted payload skipped");
    }
    #[test]
    fn seqno_hwm_prevents_duplicate_events_on_receiver() {
        let (b0, b1) = two_shards();
        // Ship seqno 1, 2, 3.
        let events = vec![signal_event(4, 1), signal_event(4, 2), signal_event(4, 3)];
        b0.ship(ShardId(1), 4, &events).unwrap();
        let applied = b1.recv_and_apply(|_| {}).unwrap();
        assert_eq!(applied, 3);
        // Verify HWM advanced to seqno 3 on receiver.
        assert_eq!(b1.seqno_tracker().hwm(4), SessionSeqNo(3));
    }
    #[test]
    fn session_payload_empty_batch_is_noop() {
        let (b0, _b1) = two_shards();
        let shipped = b0.ship(ShardId(1), 5, &[]).unwrap();
        assert_eq!(shipped, 0);
    }
    #[test]
    fn ship_incremental_only_sends_new_events() {
        let (b0, b1) = two_shards();
        // Ship first batch: seqno 1-3.
        let batch1: Vec<_> = (1..=3).map(|i| signal_event(10, i)).collect();
        let shipped1 = b0.ship(ShardId(1), 10, &batch1).unwrap();
        assert_eq!(shipped1, 3);
        let applied1 = b1.recv_and_apply(|_| {}).unwrap();
        assert_eq!(applied1, 3);
        // Ship second batch: seqno 1-5. Only 4 and 5 should be shipped.
        let batch2: Vec<_> = (1..=5).map(|i| signal_event(10, i)).collect();
        let shipped2 = b0.ship(ShardId(1), 10, &batch2).unwrap();
        assert_eq!(shipped2, 2, "only seqno 4 and 5 should be shipped");
        let mut received = Vec::new();
        let applied2 = b1.recv_and_apply(|e| received.push(e.clone())).unwrap();
        assert_eq!(applied2, 2);
    }
    #[test]
    fn payload_build_and_decode_roundtrip() {
        let events: Vec<_> = (1..=5).map(|i| signal_event(1, i)).collect();
        let payload = SessionPayload::build(ShardId(0), &events);
        assert_eq!(payload.event_count, 5);
        assert_eq!(payload.source_shard, ShardId(0));
        let decoded = payload.decode_and_verify().unwrap();
        assert_eq!(decoded.len(), 5);
        for (orig, dec) in events.iter().zip(decoded.iter()) {
            assert_eq!(orig, dec);
        }
    }
    #[test]
    fn payload_decode_corrupt_bytes_fails() {
        let mut payload = SessionPayload::build(ShardId(0), &[signal_event(1, 1)]);
        // Flip a bit in the payload bytes.
        if !payload.bytes.is_empty() {
            payload.bytes[0] ^= 0x01;
        }
        let result = payload.decode_and_verify();
        assert!(result.is_err());
        assert!(matches!(
            result.unwrap_err(),
            SessionBridgeError::ChecksumMismatch
        ));
    }
    #[test]
    fn send_to_unknown_peer_fails() {
        let (b0, _b1) = two_shards();
        let events = vec![signal_event(1, 1)];
        let result = b0.ship(ShardId(99), 1, &events);
        assert!(result.is_err());
        assert!(matches!(
            result.unwrap_err(),
            SessionBridgeError::UnknownPeer(ShardId(99))
        ));
    }
    #[test]
    fn factory_with_capacity_builds_correctly() {
        let shards = [ShardId(0), ShardId(1)];
        let transports = InProcessSessionTransportFactory::with_capacity(&shards, 8).build();
        assert_eq!(transports.len(), 2);
        assert!(transports.contains_key(&ShardId(0)));
        assert!(transports.contains_key(&ShardId(1)));
    }
    #[test]
    fn start_and_close_events_always_applied() {
        let (b0, b1) = two_shards();
        let events = vec![
            SessionWalEvent::Start {
                session_id: 20,
                user_id: 100,
                started_at_ns: 1_000,
                agent_id: "agent".to_string(),
                policy_name: "default".to_string(),
            },
            signal_event(20, 1),
            SessionWalEvent::Close { session_id: 20 },
        ];
        // Manually build and send (Start/Close have no seqno, always shipped).
        let payload = SessionPayload::build(b0.local_shard, &events);
        b0.transport.send(ShardId(1), payload).unwrap();
        let mut received = Vec::new();
        let applied = b1.recv_and_apply(|e| received.push(e.clone())).unwrap();
        assert_eq!(applied, 3, "Start + Signal + Close all applied");
    }
 }
--- a/tidal/src/replication/shard.rs
+++ b/tidal/src/replication/shard.rs
@ -0,0 +1,273 @@
 use std::fmt;
 use crate::schema::EntityId;
 /// Uniquely identifies a shard within the cluster.
 ///
 /// A shard owns a contiguous range of `EntityId`s for a given `EntityKind`.
 /// `ShardId(0)` is the default single-node shard.
 #[derive(
    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize, serde::Deserialize,
 )]
 pub struct ShardId(pub u16);
 impl ShardId {
    /// The default single-node shard.
    pub const SINGLE: Self = Self(0);
 }
 impl fmt::Display for ShardId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "s{}", self.0)
    }
 }
 /// Uniquely identifies a region in the cluster.
 ///
 /// `RegionId(0)` is the default single-node region.
 #[derive(
    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize, serde::Deserialize,
 )]
 pub struct RegionId(pub u16);
 impl RegionId {
    /// The default single-node region.
    pub const SINGLE: Self = Self(0);
 }
 impl fmt::Display for RegionId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "r{}", self.0)
    }
 }
 // ── EntityIdRange ──────────────────────────────────────────────────────────
 /// A contiguous, half-open range of `EntityId`s: `[start, end)`.
 ///
 /// The sentinel value `end = u64::MAX` means the range extends to include
 /// *all* entity IDs >= `start`, including `u64::MAX` itself. This is necessary
 /// because a true half-open range cannot represent "up to and including the
 /// maximum u64 value" without overflow.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct EntityIdRange {
    /// Inclusive lower bound.
    pub start: u64,
    /// Exclusive upper bound. `u64::MAX` is a sentinel meaning "includes the last entity".
    pub end: u64,
 }
 impl EntityIdRange {
    /// Returns `true` if the given raw entity ID falls within this range.
    ///
    /// When `end == u64::MAX`, the range covers `[start, u64::MAX]` (inclusive
    /// on both sides) to handle the sentinel convention.
    #[must_use]
    pub const fn contains(&self, id: u64) -> bool {
        id >= self.start && (self.end == u64::MAX || id < self.end)
    }
    /// The full `u64` space -- the default for single-shard deployments.
    #[must_use]
    pub const fn full() -> Self {
        Self {
            start: 0,
            end: u64::MAX,
        }
    }
 }
 // ── RoutingStrategy ────────────────────────────────────────────────────────
 /// Strategy for mapping an `EntityId` to a `ShardId`.
 ///
 /// - `Single`: all entities live on `ShardId(0)`. This is the default for
 ///   single-node tidalDB and introduces zero overhead.
 /// - `Hash`: deterministic `FNV-1a(entity_id) % num_shards` routing. Good for
 ///   uniform distribution but makes range scans across shards expensive.
 /// - `Range`: each shard owns a contiguous `[start, end)` of entity IDs. Good
 ///   for ordered scans but requires careful split-point selection.
 #[derive(Debug, Clone)]
 pub enum RoutingStrategy {
    /// All entities route to the default single shard.
    Single,
    /// Hash-based routing: `FNV-1a(entity_id) % num_shards`.
    Hash { num_shards: u16 },
    /// Range-based routing: each shard owns a contiguous range of `EntityId`s.
    /// Ranges must be sorted, non-overlapping, and cover the full `u64` space.
    Range(Vec<(ShardId, EntityIdRange)>),
 }
 // ── RouterError ────────────────────────────────────────────────────────────
 /// Errors that can occur when constructing a `ShardRouter`.
 ///
 /// All variants are construction-time errors -- once a router is built,
 /// `route()` is infallible.
 #[derive(Debug, thiserror::Error)]
 pub enum RouterError {
    #[error("shard count must be > 0")]
    ZeroShards,
    #[error("range list is empty")]
    EmptyRanges,
    #[error("gap in range coverage: expected start {expected}, found {found}")]
    Gap { expected: u64, found: u64 },
    #[error("empty range starting at {start}")]
    EmptyRange { start: u64 },
    #[error("ranges don't cover full u64 space: ends at {ends_at}")]
    IncompleteCoverage { ends_at: u64 },
 }
 // ── ShardRouter ────────────────────────────────────────────────────────────
 /// Routes `EntityId`s to `ShardId`s according to a configured strategy.
 ///
 /// Construction validates all invariants (no gaps, full coverage, non-zero
 /// shard count) so that `route()` is infallible and branch-free on the hot
 /// path for `Single` and `Hash` strategies.
 #[derive(Debug, Clone)]
 pub struct ShardRouter {
    strategy: RoutingStrategy,
 }
 impl ShardRouter {
    /// Single-shard router. All entities map to `ShardId(0)`.
    #[must_use]
    pub const fn single() -> Self {
        Self {
            strategy: RoutingStrategy::Single,
        }
    }
    /// Hash-based router. `num_shards` must be > 0.
    ///
    /// # Errors
    ///
    /// Returns `RouterError::ZeroShards` if `num_shards == 0`.
    pub const fn hash(num_shards: u16) -> Result<Self, RouterError> {
        if num_shards == 0 {
            return Err(RouterError::ZeroShards);
        }
        Ok(Self {
            strategy: RoutingStrategy::Hash { num_shards },
        })
    }
    /// Range-based router. Ranges must be sorted by start, contiguous,
    /// non-empty, and cover the full `u64` space (last range must end at
    /// `u64::MAX`).
    ///
    /// # Errors
    ///
    /// Returns a `RouterError` variant describing the first validation failure.
    pub fn range(mut ranges: Vec<(ShardId, EntityIdRange)>) -> Result<Self, RouterError> {
        Self::validate_ranges(&mut ranges)?;
        Ok(Self {
            strategy: RoutingStrategy::Range(ranges),
        })
    }
    /// Map an entity to its owning shard. Infallible after construction.
    #[must_use]
    pub fn route(&self, entity_id: EntityId) -> ShardId {
        let raw = entity_id.as_u64();
        match &self.strategy {
            RoutingStrategy::Single => ShardId::SINGLE,
            RoutingStrategy::Hash { num_shards } => {
                let h = fnv1a_hash(raw);
                // SAFETY (arithmetic): num_shards > 0 is validated at construction.
                #[allow(clippy::cast_possible_truncation)]
                let idx = (h % u64::from(*num_shards)) as u16;
                ShardId(idx)
            }
            RoutingStrategy::Range(ranges) => {
                // Binary search for the range containing `raw`.
                // Ranges are sorted by start and validated to be contiguous,
                // so exactly one range will match.
                let idx = ranges.partition_point(|(_, r)| r.start <= raw);
                // partition_point returns the first index where `r.start > raw`,
                // so the containing range is at idx - 1.
                // idx is always >= 1 because ranges[0].start == 0 <= raw.
                let (shard, _) = &ranges[idx - 1];
                *shard
            }
        }
    }
    /// Returns all shard IDs managed by this router, in sorted order.
    #[must_use]
    pub fn all_shards(&self) -> Vec<ShardId> {
        match &self.strategy {
            RoutingStrategy::Single => vec![ShardId::SINGLE],
            RoutingStrategy::Hash { num_shards } => (0..*num_shards).map(ShardId).collect(),
            RoutingStrategy::Range(ranges) => {
                let mut shards: Vec<ShardId> = ranges.iter().map(|(s, _)| *s).collect();
                shards.sort();
                shards.dedup();
                shards
            }
        }
    }
    /// Validate that ranges are sorted, contiguous, non-empty, and cover the
    /// full `u64` space. Sorts the input by `start` before checking.
    fn validate_ranges(ranges: &mut [(ShardId, EntityIdRange)]) -> Result<(), RouterError> {
        if ranges.is_empty() {
            return Err(RouterError::EmptyRanges);
        }
        // Sort by range start so callers don't need to pre-sort.
        ranges.sort_by_key(|(_, r)| r.start);
        let mut expected_start: u64 = 0;
        for (_, range) in ranges.iter() {
            // Check for gaps.
            if range.start != expected_start {
                return Err(RouterError::Gap {
                    expected: expected_start,
                    found: range.start,
                });
            }
            // Check for empty ranges (start == end, and end != u64::MAX which is sentinel).
            if range.start == range.end {
                return Err(RouterError::EmptyRange { start: range.start });
            }
            // An empty range can also occur if start > end, but with u64 and
            // the sentinel convention, start < end OR end == u64::MAX is
            // the only valid case. start > end is unreachable after the gap
            // check because expected_start only advances forward.
            expected_start = range.end;
        }
        // The last range must end at u64::MAX (the sentinel for "covers to the end").
        if expected_start != u64::MAX {
            return Err(RouterError::IncompleteCoverage {
                ends_at: expected_start,
            });
        }
        Ok(())
    }
 }
 /// FNV-1a hash of a `u64` value, used for hash-based shard routing.
 ///
 /// FNV-1a is chosen for its simplicity, speed, and excellent distribution
 /// on integer keys. It is *not* cryptographic -- we only need uniform
 /// distribution across shard buckets.
 fn fnv1a_hash(value: u64) -> u64 {
    const FNV_OFFSET: u64 = 14_695_981_039_346_656_037;
    const FNV_PRIME: u64 = 1_099_511_628_211;
    let mut hash = FNV_OFFSET;
    for byte in &value.to_le_bytes() {
        hash ^= u64::from(*byte);
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    hash
 }
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 #[path = "shard_tests.rs"]
 mod tests;
--- a/tidal/src/replication/shard_tests.rs
+++ b/tidal/src/replication/shard_tests.rs
@ -0,0 +1,395 @@
 use super::*;
 // ── ShardId / RegionId tests (existing) ────────────────────────────────
 #[test]
 fn shard_id_single_is_zero() {
    assert_eq!(ShardId::SINGLE, ShardId(0));
 }
 #[test]
 fn region_id_single_is_zero() {
    assert_eq!(RegionId::SINGLE, RegionId(0));
 }
 #[test]
 fn shard_id_display() {
    assert_eq!(ShardId(3).to_string(), "s3");
    assert_eq!(ShardId(0).to_string(), "s0");
 }
 #[test]
 fn region_id_display() {
    assert_eq!(RegionId(2).to_string(), "r2");
    assert_eq!(RegionId(0).to_string(), "r0");
 }
 #[test]
 fn shard_id_ordering() {
    assert!(ShardId(0) < ShardId(1));
    assert!(ShardId(1) < ShardId(100));
 }
 #[test]
 fn region_id_ordering() {
    assert!(RegionId(0) < RegionId(1));
 }
 // ── EntityIdRange tests ────────────────────────────────────────────────
 #[test]
 fn entity_id_range_contains_basic() {
    let range = EntityIdRange { start: 10, end: 20 };
    assert!(!range.contains(9));
    assert!(range.contains(10));
    assert!(range.contains(15));
    assert!(range.contains(19));
    assert!(!range.contains(20));
 }
 #[test]
 fn entity_id_range_contains_sentinel_end() {
    // end = u64::MAX means "includes everything from start onward"
    let range = EntityIdRange {
        start: 100,
        end: u64::MAX,
    };
    assert!(!range.contains(99));
    assert!(range.contains(100));
    assert!(range.contains(u64::MAX - 1));
    assert!(range.contains(u64::MAX)); // sentinel: inclusive of MAX
 }
 #[test]
 fn entity_id_range_full_covers_everything() {
    let full = EntityIdRange::full();
    assert!(full.contains(0));
    assert!(full.contains(u64::MAX / 2));
    assert!(full.contains(u64::MAX));
 }
 // ── ShardRouter: Single ────────────────────────────────────────────────
 #[test]
 fn single_router_always_returns_shard_zero() {
    let router = ShardRouter::single();
    assert_eq!(router.route(EntityId::new(0)), ShardId::SINGLE);
    assert_eq!(router.route(EntityId::new(42)), ShardId::SINGLE);
    assert_eq!(router.route(EntityId::new(u64::MAX)), ShardId::SINGLE);
 }
 #[test]
 fn single_router_all_shards() {
    let router = ShardRouter::single();
    assert_eq!(router.all_shards(), vec![ShardId::SINGLE]);
 }
 // ── ShardRouter: Hash ──────────────────────────────────────────────────
 #[test]
 fn hash_router_zero_shards_is_error() {
    let err = ShardRouter::hash(0).unwrap_err();
    assert!(err.to_string().contains("must be > 0"));
 }
 #[test]
 fn hash_routing_is_deterministic_spot_check() {
    let router = ShardRouter::hash(5).unwrap();
    for id in [0u64, 1, 42, 1000, u64::MAX] {
        let entity = EntityId::new(id);
        assert_eq!(router.route(entity), router.route(entity));
    }
 }
 #[test]
 fn hash_routing_stays_in_range_spot_check() {
    let router = ShardRouter::hash(7).unwrap();
    for id in [0u64, 1, 100, 10_000, u64::MAX - 1, u64::MAX] {
        let shard = router.route(EntityId::new(id));
        assert!(shard.0 < 7, "shard {} out of range for id {}", shard, id);
    }
 }
 #[test]
 fn hash_router_all_shards() {
    let router = ShardRouter::hash(4).unwrap();
    let shards = router.all_shards();
    assert_eq!(shards, vec![ShardId(0), ShardId(1), ShardId(2), ShardId(3)]);
 }
 #[test]
 fn hash_router_single_shard_always_zero() {
    let router = ShardRouter::hash(1).unwrap();
    for id in [0u64, 42, u64::MAX] {
        assert_eq!(router.route(EntityId::new(id)), ShardId(0));
    }
 }
 // ── ShardRouter: Range ─────────────────────────────────────────────────
 #[test]
 fn range_router_correct_routing() {
    let ranges = vec![
        (ShardId(0), EntityIdRange { start: 0, end: 100 }),
        (
            ShardId(1),
            EntityIdRange {
                start: 100,
                end: u64::MAX,
            },
        ),
    ];
    let router = ShardRouter::range(ranges).unwrap();
    assert_eq!(router.route(EntityId::new(0)), ShardId(0));
    assert_eq!(router.route(EntityId::new(50)), ShardId(0));
    assert_eq!(router.route(EntityId::new(99)), ShardId(0));
    assert_eq!(router.route(EntityId::new(100)), ShardId(1));
    assert_eq!(router.route(EntityId::new(u64::MAX)), ShardId(1));
 }
 #[test]
 fn range_router_three_shards() {
    let ranges = vec![
        (
            ShardId(0),
            EntityIdRange {
                start: 0,
                end: 1000,
            },
        ),
        (
            ShardId(1),
            EntityIdRange {
                start: 1000,
                end: 2000,
            },
        ),
        (
            ShardId(2),
            EntityIdRange {
                start: 2000,
                end: u64::MAX,
            },
        ),
    ];
    let router = ShardRouter::range(ranges).unwrap();
    assert_eq!(router.route(EntityId::new(0)), ShardId(0));
    assert_eq!(router.route(EntityId::new(999)), ShardId(0));
    assert_eq!(router.route(EntityId::new(1000)), ShardId(1));
    assert_eq!(router.route(EntityId::new(1999)), ShardId(1));
    assert_eq!(router.route(EntityId::new(2000)), ShardId(2));
    assert_eq!(router.route(EntityId::new(u64::MAX)), ShardId(2));
 }
 #[test]
 fn range_router_unsorted_input_is_sorted() {
    // Pass ranges out of order -- validate_ranges sorts them.
    let ranges = vec![
        (
            ShardId(1),
            EntityIdRange {
                start: 500,
                end: u64::MAX,
            },
        ),
        (ShardId(0), EntityIdRange { start: 0, end: 500 }),
    ];
    let router = ShardRouter::range(ranges).unwrap();
    assert_eq!(router.route(EntityId::new(0)), ShardId(0));
    assert_eq!(router.route(EntityId::new(500)), ShardId(1));
 }
 #[test]
 fn range_router_all_shards() {
    let ranges = vec![
        (ShardId(2), EntityIdRange { start: 0, end: 100 }),
        (
            ShardId(0),
            EntityIdRange {
                start: 100,
                end: u64::MAX,
            },
        ),
    ];
    let router = ShardRouter::range(ranges).unwrap();
    let mut shards = router.all_shards();
    shards.sort();
    assert_eq!(shards, vec![ShardId(0), ShardId(2)]);
 }
 // ── Range validation errors ────────────────────────────────────────────
 #[test]
 fn range_router_validates_empty_ranges() {
    let err = ShardRouter::range(vec![]).unwrap_err();
    assert!(err.to_string().contains("empty"));
 }
 #[test]
 fn range_router_validates_gap() {
    let ranges = vec![
        (ShardId(0), EntityIdRange { start: 0, end: 100 }),
        // gap: 100..200 is missing
        (
            ShardId(1),
            EntityIdRange {
                start: 200,
                end: u64::MAX,
            },
        ),
    ];
    let err = ShardRouter::range(ranges).unwrap_err();
    match err {
        RouterError::Gap {
            expected: 100,
            found: 200,
        } => {} // correct
        other => panic!("expected Gap, got: {other}"),
    }
 }
 #[test]
 fn range_router_validates_empty_range() {
    let ranges = vec![
        (ShardId(0), EntityIdRange { start: 0, end: 0 }), // empty: start == end
        (
            ShardId(1),
            EntityIdRange {
                start: 0,
                end: u64::MAX,
            },
        ),
    ];
    let err = ShardRouter::range(ranges).unwrap_err();
    match err {
        RouterError::EmptyRange { start: 0 } => {} // correct
        other => panic!("expected EmptyRange, got: {other}"),
    }
 }
 #[test]
 fn range_router_validates_incomplete_coverage() {
    let ranges = vec![
        (ShardId(0), EntityIdRange { start: 0, end: 100 }),
        (
            ShardId(1),
            EntityIdRange {
                start: 100,
                end: 200,
            },
        ),
        // missing: 200..u64::MAX
    ];
    let err = ShardRouter::range(ranges).unwrap_err();
    match err {
        RouterError::IncompleteCoverage { ends_at: 200 } => {} // correct
        other => panic!("expected IncompleteCoverage, got: {other}"),
    }
 }
 // ── FNV-1a hash ────────────────────────────────────────────────────────
 #[test]
 fn fnv1a_hash_deterministic() {
    assert_eq!(fnv1a_hash(42), fnv1a_hash(42));
    assert_eq!(fnv1a_hash(0), fnv1a_hash(0));
 }
 #[test]
 fn fnv1a_hash_different_inputs_differ() {
    // Not a guarantee for all inputs, but should hold for small integers.
    assert_ne!(fnv1a_hash(0), fnv1a_hash(1));
    assert_ne!(fnv1a_hash(1), fnv1a_hash(2));
 }
 // ── Distribution tests ─────────────────────────────────────────────────
 #[test]
 fn hash_routing_uniform_distribution() {
    // 10K IDs across 5 shards should produce buckets within 15% of expected (2000).
    let router = ShardRouter::hash(5).unwrap();
    let mut counts = [0u64; 5];
    for id in 0u64..10_000 {
        let shard = router.route(EntityId::new(id));
        counts[shard.0 as usize] += 1;
    }
    let expected = 10_000u64 / 5; // 2000
    let max_deviation = expected * 15 / 100; // 15%
    for (i, &count) in counts.iter().enumerate() {
        assert!(
            count >= expected - max_deviation && count <= expected + max_deviation,
            "shard {i} has {count} items, expected {expected} ± {max_deviation}"
        );
    }
 }
 // ── Property tests ─────────────────────────────────────────────────────
 mod proptests {
    use super::*;
    use proptest::prelude::*;
    proptest! {
        #[test]
        fn hash_routing_is_deterministic(id in 0u64..u64::MAX) {
            let router = ShardRouter::hash(5).unwrap();
            let entity = EntityId::new(id);
            prop_assert_eq!(router.route(entity), router.route(entity));
        }
        #[test]
        fn hash_routing_stays_in_range(id in 0u64..u64::MAX) {
            let router = ShardRouter::hash(5).unwrap();
            let shard = router.route(EntityId::new(id));
            prop_assert!(shard.0 < 5, "shard {} >= 5", shard);
        }
        #[test]
        fn range_routing_full_range_always_shard_zero(id in 0u64..u64::MAX) {
            let ranges = vec![
                (ShardId(0), EntityIdRange::full()),
            ];
            let router = ShardRouter::range(ranges).unwrap();
            prop_assert_eq!(router.route(EntityId::new(id)), ShardId(0));
        }
        #[test]
        fn entity_id_range_full_contains_everything(id in 0u64..=u64::MAX) {
            let full = EntityIdRange::full();
            prop_assert!(full.contains(id));
        }
    }
 }
 mod proptest_extended {
    use super::*;
    use proptest::prelude::*;
    proptest! {
        /// For any random u64 ID, routing through a 3-shard range router
        /// always returns the shard whose EntityIdRange contains the ID.
        #[test]
        fn range_routing_routes_to_correct_shard(id in 0u64..u64::MAX) {
            // Split [0, u64::MAX) into 3 shards.
            let boundary1 = u64::MAX / 3;
            let boundary2 = 2 * (u64::MAX / 3);
            let ranges = vec![
                (ShardId(0), EntityIdRange { start: 0, end: boundary1 }),
                (ShardId(1), EntityIdRange { start: boundary1, end: boundary2 }),
                (ShardId(2), EntityIdRange { start: boundary2, end: u64::MAX }),
            ];
            let router = ShardRouter::range(ranges.clone()).unwrap();
            let entity = EntityId::new(id);
            let shard = router.route(entity);
            // Verify the routed shard's range actually contains `id`.
            let (_, range) = ranges.iter().find(|(s, _)| *s == shard).expect("shard in ranges");
            prop_assert!(
                range.contains(id),
                "id={} routed to shard {:?} but range {:?} does not contain it",
                id, shard, range
            );
        }
    }
 }
--- a/Show More
+++ b/Show More