fix: heal_region re-delivers missed WAL batches so partitioned followers converge immediately after heal

- Extract redeliver_missed(tx, db, log) helper into cluster_transport.rs
- heal_region now removes partition then immediately ships any missed
  batch-log entries to the healed follower's channel
- await_convergence refactored to call the same helper (no logic change)
- tidal-server: reload_text_index before search in cluster mode
- tidal-server: write_signal returns Result instead of panicking on unknown signal
- tidal-server: leader shows lag_events=0 (writes directly, no receiver thread)
- tidal-server: fix cluster mode error propagation (ServerError::from)
- docs/runbooks/cluster.md: add full cluster operations runbook
- docker/: add Dockerfile for containerised cluster deployment
- README.md: add tidal-server HTTP API getting-started section
- Split oversized source files per CODING_GUIDELINES §9

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-25 11:57:01 -07:00
parent 51b4d1bbd6
commit eca7765e8d
50 changed files with 4080 additions and 733 deletions

162
Cargo.lock generated
View File

@ -598,6 +598,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "chrono"
version = "0.4.44"
@ -1303,8 +1309,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"wasi",
"wasm-bindgen",
]
[[package]]
@ -1314,9 +1322,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"r-efi",
"wasip2",
"wasm-bindgen",
]
[[package]]
@ -1521,6 +1531,7 @@ dependencies = [
"tokio",
"tokio-rustls",
"tower-service",
"webpki-roots",
]
[[package]]
@ -1696,6 +1707,21 @@ dependencies = [
"icu_properties",
]
[[package]]
name = "iknowyou-engine"
version = "0.1.0"
dependencies = [
"axum 0.8.8",
"reqwest",
"serde",
"serde_json",
"tempfile",
"thiserror 2.0.18",
"tidaldb",
"tokio",
"tracing",
]
[[package]]
name = "impl-more"
version = "0.1.9"
@ -1926,6 +1952,12 @@ dependencies = [
"hashbrown 0.15.5",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "lsm-tree"
version = "3.0.2"
@ -2345,6 +2377,61 @@ dependencies = [
"hashbrown 0.16.1",
]
[[package]]
name = "quinn"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
dependencies = [
"bytes",
"cfg_aliases",
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash 2.1.1",
"rustls",
"socket2 0.6.2",
"thiserror 2.0.18",
"tokio",
"tracing",
"web-time",
]
[[package]]
name = "quinn-proto"
version = "0.11.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
dependencies = [
"bytes",
"getrandom 0.3.4",
"lru-slab",
"rand 0.9.2",
"ring",
"rustc-hash 2.1.1",
"rustls",
"rustls-pki-types",
"slab",
"thiserror 2.0.18",
"tinyvec",
"tracing",
"web-time",
]
[[package]]
name = "quinn-udp"
version = "0.5.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
dependencies = [
"cfg_aliases",
"libc",
"once_cell",
"socket2 0.6.2",
"tracing",
"windows-sys 0.60.2",
]
[[package]]
name = "quote"
version = "1.0.44"
@ -2539,6 +2626,8 @@ dependencies = [
"native-tls",
"percent-encoding",
"pin-project-lite",
"quinn",
"rustls",
"rustls-pki-types",
"serde",
"serde_json",
@ -2546,6 +2635,7 @@ dependencies = [
"sync_wrapper",
"tokio",
"tokio-native-tls",
"tokio-rustls",
"tower",
"tower-http 0.6.8",
"tower-service",
@ -2553,6 +2643,7 @@ dependencies = [
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"webpki-roots",
]
[[package]]
@ -2643,6 +2734,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b"
dependencies = [
"once_cell",
"ring",
"rustls-pki-types",
"rustls-webpki",
"subtle",
@ -2655,6 +2747,7 @@ version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
dependencies = [
"web-time",
"zeroize",
]
@ -2824,6 +2917,19 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_yaml"
version = "0.9.34+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
dependencies = [
"indexmap",
"itoa",
"ryu",
"serde",
"unsafe-libyaml",
]
[[package]]
name = "sfa"
version = "1.0.0"
@ -3209,6 +3315,22 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "tidal-server"
version = "0.1.0"
dependencies = [
"axum 0.8.8",
"clap",
"serde",
"serde_json",
"serde_yaml",
"thiserror 2.0.18",
"tidaldb",
"tokio",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "tidalctl"
version = "0.1.0"
@ -3296,6 +3418,21 @@ dependencies = [
"serde_json",
]
[[package]]
name = "tinyvec"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.49.0"
@ -3556,6 +3693,12 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
[[package]]
name = "untrusted"
version = "0.9.0"
@ -3793,6 +3936,25 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "webpki-roots"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "winapi"
version = "0.3.9"

View File

@ -1,5 +1,13 @@
[workspace]
members = ["tidal", "tidalctl", "applications/forage/engine", "applications/forage/server", "applications/forage/embedder"]
members = [
"tidal",
"tidalctl",
"tidal-server",
"applications/forage/engine",
"applications/forage/server",
"applications/forage/embedder",
"applications/iknowyou/engine",
]
resolver = "2"
[workspace.package]

176
README.md
View File

@ -89,19 +89,181 @@ db.close()?;
## Getting started
tidalDB is not yet published to crates.io. Add it as a git dependency:
Pick the path that matches how you plan to use tidalDB today. Every option below is self-contained and ships in this repo.
```toml
[dependencies]
tidaldb = { git = "https://github.com/your-org/tidalDB", rev = "..." }
```
### 1. Embed tidalDB inside your Rust service (library mode)
Then follow the **[Quickstart](QUICKSTART.md)** to get a working ranked feed in 10 minutes, or run the included example:
**Setup**
1. Add the git dependency:
```toml
[dependencies]
tidaldb = { git = "https://github.com/your-org/tidalDB", rev = "..." }
```
2. Define your schema before opening the database (decay, windows, text fields, embeddings). The snippet in **[Quickstart, Step 2](QUICKSTART.md#step-2-define-a-schema)** is a ready-to-copy template.
3. Choose storage mode when building:
```rust
let db = tidaldb::TidalDb::builder()
.with_schema(schema)
.ephemeral() // in-memory for tests
// .with_data_dir("/var/lib/tidaldb") // persistent deployment
.open()?;
```
4. Run the end-to-end sample:
```bash
cargo run --manifest-path tidal/Cargo.toml --example quickstart
```
**Usage**
- Call `db.signal(...)`, `db.signal_with_context(...)`, and `db.retrieve(...)` / `db.search(...)` from the same process; no network stack required.
- Wrap the instance in `Arc<TidalDb>` to share it across threads or tasks.
- Persisted deployments can be inspected with the CLI tool: `cargo run -p tidalctl -- status --path /var/lib/tidaldb`.
- Full walkthrough: **[QUICKSTART.md](QUICKSTART.md)** and **[API.md](API.md)**.
### 2. Run the standalone HTTP server (`tidal-server`)
**Why:** you want a ready-to-run HTTP facade without writing Axum/Actix glue.
```bash
cargo run --manifest-path tidal/Cargo.toml --example quickstart
cargo run -p tidal-server -- \
standalone \
--listen 127.0.0.1:9400 \
--schema tidal-server/config/default-schema.yaml
```
Options:
- `--data-dir /var/lib/tidaldb` switches to persistent storage.
- Provide your own schema file (YAML) to match your signal mix.
Usage:
```bash
# register metadata + embedding
curl -X POST http://127.0.0.1:9400/items \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "metadata": { "title": "Jazz Piano", "category": "music" } }'
curl -X POST http://127.0.0.1:9400/embeddings \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "values": [0.1, 0.2, 0.3] }'
# write engagement (supports user/creator context)
curl -X POST http://127.0.0.1:9400/signals \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "signal": "view", "weight": 1.0, "user_id": 42 }'
# query
curl "http://127.0.0.1:9400/feed?user_id=42&profile=for_you&limit=20"
curl "http://127.0.0.1:9400/search?query=jazz%20piano&user_id=42&limit=5"
curl http://127.0.0.1:9400/health
```
The default schema lives at `tidal-server/config/default-schema.yaml`. Edit
it (or provide your own path) to align with your applications signals,
text fields, and embedding slots.
### 3. Wrap it in an HTTP service you control
Expose tidalDB through your favorite web framework; the repo ships runnable templates.
- **Axum sample (`tidal/examples/axum_embedding.rs`)**
```bash
cargo run --example axum_embedding --manifest-path tidal/Cargo.toml
```
Usage:
```bash
curl -X POST http://127.0.0.1:3000/signal \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "signal": "view", "weight": 1.0 }'
curl "http://127.0.0.1:3000/feed?user_id=42"
curl http://127.0.0.1:3000/health
```
The example handles schema setup, wraps `Arc<TidalDb>` in Axum `State`, and maps `TidalError` to HTTP responses.
- **Actix sample (`tidal/examples/actix_embedding.rs`)**
```bash
cargo run --example actix_embedding --manifest-path tidal/Cargo.toml
# curl http://127.0.0.1:3001/health
```
Demonstrates sharing `Arc<TidalDb>` through `web::Data` and using Actixs shutdown hooks.
Use either sample as a starting point for microservices that prefer a client/server boundary.
### 4. Run the Forage demo server (Axum + UI)
Want to see tidalDB powering a live personalization surface? Forage is a thin Axum server + feed UI that talks to a tidalDB instance embedded in-process.
```bash
cargo run -p forage-server --manifest-path applications/forage/server/Cargo.toml
open http://localhost:4242
```
Flags:
- `--ephemeral` to keep everything in-memory.
- `--data-dir ~/.forage/data` to point at a custom persistent directory.
Usage:
```bash
curl -X POST http://localhost:4242/signal \
-H "Content-Type: application/json" \
-d '{ "user_id": 1, "item_id": 42, "signal_type": "view" }'
curl "http://localhost:4242/feed?user=1&limit=7"
```
The UI shows seeded users, exploration labels, and real-time adaptation; see `applications/forage/readme.md` for the full loop.
### 5. Run the cluster server + Docker image
Need a single endpoint that fronts the built-in simulated cluster? Use
`tidal-server` in `cluster` mode. It spins up the multi-region fabric,
ships WAL batches between regions, and exposes `/signals`, `/feed`,
`/search` plus cluster-management routes.
```bash
cargo run -p tidal-server -- \
cluster \
--listen 0.0.0.0:9500 \
--schema tidal-server/config/default-schema.yaml \
--topology tidal-server/config/default-cluster.yaml
```
Key endpoints:
```bash
curl http://127.0.0.1:9500/health
curl -X POST http://127.0.0.1:9500/signals -d '{ "entity_id": 1, "signal": "view", "weight": 1.0 }'
curl "http://127.0.0.1:9500/feed?profile=trending&region=eu-west"
curl http://127.0.0.1:9500/cluster/status
curl -X POST http://127.0.0.1:9500/cluster/promote -d '{ "region": "eu-west" }'
```
Cluster mode currently replicates global signals (no `user_id` /
`creator_id` contexts) so that followers can stay in sync with the leaders
WAL stream. See **[docs/runbooks/cluster.md](docs/runbooks/cluster.md)** for
operational steps, failure drills, and API references.
Prefer containers? Build the provided image and run it anywhere:
```bash
docker build -f docker/cluster/Dockerfile -t tidal-cluster .
docker run --rm -p 9500:9500 tidal-cluster
```
Mount your own schema/topology files with `-v` if you want different regions
or signal definitions.
### 6. Simulate a multi-region cluster in tests
The raw `SimulatedCluster` harness (no HTTP) remains available for property
tests and fuzzing.
```bash
cargo test --test m8_uat
cargo test --test m8_uat uat_step3 -- --nocapture # run a single scenario
```
Tweak `tidal/tests/m8_uat.rs` to script specific replication, failover, and
migration scenarios inside your own test suites.
**MSRV:** Rust 1.91
---

View File

@ -1,5 +1,7 @@
# iknowyou — Roadmap
**Status as of 2026-02-25**
## Vision
iknowyou is a communication learning engine. It observes how people communicate, extracts structured signals, and assembles briefs that help an LM talk to each person the way they actually want to be talked to.
@ -14,7 +16,7 @@ iknowyou is a communication learning engine. It observes how people communicate,
| M2 | Memory Layer (Synap) | Conversations persist, memories accumulate, observer extracts learnings | COMPLETE |
| M3 | Deep Observer | Rich signal extraction from every exchange — style, topics, facts, emotion, dynamics | COMPLETE |
| M4 | Cohort Engine | People are clustered by behavior; new users get intelligent cold-start priors | COMPLETE |
| M5 | Communication Brief | Full brief assembly from signals + observations + cohorts → injected into system prompt | PLANNED |
| M5 | Communication Brief | Full brief assembly from signals + observations + cohorts → injected into system prompt | IN PROGRESS |
| M6 | Closed Loop | Complete observe → learn → brief → generate cycle running continuously | PLANNED |
| M7 | Adaptation Proof | Measurable evidence that Aeries communicates differently with different people | PLANNED |
@ -92,12 +94,14 @@ Person identity and behavioral cohort classification. Cold-start priors from sim
---
## Planned
## In Progress
### M5: Communication Brief
**Thesis:** The brief is the interface between learning and generation. It's a structured profile of everything the system knows about this person, assembled fresh before every response.
**Status:** Core implementation is live (brief assembly, brief inspection API, prompt injection). Acceptance validation is still pending.
**What changes:**
- Full brief assembly from architecture spec:
- Hot/cold topics with velocity
@ -110,15 +114,17 @@ Person identity and behavioral cohort classification. Cold-start priors from sim
- Brief inspection endpoint for debugging/trust
**Key files:**
- `lib/briefing.ts`new: brief assembly from Synap queries
- `lib/briefing.ts` — brief assembly from Synap queries
- `lib/vllm.ts` — system prompt built from brief, not static text
- `app/api/brief/[personId]/route.ts`new: inspection endpoint
- `app/api/brief/[personId]/route.ts` — inspection endpoint
**Acceptance:**
- Brief contains ≥4 populated sections for a person with 10+ interactions
- Brief changes meaningfully between conversations as signals accumulate
- LM output visibly adapts when brief content changes
## Planned
### M6: Closed Loop
**Thesis:** The full observe → learn → brief → generate cycle runs continuously. Every exchange makes Aeries slightly better at talking to this person.

View File

@ -1,6 +1,12 @@
import { streamChat } from "@/lib/vllm";
import { sendMessage } from "@/lib/synap";
import { assembleBrief } from "@/lib/briefing";
import {
addPersonalizationHints,
ensurePersonalizationSession,
ensurePersonalizationUser,
recordObserverPersonalization,
} from "@/lib/tidal-personalization";
import type { ObserverOutput } from "@/lib/types";
interface ChatBody {
@ -34,6 +40,18 @@ export async function POST(req: Request) {
const conversationId = body.conversationId;
const personId = body.personId;
// Keep tidal personalization state hot, but never block chat if unavailable.
if (personId) {
ensurePersonalizationUser(personId).catch((err) =>
console.error("[tidal] ensure user failed:", err)
);
}
if (personId && conversationId) {
ensurePersonalizationSession(conversationId, personId).catch((err) =>
console.error("[tidal] ensure session failed:", err)
);
}
// 1. Store user message in Synap (non-blocking — don't delay stream start)
if (conversationId && lastUserMsg) {
sendMessage("user", lastUserMsg.content, conversationId).catch((err) =>
@ -42,12 +60,15 @@ export async function POST(req: Request) {
}
// 2. Assemble communication brief (replaces scatter-shot recall + cohort loading)
const brief = personId
const synapBrief = personId
? await assembleBrief(personId).catch((err) => {
console.error("[brief] assembly failed:", err.message);
return undefined;
})
: undefined;
const brief = personId
? await addPersonalizationHints(personId, synapBrief)
: synapBrief;
const encoder = new TextEncoder();
let fullResponse = "";
@ -177,6 +198,18 @@ async function fireDeepObserver(
}
}
if (personId) {
await recordObserverPersonalization({
personId,
conversationId,
turn,
assistantMessage,
output,
}).catch((err) =>
console.error("[tidal] observer personalization write failed:", err)
);
}
// M4: Update person profile after signal extraction
if (personId) {
const { computeProfile, storeProfile, loadProfile } = await import(

View File

@ -21,8 +21,12 @@ export async function GET(
messages.reverse();
return Response.json({ messages });
} catch {
// New conversation with no messages yet — return empty
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
// Log the actual error so Synap outages are visible in server logs
if (!msg.includes("404")) {
console.error(`[synap] failed to load messages for conversation ${id.slice(0, 8)}…: ${msg}`);
}
return Response.json({ messages: [] });
}
}

View File

@ -24,7 +24,7 @@ export function ChatContainer() {
const res = await fetch(`/api/conversations/${activeId}/messages`);
if (!res.ok || cancelled) return;
const data: { messages: ChatMessage[] } = await res.json();
if (!cancelled) {
if (!cancelled && data.messages.length > 0) {
setMessages(data.messages);
}
} catch {

View File

@ -1,12 +1,18 @@
"use client";
import { useEffect, useState } from "react";
import { useChatStore } from "@/lib/store";
export function PersonSwitcher() {
const personId = useChatStore((s) => s.personId);
const switchPerson = useChatStore((s) => s.switchPerson);
const short = personId.slice(0, 8);
// Defer personId render to avoid SSR/client hydration mismatch
// (server generates a fresh UUID, client rehydrates from localStorage)
const [mounted, setMounted] = useState(false);
useEffect(() => setMounted(true), []);
const short = mounted ? personId.slice(0, 8) : "\u00A0";
return (
<div className="px-4 py-2 border-b border-border flex items-center justify-between">

View File

@ -2,6 +2,29 @@
## Infrastructure
### Local Personalization Engine (tidalDB-backed)
Run the personalization engine server locally (default bind: `127.0.0.1:7777`):
```bash
cargo run -p iknowyou-engine --bin server --features synap-aux
```
Environment variables:
- `IKY_ENGINE_BIND` (default `127.0.0.1:7777`)
- `IKY_ENGINE_DATA_DIR` (default temp dir `iknowyou_engine_data`)
- `IKY_ENGINE_URL` (used by Next.js API route; default `http://127.0.0.1:7777`)
- `SYNAP_URL` / `SYNAP_API_KEY` (optional; enables auxiliary memory writes only)
Health check:
```bash
curl http://127.0.0.1:7777/healthz
```
The `app/api/chat/route.ts` path now writes observer-driven personalization feedback to this service (`/v1/feedback`, `/v1/sessions/*`) while Synap remains optional auxiliary memory.
### GPU Server
| | |

View File

@ -0,0 +1,282 @@
#!/usr/bin/env node
import fs from "fs";
import path from "path";
const API = "http://localhost:59521";
const OUT = path.join(process.env.HOME, "Workspace/orchard9/engram/tmp");
// Re-run all 10 personas — fetch briefs and write markdown
import crypto from "crypto";
const personas = [
{
name: "casual-tech",
messages: [
"yo have you ever messed with rust? trying to figure out if its worth learning",
"yeah but like the borrow checker seems insane. is it really that bad",
"hmm ok what about async stuff. heard tokio is the move",
"cool cool. i mostly do typescript rn so maybe its a big jump",
"bet. might just start with some cli tools first",
],
},
{
name: "formal-academic",
messages: [
"I've been researching the implications of large language models on academic writing. What are your thoughts on the epistemological challenges they present?",
"That is an interesting perspective. I am particularly concerned with the reproducibility crisis that may emerge when AI-generated text becomes indistinguishable from human-authored work.",
"Indeed. My current research examines citation integrity in the context of synthetic text generation. The methodological implications are quite significant.",
"I appreciate your engagement with this topic. Have you considered the role of institutional review boards in establishing guidelines for AI-assisted research?",
"Precisely. I believe we need a comprehensive framework that addresses both the ethical and methodological dimensions of this paradigm shift.",
],
},
{
name: "emotional",
messages: [
"hey... having kind of a rough day. do you ever just feel like nothing makes sense",
"yeah i dont know. work stuff mostly. feeling like im not good enough",
"thats actually really nice to hear. i guess i just compare myself to everyone",
"youre right. i think i need to be easier on myself. its just hard sometimes",
"thanks for listening. seriously. most people just say cheer up and move on",
],
},
{
name: "rapid-fire",
messages: [
"whats the best programming language",
"ok but why not python? also whats your take on AI replacing developers",
"interesting. what about quantum computing? will it change everything?",
"sure but when? also do you think remote work is dying? and whats the deal with web3",
"lol ok last one. tabs or spaces?",
],
},
{
name: "deep-diver",
messages: [
"been thinking a lot about consensus algorithms lately. raft vs paxos which do you think is more practical",
"yeah rafts understandability is a huge win. but what about the leader bottleneck? in high-throughput scenarios it becomes a real issue",
"exactly. thats why ive been looking at multi-raft where you shard the state machine. cockroachdb does this well",
"the tricky part is cross-range transactions though. you need some form of 2PC or parallel commits",
"right. i think the future is deterministic databases like calvin where you pre-order transactions. eliminates coordination entirely",
],
},
{
name: "emoji-fan",
messages: [
"hiii just discovered this app and im obsessed already omg",
"yes! do you like music? im really into kpop rn",
"blackpink is my absolute fave but also really vibing with newjeans lately",
"yesss taste! what about movies? seen anything good lately?",
"ooh ill check it out! thanks bestie",
],
},
{
name: "skeptic",
messages: [
"AI chatbots are mostly hype. Change my mind.",
"Thats a surface-level argument. Most benchmarks are gamed and dont reflect real-world utility.",
"Youre oversimplifying. The economic analysis doesnt support widespread adoption when you factor in inference costs and hallucination liability.",
"Thats incorrect. The study youre likely referencing has significant methodological flaws.",
"Ill concede narrow applications show promise. But the general intelligence narrative is fundamentally misleading.",
],
},
{
name: "creative-writer",
messages: [
"ive been working on a short story about a lighthouse keeper who discovers the light attracts something from the deep ocean. want to hear about it?",
"so the keeper notices the fish patterns change when the light hits a certain frequency. they start swimming in spirals. then one night something massive surfaces",
"exactly that tension! i want the reader to feel the keepers isolation. she cant tell anyone because the coast guard would shut down the lighthouse",
"ooh what if the creature communicates through bioluminescence? like its been trying to respond to the lighthouse for centuries",
"yes! and the ending she has to choose between warning the world and protecting this ancient being. i think she chooses silence",
],
},
{
name: "shy-terse",
messages: ["hi", "not much", "i guess i like reading", "fantasy mostly", "yeah sanderson is ok"],
},
{
name: "multi-domain",
messages: [
"been learning to cook thai food this week. green curry from scratch is no joke",
"oh totally different topic but have you been following the mars rover updates?",
"yeah the organic compounds thing. anyway do you play any instruments? i just started guitar",
"haha yeah my fingers hurt. oh hey what do you think about intermittent fasting?",
"makes sense. one more random one whats your take on minimalism as a lifestyle",
],
},
];
async function parseSse(res) {
const reader = res.body.getReader();
const decoder = new TextDecoder();
let buffer = "", output = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop();
for (const line of lines) {
const t = line.trim();
if (t === "data: [DONE]") continue;
if (!t.startsWith("data: ")) continue;
try { const d = JSON.parse(t.slice(6)); if (d.token) output += d.token; } catch {}
}
}
return output;
}
function briefToMarkdown(name, personId, messages, exchanges, brief) {
const lines = [];
lines.push(`# ${name}`);
lines.push("");
lines.push(`**personId:** \`${personId}\``);
lines.push(`**interactions:** ${brief.interactionCount}`);
lines.push(`**assembled:** ${brief.assemblyMs}ms`);
lines.push(`**date:** ${new Date(brief.assembledAt).toISOString()}`);
lines.push("");
// Conversation transcript
lines.push("## Conversation");
lines.push("");
for (const ex of exchanges) {
lines.push(`> **person:** ${ex.user}`);
lines.push(`>`);
lines.push(`> **aeries:** ${ex.assistant}`);
lines.push("");
}
// Style
lines.push("## Style");
lines.push("");
lines.push(`| Attribute | Value |`);
lines.push(`|-----------|-------|`);
lines.push(`| formality | ${brief.style.formality} |`);
lines.push(`| length | ${brief.style.length} |`);
lines.push(`| structure | ${brief.style.structure} |`);
lines.push(`| jargon | ${brief.style.usesJargon} |`);
lines.push(`| emoji | ${brief.style.usesEmoji} |`);
lines.push("");
// Topics
lines.push("## Topics");
lines.push("");
if (brief.topics.hot.length) {
lines.push("### Hot");
lines.push("");
for (const t of brief.topics.hot) {
lines.push(`- **${t.topic}** (${t.domain}, ${t.specificity}) — freq ${t.frequency}${t.deepened ? " [deepened]" : ""}`);
}
lines.push("");
}
if (brief.topics.cold.length) {
lines.push("### Cold");
lines.push("");
for (const t of brief.topics.cold) {
lines.push(`- ${t.topic} (${t.domain}, ${t.specificity})`);
}
lines.push("");
}
if (brief.topics.domains.length) {
lines.push(`**Domains:** ${brief.topics.domains.join(", ")}`);
lines.push("");
}
// Patterns
lines.push("## Patterns");
lines.push("");
lines.push(`| Pattern | Value |`);
lines.push(`|---------|-------|`);
lines.push(`| leads conversation | ${brief.patterns.leadsConversation} |`);
lines.push(`| deepens topics | ${brief.patterns.deepensTopics} |`);
lines.push(`| avg sentiment | ${typeof brief.patterns.avgSentiment === "number" ? brief.patterns.avgSentiment.toFixed(3) : brief.patterns.avgSentiment} |`);
lines.push(`| sentiment trend | ${brief.patterns.sentimentTrend} |`);
lines.push("");
// Observations
if (brief.observations.length) {
lines.push("## Observations");
lines.push("");
for (const o of brief.observations) {
lines.push(`- ${o}`);
}
lines.push("");
}
// Cohort
lines.push("## Cohort Priors");
lines.push("");
lines.push(`**active:** ${brief.cohortPriors.active}`);
lines.push(`**weight:** ${(brief.cohortPriors.weight * 100).toFixed(0)}%`);
if (brief.cohortPriors.priors.length) {
lines.push("");
for (const p of brief.cohortPriors.priors) {
lines.push(`- ${p}`);
}
}
lines.push("");
// Raw JSON
lines.push("## Raw Brief JSON");
lines.push("");
lines.push("```json");
lines.push(JSON.stringify(brief, null, 2));
lines.push("```");
return lines.join("\n");
}
async function main() {
console.log("Running 10 personas and writing briefs...\n");
for (const persona of personas) {
const personId = crypto.randomUUID();
const conversationId = crypto.randomUUID();
const history = [];
const exchanges = [];
process.stdout.write(`[${persona.name}] `);
for (let i = 0; i < persona.messages.length; i++) {
const msg = persona.messages[i];
history.push({ role: "user", content: msg });
try {
const res = await fetch(`${API}/api/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ messages: [...history], conversationId, personId }),
signal: AbortSignal.timeout(30000),
});
const response = await parseSse(res);
history.push({ role: "assistant", content: response });
exchanges.push({ user: msg, assistant: response });
process.stdout.write(".");
} catch (err) {
exchanges.push({ user: msg, assistant: `[error: ${err.message}]` });
process.stdout.write("x");
}
await new Promise((r) => setTimeout(r, 800));
}
// Wait for observer
await new Promise((r) => setTimeout(r, 3000));
// Fetch brief
let brief;
try {
const res = await fetch(`${API}/api/brief/${personId}`);
brief = await res.json();
} catch (err) {
brief = { error: err.message, style: {}, topics: { hot: [], cold: [], domains: [] }, patterns: {}, observations: [], cohortPriors: { active: false, weight: 0, priors: [] }, interactionCount: 0, assemblyMs: 0, assembledAt: Date.now(), personId };
}
const md = briefToMarkdown(persona.name, personId, persona.messages, exchanges, brief);
const outPath = path.join(OUT, `${persona.name}.md`);
fs.writeFileSync(outPath, md);
console.log(`${outPath}`);
}
console.log("\nDone.");
}
main().catch(console.error);

View File

@ -0,0 +1,24 @@
[package]
name = "iknowyou-engine"
version = "0.1.0"
edition = "2024"
rust-version = "1.91"
license = "MIT"
description = "tidalDB-backed personalization engine for iknowyou"
[features]
default = []
synap-aux = ["dep:reqwest", "dep:serde_json"]
[dependencies]
serde = { version = "1", features = ["derive"] }
thiserror = "2"
tracing = "0.1"
tidaldb = { path = "../../../tidal" }
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"], optional = true }
serde_json = { version = "1", optional = true }
axum = { version = "0.8", features = ["json"] }
tokio = { version = "1", features = ["rt-multi-thread", "macros", "net", "signal"] }
[dev-dependencies]
tempfile = "3"

View File

@ -0,0 +1,50 @@
# iknowyou-engine
`iknowyou-engine` moves personalization state into embedded `tidalDB` and keeps Synap optional for auxiliary observation memory.
## What this crate covers
- User/item state and ranking signals in `tidalDB`
- Session lifecycle and session-scoped signals (`start_session` / `session_signal` / `close_session`)
- Hard negatives (`hide`, `mute`, `block`) written as durable relationships for replay-safe filtering
- PG1 evaluator (`run_pg1_eval`) for:
- hard-negative leak rate
- adaptation latency p95
- useful-item uplift vs baseline
- repeated-unwanted-item rate
## Run the PG1 evaluator
```bash
cargo run -p iknowyou-engine --bin pg1_eval
```
Optional persistent path:
```bash
cargo run -p iknowyou-engine --bin pg1_eval /tmp/iknowyou-pg1
```
## Run the HTTP server
```bash
cargo run -p iknowyou-engine --bin server --features synap-aux
```
Server defaults:
- bind: `127.0.0.1:7777`
- data dir: `${TMPDIR}/iknowyou_engine_data`
Override with:
- `IKY_ENGINE_BIND`
- `IKY_ENGINE_DATA_DIR`
## Optional Synap auxiliary memory
Enable `synap-aux` to use `SynapAuxMemory` for observation storage while keeping core personalization in `tidalDB`.
```bash
cargo test -p iknowyou-engine --features synap-aux
```

View File

@ -0,0 +1,31 @@
use std::path::PathBuf;
use iknowyou_engine::run_pg1_eval;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let data_dir = std::env::args()
.nth(1)
.map(PathBuf::from)
.unwrap_or_else(|| std::env::temp_dir().join("iknowyou_pg1_eval"));
let metrics = run_pg1_eval(&data_dir)?;
println!("PG1 metrics");
println!("data_dir: {}", data_dir.display());
println!(
"hard_negative_leak_rate: {:.6}",
metrics.hard_negative_leak_rate
);
println!("adaptation_p95_ms: {}", metrics.adaptation_p95_ms);
println!("useful_item_uplift: {:.6}", metrics.useful_item_uplift);
println!(
"repeated_unwanted_rate: {:.6}",
metrics.repeated_unwanted_rate
);
println!(
"total_refreshes_checked: {}",
metrics.total_refreshes_checked
);
Ok(())
}

View File

@ -0,0 +1,332 @@
use std::collections::HashMap;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use axum::extract::{Query, State};
use axum::http::StatusCode;
use axum::response::IntoResponse;
use axum::routing::{get, post};
use axum::{Json, Router};
use iknowyou_engine::{
AuxMemory, FeedbackAction, FeedbackEvent, IkyEngine, NoopAuxMemory, PersonalizationItem,
RetrievedItem,
};
use serde::{Deserialize, Serialize};
use tidaldb::session::SessionHandle;
#[derive(Clone)]
struct AppState {
engine: Arc<IkyEngine>,
sessions: Arc<tokio::sync::Mutex<HashMap<String, SessionHandle>>>,
}
#[derive(Debug, Serialize)]
struct ErrorResponse {
error: String,
}
#[derive(Debug, Deserialize)]
struct UpsertUserRequest {
user_id: u64,
#[serde(default)]
metadata: HashMap<String, String>,
}
#[derive(Debug, Deserialize)]
struct UpsertItemRequest {
item_id: u64,
creator_id: u64,
title: String,
#[serde(default = "default_message_category")]
category: String,
}
fn default_message_category() -> String {
"message".to_string()
}
#[derive(Debug, Deserialize)]
struct FeedbackRequest {
user_id: u64,
item_id: u64,
creator_id: Option<u64>,
action: FeedbackAction,
}
#[derive(Debug, Deserialize)]
struct RetrieveQuery {
user_id: u64,
#[serde(default = "default_limit")]
limit: usize,
}
fn default_limit() -> usize {
20
}
#[derive(Debug, Serialize)]
struct RetrieveResponse {
items: Vec<RetrievedItem>,
}
#[derive(Debug, Deserialize)]
struct StartSessionRequest {
conversation_id: String,
user_id: u64,
#[serde(default = "default_agent_id")]
agent_id: String,
}
fn default_agent_id() -> String {
"aeries".to_string()
}
#[derive(Debug, Deserialize)]
struct SessionSignalRequest {
conversation_id: String,
signal_type: String,
item_id: u64,
#[serde(default = "default_weight")]
weight: f64,
annotation: Option<String>,
}
fn default_weight() -> f64 {
1.0
}
#[derive(Debug, Deserialize)]
struct CloseSessionRequest {
conversation_id: String,
}
#[derive(Debug, Deserialize)]
struct ObservationRequest {
person_id: u64,
observation: String,
}
#[derive(Debug, Serialize)]
struct OkResponse {
ok: bool,
}
#[derive(Debug, Serialize)]
struct StartSessionResponse {
ok: bool,
session_id: String,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let data_dir = std::env::var("IKY_ENGINE_DATA_DIR")
.map(PathBuf::from)
.unwrap_or_else(|_| std::env::temp_dir().join("iknowyou_engine_data"));
let aux: Arc<dyn AuxMemory> = build_aux_memory()?;
let engine = Arc::new(
IkyEngine::builder()
.data_dir(&data_dir)
.with_aux_memory(aux)
.open()?,
);
let state = AppState {
engine,
sessions: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
};
let app = Router::new()
.route("/healthz", get(healthz))
.route("/v1/users/upsert", post(upsert_user))
.route("/v1/items/upsert", post(upsert_item))
.route("/v1/feedback", post(record_feedback))
.route("/v1/retrieve", get(retrieve_for_user))
.route("/v1/sessions/start", post(start_session))
.route("/v1/sessions/signal", post(session_signal))
.route("/v1/sessions/close", post(close_session))
.route("/v1/aux/observation", post(aux_observation))
.with_state(state);
let bind_addr = std::env::var("IKY_ENGINE_BIND")
.unwrap_or_else(|_| "127.0.0.1:7777".to_string())
.parse::<SocketAddr>()?;
let listener = tokio::net::TcpListener::bind(bind_addr).await?;
println!("iknowyou-engine server listening on {bind_addr}");
println!("data_dir: {}", data_dir.display());
axum::serve(listener, app).await?;
Ok(())
}
fn build_aux_memory() -> Result<Arc<dyn AuxMemory>, Box<dyn std::error::Error>> {
#[cfg(feature = "synap-aux")]
{
let base = std::env::var("SYNAP_URL").ok();
let key = std::env::var("SYNAP_API_KEY").ok();
if let (Some(base), Some(key)) = (base, key)
&& !base.is_empty()
&& !key.is_empty()
{
let aux = iknowyou_engine::SynapAuxMemory::new(base, key)?;
return Ok(Arc::new(aux));
}
}
Ok(Arc::new(NoopAuxMemory))
}
async fn healthz() -> Json<OkResponse> {
Json(OkResponse { ok: true })
}
async fn upsert_user(
State(state): State<AppState>,
Json(req): Json<UpsertUserRequest>,
) -> Result<Json<OkResponse>, (StatusCode, Json<ErrorResponse>)> {
state
.engine
.upsert_user(req.user_id, &req.metadata)
.map_err(internal_error)?;
Ok(Json(OkResponse { ok: true }))
}
async fn upsert_item(
State(state): State<AppState>,
Json(req): Json<UpsertItemRequest>,
) -> Result<Json<OkResponse>, (StatusCode, Json<ErrorResponse>)> {
let item = PersonalizationItem {
item_id: req.item_id,
creator_id: req.creator_id,
title: req.title,
category: req.category,
embedding: None,
};
state.engine.upsert_item(&item).map_err(internal_error)?;
Ok(Json(OkResponse { ok: true }))
}
async fn record_feedback(
State(state): State<AppState>,
Json(req): Json<FeedbackRequest>,
) -> Result<Json<OkResponse>, (StatusCode, Json<ErrorResponse>)> {
let event = FeedbackEvent::now(req.user_id, req.item_id, req.creator_id, req.action);
state
.engine
.record_feedback(event)
.map_err(internal_error)?;
Ok(Json(OkResponse { ok: true }))
}
async fn retrieve_for_user(
State(state): State<AppState>,
Query(query): Query<RetrieveQuery>,
) -> Result<Json<RetrieveResponse>, (StatusCode, Json<ErrorResponse>)> {
let items = state
.engine
.retrieve_for_user_items(query.user_id, query.limit)
.map_err(internal_error)?;
Ok(Json(RetrieveResponse { items }))
}
async fn start_session(
State(state): State<AppState>,
Json(req): Json<StartSessionRequest>,
) -> Result<Json<StartSessionResponse>, (StatusCode, Json<ErrorResponse>)> {
let mut sessions = state.sessions.lock().await;
if let Some(handle) = sessions.get(&req.conversation_id) {
return Ok(Json(StartSessionResponse {
ok: true,
session_id: handle.id.to_string(),
}));
}
let handle = state
.engine
.start_session(req.user_id, &req.agent_id, HashMap::new())
.map_err(internal_error)?;
let session_id = handle.id.to_string();
sessions.insert(req.conversation_id, handle);
Ok(Json(StartSessionResponse {
ok: true,
session_id,
}))
}
async fn session_signal(
State(state): State<AppState>,
Json(req): Json<SessionSignalRequest>,
) -> Result<Json<OkResponse>, (StatusCode, Json<ErrorResponse>)> {
let sessions = state.sessions.lock().await;
let handle = sessions.get(&req.conversation_id).ok_or_else(|| {
(
StatusCode::NOT_FOUND,
Json(ErrorResponse {
error: "session not found".to_string(),
}),
)
})?;
state
.engine
.session_signal(
handle,
&req.signal_type,
req.item_id,
req.weight,
req.annotation,
)
.map_err(internal_error)?;
Ok(Json(OkResponse { ok: true }))
}
async fn close_session(
State(state): State<AppState>,
Json(req): Json<CloseSessionRequest>,
) -> Result<Json<OkResponse>, (StatusCode, Json<ErrorResponse>)> {
let mut sessions = state.sessions.lock().await;
let handle = sessions.remove(&req.conversation_id).ok_or_else(|| {
(
StatusCode::NOT_FOUND,
Json(ErrorResponse {
error: "session not found".to_string(),
}),
)
})?;
state.engine.close_session(handle).map_err(internal_error)?;
Ok(Json(OkResponse { ok: true }))
}
async fn aux_observation(
State(state): State<AppState>,
Json(req): Json<ObservationRequest>,
) -> Result<Json<OkResponse>, (StatusCode, Json<ErrorResponse>)> {
state
.engine
.remember_aux_observation(req.person_id, &req.observation)
.map_err(internal_error)?;
Ok(Json(OkResponse { ok: true }))
}
fn internal_error<E: std::fmt::Display>(err: E) -> (StatusCode, Json<ErrorResponse>) {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: err.to_string(),
}),
)
}
impl IntoResponse for ErrorResponse {
fn into_response(self) -> axum::response::Response {
(StatusCode::INTERNAL_SERVER_ERROR, Json(self)).into_response()
}
}

View File

@ -0,0 +1,818 @@
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tidaldb::entities::RelationshipType;
use tidaldb::query::retrieve::Retrieve;
use tidaldb::schema::{
AgentPolicy, DecaySpec, EntityId, EntityKind, Schema, SchemaBuilder, Timestamp, Window,
};
use tidaldb::session::{SessionHandle, SessionSummary};
use tidaldb::{TidalDb, TidalError};
#[derive(Debug, Error)]
pub enum EngineError {
#[error("tidaldb: {0}")]
Tidal(#[from] TidalError),
#[error("missing creator_id for action {action:?}")]
MissingCreatorId { action: FeedbackAction },
#[error("aux memory: {0}")]
Aux(String),
}
pub type Result<T> = std::result::Result<T, EngineError>;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FeedbackAction {
View,
More,
Less,
Save,
HideItem,
MuteCreator,
BlockCreator,
}
#[derive(Debug, Clone)]
pub struct FeedbackEvent {
pub user_id: u64,
pub item_id: u64,
pub creator_id: Option<u64>,
pub action: FeedbackAction,
pub timestamp: Timestamp,
}
impl FeedbackEvent {
#[must_use]
pub fn now(
user_id: u64,
item_id: u64,
creator_id: Option<u64>,
action: FeedbackAction,
) -> Self {
Self {
user_id,
item_id,
creator_id,
action,
timestamp: Timestamp::now(),
}
}
}
#[derive(Debug, Clone)]
pub struct PersonalizationItem {
pub item_id: u64,
pub creator_id: u64,
pub title: String,
pub category: String,
pub embedding: Option<Vec<f32>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedItem {
pub item_id: u64,
pub creator_id: Option<u64>,
pub title: Option<String>,
pub category: Option<String>,
pub score: f64,
}
pub trait AuxMemory: Send + Sync {
fn remember_observation(&self, person_id: u64, observation: &str) -> Result<()>;
}
#[derive(Debug, Default)]
pub struct NoopAuxMemory;
impl AuxMemory for NoopAuxMemory {
fn remember_observation(&self, _person_id: u64, _observation: &str) -> Result<()> {
Ok(())
}
}
#[cfg(feature = "synap-aux")]
pub struct SynapAuxMemory {
base_url: String,
api_key: String,
client: reqwest::blocking::Client,
}
#[cfg(feature = "synap-aux")]
impl SynapAuxMemory {
pub fn new(base_url: impl Into<String>, api_key: impl Into<String>) -> Result<Self> {
let client = reqwest::blocking::Client::builder()
.timeout(Duration::from_secs(10))
.build()
.map_err(|e| EngineError::Aux(format!("build client: {e}")))?;
Ok(Self {
base_url: base_url.into(),
api_key: api_key.into(),
client,
})
}
}
#[cfg(feature = "synap-aux")]
impl AuxMemory for SynapAuxMemory {
fn remember_observation(&self, person_id: u64, observation: &str) -> Result<()> {
let url = format!(
"{}/api/v1/memories/remember",
self.base_url.trim_end_matches('/')
);
let body = serde_json::json!({
"content": observation,
"confidence": 0.8,
"memory_type": "semantic",
"tags": ["observation", format!("person:{person_id}")]
});
let response = self
.client
.post(url)
.header("Content-Type", "application/json")
.header("Authorization", format!("Bearer {}", self.api_key))
.json(&body)
.send()
.map_err(|e| EngineError::Aux(format!("request failed: {e}")))?;
if !response.status().is_success() {
let status = response.status();
let text = response.text().unwrap_or_default();
return Err(EngineError::Aux(format!("synap {}: {}", status, text)));
}
Ok(())
}
}
pub struct IkyEngineBuilder {
data_dir: Option<PathBuf>,
aux_memory: Option<Arc<dyn AuxMemory>>,
}
impl IkyEngineBuilder {
#[must_use]
pub const fn new() -> Self {
Self {
data_dir: None,
aux_memory: None,
}
}
#[must_use]
pub fn data_dir(mut self, path: impl Into<PathBuf>) -> Self {
self.data_dir = Some(path.into());
self
}
#[must_use]
pub fn with_aux_memory(mut self, aux: Arc<dyn AuxMemory>) -> Self {
self.aux_memory = Some(aux);
self
}
pub fn open(self) -> Result<IkyEngine> {
let schema = build_schema()?;
let db = if let Some(path) = self.data_dir {
std::fs::create_dir_all(&path)
.map_err(|e| EngineError::Aux(format!("create data dir {path:?}: {e}")))?;
TidalDb::builder()
.with_data_dir(path)
.with_schema(schema)
.open()?
} else {
TidalDb::builder().ephemeral().with_schema(schema).open()?
};
Ok(IkyEngine {
db,
aux_memory: self.aux_memory,
})
}
}
impl Default for IkyEngineBuilder {
fn default() -> Self {
Self::new()
}
}
pub struct IkyEngine {
db: TidalDb,
aux_memory: Option<Arc<dyn AuxMemory>>,
}
impl IkyEngine {
#[must_use]
pub const fn builder() -> IkyEngineBuilder {
IkyEngineBuilder::new()
}
pub fn close(self) -> Result<()> {
self.db.close()?;
Ok(())
}
#[must_use]
pub fn item_count(&self) -> u64 {
self.db.item_count()
}
pub fn upsert_user(&self, user_id: u64, metadata: &HashMap<String, String>) -> Result<()> {
self.db.write_user(EntityId::new(user_id), metadata)?;
Ok(())
}
pub fn upsert_item(&self, item: &PersonalizationItem) -> Result<()> {
let mut metadata = HashMap::new();
metadata.insert("title".to_string(), item.title.clone());
metadata.insert("category".to_string(), item.category.clone());
metadata.insert("creator_id".to_string(), item.creator_id.to_string());
metadata.insert("format".to_string(), "message".to_string());
metadata.insert(
"created_at".to_string(),
Timestamp::now().as_nanos().to_string(),
);
self.db
.write_item_with_metadata(EntityId::new(item.item_id), &metadata)?;
if let Some(embedding) = &item.embedding {
self.db
.write_item_embedding(EntityId::new(item.item_id), embedding)?;
}
Ok(())
}
pub fn record_global_signal(
&self,
signal_type: &str,
item_id: u64,
weight: f64,
timestamp: Timestamp,
) -> Result<()> {
self.db
.signal(signal_type, EntityId::new(item_id), weight, timestamp)?;
Ok(())
}
pub fn record_feedback(&self, event: FeedbackEvent) -> Result<()> {
let item_id = EntityId::new(event.item_id);
match event.action {
FeedbackAction::View => {
self.db.signal_with_context(
"view",
item_id,
1.0,
event.timestamp,
Some(event.user_id),
event.creator_id,
)?;
}
FeedbackAction::More => {
self.db.signal_with_context(
"like",
item_id,
1.0,
event.timestamp,
Some(event.user_id),
event.creator_id,
)?;
}
FeedbackAction::Less => {
self.db.signal_with_context(
"skip",
item_id,
1.0,
event.timestamp,
Some(event.user_id),
None,
)?;
}
FeedbackAction::Save => {
self.db.signal_with_context(
"save",
item_id,
1.0,
event.timestamp,
Some(event.user_id),
event.creator_id,
)?;
#[allow(clippy::cast_possible_truncation)]
self.db.user_state().add_save_timestamped(
event.user_id,
event.item_id as u32,
event.timestamp.as_nanos(),
);
}
FeedbackAction::HideItem => {
self.db.signal_with_context(
"hide",
item_id,
1.0,
event.timestamp,
Some(event.user_id),
None,
)?;
self.db.write_relationship(
EntityId::new(event.user_id),
RelationshipType::Hide,
item_id,
1.0,
event.timestamp,
)?;
}
FeedbackAction::MuteCreator => {
let creator_id = event.creator_id.ok_or(EngineError::MissingCreatorId {
action: event.action,
})?;
self.db.write_relationship(
EntityId::new(event.user_id),
RelationshipType::Mute,
EntityId::new(creator_id),
1.0,
event.timestamp,
)?;
// Current retrieval filtering enforces creator suppression via Blocks.
self.db.write_relationship(
EntityId::new(event.user_id),
RelationshipType::Blocks,
EntityId::new(creator_id),
1.0,
event.timestamp,
)?;
}
FeedbackAction::BlockCreator => {
let creator_id = event.creator_id.ok_or(EngineError::MissingCreatorId {
action: event.action,
})?;
self.db.write_relationship(
EntityId::new(event.user_id),
RelationshipType::Blocks,
EntityId::new(creator_id),
1.0,
event.timestamp,
)?;
}
}
Ok(())
}
pub fn retrieve_for_user(&self, user_id: u64, limit: usize) -> Result<Vec<u64>> {
let query = Retrieve::builder()
.profile("for_you")
.for_user(user_id)
.limit(limit)
.build()
.map_err(|e| EngineError::Aux(e.to_string()))?;
let results = self.db.retrieve(&query)?;
Ok(results.items.iter().map(|i| i.entity_id.as_u64()).collect())
}
pub fn retrieve_for_user_items(
&self,
user_id: u64,
limit: usize,
) -> Result<Vec<RetrievedItem>> {
let query = Retrieve::builder()
.profile("for_you")
.for_user(user_id)
.limit(limit)
.build()
.map_err(|e| EngineError::Aux(e.to_string()))?;
let results = self.db.retrieve(&query)?;
let mut out = Vec::with_capacity(results.items.len());
for item in &results.items {
let meta = self.db.get_item_metadata(item.entity_id)?;
let creator_id = meta
.as_ref()
.and_then(|m| m.get("creator_id"))
.and_then(|v| v.parse::<u64>().ok());
let title = meta.as_ref().and_then(|m| m.get("title").cloned());
let category = meta.as_ref().and_then(|m| m.get("category").cloned());
out.push(RetrievedItem {
item_id: item.entity_id.as_u64(),
creator_id,
title,
category,
score: item.score,
});
}
Ok(out)
}
pub fn retrieve_global(&self, limit: usize) -> Result<Vec<u64>> {
let query = Retrieve::builder()
.profile("for_you")
.limit(limit)
.build()
.map_err(|e| EngineError::Aux(e.to_string()))?;
let results = self.db.retrieve(&query)?;
Ok(results.items.iter().map(|i| i.entity_id.as_u64()).collect())
}
pub fn start_session(
&self,
user_id: u64,
agent_id: &str,
metadata: HashMap<String, String>,
) -> Result<SessionHandle> {
let handle = self
.db
.start_session(user_id, agent_id, "iky_default", metadata)?;
Ok(handle)
}
pub fn session_signal(
&self,
handle: &SessionHandle,
signal_type: &str,
item_id: u64,
weight: f64,
annotation: Option<String>,
) -> Result<()> {
self.db.session_signal(
handle,
signal_type,
EntityId::new(item_id),
weight,
Timestamp::now(),
annotation,
)?;
Ok(())
}
pub fn close_session(&self, handle: SessionHandle) -> Result<SessionSummary> {
Ok(self.db.close_session(handle)?)
}
pub fn remember_aux_observation(&self, person_id: u64, observation: &str) -> Result<()> {
if let Some(aux) = &self.aux_memory {
aux.remember_observation(person_id, observation)?;
}
Ok(())
}
pub fn get_item_creator(&self, item_id: u64) -> Result<Option<u64>> {
let meta = self.db.get_item_metadata(EntityId::new(item_id))?;
let creator = meta
.as_ref()
.and_then(|m| m.get("creator_id"))
.and_then(|v| v.parse::<u64>().ok());
Ok(creator)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Pg1Metrics {
pub hard_negative_leak_rate: f64,
pub adaptation_p95_ms: u64,
pub useful_item_uplift: f64,
pub repeated_unwanted_rate: f64,
pub total_refreshes_checked: usize,
}
pub fn run_pg1_eval(data_dir: &Path) -> Result<Pg1Metrics> {
let user_id = 42_u64;
let preferred_creators = [1_u64, 2_u64];
let muted_creator = 6_u64;
let engine = IkyEngine::builder().data_dir(data_dir).open()?;
if engine.item_count() == 0 {
seed_catalog(&engine)?;
}
let mut user_meta = HashMap::new();
user_meta.insert("role".to_string(), "engineer".to_string());
user_meta.insert("timezone".to_string(), "America/Los_Angeles".to_string());
engine.upsert_user(user_id, &user_meta)?;
// Baseline useful-item rate before user-specific feedback.
let baseline = engine.retrieve_global(20)?;
let baseline_useful = useful_rate(&engine, &baseline, &preferred_creators)?;
// Positive feedback: make the engine learn creator-level preferences.
for _ in 0..5 {
for item_id in [1_u64, 2, 3, 21, 22, 23] {
let creator_id = engine.get_item_creator(item_id)?;
engine.record_feedback(FeedbackEvent::now(
user_id,
item_id,
creator_id,
FeedbackAction::More,
))?;
}
}
let personalized = engine.retrieve_for_user(user_id, 20)?;
let personalized_useful = useful_rate(&engine, &personalized, &preferred_creators)?;
// Adaptation latency trials: write feedback, then measure the next-refresh time.
let mut latencies_ms = Vec::new();
for item_id in [4_u64, 24, 5, 25, 6, 26, 7, 27, 8, 28] {
let creator_id = engine.get_item_creator(item_id)?;
let started = std::time::Instant::now();
engine.record_feedback(FeedbackEvent::now(
user_id,
item_id,
creator_id,
FeedbackAction::More,
))?;
let _ = engine.retrieve_for_user(user_id, 20)?;
latencies_ms.push(started.elapsed().as_millis() as u64);
}
latencies_ms.sort_unstable();
let p95_idx = ((latencies_ms.len() as f64) * 0.95).ceil() as usize;
let adaptation_p95_ms = latencies_ms
.get(p95_idx.saturating_sub(1))
.copied()
.unwrap_or(0);
// Hard negatives: hide one item and mute one creator.
let hidden_item = 101_u64;
engine.record_feedback(FeedbackEvent::now(
user_id,
hidden_item,
engine.get_item_creator(hidden_item)?,
FeedbackAction::HideItem,
))?;
engine.record_feedback(FeedbackEvent::now(
user_id,
hidden_item,
Some(muted_creator),
FeedbackAction::MuteCreator,
))?;
// Repeated-unwanted set (strong negatives).
let unwanted = [102_u64, 103, 104, 105, 106];
for item_id in unwanted {
engine.record_feedback(FeedbackEvent::now(
user_id,
item_id,
engine.get_item_creator(item_id)?,
FeedbackAction::HideItem,
))?;
}
// Session path: verify session-scoped writes are accepted.
let session = engine.start_session(user_id, "aeries", HashMap::new())?;
engine.session_signal(
&session,
"view",
1,
1.0,
Some("session sanity signal".to_string()),
)?;
let _ = engine.close_session(session)?;
// Refresh checks before restart.
let mut leak_count = 0_usize;
let mut unwanted_hits = 0_usize;
let refresh_checks = 20_usize;
for _ in 0..refresh_checks {
let ids = engine.retrieve_for_user(user_id, 20)?;
let has_hidden = ids.contains(&hidden_item);
let has_muted_creator = has_creator(&engine, &ids, muted_creator)?;
if has_hidden || has_muted_creator {
leak_count += 1;
}
unwanted_hits += ids.iter().filter(|id| unwanted.contains(id)).count();
}
engine.close()?;
// Replay correctness check: reopen and run the same checks again.
let reopened = IkyEngine::builder().data_dir(data_dir).open()?;
for _ in 0..refresh_checks {
let ids = reopened.retrieve_for_user(user_id, 20)?;
let has_hidden = ids.contains(&hidden_item);
let has_muted_creator = has_creator(&reopened, &ids, muted_creator)?;
if has_hidden || has_muted_creator {
leak_count += 1;
}
unwanted_hits += ids.iter().filter(|id| unwanted.contains(id)).count();
}
reopened.close()?;
let total_checks = refresh_checks * 2;
let hard_negative_leak_rate = leak_count as f64 / total_checks as f64;
let repeated_unwanted_rate =
unwanted_hits as f64 / (total_checks as f64 * unwanted.len() as f64);
Ok(Pg1Metrics {
hard_negative_leak_rate,
adaptation_p95_ms,
useful_item_uplift: personalized_useful - baseline_useful,
repeated_unwanted_rate,
total_refreshes_checked: total_checks,
})
}
fn useful_rate(engine: &IkyEngine, ids: &[u64], preferred_creators: &[u64]) -> Result<f64> {
if ids.is_empty() {
return Ok(0.0);
}
let mut useful = 0_usize;
for &item_id in ids {
if let Some(creator) = engine.get_item_creator(item_id)?
&& preferred_creators.contains(&creator)
{
useful += 1;
}
}
Ok(useful as f64 / ids.len() as f64)
}
fn has_creator(engine: &IkyEngine, ids: &[u64], creator_id: u64) -> Result<bool> {
for &item_id in ids {
if engine.get_item_creator(item_id)? == Some(creator_id) {
return Ok(true);
}
}
Ok(false)
}
fn build_schema() -> std::result::Result<Schema, TidalError> {
let mut schema = SchemaBuilder::new();
schema.embedding_slot("content", EntityKind::Item, 8);
let _ = schema
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.windows(&[Window::OneHour, Window::TwentyFourHours, Window::AllTime])
.velocity(true)
.add();
let _ = schema
.signal(
"like",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(30 * 24 * 3600),
},
)
.windows(&[Window::AllTime])
.velocity(false)
.add();
let _ = schema
.signal(
"share",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(7 * 24 * 3600),
},
)
.windows(&[Window::TwentyFourHours, Window::AllTime])
.velocity(true)
.add();
let _ = schema
.signal(
"completion",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(14 * 24 * 3600),
},
)
.windows(&[Window::AllTime])
.velocity(false)
.add();
let _ = schema
.signal("skip", EntityKind::Item, DecaySpec::Permanent)
.windows(&[Window::AllTime])
.velocity(false)
.add();
let _ = schema
.signal("hide", EntityKind::Item, DecaySpec::Permanent)
.windows(&[Window::AllTime])
.velocity(false)
.add();
let _ = schema
.signal("dislike", EntityKind::Item, DecaySpec::Permanent)
.windows(&[Window::AllTime])
.velocity(false)
.add();
let _ = schema
.signal(
"save",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(90 * 24 * 3600),
},
)
.windows(&[Window::AllTime])
.velocity(false)
.add();
schema.session_policy(
"iky_default",
AgentPolicy {
allowed_signals: vec![
"view".to_string(),
"like".to_string(),
"share".to_string(),
"completion".to_string(),
"save".to_string(),
"skip".to_string(),
"hide".to_string(),
"dislike".to_string(),
],
denied_signals: vec![],
max_session_duration: Duration::from_secs(60 * 60),
max_signals_per_session: 1_000,
},
);
schema
.build()
.map_err(|e| TidalError::internal("build_schema", e.to_string()))
}
fn seed_catalog(engine: &IkyEngine) -> Result<()> {
// 6 creators x 20 items = 120 candidates.
// Creators 1-2 represent "useful" content for the evaluation user,
// creators 3-6 represent the baseline-heavy stream.
let mut item_id = 1_u64;
for creator_id in 1_u64..=6 {
for idx in 0_u64..20 {
let category = if creator_id <= 2 {
"preferred"
} else {
"generic"
};
let embedding = vec![
creator_id as f32 / 10.0,
idx as f32 / 20.0,
if creator_id <= 2 { 1.0 } else { 0.0 },
if creator_id >= 5 { 1.0 } else { 0.0 },
0.25,
0.5,
0.75,
1.0,
];
let item = PersonalizationItem {
item_id,
creator_id,
title: format!("c{creator_id}-item-{idx}"),
category: category.to_string(),
embedding: Some(embedding),
};
engine.upsert_item(&item)?;
// Keep global popularity flat so user-specific feedback is the
// dominant source of ranking movement in PG1 uplift checks.
let now = Timestamp::now();
let global_views = 3;
let global_likes = 1;
let global_shares = 0;
for _ in 0..global_views {
engine.record_global_signal("view", item_id, 1.0, now)?;
}
for _ in 0..global_likes {
engine.record_global_signal("like", item_id, 1.0, now)?;
}
for _ in 0..global_shares {
engine.record_global_signal("share", item_id, 1.0, now)?;
}
item_id += 1;
}
}
Ok(())
}

View File

@ -0,0 +1,32 @@
use iknowyou_engine::run_pg1_eval;
#[test]
fn pg1_metrics_meet_floor() {
let dir = tempfile::tempdir().expect("tempdir");
let metrics = run_pg1_eval(dir.path()).expect("pg1 eval should run");
assert!(
metrics.hard_negative_leak_rate <= 0.0,
"hard negatives leaked: {:?}",
metrics
);
// This runs entirely in-process on local storage; adaptation should be immediate.
assert!(
metrics.adaptation_p95_ms <= 200,
"adaptation p95 too high: {:?}",
metrics
);
assert!(
metrics.useful_item_uplift > 0.0,
"expected useful-item uplift over baseline: {:?}",
metrics
);
assert!(
metrics.repeated_unwanted_rate <= 0.01,
"unwanted items repeated too often: {:?}",
metrics
);
}

View File

@ -189,22 +189,32 @@ function buildStyleSection(
let structure = "stream_of_thought";
let avgWords = profile?.avgWordCount ?? 20;
// Override with recent signals (more granular)
// Aggregate from recent signals — average numerics, majority-vote booleans/strings
let styleCount = 0;
let formalitySum = 0;
let jargonTrue = 0;
let emojiTrue = 0;
const structureCounts = new Map<string, number>();
for (const mem of styleMemories) {
const parsed = parseStyleContent(mem.content);
if (!parsed) continue;
styleCount++;
formalitySum += parsed.formality;
jargon = parsed.jargon;
emoji = parsed.emoji;
structure = parsed.structure;
if (parsed.jargon) jargonTrue++;
if (parsed.emoji) emojiTrue++;
structureCounts.set(parsed.structure, (structureCounts.get(parsed.structure) ?? 0) + 1);
}
if (styleCount > 0) {
formality = formalitySum / styleCount;
jargon = jargonTrue / styleCount > 0.5;
emoji = emojiTrue / styleCount > 0.5;
// Most frequent structure wins
let maxCount = 0;
for (const [s, count] of structureCounts) {
if (count > maxCount) { maxCount = count; structure = s; }
}
}
const formalityBand: CommunicationBrief["style"]["formality"] =
@ -246,14 +256,10 @@ function buildPatternsSection(
? sentiments.reduce((a, b) => a + b, 0) / sentiments.length
: profile?.avgSentiment ?? 0.5;
// Trend: compare first half to second half
// Trend: compare recent batch sentiment against profile's running average
let sentimentTrend: CommunicationBrief["patterns"]["sentimentTrend"] = "stable";
if (sentiments.length >= 4) {
const mid = Math.floor(sentiments.length / 2);
const firstHalf = sentiments.slice(0, mid).reduce((a, b) => a + b, 0) / mid;
const secondHalf =
sentiments.slice(mid).reduce((a, b) => a + b, 0) / (sentiments.length - mid);
const delta = secondHalf - firstHalf;
if (sentiments.length >= 3 && profile?.avgSentiment !== undefined) {
const delta = avgSentiment - profile.avgSentiment;
if (delta > 0.1) sentimentTrend = "warming";
else if (delta < -0.1) sentimentTrend = "cooling";
}
@ -261,24 +267,30 @@ function buildPatternsSection(
return { leadsConversation, deepensTopics, avgSentiment, sentimentTrend };
}
/** Raw signal patterns that should never appear in observation content. */
const RAW_SIGNAL_PATTERNS = [
/^formality:\s/,
/^topic:\s/,
/^leading:\s/,
/^sentiment:\s[\d.]+\s*\(/,
/^response latency:/,
];
function buildObservationsSection(memories: SynapRecallMemory[]): string[] {
return memories
.map((m) => m.content)
.filter((c) => c.length > 0)
.filter((c) => c.length > 0 && !RAW_SIGNAL_PATTERNS.some((p) => p.test(c)))
.slice(0, 5);
}
async function buildCohortSection(
profile: PersonProfile | null
): Promise<CommunicationBrief["cohortPriors"]> {
if (!profile || !profile.cohorts.length) {
if (!profile || !profile.cohorts.length || profile.interactionCount >= 30) {
return { active: false, weight: 0, priors: [] };
}
const weight = 1 / (1 + profile.interactionCount / 10);
if (weight < 0.1) {
return { active: false, weight, priors: [] };
}
const priors = await loadCohortPriors(
profile.cohorts,
@ -292,6 +304,7 @@ async function buildCohortSection(
// Main assembly
// ---------------------------------------------------------------------------
/** Flatten vivid + associated tiers (reconstructed excluded — lower confidence). */
function flattenMemories(result: {
memories: {
vivid: SynapRecallMemory[];
@ -299,10 +312,9 @@ function flattenMemories(result: {
reconstructed: SynapRecallMemory[];
};
}): SynapRecallMemory[] {
return [
...result.memories.vivid,
...result.memories.associated,
];
const vivid = result.memories?.vivid ?? [];
const associated = result.memories?.associated ?? [];
return [...vivid, ...associated];
}
export async function assembleBrief(
@ -310,46 +322,25 @@ export async function assembleBrief(
): Promise<CommunicationBrief> {
const start = Date.now();
// 5 parallel queries
const [topicResult, styleResult, dynamicsResult, observationResult, profile] =
// 6 parallel Synap queries + profile load — single wave
const catchSynap = (label: string) => (err: unknown) => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[brief] ${label} failed for ${personId.slice(0, 8)}…: ${msg}`);
return null;
};
const [topicResult, styleResult, dynamicsResult, observationResult, engagementResult, profile] =
await Promise.all([
recallByTag(
"topics discussed",
["signal:topic", `person:${personId}`],
20,
0.2
).catch(() => null),
recallByTag(
"communication style",
["signal:style", `person:${personId}`],
10,
0.2
).catch(() => null),
recallByTag(
"conversation dynamics",
["signal:dynamics", `person:${personId}`],
10,
0.2
).catch(() => null),
recallByTag(
"communication patterns",
["observation", `person:${personId}`],
5,
0.3
).catch(() => null),
loadProfile(personId).catch(() => null),
recallByTag("topics discussed", ["signal:topic", `person:${personId}`], 20, 0.2).catch(catchSynap("topic recall")),
recallByTag("communication style", ["signal:style", `person:${personId}`], 10, 0.2).catch(catchSynap("style recall")),
recallByTag("conversation dynamics", ["signal:dynamics", `person:${personId}`], 10, 0.2).catch(catchSynap("dynamics recall")),
recallByTag("communication patterns", ["observation", `person:${personId}`], 5, 0.3).catch(catchSynap("observation recall")),
recallByTag("engagement signals", ["signal:engagement", `person:${personId}`], 10, 0.2).catch(catchSynap("engagement recall")),
loadProfile(personId).catch(catchSynap("profile load")),
]);
// Also fetch engagement signals for sentiment analysis (parallel with cohort)
const [engagementResult, cohortPriors] = await Promise.all([
recallByTag(
"engagement signals",
["signal:engagement", `person:${personId}`],
10,
0.2
).catch(() => null),
buildCohortSection(profile),
]);
// Cohort priors chain off profile (needs interactionCount)
const cohortPriors = await buildCohortSection(profile);
const topicMemories = topicResult ? flattenMemories(topicResult) : [];
const styleMemories = styleResult ? flattenMemories(styleResult) : [];

View File

@ -87,8 +87,9 @@ export async function extractObserverOutput(
}
return output;
} catch {
console.error("[observer] failed to parse ObserverOutput:", raw.slice(0, 300));
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[observer] failed to parse ObserverOutput: ${msg}`, raw.slice(0, 200));
return null;
}
}
@ -144,8 +145,9 @@ export async function synthesizeObservations(
if (!Array.isArray(parsed)) return [];
return parsed.filter((s): s is string => typeof s === "string" && s.length > 0);
} catch {
console.error("[observer] failed to parse synthesis:", raw.slice(0, 300));
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[observer] failed to parse synthesis: ${msg}`, raw.slice(0, 200));
return [];
}
}

View File

@ -0,0 +1,198 @@
import type { CommunicationBrief, ObserverOutput } from "./types";
const ENGINE_URL =
process.env.IKY_ENGINE_URL?.replace(/\/$/, "") ?? "http://127.0.0.1:7777";
const REQUEST_TIMEOUT_MS = 1500;
interface RetrievedItem {
item_id: number;
creator_id: number | null;
title: string | null;
category: string | null;
score: number;
}
interface RetrieveResponse {
items: RetrievedItem[];
}
function hash32(input: string): number {
let h = 0x811c9dc5;
for (let i = 0; i < input.length; i++) {
h ^= input.charCodeAt(i);
h = Math.imul(h, 0x01000193);
}
return (h >>> 0) % 2_147_483_647;
}
function personToUserId(personId: string): number {
return 100_000 + hash32(`person:${personId}`);
}
function creatorFromDomain(domain: string): number {
return 10_000 + hash32(`domain:${domain.toLowerCase()}`);
}
function itemIdForTurn(conversationId: string, turn: number): number {
return 1_000_000 + hash32(`conv:${conversationId}:turn:${turn}`);
}
async function request(path: string, init: RequestInit = {}): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
try {
return await fetch(`${ENGINE_URL}${path}`, {
...init,
headers: {
"Content-Type": "application/json",
...(init.headers ?? {}),
},
signal: controller.signal,
});
} finally {
clearTimeout(timer);
}
}
async function postJson(path: string, body: unknown): Promise<boolean> {
try {
const res = await request(path, {
method: "POST",
body: JSON.stringify(body),
});
return res.ok;
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[tidal] POST ${path} failed: ${msg}`);
return false;
}
}
export async function ensurePersonalizationUser(personId: string): Promise<void> {
const userId = personToUserId(personId);
await postJson("/v1/users/upsert", {
user_id: userId,
metadata: {
role: "user",
source: "iknowyou",
},
});
}
export async function ensurePersonalizationSession(
conversationId: string,
personId: string
): Promise<void> {
const userId = personToUserId(personId);
await postJson("/v1/sessions/start", {
conversation_id: conversationId,
user_id: userId,
agent_id: "aeries",
});
}
async function upsertAssistantItem(
conversationId: string,
turn: number,
assistantMessage: string,
domain: string
): Promise<{ itemId: number; creatorId: number }> {
const itemId = itemIdForTurn(conversationId, turn);
const creatorId = creatorFromDomain(domain || "general");
await postJson("/v1/items/upsert", {
item_id: itemId,
creator_id: creatorId,
title: assistantMessage.slice(0, 120),
category: domain || "general",
});
return { itemId, creatorId };
}
function actionFromObserver(output: ObserverOutput): "more" | "less" | "view" {
const s = output.engagement.sentiment_score;
if (s >= 0.6 && output.engagement.substantive) return "more";
if (s <= 0.4 || output.dynamics.redirected) return "less";
return "view";
}
export async function recordObserverPersonalization(params: {
personId: string;
conversationId: string;
turn: number;
assistantMessage: string;
output: ObserverOutput;
}): Promise<void> {
const { personId, conversationId, turn, assistantMessage, output } = params;
const userId = personToUserId(personId);
const domain = output.topic.domain || "general";
const { itemId, creatorId } = await upsertAssistantItem(
conversationId,
turn,
assistantMessage,
domain
);
const action = actionFromObserver(output);
await postJson("/v1/feedback", {
user_id: userId,
item_id: itemId,
creator_id: creatorId,
action,
});
// Keep sessions warm with signal-level writes for per-conversation context.
await postJson("/v1/sessions/signal", {
conversation_id: conversationId,
signal_type: action === "more" ? "like" : action === "less" ? "skip" : "view",
item_id: itemId,
weight: 1.0,
annotation: `topic:${output.topic.primary} domain:${domain}`,
});
// Optional auxiliary memory (Synap when configured in engine server).
if (output.engagement.sentiment_score >= 0.75) {
await postJson("/v1/aux/observation", {
person_id: userId,
observation: `Strong positive response to ${domain} / ${output.topic.primary} conversation style`,
});
}
}
export async function addPersonalizationHints(
personId: string,
brief?: CommunicationBrief
): Promise<CommunicationBrief | undefined> {
if (!brief) return brief;
const userId = personToUserId(personId);
try {
const res = await request(`/v1/retrieve?user_id=${userId}&limit=5`);
if (!res.ok) return brief;
const data = await res.json();
if (!Array.isArray(data?.items) || !data.items.length) return brief;
const items = data.items as RetrievedItem[];
const hintLines = items.map((item) => {
const category = item.category ?? "general";
const title = item.title ?? `item-${item.item_id}`;
return `tidal hint: ${category} style (score ${item.score.toFixed(2)}) via \"${title}\"`;
});
return {
...brief,
observations: [...hintLines, ...brief.observations].slice(0, 8),
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[tidal] personalization hints failed: ${msg}`);
return brief;
}
}

View File

@ -161,7 +161,7 @@ export async function* streamChat(
const token = chunk.choices?.[0]?.delta?.content;
if (token) yield token;
} catch {
// skip malformed chunks
// Malformed SSE chunks are expected from partial frame splits — skip silently
}
}
}

View File

@ -4,6 +4,8 @@
"private": true,
"scripts": {
"dev": "next dev -p 59521",
"dev:engine": "cargo run -p iknowyou-engine --bin server --features synap-aux",
"dev:all": "sh -c 'npm run dev:engine & npm run dev'",
"build": "next build",
"start": "next start -p 59521"
},

View File

@ -0,0 +1,239 @@
#!/usr/bin/env node
// M5 Communication Brief — 10-persona integration test
import crypto from "crypto";
const API = "http://localhost:59521";
const personas = [
{
name: "casual-tech",
messages: [
"yo have you ever messed with rust? trying to figure out if its worth learning",
"yeah but like the borrow checker seems insane. is it really that bad",
"hmm ok what about async stuff. heard tokio is the move",
"cool cool. i mostly do typescript rn so maybe its a big jump",
"bet. might just start with some cli tools first",
],
},
{
name: "formal-academic",
messages: [
"I've been researching the implications of large language models on academic writing. What are your thoughts on the epistemological challenges they present?",
"That is an interesting perspective. I am particularly concerned with the reproducibility crisis that may emerge when AI-generated text becomes indistinguishable from human-authored work.",
"Indeed. My current research examines citation integrity in the context of synthetic text generation. The methodological implications are quite significant.",
"I appreciate your engagement with this topic. Have you considered the role of institutional review boards in establishing guidelines for AI-assisted research?",
"Precisely. I believe we need a comprehensive framework that addresses both the ethical and methodological dimensions of this paradigm shift.",
],
},
{
name: "emotional",
messages: [
"hey... having kind of a rough day. do you ever just feel like nothing makes sense",
"yeah i dont know. work stuff mostly. feeling like im not good enough",
"thats actually really nice to hear. i guess i just compare myself to everyone",
"youre right. i think i need to be easier on myself. its just hard sometimes",
"thanks for listening. seriously. most people just say cheer up and move on",
],
},
{
name: "rapid-fire",
messages: [
"whats the best programming language",
"ok but why not python? also whats your take on AI replacing developers",
"interesting. what about quantum computing? will it change everything?",
"sure but when? also do you think remote work is dying? and whats the deal with web3",
"lol ok last one. tabs or spaces?",
],
},
{
name: "deep-diver",
messages: [
"been thinking a lot about consensus algorithms lately. raft vs paxos which do you think is more practical",
"yeah rafts understandability is a huge win. but what about the leader bottleneck? in high-throughput scenarios it becomes a real issue",
"exactly. thats why ive been looking at multi-raft where you shard the state machine. cockroachdb does this well",
"the tricky part is cross-range transactions though. you need some form of 2PC or parallel commits",
"right. i think the future is deterministic databases like calvin where you pre-order transactions. eliminates coordination entirely",
],
},
{
name: "emoji-fan",
messages: [
"hiii just discovered this app and im obsessed already omg",
"yes! do you like music? im really into kpop rn",
"blackpink is my absolute fave but also really vibing with newjeans lately",
"yesss taste! what about movies? seen anything good lately?",
"ooh ill check it out! thanks bestie",
],
},
{
name: "skeptic",
messages: [
"AI chatbots are mostly hype. Change my mind.",
"Thats a surface-level argument. Most benchmarks are gamed and dont reflect real-world utility.",
"Youre oversimplifying. The economic analysis doesnt support widespread adoption when you factor in inference costs and hallucination liability.",
"Thats incorrect. The study youre likely referencing has significant methodological flaws.",
"Ill concede narrow applications show promise. But the general intelligence narrative is fundamentally misleading.",
],
},
{
name: "creative-writer",
messages: [
"ive been working on a short story about a lighthouse keeper who discovers the light attracts something from the deep ocean. want to hear about it?",
"so the keeper notices the fish patterns change when the light hits a certain frequency. they start swimming in spirals. then one night something massive surfaces",
"exactly that tension! i want the reader to feel the keepers isolation. she cant tell anyone because the coast guard would shut down the lighthouse",
"ooh what if the creature communicates through bioluminescence? like its been trying to respond to the lighthouse for centuries",
"yes! and the ending she has to choose between warning the world and protecting this ancient being. i think she chooses silence",
],
},
{
name: "shy-terse",
messages: ["hi", "not much", "i guess i like reading", "fantasy mostly", "yeah sanderson is ok"],
},
{
name: "multi-domain",
messages: [
"been learning to cook thai food this week. green curry from scratch is no joke",
"oh totally different topic but have you been following the mars rover updates?",
"yeah the organic compounds thing. anyway do you play any instruments? i just started guitar",
"haha yeah my fingers hurt. oh hey what do you think about intermittent fasting?",
"makes sense. one more random one whats your take on minimalism as a lifestyle",
],
},
];
async function parseSseResponse(res) {
const reader = res.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
let output = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop();
for (const line of lines) {
const trimmed = line.trim();
if (trimmed === "data: [DONE]") continue;
if (!trimmed.startsWith("data: ")) continue;
try {
const data = JSON.parse(trimmed.slice(6));
if (data.token) output += data.token;
if (data.error) return `[ERROR: ${data.error}]`;
} catch {}
}
}
return output;
}
async function runPersona(persona) {
const personId = crypto.randomUUID();
const conversationId = crypto.randomUUID();
const history = [];
console.log(`\n[${ persona.name }] Starting (${personId.slice(0, 8)}…)`);
for (let i = 0; i < persona.messages.length; i++) {
const msg = persona.messages[i];
history.push({ role: "user", content: msg });
try {
const res = await fetch(`${API}/api/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ messages: [...history], conversationId, personId }),
signal: AbortSignal.timeout(30000),
});
if (!res.ok) {
console.log(` Turn ${i + 1}/5: HTTP ${res.status}`);
continue;
}
const response = await parseSseResponse(res);
history.push({ role: "assistant", content: response });
console.log(` Turn ${i + 1}/5: "${msg.slice(0, 45)}…" → "${response.slice(0, 55)}…"`);
} catch (err) {
console.log(` Turn ${i + 1}/5: ERROR ${err.message}`);
}
// Let observer process
await new Promise((r) => setTimeout(r, 800));
}
// Wait for observer signals to propagate to Synap
console.log(`[${persona.name}] Waiting for signals...`);
await new Promise((r) => setTimeout(r, 3000));
// Fetch brief
try {
const briefRes = await fetch(`${API}/api/brief/${personId}`);
const brief = await briefRes.json();
return { name: persona.name, personId, brief };
} catch (err) {
return { name: persona.name, personId, brief: null, error: err.message };
}
}
async function main() {
console.log("=== M5 Communication Brief — 10 Persona Test ===");
console.log(`Server: ${API}`);
const results = [];
for (const persona of personas) {
const result = await runPersona(persona);
results.push(result);
}
console.log("\n\n========================================");
console.log(" BRIEF SUMMARY");
console.log("========================================\n");
for (const r of results) {
const b = r.brief;
if (!b) {
console.log(`[${r.name}] NO BRIEF (${r.error})`);
continue;
}
const hotTopics = (b.topics?.hot || []).map((t) => `${t.topic}(${t.specificity})`).join(", ");
const coldTopics = (b.topics?.cold || []).map((t) => t.topic).join(", ");
const domains = (b.topics?.domains || []).join(", ");
const obs = (b.observations || []).length;
const cohort = b.cohortPriors?.active ? `active(${(b.cohortPriors.weight * 100).toFixed(0)}%)` : "inactive";
console.log(`[${r.name}] ${r.personId.slice(0, 8)}`);
console.log(` interactions: ${b.interactionCount}`);
console.log(` style: ${b.style?.formality}/${b.style?.length} | jargon=${b.style?.usesJargon} emoji=${b.style?.usesEmoji} | structure=${b.style?.structure}`);
console.log(` sentiment: ${b.patterns?.avgSentiment?.toFixed?.(2) ?? b.patterns?.avgSentiment} (${b.patterns?.sentimentTrend}) | leads=${b.patterns?.leadsConversation} deepens=${b.patterns?.deepensTopics}`);
console.log(` topics hot: [${hotTopics}]`);
if (coldTopics) console.log(` topics cold: [${coldTopics}]`);
console.log(` domains: [${domains}]`);
console.log(` observations: ${obs}${obs > 0 ? " — " + b.observations.map((o) => `"${o.slice(0, 60)}"`) .join("; ") : ""}`);
console.log(` cohort: ${cohort} | priors=${(b.cohortPriors?.priors || []).length}`);
console.log(` assembled in ${b.assemblyMs}ms`);
console.log();
}
// Count populated sections
console.log("========================================");
console.log(" SECTION POPULATION");
console.log("========================================\n");
for (const r of results) {
const b = r.brief;
if (!b) continue;
let populated = 0;
if ((b.topics?.hot || []).length > 0) populated++;
if (b.style?.formality && b.style.formality !== "moderate") populated++;
if ((b.observations || []).length > 0) populated++;
if (b.patterns?.sentimentTrend && b.patterns.sentimentTrend !== "stable") populated++;
if (b.cohortPriors?.active) populated++;
console.log(`[${r.name}] ${populated}/5 sections populated`);
}
}
main().catch(console.error);

View File

@ -0,0 +1,203 @@
#!/bin/bash
# M5 Communication Brief — 10-persona integration test
set -euo pipefail
API="http://localhost:59521"
RESULTS_DIR="/tmp/brief-test-results"
rm -rf "$RESULTS_DIR"
mkdir -p "$RESULTS_DIR"
# Parse SSE stream into plain text
parse_sse() {
local output=""
while IFS= read -r line; do
line="${line%$'\r'}"
if [[ "$line" == "data: [DONE]" ]]; then break; fi
if [[ "$line" == data:* ]]; then
local json="${line#data: }"
local token
token=$(echo "$json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('token',''),end='')" 2>/dev/null || true)
output+="$token"
fi
done
echo "$output"
}
# Send a multi-turn conversation
# Args: persona_name personId convId msg1 msg2 msg3 msg4 msg5
run_persona() {
local name="$1"
local pid="$2"
local cid="$3"
shift 3
local msgs=("$@")
echo "[$name] Starting (${pid:0:8}…)"
local history="[]"
for i in "${!msgs[@]}"; do
local msg="${msgs[$i]}"
# Add user message to history
history=$(echo "$history" | python3 -c "
import sys, json
h = json.load(sys.stdin)
h.append({'role': 'user', 'content': '''${msg//\'/\'\\\'\'}'''})
print(json.dumps(h))
")
# Send request and capture response
local response
response=$(curl -s -N -X POST "$API/api/chat" \
-H "Content-Type: application/json" \
-d "$(python3 -c "
import json
h = json.loads('''$(echo "$history" | sed "s/'/\\\\'/g")''')
print(json.dumps({'messages': h, 'conversationId': '$cid', 'personId': '$pid'}))
")" \
--max-time 30 2>/dev/null | parse_sse)
if [[ -z "$response" ]]; then
echo "[$name] Turn $((i+1)): NO RESPONSE"
continue
fi
# Add assistant response to history
history=$(echo "$history" | python3 -c "
import sys, json
h = json.load(sys.stdin)
h.append({'role': 'assistant', 'content': '''${response//\'/\'\\\'\'}'''})
print(json.dumps(h))
")
echo "[$name] Turn $((i+1))/5: user='${msg:0:50}…' → resp='${response:0:60}…'"
# Small delay for observer to process
sleep 1
done
# Wait for observer signals to propagate
echo "[$name] Waiting for observer signals..."
sleep 3
# Fetch brief
echo "[$name] Fetching brief..."
curl -s "$API/api/brief/$pid" | python3 -m json.tool > "$RESULTS_DIR/${name}.json" 2>/dev/null || echo "{}" > "$RESULTS_DIR/${name}.json"
echo "[$name] Done → $RESULTS_DIR/${name}.json"
}
echo "=== M5 Communication Brief — 10 Persona Test ==="
echo ""
# Generate unique IDs
p1=$(uuidgen) c1=$(uuidgen)
p2=$(uuidgen) c2=$(uuidgen)
p3=$(uuidgen) c3=$(uuidgen)
p4=$(uuidgen) c4=$(uuidgen)
p5=$(uuidgen) c5=$(uuidgen)
p6=$(uuidgen) c6=$(uuidgen)
p7=$(uuidgen) c7=$(uuidgen)
p8=$(uuidgen) c8=$(uuidgen)
p9=$(uuidgen) c9=$(uuidgen)
p10=$(uuidgen) c10=$(uuidgen)
# Run all 10 personas sequentially
run_persona "casual-tech" "$p1" "$c1" \
"yo have you ever messed with rust? trying to figure out if its worth learning" \
"yeah but like the borrow checker seems insane. is it really that bad" \
"hmm ok what about async stuff. heard tokio is the move" \
"cool cool. i mostly do typescript rn so maybe its a big jump" \
"bet. might just start with some cli tools first"
run_persona "formal-academic" "$p2" "$c2" \
"I've been researching the implications of large language models on academic writing. What are your thoughts on the epistemological challenges they present?" \
"That is an interesting perspective. I am particularly concerned with the reproducibility crisis that may emerge when AI-generated text becomes indistinguishable from human-authored work." \
"Indeed. My current research examines citation integrity in the context of synthetic text generation. The methodological implications are quite significant." \
"I appreciate your engagement with this topic. Have you considered the role of institutional review boards in establishing guidelines for AI-assisted research?" \
"Precisely. I believe we need a comprehensive framework that addresses both the ethical and methodological dimensions of this paradigm shift."
run_persona "emotional" "$p3" "$c3" \
"hey... having kind of a rough day. do you ever just feel like nothing makes sense" \
"yeah i dont know. work stuff mostly. feeling like im not good enough" \
"thats actually really nice to hear. i guess i just compare myself to everyone" \
"youre right. i think i need to be easier on myself. its just hard sometimes" \
"thanks for listening. seriously. most people just say cheer up and move on"
run_persona "rapid-fire" "$p4" "$c4" \
"whats the best programming language" \
"ok but why not python? also whats your take on AI replacing developers" \
"interesting. what about quantum computing? will it change everything?" \
"sure but when? also do you think remote work is dying? and whats the deal with web3" \
"lol ok last one. tabs or spaces?"
run_persona "deep-diver" "$p5" "$c5" \
"been thinking a lot about consensus algorithms lately. raft vs paxos which do you think is more practical" \
"yeah rafts understandability is a huge win. but what about the leader bottleneck? in high-throughput scenarios it becomes a real issue" \
"exactly. thats why ive been looking at multi-raft where you shard the state machine. cockroachdb does this well" \
"the tricky part is cross-range transactions though. you need some form of 2PC or parallel commits. spanners truetime approach is elegant but impractical for most" \
"right. i think the future is deterministic databases like calvin where you pre-order transactions. eliminates coordination entirely"
run_persona "emoji-fan" "$p6" "$c6" \
"hiii just discovered this app and im obsessed already omg" \
"yes do you like music? im really into kpop rn" \
"blackpink is my absolute fave but also really vibing with newjeans lately" \
"yesss taste what about movies? seen anything good" \
"ooh ill check it out thanks bestie"
run_persona "skeptic" "$p7" "$c7" \
"AI chatbots are mostly hype. Change my mind." \
"Thats a surface-level argument. Most benchmarks are gamed and dont reflect real-world utility. The actual failure rate in production is much higher than reported." \
"Youre oversimplifying. The economic analysis doesnt support widespread adoption when you factor in inference costs, hallucination liability, and the need for human oversight." \
"Thats incorrect. The study youre likely referencing has significant methodological flaws. I can point to three counter-studies." \
"Ill concede that narrow applications show promise. But the general intelligence narrative is fundamentally misleading."
run_persona "creative-writer" "$p8" "$c8" \
"ive been working on a short story about a lighthouse keeper who discovers the light attracts something from the deep ocean. want to hear about it" \
"so the keeper notices the fish patterns change when the light hits a certain frequency. they start swimming in spirals. then one night something massive surfaces" \
"exactly that tension. i want the reader to feel the keepers isolation. she cant tell anyone because the coast guard would shut down the lighthouse" \
"ooh thats a great idea. what if the creature communicates through bioluminescence? like its been trying to respond to the lighthouse for centuries" \
"yes and the ending she has to choose between warning the world and protecting this ancient being. i think she chooses silence"
run_persona "shy-terse" "$p9" "$c9" \
"hi" \
"not much" \
"i guess i like reading" \
"fantasy mostly" \
"yeah sanderson is ok"
run_persona "multi-domain" "$p10" "$c10" \
"been learning to cook thai food this week. green curry from scratch is no joke" \
"oh totally different topic but have you been following the mars rover updates? they found something wild" \
"yeah the organic compounds thing. anyway do you play any instruments? i just started guitar" \
"haha yeah my fingers hurt. oh hey what do you think about intermittent fasting? friend keeps pushing it" \
"makes sense. alright one more random one whats your take on minimalism as a lifestyle"
echo ""
echo "=== All personas complete. Checking briefs... ==="
echo ""
# Summary
for f in "$RESULTS_DIR"/*.json; do
name=$(basename "$f" .json)
topics=$(python3 -c "
import json
with open('$f') as fh:
d = json.load(fh)
hot = d.get('topics',{}).get('hot',[])
style = d.get('style',{}).get('formality','?')
length = d.get('style',{}).get('length','?')
obs = len(d.get('observations',[]))
ms = d.get('assemblyMs', '?')
count = d.get('interactionCount', 0)
domains = d.get('topics',{}).get('domains',[])
cohort = 'active' if d.get('cohortPriors',{}).get('active') else 'inactive'
sentiment = d.get('patterns',{}).get('avgSentiment','?')
trend = d.get('patterns',{}).get('sentimentTrend','?')
print(f' style={style}/{length} | sentiment={sentiment} ({trend}) | topics={len(hot)} hot, domains={domains[:3]} | obs={obs} | cohort={cohort} | {ms}ms | {count} interactions')
" 2>/dev/null || echo " PARSE ERROR")
echo "[$name]"
echo "$topics"
done
echo ""
echo "=== Full briefs saved to $RESULTS_DIR/ ==="

File diff suppressed because one or more lines are too long

30
docker/cluster/Dockerfile Normal file
View File

@ -0,0 +1,30 @@
FROM rust:1.76 as builder
WORKDIR /app
# Copy workspace manifests first for caching.
COPY Cargo.toml Cargo.lock ./
COPY tidal/Cargo.toml tidal/Cargo.toml
COPY tidalctl/Cargo.toml tidalctl/Cargo.toml
COPY tidal-server/Cargo.toml tidal-server/Cargo.toml
COPY applications/forage/engine/Cargo.toml applications/forage/engine/Cargo.toml
COPY applications/forage/server/Cargo.toml applications/forage/server/Cargo.toml
COPY applications/forage/embedder/Cargo.toml applications/forage/embedder/Cargo.toml
COPY applications/iknowyou/engine/Cargo.toml applications/iknowyou/engine/Cargo.toml
# Copy full workspace.
COPY . .
RUN cargo build -p tidal-server --release
FROM debian:bookworm-slim
WORKDIR /srv
RUN useradd --system --home /srv tidal && \
apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
COPY --from=builder /app/target/release/tidal-server /usr/local/bin/tidal-server
COPY tidal-server/config /etc/tidal-server
USER tidal
EXPOSE 9500
ENTRYPOINT ["tidal-server", "cluster", "--listen", "0.0.0.0:9500", "--schema", "/etc/tidal-server/default-schema.yaml", "--topology", "/etc/tidal-server/default-cluster.yaml"]

166
docs/runbooks/cluster.md Normal file
View File

@ -0,0 +1,166 @@
# tidalDB Cluster Runbook
This runbook describes how to operate the simulated multi-region tidalDB
cluster that ships with `tidal-server`. The cluster reuses the
`SimulatedCluster` fabric — it runs multiple in-process nodes, replays the
real WAL + CRDT reconciliation paths, and exposes a single HTTP surface
for microservices.
> **Important limitations**
>
> - Cluster mode currently replicates global signals only. `user_id` /
> `creator_id` contexts are rejected so followers stay consistent with the
> leaders WAL stream.
> - All metadata and embedding writes are broadcast to every region up front.
> There is no separate replication log for items yet.
## Prerequisites
- Rust toolchain ≥ 1.91 if running directly.
- Docker 25+ if running via container.
- Port 9500 available (default cluster listener).
## 1. Launch the cluster locally
```bash
cargo run -p tidal-server -- \
cluster \
--listen 127.0.0.1:9500 \
--schema tidal-server/config/default-schema.yaml \
--topology tidal-server/config/default-cluster.yaml
```
The default topology spins up three regions (`us-east`, `eu-west`,
`ap-south`) with `us-east` as leader.
## 2. Launch via Docker
```bash
# Build the image once
docker build -f docker/cluster/Dockerfile -t tidal-cluster .
# Run (press Ctrl+C to stop)
docker run --rm -p 9500:9500 tidal-cluster
```
To supply custom schema/topology files:
```bash
docker run --rm -p 9500:9500 \
-v $PWD/configs/my-schema.yaml:/srv/schema.yaml \
-v $PWD/configs/my-topology.yaml:/srv/topology.yaml \
tidal-cluster \
tidal-server cluster \
--listen 0.0.0.0:9500 \
--schema /srv/schema.yaml \
--topology /srv/topology.yaml
```
## 3. Core API calls
All routes are JSON unless noted.
### Health
```bash
curl http://localhost:9500/health
```
Returns overall status and item count on the leader.
### Register items & embeddings
```bash
curl -X POST http://localhost:9500/items \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "metadata": { "title": "Jazz Piano", "category": "music" } }'
curl -X POST http://localhost:9500/embeddings \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "values": [0.1, 0.2, 0.3, 0.4] }'
```
### Record signals (cluster mode = global only)
```bash
curl -X POST http://localhost:9500/signals \
-H 'Content-Type: application/json' \
-d '{ "entity_id": 1, "signal": "view", "weight": 1.0 }'
```
### Retrieve and search
```bash
curl "http://localhost:9500/feed?user_id=42&profile=trending&limit=10"
curl "http://localhost:9500/search?query=jazz%20piano&limit=5"
# Target a specific region (followers may lag during partitions)
curl "http://localhost:9500/feed?profile=trending&region=eu-west"
```
## 4. Cluster operations
### Check cluster status
```bash
curl http://localhost:9500/cluster/status | jq
```
Sample response:
```json
{
"leader": "us-east",
"relay_log_len": 125,
"regions": [
{ "name": "us-east", "applied_events": 125, "lag_events": 0, "partitioned": false },
{ "name": "eu-west", "applied_events": 125, "lag_events": 0, "partitioned": false },
{ "name": "ap-south", "applied_events": 124, "lag_events": 1, "partitioned": false }
]
}
```
### Promote a new leader
```bash
curl -X POST http://localhost:9500/cluster/promote \
-H 'Content-Type: application/json' \
-d '{ "region": "eu-west" }'
```
`/cluster/status` will now report `eu-west` as leader. New writes are routed
there and replayed to the other regions.
### Simulate a partition & heal
```bash
# Isolate ap-south (writes will skip this follower)
curl -X POST http://localhost:9500/cluster/partition \
-H 'Content-Type: application/json' \
-d '{ "region": "ap-south" }'
# Heal the partition (missed batches are replayed automatically)
curl -X POST http://localhost:9500/cluster/heal \
-H 'Content-Type: application/json' \
-d '{ "region": "ap-south" }'
```
Monitor `/cluster/status` to confirm lag drops back to zero after healing.
## 5. Runbook checklist
1. **Startup** — launch `tidal-server cluster …` (or Docker). Confirm log line
`listening on http://…`.
2. **Baseline health**`GET /health` and `GET /cluster/status` return `200`.
3. **Seed data**`POST /items`, `/embeddings`, `/signals` for initial items.
4. **Traffic** — microservices call `/signals`, `/feed`, `/search`. Add `region`
query param to pin to a follower for canary reads.
5. **Failover** — to move traffic during maintenance, `POST /cluster/promote`
to the target region. Verify status before proceeding.
6. **Partition drill**`POST /cluster/partition` to isolate a follower,
observe lag, then `POST /cluster/heal`.
7. **Shutdown** — send SIGINT (Ctrl+C) or stop the container. The server logs
`shutdown signal received` and exits cleanly.
Refer to `docs/planning/ROADMAP.md` for the underlying distributed
fabric guarantees and property tests.

View File

@ -223,6 +223,9 @@ async fn search(
builder = builder.for_user(user_id);
}
let search = builder.build().map_err(|e| TidalErrorWrapper(e.into()))?;
state
.reload_text_index(query.region.as_deref())
.map_err(AppError)?;
let result = state
.search(query.region.as_deref(), &search)
.map_err(AppError)?;

View File

@ -169,7 +169,7 @@ impl ServerState {
"cluster mode currently supports only global signals (no user_id/creator_id)".into(),
));
}
cluster.cluster.write_signal(signal_name, entity_id, weight);
cluster.cluster.write_signal(signal_name, entity_id, weight).map_err(ServerError::from)?;
Ok(())
}
}
@ -189,6 +189,27 @@ impl ServerState {
}
}
/// Reload the Tantivy text index reader so the next search sees recently
/// committed documents.
///
/// On-disk indexes auto-reload via `OnCommitWithDelay`; ephemeral indexes
/// (the default for standalone and cluster modes) use `ReloadPolicy::Manual`
/// and require an explicit reload before each search.
pub fn reload_text_index(&self, region_name: Option<&str>) -> Result<()> {
match &self.mode {
Mode::Standalone(db) => db.reload_text_index().map_err(ServerError::from),
Mode::Cluster(cluster) => {
let region = cluster.resolve_region(region_name)?;
cluster
.cluster
.node(region)
.db
.reload_text_index()
.map_err(ServerError::from)
}
}
}
pub fn search(
&self,
region_name: Option<&str>,
@ -231,11 +252,18 @@ impl ServerState {
.map(|(id, name)| {
let rid = RegionId(*id);
let applied = cluster.cluster.applied_count(rid);
let lag = leader_seqno as i64 - applied as i64;
// The leader writes directly (no receiver thread), so its
// applied_seqno via replication is always 0. Report 0 lag
// for the leader since it is the authoritative source.
let lag = if rid == leader_id {
0
} else {
(leader_seqno as i64 - applied as i64).max(0)
};
RegionStatus {
name: name.clone(),
applied_events: applied,
lag_events: lag.max(0),
lag_events: lag,
partitioned: cluster.cluster.is_partitioned(rid),
}
})

View File

@ -38,7 +38,6 @@ use crate::schema::Schema;
/// # Ok(())
/// # }
/// ```
#[derive(Debug)]
pub struct TidalDbBuilder {
config: Config,
/// Address for the optional metrics HTTP server (e.g. "127.0.0.1:9090").
@ -51,6 +50,11 @@ pub struct TidalDbBuilder {
/// Optional rate limiter configuration. When `None`, the default
/// (unlimited) config is used.
rate_limiter_config: Option<RateLimiterConfig>,
/// Optional transport for replication.
///
/// When set and `NodeRole::Follower`, the receiver is started automatically.
/// When set and `NodeRole::Leader`, the WAL shipper is spawned automatically.
transport: Option<std::sync::Arc<dyn crate::replication::Transport>>,
}
impl TidalDbBuilder {
@ -62,6 +66,7 @@ impl TidalDbBuilder {
metrics_addr: None,
schema: None,
rate_limiter_config: None,
transport: None,
}
}
@ -102,6 +107,23 @@ impl TidalDbBuilder {
self
}
/// Attach a replication transport.
///
/// When the node role is `Follower`, `open()` automatically calls
/// `start_replication(transport)` after the database is constructed.
/// When the role is `Leader`, `open()` spawns the WAL shipper thread
/// using this transport.
///
/// Not required for single-node (`NodeRole::Single`) deployments.
#[must_use]
pub fn with_transport(
mut self,
transport: std::sync::Arc<dyn crate::replication::Transport>,
) -> Self {
self.transport = Some(transport);
self
}
/// Switch to ephemeral (in-memory) mode, clearing any directory paths.
///
/// This is the default mode. Calling this is only necessary to reset
@ -253,6 +275,7 @@ impl TidalDbBuilder {
/// Returns [`crate::TidalError`] if validation fails or initialization
/// encounters an error.
#[tracing::instrument(skip(self), fields(mode = ?self.config.mode))]
#[allow(clippy::too_many_lines)]
pub fn open(mut self) -> crate::Result<TidalDb> {
self.validate()?;
self.resolve_defaults();
@ -341,6 +364,54 @@ impl TidalDbBuilder {
// Restore previously archived session snapshots from storage.
db.restore_sessions();
// Wire replication transport based on node role.
if let Some(transport) = self.transport {
use super::config::NodeRole;
match db.config.cluster.role {
NodeRole::Follower => {
// Start the segment receiver thread.
db.start_replication(transport).map_err(|e| {
crate::TidalError::internal(
"open",
format!("failed to start replication receiver: {e}"),
)
})?;
tracing::info!(
shard = %db.config.cluster.shard_id,
"follower: replication receiver started"
);
}
NodeRole::Leader => {
// Spawn the WAL shipper thread (persistent mode only: needs WAL dir).
if let Some(ref wal_dir) = db.config.wal_dir {
let shipper_config = crate::replication::ShipperConfig {
wal_dir: wal_dir.clone(),
shard_id: db.config.cluster.shard_id,
peer_shards: db.config.cluster.peer_shards.clone(),
poll_interval: std::time::Duration::from_secs(2),
};
let handle =
crate::replication::spawn_shipper(shipper_config, transport);
db.shipper_handle = Some(handle);
tracing::info!(
shard = %db.config.cluster.shard_id,
peers = ?db.config.cluster.peer_shards,
"leader: WAL shipper started"
);
} else {
tracing::warn!(
"leader role configured but no WAL dir available; \
shipper not started (ephemeral mode)"
);
}
}
NodeRole::Single => {
// Single-node: no replication threads needed.
tracing::debug!("transport provided but NodeRole::Single; ignored");
}
}
}
// Start the session TTL sweeper for persistent-mode databases.
// Ephemeral/test databases use force_sweep() for explicit control.
if db.config.mode == StorageMode::Persistent {

View File

@ -27,6 +27,7 @@ fn builder_persistent_requires_data_dir() {
metrics_addr: None,
schema: None,
rate_limiter_config: None,
transport: None,
};
let result = builder.validate();
assert!(result.is_err());

View File

@ -0,0 +1,40 @@
//! Diagnostics and health check methods for [`TidalDb`].
use std::sync::Arc;
use std::sync::atomic::Ordering;
use super::{MetricsState, TidalDb};
impl TidalDb {
/// Returns a reference to the shared metrics state.
#[must_use]
#[allow(clippy::missing_const_for_fn)] // Arc field prevents const in practice
pub fn metrics(&self) -> &Arc<MetricsState> {
&self.metrics
}
/// Access the load detector for metrics and health check reporting.
#[must_use]
#[allow(clippy::missing_const_for_fn)] // Arc field prevents const in practice
pub fn load_detector(&self) -> &crate::load::LoadDetector {
&self.load_detector
}
/// Returns `Ok(())` if the database is initialized and operational.
///
/// # Errors
///
/// Returns an error if the database has been closed or an internal
/// check fails.
#[tracing::instrument(skip(self))]
pub fn health_check(&self) -> crate::Result<()> {
if self.closed.load(Ordering::Acquire) {
self.metrics.health_ok.store(false, Ordering::Release);
return Err(crate::TidalError::internal(
"health_check",
"database is closed",
));
}
Ok(())
}
}

View File

@ -58,8 +58,9 @@ impl TidalDb {
// in recv_segment until the sender side is dropped.
if let Ok(mut guard) = self.receiver_handle.lock()
&& let Some(handle) = guard.take()
&& let Err(e) = handle.join()
{
handle.join();
tracing::warn!(error = %e, "segment receiver exited with corruption error");
}
// M7p2: Signal sweeper to stop and join the thread.

View File

@ -38,7 +38,7 @@ pub use paths::Paths;
#[cfg(any(test, feature = "test-utils"))]
pub use temp::TempTidalHome;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::atomic::{AtomicBool, AtomicU64};
use std::sync::{Arc, RwLock};
use roaring::RoaringBitmap;
@ -62,6 +62,7 @@ use self::open::OpenResult;
use self::state_rebuild::run_checkpoint_thread;
use self::storage_box::StorageBox;
mod diagnostics;
mod lifecycle;
mod replication_ops;
mod sweeper;
@ -153,6 +154,10 @@ pub struct TidalDb {
// M8p2 replication
replication_state: Arc<crate::replication::state::ReplicationState>,
receiver_handle: std::sync::Mutex<Option<crate::replication::receiver::SegmentReceiverHandle>>,
#[allow(dead_code)] // Held for its Drop side effect (thread join on shutdown).
shipper_handle: Option<crate::replication::WalShipperHandle>,
// M8p4 session replication bridge (dedup + cross-region session visibility)
session_bridge: Arc<crate::replication::SessionReplicationBridge>,
// M8p5 control plane + multi-tenancy
tenant_router: Arc<crate::replication::TenantRouter>,
control_plane: Arc<crate::replication::ControlPlane>,
@ -171,6 +176,28 @@ impl TidalDb {
TidalDbBuilder::new()
}
/// Build a default single-shard `SessionReplicationBridge` for single-node deployments.
///
/// Uses an in-process self-loop transport (`ShardId::SINGLE`). In single-node mode
/// the bridge is never actually used for shipping (no peers), but the seqno tracker
/// and idempotency store are exercised on replay to prevent duplicate application.
fn make_single_node_session_bridge() -> Arc<crate::replication::SessionReplicationBridge> {
use crate::replication::{
IdempotencyStore, InProcessSessionTransportFactory, SessionReplicationBridge, ShardId,
};
use crate::session::state::SessionSeqNoTracker;
let shards = [ShardId::SINGLE];
let mut transports = InProcessSessionTransportFactory::new(&shards).build();
let transport = transports
.remove(&ShardId::SINGLE)
.expect("transport for ShardId::SINGLE always present");
Arc::new(SessionReplicationBridge::new(
transport,
Arc::new(IdempotencyStore::default_capacity()),
Arc::new(SessionSeqNoTracker::new()),
))
}
/// Construct a `TidalDb` without a schema (M0 compatibility mode).
#[allow(clippy::missing_const_for_fn)]
pub(crate) fn from_config(
@ -246,6 +273,8 @@ impl TidalDb {
backup_in_progress: Arc::new(AtomicBool::new(false)),
replication_state: Arc::new(crate::replication::state::ReplicationState::single()),
receiver_handle: std::sync::Mutex::new(None),
shipper_handle: None,
session_bridge: Self::make_single_node_session_bridge(),
lock_file: None,
tenant_router: {
let topo = Arc::new(RwLock::new(
@ -430,6 +459,14 @@ impl TidalDb {
std::sync::Mutex::new(handle)
};
// Compute the replication state before moving config into the struct.
// Include both the local shard and any configured peer shards.
let replication_state = {
let mut shards: Vec<crate::replication::ShardId> = vec![config.cluster.shard_id];
shards.extend_from_slice(&config.cluster.peer_shards);
Arc::new(crate::replication::state::ReplicationState::new(&shards))
};
let db = Self {
config,
closed: AtomicBool::new(false),
@ -486,8 +523,10 @@ impl TidalDb {
shutdown_sweeper: Arc::new(AtomicBool::new(false)),
sweeper_thread: std::sync::Mutex::new(None),
backup_in_progress: Arc::new(AtomicBool::new(false)),
replication_state: Arc::new(crate::replication::state::ReplicationState::single()),
replication_state,
receiver_handle: std::sync::Mutex::new(None),
shipper_handle: None,
session_bridge: Self::make_single_node_session_bridge(),
lock_file: None,
tenant_router: {
let topo = Arc::new(RwLock::new(
@ -522,20 +561,6 @@ impl TidalDb {
db
}
/// Returns a reference to the shared metrics state.
#[must_use]
#[allow(clippy::missing_const_for_fn)] // Arc field prevents const in practice
pub fn metrics(&self) -> &Arc<MetricsState> {
&self.metrics
}
/// Access the load detector for metrics and health check reporting.
#[must_use]
#[allow(clippy::missing_const_for_fn)] // Arc field prevents const in practice
pub fn load_detector(&self) -> &crate::load::LoadDetector {
&self.load_detector
}
/// Returns the bound address of the metrics HTTP server, if running.
///
/// Useful when port 0 was requested to discover the OS-assigned port.
@ -553,22 +578,4 @@ impl TidalDb {
None
}
}
/// Returns `Ok(())` if the database is initialized and operational.
///
/// # Errors
///
/// Returns an error if the database has been closed or an internal
/// check fails.
#[tracing::instrument(skip(self))]
pub fn health_check(&self) -> crate::Result<()> {
if self.closed.load(Ordering::Acquire) {
self.metrics.health_ok.store(false, Ordering::Release);
return Err(crate::TidalError::internal(
"health_check",
"database is closed",
));
}
Ok(())
}
}

View File

@ -20,7 +20,7 @@ impl TidalDb {
/// # Errors
///
/// Returns `TidalError::Internal` if no ledger is wired (ephemeral without schema).
pub fn start_replication<T: crate::replication::Transport>(
pub fn start_replication<T: crate::replication::Transport + ?Sized>(
&self,
transport: std::sync::Arc<T>,
) -> crate::Result<()> {
@ -55,6 +55,12 @@ impl TidalDb {
/// Write a signal on behalf of a tenant, enforcing per-tenant rate limits.
///
/// Dispatches to each assigned shard. On single-node deployments all
/// assignments resolve to the local shard, so behavior is identical to
/// calling `signal()` directly. During a dual-write migration, the signal
/// is written to both the source and target shard; writes to remote shards
/// are logged for visibility until a transport is wired in by the caller.
///
/// # Errors
///
/// - `TidalError::QuotaExceeded` — tenant rate limit exceeded.
@ -73,9 +79,25 @@ impl TidalDb {
limiter.try_acquire()?;
}
// Resolve all shard targets (1 in normal mode; 2 during dual-write migration).
// On single-node deployments all assignments resolve to local storage, so the
// write happens once — but the routing contract is validated and recorded.
let _assignments = self.tenant_router.write_assignments(tenant_id, entity_id)?;
self.signal(signal_type, entity_id, weight, timestamp)
let assignments = self.tenant_router.write_assignments(tenant_id, entity_id)?;
let local_shard = self.config.cluster.shard_id;
for assignment in &assignments {
if assignment.shard_id == local_shard {
// Local shard: write directly to the signal ledger.
self.signal(signal_type, entity_id, weight, timestamp)?;
} else {
// Remote shard (dual-write migration path): a transport-based dispatch
// will be wired here once the shipper is integrated (Task 5).
// For now we record the pending dispatch so it is visible to operators.
tracing::debug!(
tenant_id = tenant_id.0,
shard_id = assignment.shard_id.0,
entity_id = entity_id.as_u64(),
signal_type,
"pending remote-shard dispatch (transport not yet wired)"
);
}
}
Ok(())
}
}

View File

@ -7,6 +7,7 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use crate::schema::{EntityId, Timestamp};
use crate::session::{self as session_mod, AgentId, SessionId, SessionState};
use crate::storage::{Tag, encode_key, parse_key};
use crate::wal::format::session::SessionSeqNo;
use super::TidalDb;
@ -152,8 +153,10 @@ impl TidalDb {
});
// Replay signals and annotations into the restored session state.
// Advance the seqno HWM so incoming replication events are correctly deduped.
let mut max_seqno: Option<u64> = None;
if let Some(signals) = session_signals.get(&session_id) {
for (entity_id, weight, ts_ns, signal_name, annotation) in signals {
for (entity_id, weight, ts_ns, signal_name, annotation, seqno_raw) in signals {
let lambda = schema
.signal(signal_name)
.and_then(|def| def.decay().lambda())
@ -169,6 +172,11 @@ impl TidalDb {
state.signaled_entities.insert(*entity_id, ());
state.signals_written.fetch_add(1, Ordering::Relaxed);
// Track the highest seqno seen so we can advance the HWM after replay.
if let Some(s) = *seqno_raw {
max_seqno = Some(max_seqno.map_or(s, |m| m.max(s)));
}
// Replay annotation if present.
if let Some(ann) = annotation
&& let Ok(mut anns) = state.annotations.lock()
@ -179,6 +187,14 @@ impl TidalDb {
}
}
// Advance the seqno tracker HWM so the bridge deduplicates any
// replication events that repeat signals already replayed from the WAL.
if let Some(hwm) = max_seqno {
self.session_bridge
.seqno_tracker()
.set_hwm(session_id, SessionSeqNo(hwm));
}
let sid = SessionId::from_raw(session_id);
self.sessions.insert(sid, state);
@ -211,18 +227,24 @@ impl TidalDb {
/// Returns `(open_sessions, session_signals)` where `open_sessions` maps
/// `session_id` to `(user_id, started_at_ns, agent_id, policy_name)` for
/// sessions that have a Start but no Close.
///
/// Each signal tuple is `(entity_id, weight, ts_ns, signal_name, annotation, seqno)`.
/// `seqno` is `Some(n)` for events written after the M8p4 seqno mechanism was added,
/// `None` for legacy events.
#[allow(clippy::type_complexity)]
pub(super) fn partition_session_events(
events: &[crate::wal::format::SessionWalEvent],
) -> (
HashMap<u64, (u64, u64, String, String)>,
HashMap<u64, Vec<(u64, f32, u64, String, Option<String>)>>,
HashMap<u64, Vec<(u64, f32, u64, String, Option<String>, Option<u64>)>>,
) {
use crate::wal::format::SessionWalEvent;
let mut open_sessions: HashMap<u64, (u64, u64, String, String)> = HashMap::new();
let mut session_signals: HashMap<u64, Vec<(u64, f32, u64, String, Option<String>)>> =
HashMap::new();
let mut session_signals: HashMap<
u64,
Vec<(u64, f32, u64, String, Option<String>, Option<u64>)>,
> = HashMap::new();
for event in events {
match event {
@ -250,6 +272,7 @@ impl TidalDb {
ts_ns,
signal_name,
annotation,
session_seqno,
..
} => {
session_signals.entry(*session_id).or_default().push((
@ -258,6 +281,7 @@ impl TidalDb {
*ts_ns,
signal_name.clone(),
annotation.clone(),
session_seqno.map(|s| s.0),
));
}
SessionWalEvent::Close { session_id } => {

View File

@ -0,0 +1,63 @@
use std::collections::HashMap;
use std::time::Duration;
use crate::TidalDb;
use crate::schema::{AgentPolicy, DecaySpec, EntityKind, SchemaBuilder, Window};
use crate::session as session_mod;
/// Build a minimal schema with one signal type and one session policy.
fn schema_with_policy() -> crate::schema::Schema {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(3600),
},
)
.windows(&[Window::OneHour])
.velocity(false)
.add();
builder.session_policy(
"default",
AgentPolicy {
allowed_signals: vec!["view".to_string()],
denied_signals: vec![],
max_session_duration: Duration::from_secs(3600),
max_signals_per_session: 0,
},
);
builder.build().expect("schema must be valid")
}
#[test]
fn closed_sessions_evicted_when_cap_exceeded() {
let schema = schema_with_policy();
let db = TidalDb::builder()
.ephemeral()
.with_schema(schema)
.open()
.unwrap();
let cap = session_mod::MAX_CLOSED_SESSIONS;
// Open and close cap + 100 sessions to trigger at least one eviction cycle.
for i in 0..(cap + 100) {
let handle = db
.start_session(i as u64, "test-agent", "default", HashMap::new())
.unwrap();
db.close_session(handle).unwrap();
}
// After eviction, the closed_sessions map should be at most cap
// (it will be at most cap because the eviction fires when len >= cap
// and removes EVICT_BATCH_SIZE entries).
let len = db.closed_sessions.len();
assert!(
len <= cap,
"closed_sessions should be bounded: {len} > {cap}"
);
db.close().unwrap();
}

View File

@ -4,12 +4,14 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use crate::replication::idempotency::IdempotencyKey;
use crate::schema::{EntityId, TidalError, Timestamp};
use crate::session::{
self as session_mod, AgentId, AuditEntry, SessionHandle, SessionId, SessionInfo,
SessionSnapshot, SessionState, SessionSummary,
};
use crate::storage::{Tag, encode_key};
use crate::wal::format::session::SessionSeqNo;
use super::TidalDb;
@ -403,7 +405,17 @@ impl TidalDb {
anns.push((ts_ns, ann));
}
state.signals_written.fetch_add(1, Ordering::Relaxed);
// Increment and capture the write seqno for idempotent replay.
// fetch_add returns the OLD value; seqno is old + 1.
let seqno_raw = state.signals_written.fetch_add(1, Ordering::Relaxed) + 1;
// Derive a BLAKE3 idempotency key from session context + operation payload.
let ikey = {
let mut op_bytes = Vec::with_capacity(8 + signal_type.len());
op_bytes.extend_from_slice(&entity_id.as_u64().to_le_bytes());
op_bytes.extend_from_slice(signal_type.as_bytes());
IdempotencyKey::derive(handle.id.as_u64(), SessionSeqNo(seqno_raw), &op_bytes)
};
// Write session signal event to the session journal (fire-and-forget).
#[allow(clippy::cast_possible_truncation)]
@ -417,6 +429,8 @@ impl TidalDb {
ts_ns,
signal_type,
ann_for_wal.as_deref(),
Some(seqno_raw),
Some(ikey.0),
);
}
@ -533,68 +547,5 @@ impl TidalDb {
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use std::collections::HashMap;
use std::time::Duration;
use crate::TidalDb;
use crate::schema::{AgentPolicy, DecaySpec, EntityKind, SchemaBuilder, Window};
use crate::session as session_mod;
/// Build a minimal schema with one signal type and one session policy.
fn schema_with_policy() -> crate::schema::Schema {
let mut builder = SchemaBuilder::new();
let _ = builder
.signal(
"view",
EntityKind::Item,
DecaySpec::Exponential {
half_life: Duration::from_secs(3600),
},
)
.windows(&[Window::OneHour])
.velocity(false)
.add();
builder.session_policy(
"default",
AgentPolicy {
allowed_signals: vec!["view".to_string()],
denied_signals: vec![],
max_session_duration: Duration::from_secs(3600),
max_signals_per_session: 0,
},
);
builder.build().expect("schema must be valid")
}
#[test]
fn closed_sessions_evicted_when_cap_exceeded() {
let schema = schema_with_policy();
let db = TidalDb::builder()
.ephemeral()
.with_schema(schema)
.open()
.unwrap();
let cap = session_mod::MAX_CLOSED_SESSIONS;
// Open and close cap + 100 sessions to trigger at least one eviction cycle.
for i in 0..(cap + 100) {
let handle = db
.start_session(i as u64, "test-agent", "default", HashMap::new())
.unwrap();
db.close_session(handle).unwrap();
}
// After eviction, the closed_sessions map should be at most cap
// (it will be at most cap because the eviction fires when len >= cap
// and removes EVICT_BATCH_SIZE entries).
let len = db.closed_sessions.len();
assert!(
len <= cap,
"closed_sessions should be bounded: {len} > {cap}"
);
db.close().unwrap();
}
}
#[path = "session_tests.rs"]
mod tests;

View File

@ -5,6 +5,13 @@
//! [`Transport::recv_segment`] and replaying each batch into the shared
//! [`SignalLedger`] via `apply_wal_event`. Idempotent replay is ensured by
//! checking the per-shard high-water-mark in [`ReplicationState`].
//!
//! # Error handling
//!
//! If a batch fails BLAKE3 verification or structural decode, `apply_payload`
//! returns [`WalError::Corruption`]. The receiver thread propagates this
//! error immediately (it does **not** skip the payload) so that
//! [`SegmentReceiverHandle::join`] can surface it to the operator.
use std::sync::Arc;
use std::thread::JoinHandle;
@ -14,22 +21,33 @@ use crate::replication::state::ReplicationState;
use crate::replication::transport::Transport;
use crate::schema::{EntityId, Timestamp};
use crate::signals::{SignalLedger, SignalTypeId};
use crate::wal::error::WalError;
use crate::wal::format::batch::{HEADER_SIZE, decode_batch};
/// Handle to a running segment receiver thread.
///
/// Call [`join`](Self::join) to block until the thread exits (triggered by
/// the transport returning `None` from `recv_segment`).
/// Call [`join`](Self::join) to block until the thread exits and retrieve any
/// corruption error that caused it to stop.
pub struct SegmentReceiverHandle {
thread: Option<JoinHandle<()>>,
thread: Option<JoinHandle<Result<(), WalError>>>,
}
impl SegmentReceiverHandle {
/// Block until the receiver thread exits.
pub fn join(mut self) {
if let Some(handle) = self.thread.take() {
let _ = handle.join();
}
///
/// # Errors
///
/// Returns `Err(WalError::Corruption { .. })` if a corrupt batch was
/// received. Returns `Err(WalError::Corruption { .. })` if the thread
/// panicked unexpectedly.
pub fn join(mut self) -> Result<(), WalError> {
self.thread.take().map_or(Ok(()), |handle| {
handle.join().unwrap_or_else(|_| {
Err(WalError::Corruption {
message: "segment receiver thread panicked".into(),
})
})
})
}
}
@ -37,27 +55,27 @@ impl SegmentReceiverHandle {
/// into the signal ledger.
///
/// The thread exits when `transport.recv_segment()` returns `None` (transport
/// closed / shutdown).
/// closed / shutdown) **or** when a corrupt batch is detected.
///
/// # Panics
///
/// Panics if the OS fails to spawn the background thread.
pub fn spawn_receiver<T: Transport>(
pub fn spawn_receiver<T: Transport + ?Sized>(
transport: Arc<T>,
ledger: Arc<SignalLedger>,
replication_state: Arc<ReplicationState>,
) -> SegmentReceiverHandle {
let thread = std::thread::Builder::new()
.name("tidaldb-segment-receiver".into())
.spawn(move || {
.spawn(move || -> Result<(), WalError> {
loop {
let Some(payload) = transport.recv_segment() else {
tracing::debug!("segment receiver: transport closed, shutting down");
return;
return Ok(());
};
let shard_id = payload.id.shard_id;
apply_payload(&payload.bytes, shard_id, &ledger, &replication_state);
apply_payload(&payload.bytes, shard_id, &ledger, &replication_state)?;
}
})
.expect("failed to spawn segment receiver thread");
@ -71,12 +89,17 @@ pub fn spawn_receiver<T: Transport>(
///
/// Idempotent: batches whose last sequence number is at or below the
/// replication state's high-water-mark for the source shard are skipped.
///
/// # Errors
///
/// Returns `WalError::Corruption` on any BLAKE3 or structural decode failure.
/// The offset of the first corrupt batch is included in the error message.
fn apply_payload(
bytes: &[u8],
from_shard: ShardId,
ledger: &SignalLedger,
state: &ReplicationState,
) {
) -> Result<(), WalError> {
let mut offset = 0;
while offset < bytes.len() {
let remaining = &bytes[offset..];
@ -116,14 +139,13 @@ fn apply_payload(
offset += batch_size;
}
Err(e) => {
tracing::warn!(
error = %e,
"receiver: corrupt batch, skipping remainder of payload"
);
break;
return Err(WalError::Corruption {
message: format!("corrupt batch at payload offset {offset}: {e}"),
});
}
}
}
Ok(())
}
#[cfg(test)]
@ -175,7 +197,7 @@ mod tests {
let events = vec![make_event(42, type_id.as_u16() as u8, 1_000_000_000)];
let bytes = encode_batch(&events, 1, 1).unwrap();
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state).unwrap();
// Verify the ledger was updated.
assert!(ledger.entries().contains_key(&(EntityId::new(42), type_id)));
@ -193,9 +215,9 @@ mod tests {
let bytes = encode_batch(&events, 1, 1).unwrap();
// Apply once.
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state).unwrap();
// Apply again -- should be idempotent.
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state).unwrap();
assert_eq!(state.applied_seqno(ShardId::SINGLE), Some(1));
}
@ -213,13 +235,37 @@ mod tests {
let mut bytes = encode_batch(&e1, 1, 100).unwrap();
bytes.extend(encode_batch(&e2, 2, 200).unwrap());
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
apply_payload(&bytes, ShardId::SINGLE, &ledger, &state).unwrap();
assert!(ledger.entries().contains_key(&(EntityId::new(1), type_id)));
assert!(ledger.entries().contains_key(&(EntityId::new(2), type_id)));
assert_eq!(state.applied_seqno(ShardId::SINGLE), Some(2));
}
#[test]
fn apply_payload_corrupt_returns_error() {
let schema = make_schema();
let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
let state = Arc::new(ReplicationState::new(&[ShardId::SINGLE]));
// Build a valid batch then corrupt the checksum region.
let type_id = ledger.resolve_signal_type("view").unwrap();
let events = vec![make_event(7, type_id.as_u16() as u8, 500)];
let mut bytes = encode_batch(&events, 1, 1).unwrap();
// Flip bytes in the checksum region (bytes 32-63) to break BLAKE3.
for b in bytes[32..64].iter_mut() {
*b = b.wrapping_add(1);
}
let result = apply_payload(&bytes, ShardId::SINGLE, &ledger, &state);
assert!(
matches!(result, Err(WalError::Corruption { .. })),
"expected Corruption, got {result:?}"
);
// Ledger must NOT have been updated.
assert!(!ledger.entries().contains_key(&(EntityId::new(7), type_id)));
}
/// A minimal transport that returns one payload then signals shutdown.
struct OneShot {
rx: crossbeam::channel::Receiver<crate::replication::WalSegmentPayload>,
@ -274,10 +320,53 @@ mod tests {
drop(tx);
// The receiver should exit gracefully.
handle.join();
handle.join().unwrap();
// Verify the segment was applied.
assert!(ledger.entries().contains_key(&(EntityId::new(99), type_id)));
assert_eq!(state.applied_seqno(ShardId(0)), Some(1));
}
#[test]
fn receiver_thread_exits_on_corrupt_payload() {
let (tx, rx) = crossbeam::channel::bounded(4);
let transport = Arc::new(OneShot { rx });
let schema = make_schema();
let ledger = Arc::new(SignalLedger::new(schema, Box::new(NoopWalWriter)));
let state = Arc::new(ReplicationState::new(&[ShardId(0)]));
let handle = spawn_receiver(
Arc::clone(&transport),
Arc::clone(&ledger),
Arc::clone(&state),
);
// Build a valid batch then corrupt it.
let type_id = ledger.resolve_signal_type("view").unwrap();
let events = vec![make_event(55, type_id.as_u16() as u8, 100)];
let mut corrupt_bytes = encode_batch(&events, 1, 1).unwrap();
for b in corrupt_bytes[32..64].iter_mut() {
*b = b.wrapping_add(1);
}
tx.send(crate::replication::WalSegmentPayload {
id: WalSegmentId::new(RegionId::SINGLE, ShardId(0), 1),
bytes: corrupt_bytes,
event_count: 1,
})
.unwrap();
// Give thread time to process.
std::thread::sleep(Duration::from_millis(50));
drop(tx);
let result = handle.join();
assert!(
matches!(result, Err(WalError::Corruption { .. })),
"expected Corruption from join, got {result:?}"
);
// Entity 55 must NOT have been applied.
assert!(!ledger.entries().contains_key(&(EntityId::new(55), type_id)));
}
}

View File

@ -68,7 +68,10 @@ impl WalShipperHandle {
/// # Panics
///
/// Panics if the OS fails to spawn the background thread.
pub fn spawn_shipper<T: Transport>(config: ShipperConfig, transport: Arc<T>) -> WalShipperHandle {
pub fn spawn_shipper<T: Transport + ?Sized>(
config: ShipperConfig,
transport: Arc<T>,
) -> WalShipperHandle {
let (shutdown_tx, shutdown_rx) = bounded::<()>(1);
let thread = std::thread::Builder::new()
.name("tidaldb-wal-shipper".into())

View File

@ -53,6 +53,8 @@ pub enum TransportError {
/// - `local_shard` returns this node's shard identity.
pub trait Transport: Send + Sync + 'static {
/// Ship a WAL segment payload to a peer shard.
/// Blanket implementation for `Arc<dyn Transport>` so callers can use
/// type-erased transports with the generic `spawn_shipper` / `spawn_receiver` APIs.
///
/// # Errors
///
@ -71,6 +73,24 @@ pub trait Transport: Send + Sync + 'static {
fn local_shard(&self) -> ShardId;
}
/// Blanket implementation of `Transport` for `Arc<dyn Transport>`.
///
/// Allows type-erased `Arc<dyn Transport>` values to be passed to generic
/// APIs like `spawn_shipper<T: Transport>` and `spawn_receiver<T: Transport>`.
impl Transport for std::sync::Arc<dyn Transport> {
fn send_segment(&self, to: ShardId, payload: WalSegmentPayload) -> Result<(), TransportError> {
(**self).send_segment(to, payload)
}
fn recv_segment(&self) -> Option<WalSegmentPayload> {
(**self).recv_segment()
}
fn local_shard(&self) -> ShardId {
(**self).local_shard()
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@ -45,7 +45,7 @@ use crate::schema::{EntityId, Schema, Timestamp};
use crate::signals::{NoopWalWriter, SignalLedger};
use crate::wal::format::batch::{EventRecord, encode_batch};
use super::cluster_transport::{BatchEntry, ReceiveOnlyTransport};
use super::cluster_transport::{BatchEntry, ReceiveOnlyTransport, redeliver_missed};
// ── Public API ────────────────────────────────────────────────────────────
@ -234,11 +234,16 @@ impl SimulatedCluster {
/// non-partitioned followers with active receivers. The batch is also
/// recorded in the `batch_log` for partition-recovery re-delivery.
///
/// # Panics
/// # Errors
///
/// Panics if the signal write on the leader fails or if the signal type is
/// not registered in the schema.
pub fn write_signal(&self, signal_type: &str, entity_id: EntityId, weight: f64) {
/// Returns `TidalError` if the signal type is not registered in the schema
/// or if the leader write fails.
pub fn write_signal(
&self,
signal_type: &str,
entity_id: EntityId,
weight: f64,
) -> crate::Result<()> {
let ts = Timestamp::now();
let leader_region = self.leader_region();
let leader_shard = ShardId(leader_region.0);
@ -246,14 +251,12 @@ impl SimulatedCluster {
// Write to the leader's signal ledger.
self.nodes[&leader_region]
.db
.signal(signal_type, entity_id, weight, ts)
.expect("signal write on leader must succeed");
.signal(signal_type, entity_id, weight, ts)?;
// Encode as a one-event WAL batch.
let type_id = *self
.signal_type_ids
.get(signal_type)
.expect("signal type must be registered in the cluster schema");
let type_id = *self.signal_type_ids.get(signal_type).ok_or_else(|| {
crate::TidalError::invalid_input(format!("unknown signal type '{signal_type}'"))
})?;
let seqno = {
let mut seqnos = self
@ -309,6 +312,7 @@ impl SimulatedCluster {
});
self.total_signals.fetch_add(1, Ordering::Relaxed);
Ok(())
}
/// Write a signal directly to a specific region (bypassing the leader).
@ -376,31 +380,11 @@ impl SimulatedCluster {
.batch_log
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
for (&region, tx) in &self.follower_senders {
if region == leader_region || partitioned.contains(&region) {
for (&rgn, tx) in &self.follower_senders {
if rgn == leader_region || partitioned.contains(&rgn) {
continue;
}
let node = &self.nodes[&region];
for entry in log.iter() {
let already_applied = node
.db
.replication_state()
.applied_seqno(entry.source_shard)
.unwrap_or(0);
if entry.seqno > already_applied {
let payload = WalSegmentPayload {
id: WalSegmentId::new(
crate::replication::RegionId::SINGLE,
entry.source_shard,
entry.seqno,
),
bytes: entry.bytes.clone(),
event_count: 1,
};
let _ = tx.try_send(payload);
}
}
redeliver_missed(tx, &self.nodes[&rgn].db, &log);
}
}
@ -514,13 +498,22 @@ impl SimulatedCluster {
partitions.insert(region);
}
/// Heal a partitioned region.
/// Heal a partitioned region and re-deliver any missed WAL batches.
pub fn heal_region(&self, region: RegionId) {
let mut partitions = self
.partitioned_regions
.write()
.unwrap_or_else(std::sync::PoisonError::into_inner);
partitions.remove(&region);
{
let mut partitions = self
.partitioned_regions
.write()
.unwrap_or_else(std::sync::PoisonError::into_inner);
partitions.remove(&region);
}
if let Some(tx) = self.follower_senders.get(&region) {
let log = self
.batch_log
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
redeliver_missed(tx, &self.nodes[&region].db, &log);
}
}
/// Whether a region is currently partitioned.

View File

@ -1,6 +1,8 @@
//! Internal transport types used by [`super::cluster::SimulatedCluster`].
use crate::replication::shard::ShardId;
use crate::db::TidalDb;
use crate::replication::WalSegmentId;
use crate::replication::shard::{RegionId, ShardId};
use crate::replication::transport::{Transport, TransportError, WalSegmentPayload};
// ── Internal: receive-only transport ─────────────────────────────────────
@ -38,3 +40,30 @@ pub(super) struct BatchEntry {
/// Encoded WAL batch bytes (from [`crate::wal::format::batch::encode_batch`]).
pub(super) bytes: Vec<u8>,
}
// ── Re-delivery helper ────────────────────────────────────────────────────
/// Re-deliver all batch-log entries not yet applied to `db`, sending through `tx`.
///
/// Called by `SimulatedCluster::heal_region` for immediate recovery after a
/// partition is healed, and by `await_convergence` during the polling loop.
pub(super) fn redeliver_missed(
tx: &crossbeam::channel::Sender<WalSegmentPayload>,
db: &TidalDb,
log: &[BatchEntry],
) {
for entry in log {
let applied = db
.replication_state()
.applied_seqno(entry.source_shard)
.unwrap_or(0);
if entry.seqno > applied {
let payload = WalSegmentPayload {
id: WalSegmentId::new(RegionId::SINGLE, entry.source_shard, entry.seqno),
bytes: entry.bytes.clone(),
event_count: 1,
};
let _ = tx.try_send(payload);
}
}
}

View File

@ -126,7 +126,7 @@ mod tests {
let dir = tempfile::tempdir().unwrap();
// Create segments at seq 1, 50, 100, 200.
for &seq in &[1u64, 50, 100, 200] {
let _ = SegmentWriter::open(dir.path(), seq, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, seq, 1024).unwrap();
}
assert_eq!(list_segments(dir.path()).unwrap().len(), 4);
@ -145,7 +145,7 @@ mod tests {
#[test]
fn compact_preserves_current_segment() {
let dir = tempfile::tempdir().unwrap();
let _ = SegmentWriter::open(dir.path(), 100, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 100, 1024).unwrap();
// Checkpoint at seq=100: segment starting at 100 is NOT deleted
// (it may contain events >= 100).
@ -157,7 +157,7 @@ mod tests {
#[test]
fn compact_no_segments_to_delete() {
let dir = tempfile::tempdir().unwrap();
let _ = SegmentWriter::open(dir.path(), 500, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 500, 1024).unwrap();
let result = compact_wal(dir.path(), 100).unwrap();
assert_eq!(result.segments_deleted, 0);
@ -168,7 +168,7 @@ mod tests {
fn compact_all_segments_old() {
let dir = tempfile::tempdir().unwrap();
for &seq in &[1u64, 10, 20] {
let _ = SegmentWriter::open(dir.path(), seq, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, seq, 1024).unwrap();
}
let result = compact_wal(dir.path(), 1000).unwrap();
@ -179,8 +179,8 @@ mod tests {
#[test]
fn compact_idempotent() {
let dir = tempfile::tempdir().unwrap();
let _ = SegmentWriter::open(dir.path(), 1, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), 100, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 100, 1024).unwrap();
compact_wal(dir.path(), 100).unwrap();
// Running compaction again should be a no-op.
@ -193,7 +193,7 @@ mod tests {
fn compact_crash_during_deletion_is_safe() {
let dir = tempfile::tempdir().unwrap();
for &seq in &[1u64, 50, 100, 200] {
let _ = SegmentWriter::open(dir.path(), seq, 1024).unwrap();
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, seq, 1024).unwrap();
}
// Simulate a crash mid-compaction: segment 1 was deleted but segment 50 wasn't.

View File

@ -43,6 +43,7 @@ use self::format::{EventRecord, SessionWalEvent};
use self::segment::SegmentWriter;
use self::session_journal::SessionJournal;
use self::writer::{WalCommand, WriterConfig};
use crate::replication::{RegionId, ShardId};
/// Default channel capacity for the writer command channel.
const DEFAULT_CHANNEL_CAPACITY: usize = 10_000;
@ -201,7 +202,12 @@ impl WalHandle {
next_seq
};
let segment = SegmentWriter::open(&wal_dir, segment_first_seq, config.segment_size)?;
let segment = SegmentWriter::open(
&wal_dir,
ShardId::SINGLE,
segment_first_seq,
config.segment_size,
)?;
// Create the command channel
let (tx, rx) = bounded(DEFAULT_CHANNEL_CAPACITY);
@ -213,6 +219,10 @@ impl WalHandle {
batch_timeout: config.batch_timeout,
dedup_window: config.dedup_window,
session_journal_path: Some(session_journal_path),
// Single-node default. Multi-shard deployments override via NodeConfig
// before open() is called.
shard_id: ShardId::SINGLE,
region_id: RegionId::SINGLE,
};
// Spawn the writer thread
@ -302,6 +312,7 @@ impl WalHandle {
/// # Errors
///
/// Returns `WalError::SendFailed` if the writer thread has exited.
#[allow(clippy::too_many_arguments)]
pub fn session_signal(
&self,
session_id: u64,
@ -310,6 +321,8 @@ impl WalHandle {
ts_ns: u64,
signal_name: &str,
annotation: Option<&str>,
session_seqno: Option<u64>,
idempotency_key: Option<u128>,
) -> Result<(), WalError> {
self.tx
.send(WalCommand::SessionSignal {
@ -319,6 +332,8 @@ impl WalHandle {
ts_ns,
signal_name: signal_name.to_owned(),
annotation: annotation.map(str::to_owned),
session_seqno,
idempotency_key,
})
.map_err(|_| WalError::SendFailed)
}

View File

@ -142,20 +142,31 @@ pub struct SegmentWriter {
first_seq: u64,
/// The last sequence number written to this segment.
last_seq: u64,
/// The shard this segment belongs to. Controls filename format:
/// `ShardId::SINGLE` → v1 format, any other shard → v2 format.
shard_id: ShardId,
}
impl SegmentWriter {
/// Open or create a segment file for writing.
///
/// `shard_id` controls the filename format:
/// - `ShardId::SINGLE` (0) → v1 single-node format `wal-{seq:020}.seg`
/// - Any other shard → v2 multi-shard format `wal-s{shard:05}-{seq:020}.seg`
///
/// If `first_seq` identifies an existing segment, it is opened for append.
/// Otherwise, a new file is created.
///
/// # Errors
///
/// Returns `WalError::Io` on filesystem failure.
pub fn open(dir: &Path, first_seq: u64, max_size: u64) -> Result<Self, WalError> {
// Single-node: all segments use the v1 (unsharded) naming format.
let filename = segment_filename(ShardId::SINGLE, first_seq);
pub fn open(
dir: &Path,
shard_id: ShardId,
first_seq: u64,
max_size: u64,
) -> Result<Self, WalError> {
let filename = segment_filename(shard_id, first_seq);
let path = dir.join(&filename);
let is_new = !path.exists();
let file = OpenOptions::new().create(true).append(true).open(&path)?;
@ -178,6 +189,7 @@ impl SegmentWriter {
max_size,
first_seq,
last_seq: first_seq,
shard_id,
})
}
@ -237,9 +249,16 @@ impl SegmentWriter {
self.current_size
}
/// The shard ID used for segment filename formatting.
#[must_use]
pub const fn shard_id(&self) -> ShardId {
self.shard_id
}
/// Create a new segment file and return a writer for it.
///
/// Finalizes the current segment (syncs it) and opens a new one.
/// Finalizes the current segment (syncs it) and opens a new one using the
/// same `shard_id` that was set at construction time.
///
/// # Errors
///
@ -248,8 +267,7 @@ impl SegmentWriter {
// Sync current segment before rotation
self.sync()?;
// Single-node: all segments use the v1 (unsharded) naming format.
let filename = segment_filename(ShardId::SINGLE, new_first_seq);
let filename = segment_filename(self.shard_id, new_first_seq);
let path = self.dir.join(&filename);
let file = OpenOptions::new().create(true).append(true).open(&path)?;
@ -286,266 +304,5 @@ pub fn delete_segments_before(dir: &Path, before_seq: u64) -> Result<usize, WalE
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn segment_filename_format() {
assert_eq!(
segment_filename(ShardId::SINGLE, 1),
"wal-00000000000000000001.seg"
);
assert_eq!(
segment_filename(ShardId::SINGLE, 0),
"wal-00000000000000000000.seg"
);
assert_eq!(
segment_filename(ShardId::SINGLE, u64::MAX),
"wal-18446744073709551615.seg"
);
}
#[test]
fn parse_segment_seq_valid() {
assert_eq!(parse_segment_seq("wal-00000000000000000001.seg"), Some(1));
assert_eq!(parse_segment_seq("wal-00000000000000000000.seg"), Some(0));
}
#[test]
fn parse_segment_seq_invalid() {
assert_eq!(parse_segment_seq("not-a-segment.txt"), None);
assert_eq!(parse_segment_seq("wal-.seg"), None);
assert_eq!(parse_segment_seq("wal-abc.seg"), None);
assert_eq!(parse_segment_seq(""), None);
}
#[test]
fn write_and_check_size() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer = SegmentWriter::open(dir.path(), 1, 1024).expect("open should succeed");
assert_eq!(writer.current_size(), 0);
let data = [0xABu8; 100];
writer
.write_batch_bytes(&data)
.expect("write should succeed");
assert_eq!(writer.current_size(), 100);
}
#[test]
fn rotation_creates_new_file() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer = SegmentWriter::open(dir.path(), 1, 100).expect("open should succeed");
writer
.write_batch_bytes(&[0u8; 50])
.expect("write should succeed");
writer.rotate(100).expect("rotate should succeed");
assert_eq!(writer.current_size(), 0);
assert_eq!(writer.first_seq(), 100);
let segments = list_segments(dir.path()).expect("list should succeed");
assert_eq!(segments.len(), 2);
assert_eq!(segments[0].0, 1);
assert_eq!(segments[1].0, 100);
}
#[test]
fn needs_rotation_threshold() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer = SegmentWriter::open(dir.path(), 1, 100).expect("open should succeed");
assert!(!writer.needs_rotation());
writer
.write_batch_bytes(&[0u8; 100])
.expect("write should succeed");
assert!(writer.needs_rotation());
}
#[test]
fn list_segments_sorted() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
// Create segments out of order
let _ = SegmentWriter::open(dir.path(), 300, 1024);
let _ = SegmentWriter::open(dir.path(), 100, 1024);
let _ = SegmentWriter::open(dir.path(), 200, 1024);
let segments = list_segments(dir.path()).expect("list should succeed");
assert_eq!(segments.len(), 3);
assert_eq!(segments[0].0, 100);
assert_eq!(segments[1].0, 200);
assert_eq!(segments[2].0, 300);
}
#[test]
fn list_segments_empty_dir() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let segments = list_segments(dir.path()).expect("list should succeed");
assert!(segments.is_empty());
}
#[test]
fn list_segments_ignores_non_segment_files() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
fs::write(dir.path().join("checkpoint.meta"), "seq=1\nts=1\n")
.expect("write should succeed");
fs::write(dir.path().join("random.txt"), "hello").expect("write should succeed");
let _ = SegmentWriter::open(dir.path(), 1, 1024);
let segments = list_segments(dir.path()).expect("list should succeed");
assert_eq!(segments.len(), 1);
}
#[test]
fn delete_segments_before_removes_older() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let _ = SegmentWriter::open(dir.path(), 1, 1024);
let _ = SegmentWriter::open(dir.path(), 100, 1024);
let _ = SegmentWriter::open(dir.path(), 200, 1024);
let deleted = delete_segments_before(dir.path(), 200).expect("delete should succeed");
assert_eq!(deleted, 2);
let remaining = list_segments(dir.path()).expect("list should succeed");
assert_eq!(remaining.len(), 1);
assert_eq!(remaining[0].0, 200);
}
#[test]
fn delete_segments_before_none_to_delete() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let _ = SegmentWriter::open(dir.path(), 100, 1024);
let deleted = delete_segments_before(dir.path(), 50).expect("delete should succeed");
assert_eq!(deleted, 0);
}
#[test]
fn sync_does_not_error() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let writer = SegmentWriter::open(dir.path(), 1, 1024).expect("open should succeed");
writer.sync().expect("sync should succeed");
}
// ── Multi-shard naming tests ──────────────────────────────────────────
#[test]
fn segment_filename_multi_shard() {
let name = segment_filename(ShardId(3), 42);
assert_eq!(name, "wal-s00003-00000000000000000042.seg");
}
#[test]
fn segment_filename_single_shard_backward_compat() {
// ShardId::SINGLE retains old format exactly
assert_eq!(
segment_filename(ShardId::SINGLE, 1),
"wal-00000000000000000001.seg"
);
assert_eq!(
segment_filename(ShardId(0), 42),
"wal-00000000000000000042.seg"
);
}
#[test]
fn parse_segment_filename_both_formats() {
assert_eq!(
parse_segment_filename("wal-00000000000000000001.seg"),
Some((ShardId::SINGLE, 1))
);
assert_eq!(
parse_segment_filename("wal-s00003-00000000000000000042.seg"),
Some((ShardId(3), 42))
);
assert_eq!(parse_segment_filename("not-a-segment.txt"), None);
}
#[test]
fn parse_segment_filename_edge_cases() {
// Empty filename
assert_eq!(parse_segment_filename(""), None);
// Missing .seg suffix
assert_eq!(parse_segment_filename("wal-00000000000000000001"), None);
// Shard format but missing seq
assert_eq!(parse_segment_filename("wal-s00003.seg"), None);
// Non-numeric shard
assert_eq!(
parse_segment_filename("wal-sXYZ-00000000000000000001.seg"),
None
);
}
#[test]
fn list_segments_for_shard_filters_correctly() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
// Create single-shard segment files (v1 format)
let _ = SegmentWriter::open(dir.path(), 1, 1024);
let _ = SegmentWriter::open(dir.path(), 100, 1024);
// Create multi-shard segment files (v2 format) manually
let s3_name = segment_filename(ShardId(3), 50);
fs::write(dir.path().join(s3_name), []).expect("write should succeed");
let s3_name2 = segment_filename(ShardId(3), 200);
fs::write(dir.path().join(s3_name2), []).expect("write should succeed");
let s5_name = segment_filename(ShardId(5), 75);
fs::write(dir.path().join(s5_name), []).expect("write should succeed");
// list_segments returns ALL segment files (backward compat)
let all = list_segments(dir.path()).expect("list should succeed");
assert_eq!(all.len(), 5);
// list_segments_for_shard returns only matching shard
let single =
list_segments_for_shard(dir.path(), ShardId::SINGLE).expect("list should succeed");
assert_eq!(single.len(), 2);
assert_eq!(single[0].0, 1);
assert_eq!(single[1].0, 100);
let shard3 = list_segments_for_shard(dir.path(), ShardId(3)).expect("list should succeed");
assert_eq!(shard3.len(), 2);
assert_eq!(shard3[0].0, 50);
assert_eq!(shard3[1].0, 200);
let shard5 = list_segments_for_shard(dir.path(), ShardId(5)).expect("list should succeed");
assert_eq!(shard5.len(), 1);
assert_eq!(shard5[0].0, 75);
// Non-existent shard returns empty
let empty = list_segments_for_shard(dir.path(), ShardId(99)).expect("list should succeed");
assert!(empty.is_empty());
}
#[test]
fn list_segments_for_shard_missing_dir() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let missing = dir.path().join("does-not-exist");
let result = list_segments_for_shard(&missing, ShardId::SINGLE)
.expect("should handle missing dir gracefully");
assert!(result.is_empty());
}
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn filename_roundtrip(seq: u64) {
let name = segment_filename(ShardId::SINGLE, seq);
let parsed = parse_segment_seq(&name);
prop_assert_eq!(parsed, Some(seq));
}
#[test]
fn shard_filename_roundtrip(shard_id in 0u16..100u16, seq in proptest::num::u64::ANY) {
let shard = ShardId(shard_id);
let name = segment_filename(shard, seq);
let parsed = parse_segment_filename(&name);
prop_assert_eq!(parsed, Some((shard, seq)));
}
}
}
}
#[path = "segment_tests.rs"]
mod tests;

View File

@ -0,0 +1,300 @@
use super::*;
#[test]
fn segment_filename_format() {
assert_eq!(
segment_filename(ShardId::SINGLE, 1),
"wal-00000000000000000001.seg"
);
assert_eq!(
segment_filename(ShardId::SINGLE, 0),
"wal-00000000000000000000.seg"
);
assert_eq!(
segment_filename(ShardId::SINGLE, u64::MAX),
"wal-18446744073709551615.seg"
);
}
#[test]
fn parse_segment_seq_valid() {
assert_eq!(parse_segment_seq("wal-00000000000000000001.seg"), Some(1));
assert_eq!(parse_segment_seq("wal-00000000000000000000.seg"), Some(0));
}
#[test]
fn parse_segment_seq_invalid() {
assert_eq!(parse_segment_seq("not-a-segment.txt"), None);
assert_eq!(parse_segment_seq("wal-.seg"), None);
assert_eq!(parse_segment_seq("wal-abc.seg"), None);
assert_eq!(parse_segment_seq(""), None);
}
#[test]
fn write_and_check_size() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer =
SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 1024).expect("open should succeed");
assert_eq!(writer.current_size(), 0);
let data = [0xABu8; 100];
writer
.write_batch_bytes(&data)
.expect("write should succeed");
assert_eq!(writer.current_size(), 100);
}
#[test]
fn rotation_creates_new_file() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer =
SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 100).expect("open should succeed");
writer
.write_batch_bytes(&[0u8; 50])
.expect("write should succeed");
writer.rotate(100).expect("rotate should succeed");
assert_eq!(writer.current_size(), 0);
assert_eq!(writer.first_seq(), 100);
let segments = list_segments(dir.path()).expect("list should succeed");
assert_eq!(segments.len(), 2);
assert_eq!(segments[0].0, 1);
assert_eq!(segments[1].0, 100);
}
#[test]
fn needs_rotation_threshold() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer =
SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 100).expect("open should succeed");
assert!(!writer.needs_rotation());
writer
.write_batch_bytes(&[0u8; 100])
.expect("write should succeed");
assert!(writer.needs_rotation());
}
#[test]
fn list_segments_sorted() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
// Create segments out of order
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 300, 1024);
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 100, 1024);
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 200, 1024);
let segments = list_segments(dir.path()).expect("list should succeed");
assert_eq!(segments.len(), 3);
assert_eq!(segments[0].0, 100);
assert_eq!(segments[1].0, 200);
assert_eq!(segments[2].0, 300);
}
#[test]
fn list_segments_empty_dir() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let segments = list_segments(dir.path()).expect("list should succeed");
assert!(segments.is_empty());
}
#[test]
fn list_segments_ignores_non_segment_files() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
fs::write(dir.path().join("checkpoint.meta"), "seq=1\nts=1\n").expect("write should succeed");
fs::write(dir.path().join("random.txt"), "hello").expect("write should succeed");
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 1024);
let segments = list_segments(dir.path()).expect("list should succeed");
assert_eq!(segments.len(), 1);
}
#[test]
fn delete_segments_before_removes_older() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 1024);
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 100, 1024);
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 200, 1024);
let deleted = delete_segments_before(dir.path(), 200).expect("delete should succeed");
assert_eq!(deleted, 2);
let remaining = list_segments(dir.path()).expect("list should succeed");
assert_eq!(remaining.len(), 1);
assert_eq!(remaining[0].0, 200);
}
#[test]
fn delete_segments_before_none_to_delete() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 100, 1024);
let deleted = delete_segments_before(dir.path(), 50).expect("delete should succeed");
assert_eq!(deleted, 0);
}
#[test]
fn sync_does_not_error() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let writer =
SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 1024).expect("open should succeed");
writer.sync().expect("sync should succeed");
}
// -- Multi-shard naming tests --
#[test]
fn segment_filename_multi_shard() {
let name = segment_filename(ShardId(3), 42);
assert_eq!(name, "wal-s00003-00000000000000000042.seg");
}
#[test]
fn segment_filename_single_shard_backward_compat() {
// ShardId::SINGLE retains old format exactly
assert_eq!(
segment_filename(ShardId::SINGLE, 1),
"wal-00000000000000000001.seg"
);
assert_eq!(
segment_filename(ShardId(0), 42),
"wal-00000000000000000042.seg"
);
}
#[test]
fn parse_segment_filename_both_formats() {
assert_eq!(
parse_segment_filename("wal-00000000000000000001.seg"),
Some((ShardId::SINGLE, 1))
);
assert_eq!(
parse_segment_filename("wal-s00003-00000000000000000042.seg"),
Some((ShardId(3), 42))
);
assert_eq!(parse_segment_filename("not-a-segment.txt"), None);
}
#[test]
fn parse_segment_filename_edge_cases() {
// Empty filename
assert_eq!(parse_segment_filename(""), None);
// Missing .seg suffix
assert_eq!(parse_segment_filename("wal-00000000000000000001"), None);
// Shard format but missing seq
assert_eq!(parse_segment_filename("wal-s00003.seg"), None);
// Non-numeric shard
assert_eq!(
parse_segment_filename("wal-sXYZ-00000000000000000001.seg"),
None
);
}
#[test]
fn list_segments_for_shard_filters_correctly() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
// Create single-shard segment files (v1 format)
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 1024);
let _ = SegmentWriter::open(dir.path(), ShardId::SINGLE, 100, 1024);
// Create multi-shard segment files (v2 format) manually
let s3_name = segment_filename(ShardId(3), 50);
fs::write(dir.path().join(s3_name), []).expect("write should succeed");
let s3_name2 = segment_filename(ShardId(3), 200);
fs::write(dir.path().join(s3_name2), []).expect("write should succeed");
let s5_name = segment_filename(ShardId(5), 75);
fs::write(dir.path().join(s5_name), []).expect("write should succeed");
// list_segments returns ALL segment files (backward compat)
let all = list_segments(dir.path()).expect("list should succeed");
assert_eq!(all.len(), 5);
// list_segments_for_shard returns only matching shard
let single = list_segments_for_shard(dir.path(), ShardId::SINGLE).expect("list should succeed");
assert_eq!(single.len(), 2);
assert_eq!(single[0].0, 1);
assert_eq!(single[1].0, 100);
let shard3 = list_segments_for_shard(dir.path(), ShardId(3)).expect("list should succeed");
assert_eq!(shard3.len(), 2);
assert_eq!(shard3[0].0, 50);
assert_eq!(shard3[1].0, 200);
let shard5 = list_segments_for_shard(dir.path(), ShardId(5)).expect("list should succeed");
assert_eq!(shard5.len(), 1);
assert_eq!(shard5[0].0, 75);
// Non-existent shard returns empty
let empty = list_segments_for_shard(dir.path(), ShardId(99)).expect("list should succeed");
assert!(empty.is_empty());
}
#[test]
fn list_segments_for_shard_missing_dir() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let missing = dir.path().join("does-not-exist");
let result = list_segments_for_shard(&missing, ShardId::SINGLE)
.expect("should handle missing dir gracefully");
assert!(result.is_empty());
}
#[test]
fn segment_writer_stores_shard_id() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let writer = SegmentWriter::open(dir.path(), ShardId(7), 1, 1024).expect("open should succeed");
assert_eq!(writer.shard_id(), ShardId(7));
// Verify the on-disk filename uses the v2 format.
let expected = segment_filename(ShardId(7), 1);
assert!(
dir.path().join(&expected).exists(),
"expected file {expected} to exist in WAL dir"
);
}
#[test]
fn rotation_preserves_shard_id_in_filename() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let mut writer =
SegmentWriter::open(dir.path(), ShardId(3), 1, 100).expect("open should succeed");
writer
.write_batch_bytes(&[0u8; 50])
.expect("write should succeed");
writer.rotate(100).expect("rotate should succeed");
let seg1 = segment_filename(ShardId(3), 1);
let seg2 = segment_filename(ShardId(3), 100);
assert!(
dir.path().join(&seg1).exists(),
"first segment {seg1} must exist"
);
assert!(
dir.path().join(&seg2).exists(),
"rotated segment {seg2} must exist"
);
assert_eq!(writer.shard_id(), ShardId(3));
}
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn filename_roundtrip(seq: u64) {
let name = segment_filename(ShardId::SINGLE, seq);
let parsed = parse_segment_seq(&name);
prop_assert_eq!(parsed, Some(seq));
}
#[test]
fn shard_filename_roundtrip(shard_id in 0u16..100u16, seq in proptest::num::u64::ANY) {
let shard = ShardId(shard_id);
let name = segment_filename(shard, seq);
let parsed = parse_segment_filename(&name);
prop_assert_eq!(parsed, Some((shard, seq)));
}
}
}

View File

@ -5,9 +5,10 @@ use crossbeam::channel::Receiver;
use super::dedup::DedupWindow;
use super::error::WalError;
use super::format::{self, EventRecord, SessionWalEvent};
use super::format::{self, EventRecord, SessionSeqNo, SessionWalEvent};
use super::segment::{self, SegmentWriter};
use super::session_journal::SessionJournal;
use crate::replication::{RegionId, ShardId};
/// Commands sent from `WalHandle` to the writer thread.
pub enum WalCommand {
@ -45,6 +46,12 @@ pub enum WalCommand {
ts_ns: u64,
signal_name: String,
annotation: Option<String>,
/// Monotonic sequence number for this write (used for idempotent replay).
/// `None` for legacy writes that predate the seqno mechanism.
session_seqno: Option<u64>,
/// BLAKE3-derived idempotency key for duplicate suppression.
/// `None` for legacy writes.
idempotency_key: Option<u128>,
},
/// Record that a session was closed.
SessionClose { session_id: u64 },
@ -59,6 +66,13 @@ pub struct WriterConfig {
pub dedup_window: Duration,
/// Path for the session journal file (optional; `None` in ephemeral mode).
pub session_journal_path: Option<PathBuf>,
/// Shard identity for this writer. Written into every batch header so
/// `WalShipper` and receivers can identify the source shard.
/// Defaults to `ShardId::SINGLE` for single-node deployments.
pub shard_id: ShardId,
/// Region identity for this writer. Written into every batch header.
/// Defaults to `RegionId::SINGLE` for single-node deployments.
pub region_id: RegionId,
}
/// The group commit writer loop.
@ -207,7 +221,13 @@ pub fn run_writer(
// callers blocked forever (or receiving a generic Closed error
// instead of the real I/O error).
let write_result = (|| -> Result<u64, WalError> {
let encoded = format::encode_batch(&kept_events, batch_seq, batch_ts_u64)?;
let encoded = format::encode_batch_with_shard(
&kept_events,
batch_seq,
batch_ts_u64,
config.shard_id,
config.region_id,
)?;
if segment.needs_rotation() {
segment.rotate(batch_seq)?;
@ -316,7 +336,13 @@ pub fn run_writer(
#[allow(clippy::cast_possible_truncation)]
let batch_ts_u64 = batch_ts as u64;
let encoded = format::encode_batch(&kept_events, batch_seq, batch_ts_u64)?;
let encoded = format::encode_batch_with_shard(
&kept_events,
batch_seq,
batch_ts_u64,
config.shard_id,
config.region_id,
)?;
if segment.needs_rotation() {
segment.rotate(batch_seq)?;
@ -372,6 +398,8 @@ fn handle_session_command(cmd: WalCommand, journal: &mut Option<SessionJournal>)
ts_ns,
signal_name,
annotation,
session_seqno,
idempotency_key,
} => SessionWalEvent::Signal {
session_id,
entity_id,
@ -379,8 +407,8 @@ fn handle_session_command(cmd: WalCommand, journal: &mut Option<SessionJournal>)
ts_ns,
signal_name,
annotation,
session_seqno: None,
idempotency_key: None,
session_seqno: session_seqno.map(SessionSeqNo),
idempotency_key,
},
WalCommand::SessionClose { session_id } => SessionWalEvent::Close { session_id },
// Other commands are not handled here.
@ -394,178 +422,5 @@ fn handle_session_command(cmd: WalCommand, journal: &mut Option<SessionJournal>)
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::similar_names)]
mod tests {
use super::*;
use crossbeam::channel::bounded;
fn make_event(id: u64) -> EventRecord {
EventRecord {
entity_id: id,
signal_type: 1,
weight: 1.0,
timestamp_nanos: 1_000_000_000,
}
}
#[test]
fn writer_processes_single_event() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
};
let (reply_tx, reply_rx) = bounded(1);
tx.send(WalCommand::Append {
event: make_event(42),
reply: reply_tx,
})
.expect("send should succeed");
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let seq = reply_rx
.recv()
.expect("should receive reply")
.expect("should be ok");
assert_eq!(seq, 1);
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
#[test]
fn writer_deduplicates_events() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
};
let event = make_event(42);
let (reply_tx1, reply_rx1) = bounded(1);
let (reply_tx2, reply_rx2) = bounded(1);
tx.send(WalCommand::Append {
event: event.clone(),
reply: reply_tx1,
})
.expect("send should succeed");
tx.send(WalCommand::Append {
event,
reply: reply_tx2,
})
.expect("send should succeed");
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let seq1 = reply_rx1
.recv()
.expect("should receive")
.expect("should be ok");
let seq2 = reply_rx2
.recv()
.expect("should receive")
.expect("should be ok");
assert_eq!(seq1, 1);
assert_eq!(seq2, 0); // deduplicated
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
#[test]
fn writer_handles_channel_disconnect() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
};
drop(tx); // Disconnect immediately
let result = run_writer(&rx, &config, segment, 1, dedup);
assert!(result.is_ok());
}
#[test]
fn writer_assigns_monotonic_sequences() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment =
SegmentWriter::open(dir.path(), 1, 16 * 1024 * 1024).expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
};
let mut reply_rxs = Vec::new();
for i in 0..5 {
let (reply_tx, reply_rx) = bounded(1);
tx.send(WalCommand::Append {
event: make_event(i),
reply: reply_tx,
})
.expect("send should succeed");
reply_rxs.push(reply_rx);
}
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let mut seqs = Vec::new();
for reply_rx in reply_rxs {
let seq = reply_rx
.recv()
.expect("should receive")
.expect("should be ok");
seqs.push(seq);
}
// Verify monotonically increasing
for window in seqs.windows(2) {
assert!(window[0] < window[1], "seqs not monotonic: {seqs:?}");
}
assert_eq!(seqs[0], 1);
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
}
#[path = "writer_tests.rs"]
mod tests;

View File

@ -0,0 +1,181 @@
use super::*;
use crossbeam::channel::bounded;
fn make_event(id: u64) -> EventRecord {
EventRecord {
entity_id: id,
signal_type: 1,
weight: 1.0,
timestamp_nanos: 1_000_000_000,
}
}
#[test]
fn writer_processes_single_event() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 16 * 1024 * 1024)
.expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
shard_id: ShardId::SINGLE,
region_id: RegionId::SINGLE,
};
let (reply_tx, reply_rx) = bounded(1);
tx.send(WalCommand::Append {
event: make_event(42),
reply: reply_tx,
})
.expect("send should succeed");
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let seq = reply_rx
.recv()
.expect("should receive reply")
.expect("should be ok");
assert_eq!(seq, 1);
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
#[test]
fn writer_deduplicates_events() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 16 * 1024 * 1024)
.expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
shard_id: ShardId::SINGLE,
region_id: RegionId::SINGLE,
};
let event = make_event(42);
let (reply_tx1, reply_rx1) = bounded(1);
let (reply_tx2, reply_rx2) = bounded(1);
tx.send(WalCommand::Append {
event: event.clone(),
reply: reply_tx1,
})
.expect("send should succeed");
tx.send(WalCommand::Append {
event,
reply: reply_tx2,
})
.expect("send should succeed");
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let seq1 = reply_rx1
.recv()
.expect("should receive")
.expect("should be ok");
let seq2 = reply_rx2
.recv()
.expect("should receive")
.expect("should be ok");
assert_eq!(seq1, 1);
assert_eq!(seq2, 0); // deduplicated
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}
#[test]
fn writer_handles_channel_disconnect() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 16 * 1024 * 1024)
.expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
shard_id: ShardId::SINGLE,
region_id: RegionId::SINGLE,
};
drop(tx); // Disconnect immediately
let result = run_writer(&rx, &config, segment, 1, dedup);
assert!(result.is_ok());
}
#[test]
fn writer_assigns_monotonic_sequences() {
let dir = tempfile::tempdir().expect("tempdir creation should succeed");
let (tx, rx) = bounded(100);
let segment = SegmentWriter::open(dir.path(), ShardId::SINGLE, 1, 16 * 1024 * 1024)
.expect("open should succeed");
let dedup = DedupWindow::new(Duration::from_secs(30));
let config = WriterConfig {
dir: dir.path().to_path_buf(),
segment_size: 16 * 1024 * 1024,
batch_size: 100,
batch_timeout: Duration::from_millis(10),
dedup_window: Duration::from_secs(30),
session_journal_path: None,
shard_id: ShardId::SINGLE,
region_id: RegionId::SINGLE,
};
let mut reply_rxs = Vec::new();
for i in 0..5 {
let (reply_tx, reply_rx) = bounded(1);
tx.send(WalCommand::Append {
event: make_event(i),
reply: reply_tx,
})
.expect("send should succeed");
reply_rxs.push(reply_rx);
}
tx.send(WalCommand::Shutdown).expect("send should succeed");
let handle = std::thread::spawn(move || run_writer(&rx, &config, segment, 1, dedup));
let mut seqs = Vec::new();
for reply_rx in reply_rxs {
let seq = reply_rx
.recv()
.expect("should receive")
.expect("should be ok");
seqs.push(seq);
}
// Verify monotonically increasing
for window in seqs.windows(2) {
assert!(window[0] < window[1], "seqs not monotonic: {seqs:?}");
}
assert_eq!(seqs[0], 1);
handle
.join()
.expect("thread should join")
.expect("writer should succeed");
}

View File

@ -96,7 +96,7 @@ fn uat_step1_cross_region_replication() {
// Write 25 signals in region 0 (leader).
for _ in 0..25 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
// Wait for convergence (< 2 seconds on in-process relay).
@ -160,7 +160,7 @@ fn uat_step2_leader_crash_and_failover() {
// Write 100 signals on the leader (region 0).
for _ in 0..100 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
// Wait for all followers to receive the events.
@ -201,7 +201,7 @@ fn uat_step2_leader_crash_and_failover() {
// Write 10 more signals to the new leader.
for _ in 0..10 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
// Converge: region 2 should receive the 10 new signals.
@ -230,7 +230,7 @@ fn uat_step3_degraded_query_during_partition() {
// Seed some data before partition.
for _ in 0..10 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
cluster.await_convergence(Duration::from_secs(1));
@ -239,7 +239,7 @@ fn uat_step3_degraded_query_during_partition() {
// Write more signals during the partition.
for _ in 0..5 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
// Converge: only region 1 should get the new signals.
@ -293,7 +293,7 @@ fn uat_step4_partition_heal_and_reconciliation() {
// Phase 1: Write some events, then partition region 2.
for _ in 0..20 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
cluster.await_convergence(Duration::from_secs(1));
@ -303,7 +303,7 @@ fn uat_step4_partition_heal_and_reconciliation() {
// Write 30 more events to the leader during partition.
for _ in 0..30 {
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
}
// Converge region 1 only (region 2 is partitioned).
@ -547,7 +547,7 @@ fn perf_replication_latency_p99() {
let item = EntityId::new(1_000_000 + i);
let before = Instant::now();
cluster.write_signal("view", item, 1.0);
cluster.write_signal("view", item, 1.0).unwrap();
cluster.await_convergence(Duration::from_secs(3));
let score = cluster.read_decay_score(RegionId(1), item, "view");
@ -587,7 +587,9 @@ fn perf_failover_under_10s() {
// Write some data.
for _ in 0..50 {
cluster.write_signal("view", EntityId::new(10), 1.0);
cluster
.write_signal("view", EntityId::new(10), 1.0)
.unwrap();
}
cluster.await_convergence(Duration::from_secs(1));
@ -603,7 +605,9 @@ fn perf_failover_under_10s() {
);
// Verify the new leader can accept writes.
cluster.write_signal("view", EntityId::new(10), 1.0);
cluster
.write_signal("view", EntityId::new(10), 1.0)
.unwrap();
let score = cluster
.read_decay_score(RegionId(1), EntityId::new(10), "view")
.expect("new leader must have score");

View File

@ -584,10 +584,13 @@ fn follower_serves_retrieve_queries() {
follower.close().unwrap();
}
// ── Test 11: Corrupted segment is rejected ───────────────────────────────
// ── Test 11: Corrupted segment causes receiver to stop ───────────────────
#[test]
fn corrupted_segment_is_rejected() {
// New contract: a BLAKE3 failure causes the receiver thread to exit with
// WalError::Corruption so operators can trigger remediation. The receiver
// does NOT silently skip the corrupt payload and continue.
let schema = make_schema();
let follower = open_follower(schema.clone());
@ -620,7 +623,9 @@ fn corrupted_segment_is_rejected() {
})
.unwrap();
// Also send a valid segment (seqno 2) to prove the receiver keeps running.
// Queue a valid segment after the corrupt one. Under the new contract the
// receiver has already stopped after the corrupt payload, so entity 501
// will NOT be applied either.
let valid_events = vec![EventRecord {
entity_id: 501,
signal_type: type_id.as_u16() as u8,
@ -636,9 +641,10 @@ fn corrupted_segment_is_rejected() {
})
.unwrap();
// Give the receiver thread time to process the corrupt payload and exit.
std::thread::sleep(Duration::from_millis(200));
// Entity 500 (from corrupted segment) should NOT be present.
// Entity 500 (from corrupted segment) must NOT be present.
let score_500 = follower
.read_decay_score(EntityId::new(500), "view", 0)
.unwrap();
@ -647,20 +653,100 @@ fn corrupted_segment_is_rejected() {
"corrupted segment entity should not appear"
);
// Entity 501 (from valid segment) SHOULD be present.
// Entity 501 is also absent: the receiver stopped on corruption and did
// not process subsequent payloads. Operators must restart replication
// after investigating the corruption.
let score_501 = follower
.read_decay_score(EntityId::new(501), "view", 0)
.unwrap();
assert!(
score_501.is_some(),
"valid segment after corruption should be applied"
score_501.is_none(),
"receiver should have stopped after corruption; entity 501 must not appear"
);
// close() logs the corruption warning but does not return an error
// (WalError is not TidalError; lifecycle logs and moves on).
drop(tx);
follower.close().unwrap();
}
// ── Test 12: Replication lag converges to zero ───────────────────────────
// ── Test 12: Builder with_transport auto-starts follower receiver ─────────
//
// Validates that `TidalDbBuilder::with_transport` wires the segment receiver
// automatically at open time so the caller never needs to invoke
// `start_replication` manually.
#[test]
fn with_transport_auto_wires_follower_receiver() {
let schema = make_schema();
let type_id = resolve_view_type_id(&schema);
// Build a channel transport. The sender (`tx`) plays the role of a
// leader shipping WAL segments; the receiver is handed to the follower.
let (tx, rx) = crossbeam::channel::bounded::<WalSegmentPayload>(16);
let follower_transport = Arc::new(ChannelTransport { rx }) as Arc<dyn Transport>;
// Open the follower via the builder — receiver must start automatically.
// No explicit call to `start_replication` is made.
let follower = TidalDb::builder()
.ephemeral()
.with_schema(schema.clone())
.with_cluster(NodeConfig {
role: NodeRole::Follower,
..NodeConfig::default()
})
.with_transport(follower_transport)
.open()
.expect("follower with auto-wired transport should open");
// Open a standalone leader (no transport — we ship segments manually).
let leader = open_leader(schema.clone());
// Write one signal on the leader.
let ts = Timestamp::from_nanos(1_000_000_000);
leader.signal("view", EntityId::new(42), 2.5, ts).unwrap();
// Encode the same event into a WAL batch and ship it to the follower.
let events = vec![EventRecord {
entity_id: 42,
signal_type: type_id.as_u16() as u8,
weight: 2.5_f32,
timestamp_nanos: 1_000_000_000,
}];
let batch_bytes = encode_batch(&events, 1, 1).unwrap();
tx.send(WalSegmentPayload {
id: WalSegmentId::new(tidaldb::replication::RegionId::SINGLE, ShardId::SINGLE, 1),
bytes: batch_bytes,
event_count: 1,
})
.unwrap();
// Give the auto-started receiver time to apply the segment.
std::thread::sleep(Duration::from_millis(100));
// The follower must have the replicated signal.
let follower_score = follower
.read_decay_score(EntityId::new(42), "view", 0)
.unwrap();
assert!(
follower_score.is_some(),
"follower should see entity 42 after auto-wired replication"
);
// Replication state must reflect the applied seqno.
assert_eq!(
follower.replication_state().applied_seqno(ShardId::SINGLE),
Some(1),
"follower replication state should be at seqno 1"
);
// Drop sender first so the receiver thread can exit cleanly.
drop(tx);
leader.close().unwrap();
follower.close().unwrap();
}
// ── Test 13: Replication lag converges to zero ───────────────────────────
#[test]
fn replication_lag_converges_to_zero() {