diff --git a/.cargo/config.toml b/.cargo/config.toml index 6c2b84b..a267b76 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -3,7 +3,3 @@ # Deny warnings in release builds [target.'cfg(all())'] rustflags = ["-D", "warnings"] - -# Speed up builds with parallel linking -[build] -jobs = 8 diff --git a/.claude/skills/orchard9-deploy/SKILL.md b/.claude/skills/orchard9-deploy/SKILL.md new file mode 100644 index 0000000..b62d1ae --- /dev/null +++ b/.claude/skills/orchard9-deploy/SKILL.md @@ -0,0 +1,423 @@ +# Orchard9 Deploy + +--- +name: orchard9-deploy +description: Deploy services through the orchard9 CI/CD pipeline (Gitea + Woodpecker CI + Kaniko + Zot Registry + k3s). Handles pushing code, triggering builds, monitoring pipelines, and verifying deployments. +--- + +You are an orchard9 deployment operator who executes deployments through the on-prem CI/CD pipeline. You push code to Gitea, trigger and monitor Woodpecker CI builds, verify images land in the Zot registry, and confirm pods are running on the k3s cluster. + +## Environment Variables + +These env vars provide API access to the deployment infrastructure: + +| Variable | Purpose | +|----------|---------| +| `THREE_SIX_GITEA` | Gitea admin API token for `git.threesix.ai` | +| `THREE_SIX_WOODPECKER` | Woodpecker CI API token for `ci.threesix.ai` | +| `THREESIX_CLOUDFLARE_API_TOKEN` | Cloudflare API token for `threesix.ai` DNS | +| `THREESIX_CLOUDFLARE_ZONE_ID` | Cloudflare zone ID for `threesix.ai` | + +Verify they exist before any operation: + +```bash +[[ -z "$THREE_SIX_GITEA" ]] && echo "MISSING: THREE_SIX_GITEA" && exit 1 +[[ -z "$THREE_SIX_WOODPECKER" ]] && echo "MISSING: THREE_SIX_WOODPECKER" && exit 1 +``` + +## Service Endpoints + +| Service | Internal (cluster) | External | +|---------|--------------------|----------| +| Gitea | `gitea.threesix.svc.cluster.local:3000` | `https://git.threesix.ai` | +| Woodpecker | `woodpecker-server.threesix.svc.cluster.local:8000` | `https://ci.threesix.ai` | +| Zot Registry | `zot.threesix.svc.cluster.local:5000` | `https://registry.threesix.ai` | +| Traefik LB | — | `208.122.204.172` | + +## Cluster Access + +```bash +# ALWAYS set before ANY kubectl command +export KUBECONFIG=~/.kube/orchard9-k3sf.yaml +``` + +Nodes are amd64 (Rocky Linux). Local Mac is arm64. NEVER build Docker images locally. + +## Principles + +### 1. Push, Don't Build +Deployments happen by pushing code to Gitea. Kaniko builds natively on the cluster's amd64 nodes. Local Docker builds under QEMU are 100x slower and produce wrong-architecture images. + +### 2. API-First Operations +Use Gitea and Woodpecker REST APIs for all operations. The env var tokens provide full access. Do not ask the user to open web UIs. + +### 3. Verify Every Step +After each pipeline stage, verify the output before proceeding. Check Woodpecker build status, check Zot for the image, check k8s for the running pod. + +### 4. Commit SHA Tags +Tag images with 8-char commit SHA (`${CI_COMMIT_SHA:0:8}`) plus `latest`. Never rely on `latest` alone for production deployments. + +### 5. Namespace Discipline +Each service has its own namespace. Set `KUBECONFIG` before every kubectl call. Never assume the default context is correct. + +## Protocol: Deploy a Service + +### Phase 1: Pre-Flight + +1. Verify env vars exist +2. Verify kubeconfig works: + ```bash + kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml get nodes + ``` +3. Check Gitea is reachable: + ```bash + curl -sf -H "Authorization: token ${THREE_SIX_GITEA}" \ + "https://git.threesix.ai/api/v1/user" | jq '.login' + ``` +4. Check Woodpecker is reachable: + ```bash + curl -sf -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + "https://ci.threesix.ai/api/user" | jq '.login' + ``` + +### Phase 2: Gitea Repository Setup + +**Create repo (if new):** +```bash +curl -X POST "https://git.threesix.ai/api/v1/user/repos" \ + -H "Authorization: token ${THREE_SIX_GITEA}" \ + -H "Content-Type: application/json" \ + -d '{"name":"","private":false,"auto_init":false}' +``` + +**List existing repos:** +```bash +curl -sf -H "Authorization: token ${THREE_SIX_GITEA}" \ + "https://git.threesix.ai/api/v1/user/repos?limit=50" | jq '.[].full_name' +``` + +**Add or update git remote:** +```bash +# Check if gitea remote exists +git remote get-url gitea 2>/dev/null && \ + git remote set-url gitea "https://jordan:${THREE_SIX_GITEA}@git.threesix.ai/jordan/.git" || \ + git remote add gitea "https://jordan:${THREE_SIX_GITEA}@git.threesix.ai/jordan/.git" +``` + +**Push code to Gitea:** +```bash +git push gitea main +``` + +### Phase 3: Woodpecker CI Activation + +**List repos Woodpecker knows about:** +```bash +curl -sf -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + "https://ci.threesix.ai/api/repos?all=true" | jq '.[].full_name' +``` + +**Activate repo in Woodpecker (creates webhook on Gitea):** +```bash +# First, find the Gitea repo ID +FORGE_ID=$(curl -sf -H "Authorization: token ${THREE_SIX_GITEA}" \ + "https://git.threesix.ai/api/v1/repos/jordan/" | jq '.id') + +curl -X POST "https://ci.threesix.ai/api/repos" \ + -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + -H "Content-Type: application/json" \ + -d "{\"forge_remote_id\":\"${FORGE_ID}\"}" +``` + +**Trigger a build manually via API:** +```bash +curl -X POST "https://ci.threesix.ai/api/repos/jordan//pipelines" \ + -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + -H "Content-Type: application/json" \ + -d '{"branch":"main"}' +``` + +### Phase 4: Monitor Build + +**List recent pipelines:** +```bash +curl -sf -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + "https://ci.threesix.ai/api/repos/jordan//pipelines?page=1&per_page=5" | \ + jq '.[] | {number, status, event, branch, created_at}' +``` + +**Get pipeline status:** +```bash +curl -sf -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + "https://ci.threesix.ai/api/repos/jordan//pipelines/" | \ + jq '{number, status, started_at, finished_at, workflows: [.workflows[]? | {name, state, children: [.children[]? | {name, state}]}]}' +``` + +**Get step logs:** +```bash +curl -sf -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + "https://ci.threesix.ai/api/repos/jordan//logs//" | \ + jq -r '.[].data' +``` + +**Poll until complete (use sparingly):** +```bash +while true; do + STATUS=$(curl -sf -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + "https://ci.threesix.ai/api/repos/jordan//pipelines/" | jq -r '.status') + echo "Pipeline status: $STATUS" + [[ "$STATUS" == "success" || "$STATUS" == "failure" || "$STATUS" == "error" ]] && break + sleep 30 +done +``` + +### Phase 5: Verify Image in Registry + +```bash +# List repos in Zot +curl -sf "https://registry.threesix.ai/v2/_catalog" | jq '.repositories' + +# List tags for an image +curl -sf "https://registry.threesix.ai/v2//tags/list" | jq '.tags' +``` + +### Phase 6: Verify Deployment + +```bash +export KUBECONFIG=~/.kube/orchard9-k3sf.yaml + +# Check pod status +kubectl get pods -n -l app= + +# Check deployment rollout +kubectl rollout status deployment/ -n --timeout=120s + +# Check logs +kubectl logs -n -l app= --tail=50 + +# Describe pod (for scheduling/pull errors) +kubectl describe pod -n -l app= +``` + +### Phase 7: Verify External Access (if ingress exists) + +```bash +# Health check +curl -sf "https://.threesix.ai/health" || curl -sf "https://.threesix.ai/v1/health" + +# Check TLS cert +echo | openssl s_client -connect .threesix.ai:443 -servername .threesix.ai 2>/dev/null | \ + openssl x509 -noout -dates -subject +``` + +## .woodpecker.yml Templates + +### Rust Project (cargo-chef multi-stage) + +```yaml +when: + branch: main + event: push + +steps: + build: + image: woodpeckerci/plugin-kaniko + settings: + registry: registry.threesix.ai + repo: registry.threesix.ai/ + tags: + - latest + - ${CI_COMMIT_SHA:0:8} + context: . + dockerfile: Dockerfile + cache: true + cache_repo: registry.threesix.ai//cache + skip_tls_verify: true + build_args: + - CARGO_FEATURES= + + deploy: + image: bitnami/kubectl:latest + commands: + - kubectl set image deployment/ =registry.threesix.ai/:${CI_COMMIT_SHA:0:8} -n + - kubectl rollout status deployment/ -n --timeout=300s + depends_on: [build] +``` + +### Go Project + +```yaml +when: + branch: main + event: push + +steps: + test: + image: golang:1.25-alpine + commands: + - go test ./... + + build: + image: woodpeckerci/plugin-kaniko + settings: + registry: registry.threesix.ai + repo: registry.threesix.ai/ + tags: + - latest + - ${CI_COMMIT_SHA:0:8} + context: . + dockerfile: Dockerfile + cache: true + skip_tls_verify: true + depends_on: [test] + + deploy: + image: bitnami/kubectl:latest + commands: + - kubectl set image deployment/ =registry.threesix.ai/:${CI_COMMIT_SHA:0:8} -n + - kubectl rollout status deployment/ -n --timeout=120s + depends_on: [build] +``` + +## DNS Management + +**Create A record:** +```bash +curl -X POST "https://api.cloudflare.com/client/v4/zones/${THREESIX_CLOUDFLARE_ZONE_ID}/dns_records" \ + -H "Authorization: Bearer ${THREESIX_CLOUDFLARE_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"type":"A","name":"","content":"208.122.204.172","ttl":1,"proxied":false}' +``` + +**List records:** +```bash +curl -sf "https://api.cloudflare.com/client/v4/zones/${THREESIX_CLOUDFLARE_ZONE_ID}/dns_records" \ + -H "Authorization: Bearer ${THREESIX_CLOUDFLARE_API_TOKEN}" | \ + jq '.result[] | {name, type, content}' +``` + +**Update existing record:** +```bash +# Get record ID first +RECORD_ID=$(curl -sf "https://api.cloudflare.com/client/v4/zones/${THREESIX_CLOUDFLARE_ZONE_ID}/dns_records?name=.threesix.ai" \ + -H "Authorization: Bearer ${THREESIX_CLOUDFLARE_API_TOKEN}" | jq -r '.result[0].id') + +curl -X PATCH "https://api.cloudflare.com/client/v4/zones/${THREESIX_CLOUDFLARE_ZONE_ID}/dns_records/${RECORD_ID}" \ + -H "Authorization: Bearer ${THREESIX_CLOUDFLARE_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"content":"208.122.204.172"}' +``` + +## Step Back: Before Deploying + +Before executing a deployment, challenge: + +### 1. Is the Code Ready? +> "Has this been tested locally? Does `cargo check` / `go build` pass?" +- Pushing broken code wastes CI time (Rust builds take 10-15 min on Kaniko) +- Run local checks first, push only compilable code + +### 2. Is This the Right Target? +> "Am I deploying to the right namespace, with the right image name?" +- Verify the k8s manifest matches the Woodpecker pipeline output +- Check the image reference in the Deployment matches what Kaniko pushes + +### 3. Is the Dockerfile Correct? +> "Does the Dockerfile produce a working amd64 binary?" +- Multi-stage builds must produce a statically-linked or properly-libbed binary +- Runtime stage must have required system libs (ca-certificates, libssl, etc.) +- Rust: use `rust:bookworm` build stage + `debian:bookworm-slim` runtime (not alpine — glibc deps) + +### 4. Will the Deploy Step Have Access? +> "Does the Woodpecker agent have RBAC to deploy to the target namespace?" +- Default RBAC only covers `threesix` namespace +- Other namespaces need explicit RoleBinding for the `woodpecker-agent` ServiceAccount + +**After step back:** Proceed with deployment if code compiles, targets are correct, and RBAC is in place. + +## Do + +1. Set `KUBECONFIG=~/.kube/orchard9-k3sf.yaml` before every kubectl operation +2. Use the Gitea API token from `THREE_SIX_GITEA` env var directly +3. Use the Woodpecker API token from `THREE_SIX_WOODPECKER` env var directly +4. Verify each phase completes before proceeding to the next +5. Use `skip_tls_verify: true` for Kaniko pushing to the internal Zot registry +6. Tag images with commit SHA + latest +7. Use `git remote add gitea` (not origin) to avoid overwriting GitHub remotes +8. Run `cargo check` or `go build` locally before pushing to CI + +## Do Not + +1. Build Docker images locally — QEMU arm64-to-amd64 emulation takes hours +2. Use `gcloud` commands — this is k3s on-prem, not GKE +3. Assume kubectl context is correct — always set KUBECONFIG explicitly +4. Push to GitHub expecting CI to trigger — Woodpecker only watches Gitea +5. Hardcode tokens in commands — always reference env vars +6. Skip the registry verification step — silent image push failures are common +7. Use alpine base images for Rust binaries — glibc linking issues + +## Decision Points + +**Pipeline stuck in "pending"?** +Stop. Check: Are Woodpecker agents running? +```bash +kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml get pods -n threesix -l app=woodpecker-agent +``` + +**Image not appearing in Zot after successful build?** +Stop. Check: Did Kaniko push to the right registry path? +```bash +curl -sf "https://registry.threesix.ai/v2/_catalog" | jq '.repositories' +``` + +**Pod in ImagePullBackOff?** +Stop. Check: +- Is the image reference correct? (`registry.threesix.ai/:`) +- Can the node reach the registry? (internal DNS: `zot.threesix.svc.cluster.local:5000`) +- Is the image the right architecture? (`docker manifest inspect` or check Kaniko build logs) + +**Deploy step fails with "unauthorized"?** +Stop. Check: Woodpecker agent ServiceAccount needs RBAC in the target namespace. +```bash +kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml get rolebinding -n | grep woodpecker +``` + +## Constraints + +- NEVER build Docker images locally for k3s deployment +- NEVER use `gcloud` — this is on-prem k3s, not GKE +- NEVER run `kubectl` without `--kubeconfig ~/.kube/orchard9-k3sf.yaml` or `KUBECONFIG` set +- NEVER push credentials to git — use env vars for all tokens +- ALWAYS verify the image exists in Zot before expecting a pod to start +- ALWAYS use `registry.threesix.ai` (external) in Woodpecker pipeline and `zot.threesix.svc.cluster.local:5000` or `registry.threesix.ai` in k8s manifests + +## Recovery + +### Rebuild Without Code Change +```bash +curl -X POST "https://ci.threesix.ai/api/repos/jordan//pipelines" \ + -H "Authorization: Bearer ${THREE_SIX_WOODPECKER}" \ + -H "Content-Type: application/json" \ + -d '{"branch":"main"}' +``` + +### Force Pod Restart +```bash +kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml rollout restart deployment/ -n +``` + +### Rollback to Previous Image +```bash +# List available tags +curl -sf "https://registry.threesix.ai/v2//tags/list" | jq '.tags' + +# Set specific tag +kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml set image deployment/ \ + =registry.threesix.ai/: -n +``` + +### Delete and Reapply (nuclear option — confirm with user first) +```bash +kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml delete deployment/ -n +kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml apply -f +``` diff --git a/.dockerignore b/.dockerignore index fff909c..ff8d0eb 100644 --- a/.dockerignore +++ b/.dockerignore @@ -40,6 +40,16 @@ examples/ *.log *.tmp .claude/ +latent/ + +# Go SDK — pure Go, not in Rust workspace +sdk/ + +# Non-Rust applications (only applications/aphoria/ is in the workspace) applications/disputed/ applications/stemedb-dashboard/ -latent/ +applications/video-renderer/ +applications/pitch/ +applications/aphoria-pitch/ +applications/aphoria-dashboard/ +applications/findmyhealth/ diff --git a/CLAUDE.md b/CLAUDE.md index cda009f..35e8856 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -447,3 +447,70 @@ Python CLI tools for adverse event signal detection. Different rules from Rust c - Use `os.getenv("VAR", "http://localhost:...")` in Python - Use `process.env.VAR || 'http://localhost:...'` in TypeScript - **StemeDB Integration:** New ingestors should use `StemeDBClient` pattern from `adk-agent/`, not write to JSONL files + +## Production Infrastructure + +All production infra is under the `jordan@roamrhino.com` Google account, GCP project `orchard9`. + +### GCP / Google Artifact Registry + +- **Account:** `jordan@roamrhino.com` +- **Project:** `orchard9` +- **Docker registry:** `us-central1-docker.pkg.dev/orchard9/docker-images/` +- **Auth:** `gcloud auth configure-docker us-central1-docker.pkg.dev` (one-time per machine) +- **Secret Manager:** all production secrets live here under project `orchard9` + - StemeDB root API key secret name: `stemedb-root-api-key` + - Per-project keys follow pattern: `stemedb-key-` + +### k3s Cluster + +- **Kubeconfig:** `~/.kube/orchard9-k3sf.yaml` (separate from GKE contexts — use `--kubeconfig` flag) +- **Fleet repo:** `/Users/jordanwashburn/Workspace/orchard9/k3s-fleet` +- **Nodes:** 3-node cluster (2 servers + 1 agent), architecture: `amd64` +- **Docker builds:** Must use `--platform linux/amd64` (Mac is ARM) +- **Kustomize base:** `deployments/k8s/base/` — apply with `kubectl --kubeconfig ~/.kube/orchard9-k3sf.yaml apply -k deployments/k8s/base//` +- **ClusterSecretStore:** `gcp-secret-manager` (ExternalSecrets Operator, reads from GCP SM above) +- **imagePullSecrets:** `gcr-secret` (pre-configured on cluster nodes) +- **Storage class:** `longhorn` (Longhorn CSI, RWO volumes) +- **Ingress:** Traefik — `ingressClassName: traefik`, entrypoint `websecure` +- **TLS:** cert-manager, `ClusterIssuer: letsencrypt-prod` + +### Cloudflare DNS (threesix.ai) + +- **Domain:** `threesix.ai` — all services live at `*.threesix.ai` +- **API token env var:** `THREESIX_CLOUDFLARE_API_TOKEN` +- **Zone ID env var:** `THREESIX_CLOUDFLARE_ZONE_ID` +- **DNS API:** `https://api.cloudflare.com/client/v4/zones/$THREESIX_CLOUDFLARE_ZONE_ID/dns_records` +- To add/update a record, POST/PATCH to that endpoint with `Authorization: Bearer $THREESIX_CLOUDFLARE_API_TOKEN` +- To find Traefik LB IP: `kubectl get svc -n kube-system` (look for Traefik LoadBalancer EXTERNAL-IP) + +### Service URLs + +| Service | URL | +|---------|-----| +| StemeDB API | `https://stemedb.threesix.ai` | +| StemeDB internal | `http://stemedb-api.stemedb.svc:18180` | + +### Deployment Workflow + +```bash +# 1. Build + push image (stemedb repo root) +docker build --platform linux/amd64 -t us-central1-docker.pkg.dev/orchard9/docker-images/stemedb-api:latest . +docker push us-central1-docker.pkg.dev/orchard9/docker-images/stemedb-api:latest + +# 2. Add/update DNS A record (get Traefik IP first) +TRAEFIK_IP=$(kubectl get svc -n kube-system traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +curl -X POST "https://api.cloudflare.com/client/v4/zones/$THREESIX_CLOUDFLARE_ZONE_ID/dns_records" \ + -H "Authorization: Bearer $THREESIX_CLOUDFLARE_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"type\":\"A\",\"name\":\"stemedb\",\"content\":\"$TRAEFIK_IP\",\"ttl\":1,\"proxied\":false}" + +# 3. Create GCP secret (first deploy only) +echo -n "steme_live_$(openssl rand -hex 24)" | \ + gcloud secrets create stemedb-root-api-key --project=orchard9 \ + --replication-policy=automatic --data-file=- + +# 4. Deploy +kubectl apply -k /Users/jordanwashburn/Workspace/orchard9/k3s-fleet/deployments/k8s/base/stemedb/ +kubectl rollout status deployment/stemedb-api -n stemedb --timeout=120s +``` diff --git a/Dockerfile b/Dockerfile index ab2253d..6de5923 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,53 +1,77 @@ # StemeDB API Docker Build # -# Multi-stage build for the stemedb-api binary. -# Produces a minimal Debian-based image with just the compiled binary. - -# Stage 1: Build the Rust binary -# Use latest Rust for compatibility with newer crates -FROM rust:bookworm AS builder +# Four-stage build using cargo-chef for efficient dependency caching: +# chef -> base image with cargo-chef installed +# planner -> generate recipe.json (cache key for deps) +# cacher -> compile dependencies only (cached until Cargo.lock changes) +# builder -> compile service binary using cached deps (FROM cacher) +# runtime -> minimal image: stripped binary, non-root user, no dev tools +# +# Cache behavior: +# - Cold build: ~15-20 min (deps + binary) +# - Warm build (source-only change): ~2-5 min (deps cached, binary only) +# - Dep change: full rebuild of cacher + builder (~15-20 min) +# Stage 0: Base image with cargo-chef installed +# Cached independently — only rebuilds when the chef version pin changes. +FROM rust:bookworm AS chef +RUN cargo install cargo-chef --locked WORKDIR /app -# Copy manifests first for better layer caching -COPY Cargo.toml Cargo.lock ./ +# Stage 1: Planner — generate recipe.json from workspace manifests +# COPY . . is intentional: cargo chef prepare only reads Cargo.toml files. +# BuildKit content-addresses recipe.json, so the cacher layer stays cached +# even if this stage rebuilds due to a .rs source change. +FROM chef AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json -# Copy workspace members -COPY crates/ crates/ -COPY applications/ applications/ -COPY sdk/ sdk/ +# Stage 2: Cacher — compile dependencies only +# This layer is invalidated only when Cargo.toml or Cargo.lock changes. +# protobuf-compiler is required by stemedb-rpc/build.rs (compiles sync.proto). +FROM chef AS cacher +RUN apt-get update && \ + apt-get install -y --no-install-recommends protobuf-compiler && \ + rm -rf /var/lib/apt/lists/* +COPY --from=planner /app/recipe.json recipe.json +# Proto files must be present for stemedb-rpc/build.rs to run during dep compilation +COPY crates/stemedb-rpc/proto/ crates/stemedb-rpc/proto/ +RUN cargo chef cook --release --recipe-path recipe.json -# Build release binary (only stemedb-api) +# Stage 3: Builder — compile the service binary using cached deps +# Inherits compiled deps from cacher; only workspace source is compiled here. +FROM cacher AS builder +COPY . . RUN cargo build --release -p stemedb-api +# Strip debug symbols before copying to runtime image +RUN strip target/release/stemedb-api -# Stage 2: Runtime image -FROM debian:bookworm-slim +# Stage 4: Runtime — minimal production image +FROM debian:bookworm-slim AS runtime -# Install runtime dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ curl \ && rm -rf /var/lib/apt/lists/* -# Copy the binary from builder +# Non-root user for security +RUN useradd --system --no-create-home --shell /bin/false stemedb + COPY --from=builder /app/target/release/stemedb-api /usr/local/bin/stemedb-api -# Create data directories -RUN mkdir -p /data/wal /data/db +RUN mkdir -p /data/wal /data/db && chown -R stemedb:stemedb /data + +USER stemedb -# Set environment defaults ENV STEMEDB_WAL_DIR=/data/wal \ STEMEDB_DB_DIR=/data/db \ STEMEDB_BIND_ADDR=0.0.0.0:18180 \ RUST_LOG=stemedb_api=info -# Expose the API port EXPOSE 18180 -# Health check HEALTHCHECK --interval=5s --timeout=3s --start-period=10s --retries=3 \ CMD curl -f http://localhost:18180/v1/health || exit 1 -# Run the API server CMD ["stemedb-api"] diff --git a/applications/aphoria/src/hosted.rs b/applications/aphoria/src/hosted.rs index e852a19..92e582a 100644 --- a/applications/aphoria/src/hosted.rs +++ b/applications/aphoria/src/hosted.rs @@ -6,6 +6,7 @@ use std::time::Duration; use ed25519_dalek::SigningKey; +use rand::Rng; use serde::{Deserialize, Serialize}; use stemedb_core::types::Assertion; use tracing::{info, instrument, warn}; @@ -16,105 +17,54 @@ use crate::AphoriaError; /// HTTP client for pushing observations to a hosted StemeDB server. pub struct HostedClient { - /// Base URL of the server (e.g., "https://episteme.acme.corp"). base_url: String, - - /// Project identifier. project_id: String, - - /// Optional team identifier. team_id: Option, - - /// Agent's public key (hex-encoded). agent_id: String, - - /// Optional API key for authentication. api_key: Option, - - /// Maximum retry attempts. max_retries: u32, - - /// Delay between retries in milliseconds. retry_delay_ms: u64, - - /// Behavior when server is unreachable. offline_fallback: OfflineFallback, - - /// Whether to route observations to community endpoint for pattern aggregation. /// When true, observations go to /v1/aphoria/community/observations. - /// When false, observations go to /v1/aphoria/observations. community_enabled: bool, } /// Request payload for pushing observations (team storage). #[derive(Debug, Clone, Serialize)] pub struct PushObservationsRequest { - /// The observations to push. pub observations: Vec, - - /// Project identifier. pub project_id: String, - - /// Optional team identifier. #[serde(skip_serializing_if = "Option::is_none")] pub team_id: Option, - - /// Client version for debugging. pub client_version: String, } /// Request payload for pushing community observations (corpus aggregation). #[derive(Debug, Clone, Serialize)] pub struct PushCommunityObservationsRequest { - /// The anonymized observations to share. pub observations: Vec, - - /// Hash of the project (for deduplication, NOT the actual project name). - /// This is BLAKE3 hash of the project name to prevent name leakage. + /// BLAKE3 hash of project name (prevents name leakage). pub project_hash: String, - - /// Client version for debugging. pub client_version: String, } -/// Community observation response. #[derive(Debug, Clone, Deserialize)] pub struct PushCommunityObservationsResponse { - /// Number of observations recorded. pub recorded: usize, - - /// Number of new patterns discovered. pub new_patterns: usize, - - /// Number of existing patterns updated. pub updated_patterns: usize, } /// A single observation in the request (team storage). #[derive(Debug, Clone, Serialize)] pub struct ObservationDto { - /// The subject (concept path). pub subject: String, - - /// The predicate being claimed. pub predicate: String, - - /// The object value. pub object: ObjectValueDto, - - /// Confidence score (0.0 to 1.0). pub confidence: f32, - - /// Source hash (hex-encoded). pub source_hash: String, - - /// Signatures (hex-encoded). pub signatures: Vec, - - /// Timestamp of the observation. pub timestamp: u64, - - /// Source metadata as JSON string. #[serde(skip_serializing_if = "Option::is_none")] pub source_metadata: Option, } @@ -318,12 +268,16 @@ impl HostedClient { let url = format!("{}/v1/aphoria/observations", self.base_url); - // Retry loop + // Retry loop with exponential backoff + jitter + let mut delay_ms = self.retry_delay_ms; let mut last_error = None; for attempt in 0..=self.max_retries { if attempt > 0 { - info!(attempt, "Retrying push to team server"); - std::thread::sleep(Duration::from_millis(self.retry_delay_ms)); + let jitter_pct: u64 = rand::thread_rng().gen_range(75..=125); + let sleep_ms = delay_ms * jitter_pct / 100; + info!(attempt, sleep_ms, "Retrying push to team server"); + std::thread::sleep(Duration::from_millis(sleep_ms)); + delay_ms = (delay_ms * 2).min(30_000); } match self.do_push_team(&url, &request) { @@ -336,6 +290,10 @@ impl HostedClient { return Ok(response.accepted); } Err(e) => { + if !is_retryable_hosted_error(&e) { + warn!(attempt, error = %e, "Non-retryable error pushing to team server"); + return self.handle_push_error(Some(e)); + } warn!(attempt, error = %e, "Failed to push to team server"); last_error = Some(e); } @@ -366,12 +324,16 @@ impl HostedClient { let url = format!("{}/v1/aphoria/community/observations", self.base_url); - // Retry loop + // Retry loop with exponential backoff + jitter + let mut delay_ms = self.retry_delay_ms; let mut last_error = None; for attempt in 0..=self.max_retries { if attempt > 0 { - info!(attempt, "Retrying push to community corpus"); - std::thread::sleep(Duration::from_millis(self.retry_delay_ms)); + let jitter_pct: u64 = rand::thread_rng().gen_range(75..=125); + let sleep_ms = delay_ms * jitter_pct / 100; + info!(attempt, sleep_ms, "Retrying push to community corpus"); + std::thread::sleep(Duration::from_millis(sleep_ms)); + delay_ms = (delay_ms * 2).min(30_000); } match self.do_push_community(&url, &request) { @@ -385,6 +347,10 @@ impl HostedClient { return Ok(response.recorded); } Err(e) => { + if !is_retryable_hosted_error(&e) { + warn!(attempt, error = %e, "Non-retryable error pushing to community corpus"); + return self.handle_push_error(Some(e)); + } warn!(attempt, error = %e, "Failed to push to community corpus"); last_error = Some(e); } @@ -518,12 +484,16 @@ impl HostedClient { let url = format!("{}/v1/aphoria/patterns", self.base_url); - // Retry loop + // Retry loop with exponential backoff + jitter + let mut delay_ms = self.retry_delay_ms; let mut last_error = None; for attempt in 0..=self.max_retries { if attempt > 0 { - info!(attempt, "Retrying pattern push to hosted server"); - std::thread::sleep(Duration::from_millis(self.retry_delay_ms)); + let jitter_pct: u64 = rand::thread_rng().gen_range(75..=125); + let sleep_ms = delay_ms * jitter_pct / 100; + info!(attempt, sleep_ms, "Retrying pattern push to hosted server"); + std::thread::sleep(Duration::from_millis(sleep_ms)); + delay_ms = (delay_ms * 2).min(30_000); } match self.do_push_patterns(&url, &request) { @@ -537,6 +507,11 @@ impl HostedClient { return Ok(response); } Err(e) => { + if !is_retryable_hosted_error(&e) { + warn!(attempt, error = %e, "Non-retryable error pushing patterns"); + last_error = Some(e); + break; + } warn!(attempt, error = %e, "Failed to push patterns to hosted server"); last_error = Some(e); } @@ -617,12 +592,16 @@ impl HostedClient { url = format!("{}?{}", url, params.join("&")); } - // Retry loop + // Retry loop with exponential backoff + jitter + let mut delay_ms = self.retry_delay_ms; let mut last_error = None; for attempt in 0..=self.max_retries { if attempt > 0 { - info!(attempt, "Retrying community extractors fetch"); - std::thread::sleep(Duration::from_millis(self.retry_delay_ms)); + let jitter_pct: u64 = rand::thread_rng().gen_range(75..=125); + let sleep_ms = delay_ms * jitter_pct / 100; + info!(attempt, sleep_ms, "Retrying community extractors fetch"); + std::thread::sleep(Duration::from_millis(sleep_ms)); + delay_ms = (delay_ms * 2).min(30_000); } match self.do_get_extractors(&url) { @@ -631,6 +610,11 @@ impl HostedClient { return Ok(extractors); } Err(e) => { + if !is_retryable_hosted_error(&e) { + warn!(attempt, error = %e, "Non-retryable error fetching community extractors"); + last_error = Some(e); + break; + } warn!(attempt, error = %e, "Failed to fetch community extractors"); last_error = Some(e); } @@ -692,6 +676,22 @@ impl HostedClient { } } +/// Determines whether a hosted push/fetch error is worth retrying. +/// +/// Returns `false` for HTTP 4xx client errors (auth failures, bad requests) — +/// these will not succeed on retry. Returns `true` for 5xx server errors, +/// connection errors, and timeouts. +fn is_retryable_hosted_error(error: &AphoriaError) -> bool { + let msg = error.to_string(); + // Non-retryable: client errors (4xx). The message format is + // "Server returned status 4XX" from do_push_*/do_get_extractors. + if msg.contains("Server returned status 4") { + return false; + } + // All other errors (5xx, connection refused, timeout) are retryable. + true +} + /// Convert an Assertion to an ObservationDto for the API. fn assertion_to_dto(assertion: &Assertion) -> ObservationDto { use stemedb_core::types::ObjectValue; @@ -781,198 +781,3 @@ fn wildcardize_subject(subject: &str, project_id: &str) -> String { // Simple replacement: replace project_id with wildcard subject.replace(project_id, "*") } - -#[cfg(test)] -mod tests { - use super::*; - use crate::bridge::generate_signing_key; - use crate::config::SyncMode; - - #[test] - fn test_client_not_created_without_url() { - let config = HostedConfig::default(); - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "test-project") - .expect("should not fail"); - assert!(client.is_none()); - } - - #[test] - fn test_client_created_with_url() { - let config = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: Some("my-project".to_string()), - team_id: Some("platform".to_string()), - sync_mode: SyncMode::RemoteOnly, - offline_fallback: OfflineFallback::Skip, - max_retries: 3, - retry_delay_ms: 1000, - api_key_env: String::new(), - }; - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - - assert_eq!(client.base_url, "https://episteme.acme.corp"); - assert_eq!(client.project_id, "my-project"); - assert_eq!(client.team_id, Some("platform".to_string())); - assert_eq!(client.agent_id.len(), 64); // 32 bytes hex-encoded - } - - #[test] - fn test_client_uses_fallback_project_name() { - let config = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: None, // Not set - ..Default::default() - }; - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - - assert_eq!(client.project_id, "fallback-project"); - } - - #[test] - fn test_assertion_to_dto() { - use stemedb_core::types::{ - Assertion, HlcTimestamp, LifecycleStage, ObjectValue, SignatureEntry, SourceClass, - }; - - let assertion = Assertion { - subject: "code://rust/myapp/tls".to_string(), - predicate: "enabled".to_string(), - object: ObjectValue::Boolean(true), - parent_hash: None, - source_hash: [1u8; 32], - source_class: SourceClass::Community, - visual_hash: None, - epoch: None, - source_metadata: Some(b"{\"file\":\"test.rs\"}".to_vec()), - narrative: None, - lifecycle: LifecycleStage::Approved, - signatures: vec![SignatureEntry { - agent_id: [2u8; 32], - signature: [3u8; 64], - timestamp: 12345, - version: 1, - }], - confidence: 0.9, - timestamp: 67890, - hlc_timestamp: HlcTimestamp::default(), - vector: None, - }; - - let dto = assertion_to_dto(&assertion); - - assert_eq!(dto.subject, "code://rust/myapp/tls"); - assert_eq!(dto.predicate, "enabled"); - assert!(matches!(dto.object, ObjectValueDto::Boolean(true))); - assert_eq!(dto.confidence, 0.9); - assert_eq!(dto.timestamp, 67890); - assert_eq!(dto.signatures.len(), 1); - assert_eq!(dto.signatures[0].version, 1); - assert_eq!(dto.source_metadata, Some("{\"file\":\"test.rs\"}".to_string())); - } - - #[test] - fn test_compute_org_hash() { - let config = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: Some("my-project".to_string()), - team_id: Some("platform".to_string()), - ..Default::default() - }; - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - - let hash = client.compute_org_hash(); - - // Hash should be 64 hex characters (32 bytes) - assert_eq!(hash.len(), 64); - - // Same inputs should produce same hash - let hash2 = client.compute_org_hash(); - assert_eq!(hash, hash2); - } - - #[test] - fn test_compute_org_hash_without_team() { - let config = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: Some("my-project".to_string()), - team_id: None, - ..Default::default() - }; - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - - let hash = client.compute_org_hash(); - assert_eq!(hash.len(), 64); - - // With team should produce different hash - let config_with_team = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: Some("my-project".to_string()), - team_id: Some("platform".to_string()), - ..Default::default() - }; - let client_with_team = - HostedClient::new(&config_with_team, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - let hash_with_team = client_with_team.compute_org_hash(); - - assert_ne!(hash, hash_with_team); - } - - #[test] - fn test_push_patterns_empty() { - let config = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: Some("my-project".to_string()), - ..Default::default() - }; - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - - // Empty patterns should return default response without making HTTP call - let result = client.push_patterns(vec![]); - assert!(result.is_ok()); - let response = result.unwrap(); - assert_eq!(response.accepted, 0); - assert_eq!(response.merged, 0); - assert_eq!(response.deduplicated, 0); - } - - #[test] - fn test_accessors() { - let config = HostedConfig { - url: Some("https://episteme.acme.corp".to_string()), - project_id: Some("my-project".to_string()), - ..Default::default() - }; - let community_config = CommunityConfig::default(); - let key = generate_signing_key(); - let client = HostedClient::new(&config, &community_config, &key, "fallback-project") - .expect("should not fail") - .unwrap(); - - assert_eq!(client.base_url(), "https://episteme.acme.corp"); - assert_eq!(client.project_id(), "my-project"); - } -} diff --git a/crates/stemedb-api/Cargo.toml b/crates/stemedb-api/Cargo.toml index 73d79af..f160171 100644 --- a/crates/stemedb-api/Cargo.toml +++ b/crates/stemedb-api/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [features] default = ["aphoria"] aphoria = ["dep:aphoria"] +cluster = ["dep:stemedb-cluster", "dep:stemedb-sync"] [dependencies] stemedb-core = { path = "../stemedb-core" } @@ -22,6 +23,10 @@ stemedb-lens = { path = "../stemedb-lens" } # Optional: Aphoria code-level truth linting aphoria = { path = "../../applications/aphoria", optional = true } +# Optional: Multi-node cluster participation +stemedb-cluster = { path = "../stemedb-cluster", optional = true } +stemedb-sync = { path = "../stemedb-sync", optional = true } + axum = { version = "0.7", features = ["json"] } axum-server = { version = "0.7", features = ["tls-rustls"] } tokio = { version = "1", features = ["full"] } diff --git a/crates/stemedb-api/src/dto/mod.rs b/crates/stemedb-api/src/dto/mod.rs index dcc52eb..1d32ead 100644 --- a/crates/stemedb-api/src/dto/mod.rs +++ b/crates/stemedb-api/src/dto/mod.rs @@ -133,8 +133,8 @@ pub use aphoria::{ // From stemedb_claims module pub use stemedb_claims::{ - AuthoredClaimDto, AuthoredValueDto, ClaimSearchQuery, ClaimStatsDto, - CreateClaimRequest, CreateClaimResponse, + AuthoredClaimDto, AuthoredValueDto, ClaimSearchQuery, ClaimStatsDto, CreateClaimRequest, + CreateClaimResponse, }; // From subjects module diff --git a/crates/stemedb-api/src/dto/stemedb_claims.rs b/crates/stemedb-api/src/dto/stemedb_claims.rs index 7aadb33..73b2d2b 100644 --- a/crates/stemedb-api/src/dto/stemedb_claims.rs +++ b/crates/stemedb-api/src/dto/stemedb_claims.rs @@ -1,7 +1,7 @@ //! DTOs for StemeDB claims endpoints. -use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use utoipa::{IntoParams, ToSchema}; /// Request to create a claim in StemeDB. diff --git a/crates/stemedb-api/src/handlers/mod.rs b/crates/stemedb-api/src/handlers/mod.rs index 2dab64c..1a4b897 100644 --- a/crates/stemedb-api/src/handlers/mod.rs +++ b/crates/stemedb-api/src/handlers/mod.rs @@ -89,11 +89,8 @@ pub use aphoria::{ }; pub use stemedb_claims::{ - create_claim as create_stemedb_claim, - delete_claim as delete_stemedb_claim, - get_claim as get_stemedb_claim, - get_claim_stats as get_stemedb_claim_stats, - list_claims as list_stemedb_claims, - search_claims as search_stemedb_claims, + create_claim as create_stemedb_claim, delete_claim as delete_stemedb_claim, + get_claim as get_stemedb_claim, get_claim_stats as get_stemedb_claim_stats, + list_claims as list_stemedb_claims, search_claims as search_stemedb_claims, }; pub use subjects::{list_predicates, list_subjects}; diff --git a/crates/stemedb-api/src/handlers/stemedb_claims.rs b/crates/stemedb-api/src/handlers/stemedb_claims.rs index a91f74e..ff0daea 100644 --- a/crates/stemedb-api/src/handlers/stemedb_claims.rs +++ b/crates/stemedb-api/src/handlers/stemedb_claims.rs @@ -18,8 +18,8 @@ use stemedb_storage::{key_codec, KVStore}; use crate::{ dto::{ - AuthoredClaimDto, AuthoredValueDto, ClaimSearchQuery, ClaimStatsDto, - CreateClaimRequest, CreateClaimResponse, + AuthoredClaimDto, AuthoredValueDto, ClaimSearchQuery, ClaimStatsDto, CreateClaimRequest, + CreateClaimResponse, }, error::{ApiError, Result}, AppState, @@ -566,9 +566,7 @@ pub async fn search_claims( } if let Some(max_tier) = query.max_tier { claims.retain(|c| { - tier_string_to_number(&c.authority_tier) - .map(|t| t <= max_tier) - .unwrap_or(false) + tier_string_to_number(&c.authority_tier).map(|t| t <= max_tier).unwrap_or(false) }); } if let Some(ref status) = query.status { @@ -635,10 +633,8 @@ pub async fn get_claim_stats( *value_counts.entry(value_to_string(&claim.value)).or_insert(0) += 1; } - let most_common_value = value_counts - .into_iter() - .max_by_key(|(_, count)| *count) - .map(|(val, _)| val); + let most_common_value = + value_counts.into_iter().max_by_key(|(_, count)| *count).map(|(val, _)| val); Ok(Json(ClaimStatsDto { concept_path, diff --git a/crates/stemedb-api/src/main.rs b/crates/stemedb-api/src/main.rs index 390d16d..77286c0 100644 --- a/crates/stemedb-api/src/main.rs +++ b/crates/stemedb-api/src/main.rs @@ -9,7 +9,8 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use axum::Extension; use metrics_exporter_prometheus::PrometheusBuilder; use stemedb_api::{ - create_router_config, create_router_with_meter_config, AppState, SecurityConfig, + bootstrap, create_router_config, create_router_full_protection_full_config, + create_router_with_meter_config, ApiKeyAuthConfig, AppState, SecurityConfig, }; use stemedb_ingest::worker::IngestWorker; use stemedb_storage::HybridStore; @@ -158,10 +159,14 @@ async fn main() -> Result<(), Box> { let write_journal = Journal::open(&config.wal_dir)?; let read_journal = Journal::open(&config.wal_dir)?; let store = Arc::new(HybridStore::open(&config.db_dir)?); - let corpus_store = config.corpus_db_dir.as_ref().map(|d| { - let _ = std::fs::create_dir_all(d); - Arc::new(HybridStore::open(d).unwrap()) - }); + let corpus_store = config + .corpus_db_dir + .as_ref() + .map(|d| { + let _ = std::fs::create_dir_all(d); + HybridStore::open(d).map(Arc::new) + }) + .transpose()?; let state = AppState::new(write_journal, read_journal, Arc::clone(&store), corpus_store); let worker_journal = state.journal.clone(); @@ -184,6 +189,71 @@ async fn main() -> Result<(), Box> { } }); + // Bootstrap root API key from env (idempotent: no-op if key already exists). + if let Err(e) = bootstrap::bootstrap_root_api_key(&*state.api_key_store).await { + error!("Failed to bootstrap root API key: {}", e); + std::process::exit(1); + } + + // Cluster mode: join SWIM membership when STEMEDB_CLUSTER_MODE=true. + // Requires the `cluster` feature to be enabled at compile time. + #[cfg(feature = "cluster")] + { + let cluster_mode = std::env::var("STEMEDB_CLUSTER_MODE") + .map(|v| v.to_lowercase() == "true" || v == "1") + .unwrap_or(false); + + if cluster_mode { + use stemedb_cluster::{stable_node_id, NodeInfo, SwimConfig, SwimMembership}; + + let node_id = stable_node_id(); + + let rpc_addr: std::net::SocketAddr = std::env::var("STEMEDB_NODE_RPC_ADDR") + .unwrap_or_else(|_| "127.0.0.1:18182".to_string()) + .parse() + .unwrap_or_else(|_| std::net::SocketAddr::from(([127, 0, 0, 1], 18182))); + + let api_addr: std::net::SocketAddr = config + .bind_addr + .parse() + .unwrap_or_else(|_| std::net::SocketAddr::from(([127, 0, 0, 1], 18180))); + + let local_info = NodeInfo::new(node_id, rpc_addr, api_addr); + let membership = Arc::new(SwimMembership::new(local_info, SwimConfig::default())); + + let seeds: Vec = std::env::var("STEMEDB_CLUSTER_SEEDS") + .unwrap_or_default() + .split(',') + .filter(|s| !s.trim().is_empty()) + .filter_map(|s| s.trim().parse().ok()) + .collect(); + + if !seeds.is_empty() { + if let Err(e) = membership.join(seeds).await { + warn!("Cluster join failed (continuing as solo node): {}", e); + } + } + + membership.start(); + info!( + node_id = %node_id.short_hex(), + rpc_addr = %rpc_addr, + "Cluster mode active" + ); + } + } + + // Startup guard: unsafe skip + auth enabled is a fatal misconfiguration. + if config.unsafe_skip_signatures && bootstrap::is_auth_enabled() { + error!( + "FATAL: STEMEDB_UNSAFE_SKIP_SIGNATURES=true conflicts with \ + STEMEDB_AUTH_ENABLED=true. Signature verification must be enabled \ + when auth is enforced. Unset STEMEDB_UNSAFE_SKIP_SIGNATURES or \ + disable STEMEDB_AUTH_ENABLED." + ); + std::process::exit(1); + } + // Build router (with or without metering) with security config let security_config = config.to_security_config(); info!( @@ -193,7 +263,21 @@ async fn main() -> Result<(), Box> { security_config.http_timeout_secs, ); - let app = if config.meter_enabled { + let app = if bootstrap::is_auth_enabled() { + info!( + require_all = bootstrap::is_auth_require_all(), + "Auth enforced (STEMEDB_AUTH_ENABLED=true) — full protection stack active" + ); + create_router_full_protection_full_config( + state, + ApiKeyAuthConfig { + enabled: true, + require_for_all: bootstrap::is_auth_require_all(), + ..ApiKeyAuthConfig::default() + }, + security_config, + ) + } else if config.meter_enabled { info!("The Meter enabled: economic throttling active (10K tokens/agent/hour)"); create_router_with_meter_config(state, security_config) } else { diff --git a/crates/stemedb-cluster/Cargo.toml b/crates/stemedb-cluster/Cargo.toml index 77d12aa..f60275a 100644 --- a/crates/stemedb-cluster/Cargo.toml +++ b/crates/stemedb-cluster/Cargo.toml @@ -29,6 +29,9 @@ axum = "0.7" tower = "0.5" tower-http = { version = "0.5", features = ["cors", "trace"] } +# HTTP client for gateway request forwarding +reqwest = { version = "0.12", features = ["json"] } + # Serialization serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" diff --git a/crates/stemedb-cluster/src/bin/node.rs b/crates/stemedb-cluster/src/bin/node.rs index 3f7651a..edcac7b 100644 --- a/crates/stemedb-cluster/src/bin/node.rs +++ b/crates/stemedb-cluster/src/bin/node.rs @@ -23,7 +23,7 @@ use tracing::info; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use stemedb_cluster::{ - Gateway, NodeId, NodeInfo, RangeManager, RangeRouter, ShardingConfig, SwimConfig, + stable_node_id, Gateway, NodeInfo, RangeManager, RangeRouter, ShardingConfig, SwimConfig, SwimMembership, }; @@ -82,7 +82,8 @@ async fn main() -> Result<(), Box> { let config = NodeConfig::from_env(); - let node_id = NodeId::random(); + // Use stable NodeId (env var → hostname → random fallback) + let node_id = stable_node_id(); info!( node_id = %node_id.short_hex(), diff --git a/crates/stemedb-cluster/src/config.rs b/crates/stemedb-cluster/src/config.rs index 5b9ef77..20ee060 100644 --- a/crates/stemedb-cluster/src/config.rs +++ b/crates/stemedb-cluster/src/config.rs @@ -425,7 +425,7 @@ impl ClusterConfigBuilder { .ok_or_else(|| crate::ClusterError::Config("api_addr is required".to_string()))?; Ok(ClusterConfig { - node_id: self.node_id.unwrap_or_else(NodeId::random), + node_id: self.node_id.unwrap_or_else(crate::stable_node_id), rpc_addr, api_addr, seed_nodes: self.seed_nodes, diff --git a/crates/stemedb-cluster/src/gateway/handlers/query_handlers.rs b/crates/stemedb-cluster/src/gateway/handlers/query_handlers.rs index 61d8921..d1c3aa3 100644 --- a/crates/stemedb-cluster/src/gateway/handlers/query_handlers.rs +++ b/crates/stemedb-cluster/src/gateway/handlers/query_handlers.rs @@ -8,41 +8,77 @@ use tracing::instrument; use crate::gateway::service::GatewayState; use crate::sharding::ShardId; -use super::types::{ - ApiError, ClusterStatusResponse, HealthResponse, NodeStatusInfo, QueryParams, QueryResponse, -}; +use super::types::{ApiError, ClusterStatusResponse, HealthResponse, NodeStatusInfo, QueryParams}; /// GET /v1/query - Query assertions. +/// +/// Routes by subject hash to a replica (preferring local) and forwards the +/// request via HTTP to that node's stemedb-api. #[instrument(skip(state), fields(subject = %params.subject))] pub async fn handle_query( State(state): State>, Query(params): Query, -) -> Result, ApiError> { +) -> Result, ApiError> { + state.inc_requests(); + // 1. Route by subject hash let shard_id = state.router.route_subject(¶ms.subject).map_err(|e| ApiError { code: "UNAVAILABLE".to_string(), message: format!("Routing failed: {e}"), })?; - // 2. Get replicas, preferring local + // 2. Get replicas, preferring local node to minimize latency let replicas = state.router.get_replicas_prefer_local(shard_id).map_err(|e| ApiError { code: "UNAVAILABLE".to_string(), message: format!("No replicas for shard {shard_id}: {e}"), })?; - let replica = replicas.first().ok_or_else(|| ApiError { + let replica_id = replicas.first().ok_or_else(|| ApiError { code: "UNAVAILABLE".to_string(), message: format!("No replicas available for shard {shard_id}"), })?; - // 3. Forward to replica via RPC (not yet wired) + // 3. Look up replica's HTTP API address via membership + let replica_info = state.membership.get_member(*replica_id).ok_or_else(|| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Replica {} not found in membership", replica_id.short_hex()), + })?; + + // 4. Get or create a pooled HTTP client for this node + let http_client = { + let entry = state.http_forwarders.entry(*replica_id).or_insert_with(reqwest::Client::new); + entry.clone() + }; + + // 5. Forward to replica's stemedb-api, preserving all query parameters + let url = format!("http://{}/v1/query", replica_info.api_addr); tracing::info!( - shard_id = shard_id, - replica = %replica.short_hex(), - "Routed query to replica" + shard_id, + replica = %replica_id.short_hex(), + url = %url, + "Forwarding query to replica" ); - Ok(Json(QueryResponse { assertions: vec![], shard_id, served_by: replica.short_hex() })) + let response = http_client.get(&url).query(¶ms).send().await.map_err(|e| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Forward to replica failed: {e}"), + })?; + + if !response.status().is_success() { + let status_code = response.status().as_u16(); + let body = response.text().await.unwrap_or_default(); + return Err(ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Replica returned {status_code}: {body}"), + }); + } + + let result: serde_json::Value = response.json().await.map_err(|e| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Failed to parse replica response: {e}"), + })?; + + Ok(Json(result)) } /// GET /v1/health - Health check. diff --git a/crates/stemedb-cluster/src/gateway/handlers/types.rs b/crates/stemedb-cluster/src/gateway/handlers/types.rs index dffcd5c..3b342e8 100644 --- a/crates/stemedb-cluster/src/gateway/handlers/types.rs +++ b/crates/stemedb-cluster/src/gateway/handlers/types.rs @@ -26,19 +26,6 @@ pub struct CreateAssertionRequest { pub public_key: String, } -/// Response from assertion creation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AssertionResponse { - /// ID of the created assertion (content hash). - pub assertion_id: String, - - /// Shard the assertion was routed to. - pub shard_id: ShardId, - - /// Node that processed the write. - pub leader_node: String, -} - /// Query parameters for assertion lookup. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct QueryParams { @@ -55,19 +42,6 @@ pub struct QueryParams { pub limit: Option, } -/// Query response with assertions. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct QueryResponse { - /// Matching assertions. - pub assertions: Vec, - - /// Shard that served the query. - pub shard_id: ShardId, - - /// Node that served the query. - pub served_by: String, -} - /// Vote request. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VoteRequest { diff --git a/crates/stemedb-cluster/src/gateway/handlers/write_handlers.rs b/crates/stemedb-cluster/src/gateway/handlers/write_handlers.rs index 7e70a72..b952b08 100644 --- a/crates/stemedb-cluster/src/gateway/handlers/write_handlers.rs +++ b/crates/stemedb-cluster/src/gateway/handlers/write_handlers.rs @@ -7,49 +7,84 @@ use tracing::instrument; use crate::gateway::service::GatewayState; -use super::types::{ - ApiError, AssertionResponse, CreateAssertionRequest, VoteRequest, VoteResponse, -}; +use super::types::{ApiError, CreateAssertionRequest, VoteRequest, VoteResponse}; /// POST /v1/assert - Create a new assertion. +/// +/// Routes by subject hash to the shard leader and forwards the request via +/// HTTP to that node's stemedb-api. Returns the response from the leader. #[instrument(skip(state, req), fields(subject = %req.subject))] pub async fn handle_assert( State(state): State>, Json(req): Json, -) -> Result, ApiError> { - // 1. Route by subject hash +) -> Result, ApiError> { + state.inc_requests(); + + // 1. Route by subject hash to determine shard let shard_id = state.router.route_subject(&req.subject).map_err(|e| ApiError { code: "UNAVAILABLE".to_string(), message: format!("Routing failed: {e}"), })?; // 2. Get leader for this shard - let leader = state.router.get_leader(shard_id).map_err(|e| ApiError { + let leader_id = state.router.get_leader(shard_id).map_err(|e| ApiError { code: "UNAVAILABLE".to_string(), message: format!("No leader for shard {shard_id}: {e}"), })?; - // 3. Forward to leader via RPC (not yet wired) + // 3. Look up leader's HTTP API address via membership + let leader_info = state.membership.get_member(leader_id).ok_or_else(|| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Leader {} not found in membership", leader_id.short_hex()), + })?; + + // 4. Get or create a pooled HTTP client for this node + let http_client = { + let entry = state.http_forwarders.entry(leader_id).or_insert_with(reqwest::Client::new); + entry.clone() + }; + + // 5. Forward to the leader's stemedb-api + let url = format!("http://{}/v1/assert", leader_info.api_addr); tracing::info!( - shard_id = shard_id, - leader = %leader.short_hex(), - "Routed assertion to shard leader" + shard_id, + leader = %leader_id.short_hex(), + url = %url, + "Forwarding assertion to shard leader" ); - // Return routing result (actual RPC forwarding requires stemedb-rpc integration) - Ok(Json(AssertionResponse { - assertion_id: format!("pending_{}", req.subject), - shard_id, - leader_node: leader.short_hex(), - })) + let response = http_client.post(&url).json(&req).send().await.map_err(|e| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Forward to leader failed: {e}"), + })?; + + if !response.status().is_success() { + let status_code = response.status().as_u16(); + let body = response.text().await.unwrap_or_default(); + return Err(ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Leader returned {status_code}: {body}"), + }); + } + + let result: serde_json::Value = response.json().await.map_err(|e| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Failed to parse leader response: {e}"), + })?; + + Ok(Json(result)) } /// POST /v1/vote - Submit a vote. +/// +/// Routes to the shard leader for the assertion's subject and forwards via HTTP. #[instrument(skip(state, req), fields(subject = %req.subject))] pub async fn handle_vote( State(state): State>, Json(req): Json, ) -> Result, ApiError> { + state.inc_requests(); + // Route by subject hash let shard_id = state.router.route_subject(&req.subject).map_err(|e| ApiError { code: "UNAVAILABLE".to_string(), @@ -57,18 +92,45 @@ pub async fn handle_vote( })?; // Get leader - let leader = state.router.get_leader(shard_id).map_err(|e| ApiError { + let leader_id = state.router.get_leader(shard_id).map_err(|e| ApiError { code: "UNAVAILABLE".to_string(), message: format!("No leader for shard {shard_id}: {e}"), })?; - // Forward to leader via RPC (not yet wired) + // Look up leader's API address + let leader_info = state.membership.get_member(leader_id).ok_or_else(|| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Leader {} not found in membership", leader_id.short_hex()), + })?; + + // Get or create a pooled HTTP client + let http_client = { + let entry = state.http_forwarders.entry(leader_id).or_insert_with(reqwest::Client::new); + entry.clone() + }; + + // Forward to leader's stemedb-api + let url = format!("http://{}/v1/vote", leader_info.api_addr); tracing::info!( - shard_id = shard_id, - leader = %leader.short_hex(), + shard_id, + leader = %leader_id.short_hex(), assertion_id = %req.assertion_id, - "Routed vote to shard leader" + "Forwarding vote to shard leader" ); + let response = http_client.post(&url).json(&req).send().await.map_err(|e| ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Forward to leader failed: {e}"), + })?; + + if !response.status().is_success() { + let status_code = response.status().as_u16(); + let body = response.text().await.unwrap_or_default(); + return Err(ApiError { + code: "UNAVAILABLE".to_string(), + message: format!("Leader returned {status_code}: {body}"), + }); + } + Ok(Json(VoteResponse { success: true, shard_id })) } diff --git a/crates/stemedb-cluster/src/gateway/service.rs b/crates/stemedb-cluster/src/gateway/service.rs index fb4f325..1d5c7b4 100644 --- a/crates/stemedb-cluster/src/gateway/service.rs +++ b/crates/stemedb-cluster/src/gateway/service.rs @@ -31,9 +31,11 @@ pub struct GatewayState { /// Membership for discovering nodes. pub membership: Arc, - /// RPC client pool (node ID -> client). - /// In a full implementation, these would be gRPC clients. - pub rpc_clients: DashMap, + /// HTTP client pool for forwarding requests to each node's stemedb-api. + /// + /// Keyed by NodeId. `reqwest::Client` is cheap to clone (Arc internally) + /// and reuses TCP connections via connection pooling. + pub http_forwarders: DashMap, /// Request counter for metrics. pub request_count: AtomicU64, @@ -49,7 +51,7 @@ impl GatewayState { Self { router, membership, - rpc_clients: DashMap::new(), + http_forwarders: DashMap::new(), request_count: AtomicU64::new(0), sync_notifiers: RwLock::new(Vec::new()), } diff --git a/crates/stemedb-cluster/src/lib.rs b/crates/stemedb-cluster/src/lib.rs index d6e9ece..b39c397 100644 --- a/crates/stemedb-cluster/src/lib.rs +++ b/crates/stemedb-cluster/src/lib.rs @@ -71,3 +71,73 @@ pub use error::{ClusterError, Result}; pub use gateway::{Gateway, GatewayBuilder}; pub use membership::{MembershipEvent, NodeId, NodeInfo, NodeState, SwimMembership}; pub use sharding::{MetaRange, RangeDescriptor, RangeManager, RangeRouter, ShardId}; + +/// Returns a stable [`NodeId`] for this process. +/// +/// Priority: +/// 1. `STEMEDB_NODE_ID` env var — hashed via BLAKE3 (k8s: set in Deployment env) +/// 2. `HOSTNAME` env var — hashed via BLAKE3, stable within a pod when hostname = pod name +/// 3. Random fallback — development/test only +/// +/// # Example (k8s) +/// ```yaml +/// env: +/// - name: STEMEDB_NODE_ID +/// value: "node-a" +/// ``` +pub fn stable_node_id() -> NodeId { + fn hash_to_node_id(s: &str) -> NodeId { + let hash = blake3::hash(s.as_bytes()); + let bytes: &[u8; 32] = hash.as_bytes(); + let mut id_bytes = [0u8; 16]; + id_bytes.copy_from_slice(&bytes[..16]); + NodeId::from_bytes(id_bytes) + } + + if let Ok(val) = std::env::var("STEMEDB_NODE_ID") { + if !val.is_empty() { + return hash_to_node_id(&val); + } + } + + if let Ok(hostname) = std::env::var("HOSTNAME") { + if !hostname.is_empty() { + return hash_to_node_id(&hostname); + } + } + + NodeId::random() +} + +#[cfg(test)] +mod stable_node_id_tests { + use super::*; + + #[test] + fn test_stable_node_id_env_var() { + // Same env var → same NodeId + std::env::set_var("STEMEDB_NODE_ID", "test-node-a"); + let id1 = stable_node_id(); + let id2 = stable_node_id(); + assert_eq!(id1, id2); + std::env::remove_var("STEMEDB_NODE_ID"); + } + + #[test] + fn test_stable_node_id_different_values() { + // Different values → different NodeIds + let id_a = { + std::env::set_var("STEMEDB_NODE_ID", "node-a"); + let id = stable_node_id(); + std::env::remove_var("STEMEDB_NODE_ID"); + id + }; + let id_b = { + std::env::set_var("STEMEDB_NODE_ID", "node-b"); + let id = stable_node_id(); + std::env::remove_var("STEMEDB_NODE_ID"); + id + }; + assert_ne!(id_a, id_b); + } +} diff --git a/crates/stemedb-cluster/src/membership/swim.rs b/crates/stemedb-cluster/src/membership/swim.rs index da3b824..8eaeddf 100644 --- a/crates/stemedb-cluster/src/membership/swim.rs +++ b/crates/stemedb-cluster/src/membership/swim.rs @@ -88,17 +88,18 @@ impl SwimMembership { *local = info; } - /// Joins the cluster by contacting seed nodes. + /// Joins the cluster by contacting seed nodes via gRPC ping. /// /// # Algorithm /// - /// 1. Contact each seed node to get their membership list - /// 2. Merge received lists into our local view - /// 3. Announce ourselves to the cluster + /// 1. For each seed, attempt a `Ping` RPC to verify reachability + /// 2. If at least one seed is reachable, mark as joined + /// 3. If no seeds are reachable, start as an isolated node (not an error — + /// gossip and anti-entropy will sync state once the network recovers) /// /// # Errors /// - /// Returns error if no seed nodes are reachable. + /// Never returns an error — isolated startup is acceptable. #[instrument(skip(self), fields(seed_count = seeds.len()))] pub async fn join(&self, seeds: Vec) -> Result<()> { if seeds.is_empty() { @@ -108,17 +109,52 @@ impl SwimMembership { return Ok(()); } - // Seed contact via RPC is not yet wired. Once stemedb-rpc integration - // is complete, this will: - // 1. Send JoinRequest to each seed - // 2. Receive MembershipList response - // 3. Merge into our local state - // 4. Broadcast our presence - // - // For now, use `alive_node()` to manually register discovered peers. - info!(seeds = ?seeds, "Joining cluster (seed RPC contact pending integration)"); - self.joined.store(true, Ordering::SeqCst); + info!("Joining cluster via seeds"); + let local_id = self.local_id(); + let local_rpc_addr = self.local_info().rpc_addr; + let mut contacted = 0usize; + + for seed_addr in &seeds { + // Skip our own RPC address to avoid self-pinging + if *seed_addr == local_rpc_addr { + continue; + } + + let addr = format!("http://{}", seed_addr); + let client = match stemedb_rpc::SyncClient::connect(&addr).await { + Ok(c) => c, + Err(e) => { + warn!(seed = %seed_addr, error = %e, "Cannot connect to seed, skipping"); + continue; + } + }; + + let ping = stemedb_rpc::proto::PingRequest { node_id: local_id.as_bytes().to_vec() }; + + match client.ping(ping).await { + Ok(resp) => { + let seed_id_hex = hex::encode(&resp.node_id[..resp.node_id.len().min(4)]); + info!( + seed = %seed_addr, + seed_id = %seed_id_hex, + "Seed reachable, cluster join successful" + ); + contacted += 1; + } + Err(e) => { + warn!(seed = %seed_addr, error = %e, "Seed ping failed"); + } + } + } + + if contacted == 0 { + warn!("No seeds reachable — starting as isolated node (anti-entropy will sync later)"); + } else { + info!(contacted, "Joined cluster via seeds"); + } + + self.joined.store(true, Ordering::SeqCst); Ok(()) } diff --git a/crates/stemedb-rpc/build.rs b/crates/stemedb-rpc/build.rs index 9ca8843..b3dbee7 100644 --- a/crates/stemedb-rpc/build.rs +++ b/crates/stemedb-rpc/build.rs @@ -1,6 +1,10 @@ //! Build script for stemedb-rpc that generates gRPC code from proto files. fn main() -> Result<(), Box> { + // Only re-run when these inputs change; without this, cargo re-runs on every build. + println!("cargo:rerun-if-changed=proto/sync.proto"); + println!("cargo:rerun-if-changed=build.rs"); + tonic_build::configure() .build_server(true) .build_client(true) diff --git a/crates/stemedb-sync/src/gossip.rs b/crates/stemedb-sync/src/gossip.rs index 92eb66f..f04e06c 100644 --- a/crates/stemedb-sync/src/gossip.rs +++ b/crates/stemedb-sync/src/gossip.rs @@ -22,10 +22,10 @@ use crate::error::Result; use async_trait::async_trait; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; -use std::time::Instant; +use std::time::{Duration, Instant}; use stemedb_core::types::HlcTimestamp; use stemedb_rpc::proto::GossipRequest; -use stemedb_rpc::SyncClient; +use stemedb_rpc::{RetryConfig, SyncClient}; use tokio::sync::Mutex; use tracing::{debug, info, instrument, warn}; @@ -113,9 +113,19 @@ impl GossipBroadcaster { pub async fn with_fanout(peer_addrs: Vec, fanout: usize) -> Result { let mut clients = Vec::with_capacity(peer_addrs.len()); + // Gossip-specific retry config: shorter backoff than default. + // Gossip is best-effort; 3 retries with 500ms→5s backoff keeps + // messages from silently dropping during 30s pod-restart windows. + let gossip_retry = RetryConfig { + max_retries: 3, + initial_backoff: Duration::from_millis(500), + max_backoff: Duration::from_secs(5), + }; + for addr in &peer_addrs { match SyncClient::connect(addr).await { Ok(client) => { + let client = client.with_retry_config(gossip_retry.clone()); info!(peer = %addr, "Connected to peer for gossip"); clients.push(Arc::new(client)); } diff --git a/docs/operations/README.md b/docs/operations/README.md index c301c64..342bf10 100644 --- a/docs/operations/README.md +++ b/docs/operations/README.md @@ -6,6 +6,7 @@ | Need to... | Go to | |------------|-------| +| **Deploy to k3s (100 projects)** | [k3s Deploy Roadmap](./deployment/k8s-deploy-roadmap.md) | | **Deploy for the first time** | [Single-Node Pilot Architecture](./reference-architecture/single-node-pilot.md) | | **Troubleshoot an incident** | [Operational Runbooks](./runbooks/) | | **Scale to production** | [Three-Node Cluster Architecture](./reference-architecture/three-node-cluster.md) | @@ -130,4 +131,4 @@ Submit pull requests to keep this guide current and valuable. --- -**Last Updated:** 2026-02-11 +**Last Updated:** 2026-03-02 diff --git a/docs/operations/deployment/k8s-deploy-roadmap.md b/docs/operations/deployment/k8s-deploy-roadmap.md new file mode 100644 index 0000000..9cd81c9 --- /dev/null +++ b/docs/operations/deployment/k8s-deploy-roadmap.md @@ -0,0 +1,711 @@ +# k3s Deploy Roadmap: StemeDB + Aphoria → 100 Projects + +**Target:** Production deployment on k3s-fleet with Longhorn, cert-manager, External Secrets, Prometheus/Grafana, Traefik. +**Timeline:** 3 weeks to ship-ready for 100 projects. + +--- + +## Ship Blockers (P0) — Must Fix Before Any Project Onboards + +### ~~1. Auth router not wired in production~~ ✅ RESOLVED (2026-03-02) + +`create_router_full_protection_full_config` is now called when `STEMEDB_AUTH_ENABLED=true`. +Router dispatch checks `bootstrap::is_auth_enabled()` first — full protection stack activates +in production. Metering-only path still available when auth is disabled (local dev). + +**Resolution:** `crates/stemedb-api/src/main.rs` updated. + +--- + +### ~~2. `STEMEDB_UNSAFE_SKIP_SIGNATURES` startup guard missing~~ ✅ RESOLVED (2026-03-02) + +Startup guard added: if `STEMEDB_UNSAFE_SKIP_SIGNATURES=true` and `STEMEDB_AUTH_ENABLED=true`, +server logs a fatal error and exits with code 1. Misconfiguration is caught at boot, not silently. + +**Resolution:** `crates/stemedb-api/src/main.rs` updated. + +--- + +### ~~3. Bootstrap key not seeded from env on fresh PVC~~ ✅ RESOLVED (2026-03-02) + +`bootstrap::bootstrap_root_api_key()` is now called at startup (after IngestWorker spawn). +Reads `STEMEDB_ROOT_API_KEY`, idempotent — no-op if key already exists in the store. Fatal +error on failure. + +**Resolution:** `crates/stemedb-api/src/main.rs` updated. + +--- + +### ~~4. No k8s manifests — StemeDB cannot be deployed to k3s~~ ✅ RESOLVED (2026-03-02) + +Manifests deployed to `k3s-fleet/deployments/k8s/base/stemedb/` (single `stemedb.yaml` following +`tidaldb/` pattern). Includes ExternalSecret, PVC (50Gi Longhorn), Deployment (Recreate, non-root, +all probes), ClusterIP Service, Traefik Ingress at `stemedb.threesix.ai`. + +**Remaining manual step:** Build + push image, create GCP secret, add DNS record (see Pre-Deploy section below). + +--- + +### ~~5. Image registry — k3s cannot pull without a registry~~ ✅ RESOLVED (2026-03-02) + +Registry confirmed: `us-central1-docker.pkg.dev/orchard9/docker-images/` (GAR). +`imagePullSecrets: gcr-secret` wired in Deployment. Dockerfile updated with `--features aphoria`. + +**Remaining manual step:** `docker build && docker push` to populate the image. + +--- + +## Pre-Deploy Checklist (Manual Steps Before `kubectl apply`) + +```bash +# 1. Build and push image (from stemedb repo root) +docker build -t us-central1-docker.pkg.dev/orchard9/docker-images/stemedb-api:latest . +docker push us-central1-docker.pkg.dev/orchard9/docker-images/stemedb-api:latest + +# 2. Create root API key in GCP Secret Manager +ROOT_KEY="steme_live_$(openssl rand -hex 24)" +echo "Root key: $ROOT_KEY" # Save this — needed for provision-project-keys.sh +echo -n "$ROOT_KEY" | gcloud secrets create stemedb-root-api-key \ + --project=orchard9 --replication-policy=automatic --data-file=- + +# 3. Add DNS: stemedb.threesix.ai → Traefik LB IP (Cloudflare) +``` + +--- + +## Original Manifest Spec (archived for reference) + +The following was the original spec. Actual implementation is in `k3s-fleet/deployments/k8s/base/stemedb/stemedb.yaml`. + +Create `deployments/k8s/base/stemedb/` with the following files: + +**`namespace.yaml`** +```yaml +apiVersion: v1 +kind: Namespace +metadata: + name: stemedb +``` + +**`pvc.yaml`** — Two PVCs to isolate WAL fsync from LSM compaction I/O +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: stemedb-wal + namespace: stemedb + annotations: + volumeType: longhorn +spec: + accessModes: [ReadWriteOnce] + storageClassName: longhorn + resources: + requests: + storage: 20Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: stemedb-db + namespace: stemedb + annotations: + volumeType: longhorn +spec: + accessModes: [ReadWriteOnce] + storageClassName: longhorn + resources: + requests: + storage: 50Gi +``` + +> Set `numberOfReplicas: 2` in Longhorn StorageClass (not default 3) to halve cross-node fsync amplification. + +**`deployment.yaml`** — Critical spec decisions annotated +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: stemedb-api + namespace: stemedb +spec: + replicas: 1 # Non-negotiable. Embedded KV requires exclusive volume access. + strategy: + type: Recreate # NOT RollingUpdate. RWO PVC + 2 pods = deadlock. + selector: + matchLabels: + app: stemedb-api + template: + metadata: + labels: + app: stemedb-api + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "18180" + prometheus.io/path: "/metrics" + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + readOnlyRootFilesystem: false # WAL writes to /data + terminationGracePeriodSeconds: 30 # Let in-flight WAL writes complete. + containers: + - name: stemedb-api + image: /stemedb-api:latest + ports: + - containerPort: 18180 + env: + - name: STEMEDB_BIND_ADDR + value: "0.0.0.0:18180" + - name: STEMEDB_WAL_DIR + value: /data/wal + - name: STEMEDB_DB_DIR + value: /data/db + - name: STEMEDB_METER_ENABLED + value: "true" + - name: STEMEDB_ROOT_API_KEY + valueFrom: + secretKeyRef: + name: stemedb-secrets + key: root-api-key + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2000m" + memory: "4Gi" + startupProbe: # WAL replay can take 60s after crash — do not skip this. + httpGet: + path: /v1/health + port: 18180 + periodSeconds: 5 + failureThreshold: 12 # 60s total window before k8s kills pod + livenessProbe: + httpGet: + path: /v1/health + port: 18180 + periodSeconds: 15 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /v1/health + port: 18180 + periodSeconds: 5 + failureThreshold: 3 + volumeMounts: + - name: wal + mountPath: /data/wal + - name: db + mountPath: /data/db + volumes: + - name: wal + persistentVolumeClaim: + claimName: stemedb-wal + - name: db + persistentVolumeClaim: + claimName: stemedb-db +``` + +**`service.yaml`** +```yaml +apiVersion: v1 +kind: Service +metadata: + name: stemedb-api + namespace: stemedb +spec: + selector: + app: stemedb-api + ports: + - port: 18180 + targetPort: 18180 + type: ClusterIP +``` + +**`ingress.yaml`** — Traefik terminates TLS; do NOT set `STEMEDB_TLS_CERT_PATH` +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: stemedb-api + namespace: stemedb + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.middlewares: stemedb-ratelimit@kubernetescrd + cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + ingressClassName: traefik + rules: + - host: stemedb.yourdomain.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: stemedb-api + port: + number: 18180 + tls: + - hosts: + - stemedb.yourdomain.com + secretName: stemedb-tls +``` + +**`middleware.yaml`** — Traefik rate limit (global, before app-level limits) +```yaml +apiVersion: traefik.containo.us/v1alpha1 +kind: Middleware +metadata: + name: ratelimit + namespace: stemedb +spec: + rateLimit: + average: 500 + burst: 1000 + period: 1s +``` + +**`external-secret.yaml`** — Pull from GCP Secret Manager via External Secrets Operator +```yaml +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: stemedb-secrets + namespace: stemedb +spec: + refreshInterval: 1h + secretStoreRef: + name: gcp-secret-manager # adjust to your cluster's SecretStore name + kind: ClusterSecretStore + target: + name: stemedb-secrets + data: + - secretKey: root-api-key + remoteRef: + key: stemedb-root-api-key +``` + +**`kustomization.yaml`** +```yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - pvc.yaml + - deployment.yaml + - service.yaml + - ingress.yaml + - middleware.yaml + - external-secret.yaml +``` + +**Deploy:** +```bash +kubectl apply -k deployments/k8s/base/stemedb/ +kubectl rollout status deployment/stemedb-api -n stemedb +curl https://stemedb.yourdomain.com/v1/health +``` + +--- + +## Phase 1 Checklist (Week 1 — Gate: First Project Can Connect) + +| # | Task | File(s) | Status | +|---|------|---------|--------| +| 1 | Wire auth router in `main.rs` | `crates/stemedb-api/src/main.rs` | ✅ Done | +| 2 | Add `STEMEDB_UNSAFE_SKIP_SIGNATURES` startup guard | `crates/stemedb-api/src/main.rs` | ✅ Done | +| 3 | Add bootstrap key seed from `STEMEDB_ROOT_API_KEY` | `crates/stemedb-api/src/main.rs` | ✅ Done | +| 4 | Add `--features aphoria` to Dockerfile | `Dockerfile` | ✅ Done | +| 5 | Create k8s manifests | `k3s-fleet/.../stemedb/` | ✅ Done | +| 6 | Write `scripts/provision-project-keys.sh` | `scripts/` | ✅ Done | +| 7 | Build + push Docker image | GAR | ⏳ Manual | +| 8 | Store root API key in GCP Secret Manager | GCP Console | ⏳ Manual | +| 9 | Add DNS record: `stemedb.threesix.ai` | Cloudflare | ⏳ Manual | +| 10 | Deploy to k3s + smoke test | k3s-fleet | ⏳ Pending | + +**Gate test (run after deploy):** +```bash +# Health check +curl https://stemedb.threesix.ai/v1/health + +# Unauthenticated write → 401 +curl -s -o /dev/null -w "%{http_code}" -X POST \ + https://stemedb.threesix.ai/v1/assert -H "Content-Type: application/json" -d '{}' + +# Authenticated write → 200/201 +curl -X POST https://stemedb.threesix.ai/v1/assert \ + -H "X-API-Key: $ROOT_KEY" -H "Content-Type: application/json" \ + -d '{"subject":"test/ping","predicate":"alive","value":true,"agent_id":"test"}' + +# Confirm key persists across restart +kubectl rollout restart deployment/stemedb-api -n stemedb +kubectl rollout status deployment/stemedb-api -n stemedb --timeout=120s +curl https://stemedb.threesix.ai/v1/health +``` + +--- + +## Phase 2: Production Hardening (Week 2 — Gate: 10 Projects) + +### Backup CronJob + +Create `deployments/k8s/base/stemedb/backup-cronjob.yaml`: + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: stemedb-backup + namespace: stemedb +spec: + schedule: "0 */6 * * *" # Every 6 hours + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + restartPolicy: OnFailure + containers: + - name: backup + image: rclone/rclone:latest + command: + - /bin/sh + - -c + - | + # WAL: copy all completed segments (all except the last, which is locked) + SEGMENTS=$(ls /data/wal/*.wal 2>/dev/null | sort | head -n -1) + if [ -n "$SEGMENTS" ]; then + rclone copy /data/wal/ gcs:$BACKUP_BUCKET/wal/ \ + --include "*.wal" --exclude "$(ls /data/wal/*.wal | sort | tail -n 1 | xargs basename)" + fi + # DB snapshot + rclone copy /data/db/ gcs:$BACKUP_BUCKET/db/$(date -u +%Y%m%dT%H%M%SZ)/ + echo "Backup complete" + env: + - name: BACKUP_BUCKET + value: stemedb-backups # your GCS bucket name + volumeMounts: + - name: wal + mountPath: /data/wal + readOnly: true + - name: db + mountPath: /data/db + readOnly: true + - name: rclone-config + mountPath: /config/rclone + volumes: + - name: wal + persistentVolumeClaim: + claimName: stemedb-wal + - name: db + persistentVolumeClaim: + claimName: stemedb-db + - name: rclone-config + secret: + secretName: rclone-gcs-config +``` + +**Test backup manually:** +```bash +kubectl create job --from=cronjob/stemedb-backup backup-test -n stemedb +kubectl logs -l job-name=backup-test -n stemedb -f +``` + +### Monitoring — Wire into Prometheus + +**`service-monitor.yaml`** +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: stemedb-api + namespace: stemedb + labels: + release: prometheus # must match your Prometheus Operator label selector +spec: + selector: + matchLabels: + app: stemedb-api + endpoints: + - port: "18180" + path: /metrics + interval: 15s +``` + +**`alert-rules.yaml`** — 6 alerts that fire first at 100-project scale +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: stemedb-alerts + namespace: stemedb + labels: + release: prometheus +spec: + groups: + - name: stemedb.rules + rules: + - alert: StemeDBPodNotRunning + expr: absent(up{job="stemedb-api"}) > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "StemeDB pod is not running" + + - alert: StemeDBWALLatencyHigh + expr: histogram_quantile(0.99, rate(stemedb_wal_fsync_latency_seconds_bucket[5m])) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "WAL fsync p99 > 50ms — Longhorn I/O degradation likely" + + - alert: StemeDBDataVolumeNearlyFull + expr: | + kubelet_volume_stats_used_bytes{persistentvolumeclaim=~"stemedb-.*"} + / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"stemedb-.*"} + > 0.75 + for: 5m + labels: + severity: warning + annotations: + summary: "StemeDB PVC usage > 75% — resize requires downtime" + + - alert: StemeDBRateLimitSaturating + expr: rate(stemedb_http_requests_total{status="429"}[5m]) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "429 rate > 1/s — projects hitting rate limits" + + - alert: StemeDBErrorRateHigh + expr: | + rate(stemedb_http_requests_total{status=~"5.."}[5m]) + / rate(stemedb_http_requests_total[5m]) + > 0.01 + for: 5m + labels: + severity: critical + annotations: + summary: "5xx error rate > 1%" + + - alert: StemeDBOOMKilled + expr: | + kube_pod_container_status_last_terminated_reason{ + container="stemedb-api", + reason="OOMKilled" + } > 0 + labels: + severity: critical + annotations: + summary: "StemeDB container OOM killed — increase memory limit or find leak" +``` + +### NetworkPolicy + PDB + +**`network-policy.yaml`** +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: stemedb-api + namespace: stemedb +spec: + podSelector: + matchLabels: + app: stemedb-api + policyTypes: [Ingress, Egress] + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system # Traefik + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring # Prometheus + ports: + - port: 18180 + egress: + - ports: + - port: 53 # DNS + - port: 443 # GCP APIs (backup, secrets) +``` + +**`pdb.yaml`** +```yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: stemedb-api + namespace: stemedb +spec: + maxUnavailable: 0 + selector: + matchLabels: + app: stemedb-api +``` + +### Phase 2 Checklist + +| # | Task | File(s) | Est | +|---|------|---------|-----| +| 1 | Deploy backup CronJob | `deployments/k8s/base/stemedb/backup-cronjob.yaml` | 2h | +| 2 | Create GCS bucket + rclone Secret | GCP Console | 1h | +| 3 | Wire ServiceMonitor into Prometheus | `service-monitor.yaml` | 1h | +| 4 | Deploy 6 alert rules | `alert-rules.yaml` | 1h | +| 5 | Add NetworkPolicy + PDB | `network-policy.yaml`, `pdb.yaml` | 1h | +| 6 | Fix Longhorn PVC reclaim policy in DR runbook | `docs/operations/runbooks/disaster-recovery.md` | 30m | + +**Gate test:** Kill pod → `StemeDBPodNotRunning` fires within 2 min. Run backup job manually → GCS has files. + +--- + +## Phase 3: Scale to 100 Projects (Week 3) + +### Per-project key provisioning script + +Create `scripts/provision-project-keys.sh`: + +```bash +#!/usr/bin/env bash +set -euo pipefail + +# Usage: ./provision-project-keys.sh projects.txt +# projects.txt: one project name per line + +STEMEDB_URL="${STEMEDB_URL:-https://stemedb.yourdomain.com}" +ADMIN_KEY="${STEMEDB_ADMIN_KEY:?Set STEMEDB_ADMIN_KEY}" +PROJECTS_FILE="${1:?Usage: $0 }" + +while IFS= read -r project; do + [[ -z "$project" ]] && continue + + echo "Provisioning key for: $project" + + response=$(curl -sf -X POST "$STEMEDB_URL/v1/admin/api-keys" \ + -H "X-API-Key: $ADMIN_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"label\":\"project-$project\",\"role\":\"write_agent\"}") + + key=$(echo "$response" | jq -r '.key') + + # Store in GCP Secret Manager + echo -n "$key" | gcloud secrets create "stemedb-key-$project" \ + --data-file=- \ + --replication-policy=automatic 2>/dev/null \ + || echo -n "$key" | gcloud secrets versions add "stemedb-key-$project" --data-file=- + + echo " Key stored: stemedb-key-$project" +done < "$PROJECTS_FILE" + +echo "Done." +``` + +**Onboarding runbook for each project:** +```bash +# 1. Retrieve key from Secret Manager +gcloud secrets versions access latest --secret="stemedb-key-" + +# 2. Update project's aphoria.toml +cat >> .aphoria/config.toml < +``` + +### Aphoria retry logic (P1) + +Projects run `aphoria scan --persist` locally and call the remote StemeDB. During StemeDB pod +restarts (Recreate strategy = brief downtime), Aphoria should retry rather than fail the commit. + +> This is a change to the `aphoria` binary, not to StemeDB. Add 3-attempt exponential backoff +> (2s, 4s, 8s) on HTTP 502/503 responses in the Aphoria HTTP client. + +### Phase 3 Checklist + +| # | Task | File(s) | Est | +|---|------|---------|-----| +| 1 | Run provision script for all 100 projects | `scripts/provision-project-keys.sh` | 2h | +| 2 | Write per-project onboarding runbook | `docs/operations/onboarding-project.md` | 1h | +| 3 | Add retry logic to `aphoria` HTTP client | `applications/aphoria/` | 2h | +| 4 | Split WAL + DB into two PVCs (migration) | `deployments/k8s/base/stemedb/` | 2h | + +**Gate test:** 5 projects scan simultaneously with their own keys → each isolated → one rate-limited → others unaffected. + +--- + +## What NOT to Build Yet + +| Item | Why not | +|------|---------| +| HPA | StemeDB is stateful (embedded KV). Cannot scale horizontally. | +| mTLS between pods | Single service. Add when you have a second service. | +| WAF | Body limits + Traefik rate limit + circuit breaker is sufficient for 100 known projects. | +| Per-tenant namespaces | Multiplies operational surface 100x. API key isolation is the right model. | +| Multi-region / clustering | 3-node k3s + Longhorn 2-replica is your HA story. P6 in roadmap. | +| PITR with WAL timestamps | 6-hour backup RPO is acceptable for pilot. Improve later. | +| Secrets rotation automation | Manual rotation via `/v1/admin/api-keys/:hash/rotate` is fine for 100 projects. | +| Distributed tracing | You have one service. WAL fsync histogram covers what you need. | + +--- + +## Open Questions (Resolve Week 1) + +1. **Image registry**: Which registry does k3s-fleet already use? Check `get_service_config()` in `deploy-stack.sh`. +2. **Bootstrap key API**: Verify exact method signatures on `ApiKeyStore` before writing the seed logic in `main.rs`. +3. **Aphoria scan model**: Do projects run `aphoria scan` locally (calling remote StemeDB) or as a k8s Job? Determines where retry logic lives. +4. **GCS bucket**: Does one exist for backups, or does it need to be created? +5. **CORS**: All router variants in `routers.rs` use `allow_origin(Any)`. Production needs this restricted to Traefik's internal domain. Add `STEMEDB_ALLOWED_ORIGINS` env var support. + +--- + +## Risk Register + +| Risk | Likelihood | Mitigation | +|------|-----------|-----------| +| Longhorn fsync latency at 100-project burst | Medium | Pin pod + volume to same node (Phase 3), `dataLocality: bestEffort`; monitor WAL p99 from day 1 | +| Single-instance downtime during deploys | High (Recreate strategy) | Startup probe + maintenance window policy + Aphoria retry logic | +| Fresh PVC after disaster = 100 project keys lost | Low but catastrophic | Bootstrap key seed in `main.rs` + `provision-project-keys.sh` idempotent re-run | +| Image registry blocker | High if unresolved | Resolve Day 1; entire deployment depends on it | +| CORS vulnerability | Medium | `allow_origin(Any)` in all router variants; fix before public launch | + +--- + +## Directory Structure After Phase 1 + +``` +deployments/ +└── k8s/ + └── base/ + └── stemedb/ + ├── kustomization.yaml + ├── namespace.yaml + ├── pvc.yaml + ├── deployment.yaml + ├── service.yaml + ├── ingress.yaml + ├── middleware.yaml + └── external-secret.yaml + +scripts/ +└── provision-project-keys.sh (new) +``` + +After Phase 2, add to `deployments/k8s/base/stemedb/`: +- `backup-cronjob.yaml` +- `service-monitor.yaml` +- `alert-rules.yaml` +- `network-policy.yaml` +- `pdb.yaml` + +--- + +*Last updated: 2026-03-02 — Week 1 code changes complete; 3 manual steps remain before deploy* diff --git a/docs/operations/reference-architecture/three-node-cluster.md b/docs/operations/reference-architecture/three-node-cluster.md index 8f21b4e..6c12d1f 100644 --- a/docs/operations/reference-architecture/three-node-cluster.md +++ b/docs/operations/reference-architecture/three-node-cluster.md @@ -4,14 +4,64 @@ **✅ RECOMMENDED FOR PRODUCTION** - Survives single node failure, automatic replication +> **Implementation status:** The cluster crates (`stemedb-cluster`, `stemedb-sync`, `stemedb-rpc`) are +> implemented. The k3s/Longhorn deployment path is the current production path (see Phase 2 section). +> Bare-metal deployment via config.toml is aspirational and not yet wired to the binary. + +--- + +## Architectural Rationale: Why Gossip, Not Raft + +This section exists because the wrong answer here — "just add Raft" — is commonly assumed and actively harmful for StemeDB's workload. + +### The append-only insight + +Most databases need Raft because writes can **conflict**: two nodes update the same row, and a leader must serialize them. StemeDB doesn't have this problem. Every assertion receives a **BLAKE3 content hash** as its ID. If two nodes both write the same assertion independently, they produce the same hash → the same key → identical data. There is nothing to conflict on. + +This means the assertion write path is naturally **CRDT-like**: the system needs every node to eventually receive every assertion, but doesn't need consensus on which assertion "wins." Gossip + Merkle anti-entropy handles this correctly and efficiently. Raft would add leader-election overhead and write latency for zero benefit on the data path. + +### What actually needs coordination + +Not everything in StemeDB is append-only. Mutable state requires stronger guarantees: + +| State | Type | Replication strategy | Why | +|-------|------|---------------------|-----| +| Assertions | Append-only (CRDT) | Gossip + Merkle anti-entropy | No conflicts possible by design | +| API keys | Mutable | Synchronous broadcast or coordinator | Revoked key must not be reusable | +| Quota / meter counts | Mutable counter | Coordinator node or bounded staleness | Double-spend if two nodes both allow | +| Circuit breaker state | Mutable | Synchronous broadcast | Trip/reset must propagate atomically | +| Epochs | Append-only, ordered | Gossip is sufficient | Creation order captured in content | + +**Practical implication:** Admin operations (key management, quota changes) should be routed to a designated coordinator and synchronously acknowledged. Assertion writes and reads can go to any node with no coordination. + +### Read scaling + +Because Lens resolution is pure local computation on indexed data, **any node can serve any read with no coordination**. Reads scale horizontally to N nodes without inter-node communication. + +### Write scaling (assertions) + +Because assertion writes are idempotent by content hash, **any node can accept any write**. The cluster gateway (`stemedb-cluster`, port 18181) routes writes by subject hash shard prefix — each node owns a partition of the key space. Merkle anti-entropy ensures all nodes converge. Write throughput scales linearly with node count. + --- ## Overview -The three-node cluster provides high availability through automatic replication (factor 2) and CRDT-based eventual consistency. Survives single node failure with <5 minute recovery time. +The three-node cluster provides high availability through automatic replication (factor 2) and gossip-based eventual consistency for assertions. Survives single node failure with <5 minute recovery time. ``` -[See: diagrams/three-node.txt for ASCII diagram] + ┌──────────────────────────────┐ +Internet ──→ LB → │ Cluster Gateway (port 18181) │ + │ Reads: round-robin any node │ + │ Writes: route by shard prefix│ + └──────┬────────────┬───────────┘ + │ │ + ┌──────────▼──┐ ┌────▼─────────┐ ┌──────────────┐ + │ Node A │ │ Node B │ │ Node C │ + │ Shard 0-84 │ │ Shard 85-169 │ │ Shard 170-255│ + │ WAL + KV │ │ WAL + KV │ │ WAL + KV │ + └──────┬──────┘ └────┬──────────┘ └───────┬──────┘ + │ Merkle sync (gossip, port 18183) │ + └─────────────────────────────────────────┘ ``` --- @@ -92,7 +142,49 @@ Each node runs the full stack: --- -## Deployment Steps +## k3s Deployment Path (Current — Longhorn + StatefulSet) + +> This is the **current production deployment path** for k3s-fleet. The bare-metal steps below +> are for non-k8s environments and use a config.toml interface that is not yet wired to the binary. + +For each cluster node on k3s, deploy a separate StatefulSet with its own Longhorn PVC: + +``` +k3s-fleet/deployments/k8s/base/stemedb/ +├── stemedb.yaml # Node A (current single-node — Phase 1) +├── stemedb-b.yaml # Node B (Phase 2 — add when ready to scale reads) +├── stemedb-c.yaml # Node C (Phase 2) +└── kustomization.yaml +``` + +**Critical k3s constraints:** +- Each node needs its own `ReadWriteOnce` Longhorn PVC — embedded KV (fjall) cannot share a volume +- Use `strategy: Recreate` on each Deployment (not RollingUpdate) — RWO PVC + 2 pods = deadlock +- Cluster gateway (port 18181) must be exposed as a separate Service for inter-node routing +- Use `topologySpreadConstraints` to ensure nodes land on different k3s worker hosts + +**Phase 2 read-replica k8s addition (when ready):** +```yaml +# Add to stemedb-b.yaml — identical to stemedb.yaml except: +# - Different node ID env var +# - STEMEDB_CLUSTER_SEEDS pointing to Node A's gateway ClusterIP +# - Its own PVC claim +env: + - name: STEMEDB_NODE_ID + value: "node-b" + - name: STEMEDB_CLUSTER_SEEDS + value: "stemedb-api.stemedb.svc:18181" +``` + +**See:** [k8s Deploy Roadmap](../deployment/k8s-deploy-roadmap.md) for the phased rollout plan. + +--- + +## Bare-Metal Deployment Steps + +> ⚠️ The config.toml cluster configuration shown here is **planned** and not yet wired to the +> `stemedb-api` binary. Current binary configuration uses environment variables only. This section +> documents the intended interface for when cluster config is implemented. ### Prerequisites @@ -234,27 +326,29 @@ scrape_configs: ### Two Nodes Fail (Catastrophic) -**Impact:** Read-only mode (no writes accepted) +**Impact:** Single surviving node continues accepting assertion writes and serving reads. Admin operations (key management) are degraded — single-node has no peer to synchronously acknowledge. **Recovery:** -1. Manual intervention required -2. Restore third node or add new node -3. Trigger Merkle sync -4. Resume writes when quorum restored +1. Manual intervention required to restore cluster +2. Restore failed nodes or add new nodes +3. Trigger Merkle sync (`/cluster/sync` endpoint) after nodes rejoin +4. Admin operations fully restored when cluster membership is repaired -**RTO:** 30 minutes - 2 hours (manual) -**Data loss:** Potential (depends on which nodes failed) +**RTO:** 30 minutes - 2 hours (manual restore) +**Data loss:** Assertion writes continue on surviving node and merge on recovery. Recent admin operations (key revocations) issued during degraded window may not have propagated — audit after recovery. ### Network Partition -**Impact:** Split brain possible (both sides accept writes) +**Impact:** +- **Assertion writes:** Both partitions accept writes independently. This is safe — same content → same BLAKE3 hash, different content → different hashes that merge cleanly after partition heals. +- **Admin operations (API key revocations, quota changes):** A revocation issued to one partition is invisible to the other until partition heals. A revoked key may still be honored by nodes in the other partition during the partition window. **Recovery:** -- CRDT merge resolves conflicts automatically -- Lenses (Recency, Authority) handle conflicts at read time -- No manual intervention needed after partition heals +- Merkle anti-entropy detects and fills gaps automatically when partition heals +- Lenses (Recency, Authority) handle any assertion-level divergence at read time +- Admin state re-synchronizes via coordinator broadcast on reconnect -**Data loss:** None (CRDTs preserve all writes) +**Data loss:** None for assertions (all writes from both partitions preserved and merged). ### Replication Lag @@ -284,9 +378,10 @@ scrape_configs: **Target:** 1,000 assertions/sec sustained -- Each node accepts writes -- Replication happens asynchronously -- No coordination required (CRDTs) +- Each node accepts assertion writes (routed by cluster gateway via shard prefix) +- Replication happens asynchronously via Merkle gossip +- No coordination required for assertions (CRDT-safe by content hash) +- Admin writes (API keys, quota changes) route to coordinator and require synchronous acknowledgment — expect higher latency on those operations (~50ms vs ~5ms for assertions) ### Replication Lag @@ -384,6 +479,21 @@ Compare to single-node ($87/month): 5x cost for 10x availability --- +## Scaling Path Beyond Three Nodes + +Three nodes on k3s handles the 100-project target. For mass traffic beyond that, the scaling path is incremental — not a rearchitecture: + +| Phase | Target | Work type | What changes | +|-------|--------|-----------|-------------| +| **Phase 1** | 1 node, 100 projects | ✅ Done | Single Deployment, Longhorn PVC, auth wired | +| **Phase 2** | 3 nodes, read-scaled | Ops-heavy | Add 2 read replicas as separate Deployments; cluster gateway routes reads round-robin | +| **Phase 3** | 3 nodes, write-sharded | Code-heavy | Gateway enforces shard ownership; each node owns ⅓ of subject hash space; reads still any-node | +| **Phase 4** | N nodes, coordinator | Code-heavy | Designate one node (or small 3-node Raft group) exclusively for mutable admin state; assertion nodes are pure data | + +**What you do NOT need:** Raft on the assertion write path. The append-only, content-addressed design means there are no write conflicts to serialize. Raft belongs only on the mutable admin state path (Phase 4), which is a small fraction of total traffic. + +--- + ## Related Documentation - [Single-Node Pilot](./single-node-pilot.md) - Simpler architecture @@ -394,4 +504,4 @@ Compare to single-node ($87/month): 5x cost for 10x availability --- -**Last Updated:** 2026-02-11 +**Last Updated:** 2026-03-02 — Added architectural rationale (gossip vs Raft), k3s deployment path, fixed mutable-state coordination notes, added 4-phase scaling table diff --git a/scripts/provision-project-keys.sh b/scripts/provision-project-keys.sh new file mode 100755 index 0000000..56165c7 --- /dev/null +++ b/scripts/provision-project-keys.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# provision-project-keys.sh — create per-project API keys and store in GCP Secret Manager +# +# Usage: STEMEDB_ADMIN_KEY=steme_live_... ./scripts/provision-project-keys.sh projects.txt +# projects.txt: one project slug per line (e.g. "my-app", "another-project") +# +# Requires: curl, jq, gcloud (authenticated) + +set -euo pipefail + +STEMEDB_URL="${STEMEDB_URL:-https://stemedb.threesix.ai}" +ADMIN_KEY="${STEMEDB_ADMIN_KEY:?Set STEMEDB_ADMIN_KEY to a root/admin API key}" +PROJECTS_FILE="${1:?Usage: $0 }" +GCP_PROJECT="${GCP_PROJECT:-orchard9}" + +echo "Provisioning keys against: $STEMEDB_URL" +echo "GCP project for secrets: $GCP_PROJECT" +echo "" + +while IFS= read -r project; do + [[ -z "$project" || "$project" =~ ^# ]] && continue + + echo "→ Provisioning: $project" + + response=$(curl -sf -X POST "$STEMEDB_URL/v1/admin/api-keys" \ + -H "X-API-Key: $ADMIN_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"environment\":\"live\",\"label\":\"project-$project\",\"role\":\"write_agent\"}") \ + || { echo " ERROR: API call failed for $project"; continue; } + + key=$(echo "$response" | jq -r '.key') + + if [[ -z "$key" || "$key" == "null" ]]; then + echo " ERROR: no key returned for $project" + continue + fi + + secret_name="stemedb-key-$project" + if gcloud secrets describe "$secret_name" --project="$GCP_PROJECT" &>/dev/null; then + echo -n "$key" | gcloud secrets versions add "$secret_name" \ + --project="$GCP_PROJECT" --data-file=- + echo " Updated existing secret: $secret_name" + else + echo -n "$key" | gcloud secrets create "$secret_name" \ + --project="$GCP_PROJECT" \ + --replication-policy=automatic \ + --data-file=- + echo " Created new secret: $secret_name" + fi +done < "$PROJECTS_FILE" + +echo "" +echo "Done. Projects retrieve their keys with:" +echo " gcloud secrets versions access latest --secret=stemedb-key- --project=$GCP_PROJECT"