stemedb/community/scripts/seed-claims.ts

/**
 * Seed script for populating StemeDB with demo claims.
 *
 * This script:
 * 1. Waits for the API to be healthy
 * 2. Registers source documents with human-readable labels
 * 3. Creates assertions that match the mock data in page.tsx
 * 4. Verifies the data is queryable via SkepticLens
 *
 * Usage:
 *   npx tsx scripts/seed-claims.ts
 *
 * Environment:
 *   STEMEDB_API_URL - API base URL (default: http://127.0.0.1:18180)
 */

import * as ed from "@noble/ed25519";
import { sha512 } from "@noble/hashes/sha512";

// Configure ed25519 to use sha512
ed.etc.sha512Sync = (...m) => sha512(ed.etc.concatBytes(...m));

const API_URL = process.env.STEMEDB_API_URL || "http://127.0.0.1:18180";

// ============================================================================
// Types
// ============================================================================

interface Agent {
  name: string;
  seed: string;
  privateKey: Uint8Array;
  publicKey: Uint8Array;
}

interface Source {
  hash: string;
  label: string;
  tier: number;
  url?: string;
}

interface ClaimSet {
  subject: string;
  predicate: string;
  claims: {
    value: string;
    confidence: number;
    sourceIndex: number;
    agentIndex: number;
  }[];
}

// ============================================================================
// Helpers
// ============================================================================

function toHex(bytes: Uint8Array): string {
  return Array.from(bytes)
    .map((b) => b.toString(16).padStart(2, "0"))
    .join("");
}

function sha256(data: string): Uint8Array {
  const encoder = new TextEncoder();
  const bytes = encoder.encode(data);
  // Simple deterministic hash for seed purposes (not crypto-secure, but deterministic)
  const hash = new Uint8Array(32);
  for (let i = 0; i < bytes.length; i++) {
    hash[i % 32] ^= bytes[i];
    hash[(i + 1) % 32] = (hash[(i + 1) % 32] + bytes[i]) % 256;
  }
  return hash;
}

async function waitForHealth(maxRetries = 30, delayMs = 2000): Promise<void> {
  console.log(`Waiting for API at ${API_URL}...`);

  for (let i = 0; i < maxRetries; i++) {
    try {
      const response = await fetch(`${API_URL}/v1/health`);
      if (response.ok) {
        const data = await response.json();
        console.log(`API is healthy: v${data.version}, ${data.assertions_count} assertions`);
        return;
      }
    } catch {
      // Retry
    }

    if (i < maxRetries - 1) {
      console.log(`  Retry ${i + 1}/${maxRetries}...`);
      await new Promise((resolve) => setTimeout(resolve, delayMs));
    }
  }

  throw new Error(`API not healthy after ${maxRetries} retries`);
}

async function createAgent(name: string, seed: string): Promise<Agent> {
  // Generate deterministic private key from seed
  const seedHash = sha256(`agent-seed-${seed}-${name}`);
  const privateKey = seedHash;
  const publicKey = await ed.getPublicKeyAsync(privateKey);

  return { name, seed, privateKey, publicKey };
}

async function signAssertion(
  agent: Agent,
  subject: string,
  predicate: string
): Promise<{ signature: string; timestamp: number }> {
  const timestamp = Math.floor(Date.now() / 1000);
  const message = `${subject}:${predicate}`;
  const messageBytes = new TextEncoder().encode(message);
  const signature = await ed.signAsync(messageBytes, agent.privateKey);

  return {
    signature: toHex(signature),
    timestamp,
  };
}

function generateSourceHash(label: string): string {
  const hash = sha256(`source-${label}`);
  return toHex(hash);
}

async function registerSource(source: Source): Promise<void> {
  const response = await fetch(`${API_URL}/v1/sources`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({
      hash: source.hash,
      label: source.label,
      tier: source.tier,
      url: source.url,
    }),
  });

  if (!response.ok && response.status !== 409) {
    // 409 = already exists, which is fine
    const text = await response.text();
    console.warn(`  Warning: Failed to register source ${source.label}: ${text}`);
  }
}

async function createAssertion(
  agent: Agent,
  subject: string,
  predicate: string,
  value: string,
  confidence: number,
  sourceHash: string,
  sourceClass: string
): Promise<string | null> {
  const { signature, timestamp } = await signAssertion(agent, subject, predicate);

  const request = {
    subject,
    predicate,
    object: { type: "Text", value },
    confidence,
    source_hash: sourceHash,
    source_class: sourceClass,
    signatures: [
      {
        agent_id: toHex(agent.publicKey),
        signature,
        timestamp,
        version: 1,
      },
    ],
  };

  const response = await fetch(`${API_URL}/v1/assert`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(request),
  });

  if (!response.ok) {
    const text = await response.text();
    console.warn(`  Warning: Failed to create assertion: ${text}`);
    return null;
  }

  const data = await response.json();
  return data.hash;
}

async function verifySkeptic(subject: string, predicate: string): Promise<void> {
  const url = `${API_URL}/v1/skeptic?subject=${encodeURIComponent(subject)}&predicate=${encodeURIComponent(predicate)}&include_source_metadata=true`;
  const response = await fetch(url);

  if (!response.ok) {
    console.warn(`  Warning: Skeptic query failed for ${subject}/${predicate}`);
    return;
  }

  const data = await response.json();
  console.log(
    `  Verified: ${subject}/${predicate} -> ${data.status} (${data.claims.length} claims, conflict=${data.conflict_score.toFixed(2)})`
  );
}

// ============================================================================
// Data Definitions (matching page.tsx mock data)
// ============================================================================

const SOURCES: Source[] = [
  {
    hash: "", // Will be generated
    label: "PostgreSQL 16 Documentation - DDL Constraints",
    tier: 0,
    url: "https://www.postgresql.org/docs/current/ddl-constraints.html",
  },
  {
    hash: "",
    label: "Snodgrass - Developing Time-Oriented Database Applications",
    tier: 1,
    url: "https://www2.cs.arizona.edu/~rts/tdbbook.pdf",
  },
  {
    hash: "",
    label: "Shapiro et al. - Conflict-free Replicated Data Types (SSS 2011)",
    tier: 1,
    url: "https://hal.inria.fr/inria-00609399/document",
  },
  {
    hash: "",
    label: "Almeida et al. - Delta State Replicated Data Types (2018)",
    tier: 1,
    url: "https://arxiv.org/abs/1603.01529",
  },
  {
    hash: "",
    label: "StemeDB Design Notes - Why Not CRDTs",
    tier: 3,
  },
  {
    hash: "",
    label: "BLAKE3 Specification - One Function, Fast Everywhere",
    tier: 0,
    url: "https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf",
  },
  {
    hash: "",
    label: "StemeDB Source Code - lens/recency.rs",
    tier: 0,
    url: "https://github.com/orchard9/stemedb/blob/main/crates/stemedb-lens/src/recency.rs",
  },
  {
    hash: "",
    label: "GitHub Issue #142 - Optimize RecencyLens",
    tier: 4,
    url: "https://github.com/orchard9/stemedb/issues/142",
  },
  {
    hash: "",
    label: "Kleppmann - Designing Data-Intensive Applications",
    tier: 1,
    url: "https://dataintensive.net/",
  },
  {
    hash: "",
    label: "StemeDB Implementation Notes",
    tier: 3,
  },
  {
    hash: "",
    label: "Kamvar et al. - The EigenTrust Algorithm (WWW 2003)",
    tier: 1,
    url: "https://nlp.stanford.edu/pubs/eigentrust.pdf",
  },
  {
    hash: "",
    label: "Medical AI Safety Working Group - Trust Calibration Report",
    tier: 2,
  },
];

// Generate hashes for sources
for (const source of SOURCES) {
  source.hash = generateSourceHash(source.label);
}

const SOURCE_CLASS_MAP: Record<number, string> = {
  0: "Regulatory",
  1: "Clinical",
  2: "Observational",
  3: "Expert",
  4: "Community",
  5: "Anecdotal",
};

const CLAIM_SETS: ClaimSet[] = [
  // Single value claim (agreed)
  {
    subject: "Episteme",
    predicate: "storage_model",
    claims: [
      {
        value: "Single value per key is the dominant paradigm",
        confidence: 0.92,
        sourceIndex: 0, // PostgreSQL docs
        agentIndex: 0,
      },
      {
        value: "Bitemporal and event stores are exceptions",
        confidence: 0.78,
        sourceIndex: 1, // Snodgrass
        agentIndex: 1,
      },
    ],
  },

  // CRDT claim (contested)
  {
    subject: "CRDT",
    predicate: "replica_assumption",
    claims: [
      {
        value: "CRDTs assume replicas are authoritative copies of same data",
        confidence: 0.94,
        sourceIndex: 2, // Shapiro
        agentIndex: 0,
      },
      {
        value: "CRDTs can model multi-source disagreement with delta states",
        confidence: 0.76,
        sourceIndex: 3, // Almeida
        agentIndex: 1,
      },
      {
        value: "CRDTs don't preserve provenance of conflicting sources",
        confidence: 0.65,
        sourceIndex: 4, // StemeDB notes
        agentIndex: 2,
      },
    ],
  },

  // Content addressing (unanimous)
  {
    subject: "Episteme",
    predicate: "content_addressing",
    claims: [
      {
        value: "Content-addressing provides deduplication, integrity, and efficient comparison",
        confidence: 0.96,
        sourceIndex: 5, // BLAKE3 spec
        agentIndex: 0,
      },
    ],
  },

  // Recency complexity (agreed)
  {
    subject: "RecencyLens",
    predicate: "complexity",
    claims: [
      {
        value: "RecencyLens is O(n) where n = candidates",
        confidence: 0.88,
        sourceIndex: 6, // Source code
        agentIndex: 0,
      },
      {
        value: "Could be O(log n) with a heap-based implementation",
        confidence: 0.52,
        sourceIndex: 7, // GitHub issue
        agentIndex: 3,
      },
    ],
  },

  // Storage growth (unanimous)
  {
    subject: "Episteme",
    predicate: "storage_growth",
    claims: [
      {
        value: "Append-only storage grows without bound",
        confidence: 0.97,
        sourceIndex: 8, // Kleppmann
        agentIndex: 0,
      },
    ],
  },

  // Trust parameters (contested)
  {
    subject: "EigenTrust",
    predicate: "parameters",
    claims: [
      {
        value: "Trust parameters (0.5 start, +0.05/-0.1) are reasonable heuristics",
        confidence: 0.72,
        sourceIndex: 9, // Implementation notes
        agentIndex: 2,
      },
      {
        value: "EigenTrust provides theoretical foundation for trust propagation",
        confidence: 0.89,
        sourceIndex: 10, // Kamvar paper
        agentIndex: 0,
      },
      {
        value: "Heuristics without formal verification are dangerous for high-stakes domains",
        confidence: 0.61,
        sourceIndex: 11, // Medical AI Safety
        agentIndex: 1,
      },
    ],
  },
];

// ============================================================================
// Main
// ============================================================================

async function main(): Promise<void> {
  console.log("StemeDB Seed Script");
  console.log("===================\n");

  // Wait for API
  await waitForHealth();
  console.log();

  // Create agents with deterministic keys
  console.log("Creating agents...");
  const agents: Agent[] = await Promise.all([
    createAgent("regulatory_authority", "fda-agent-seed-001"),
    createAgent("clinical_researcher", "clinical-agent-seed-002"),
    createAgent("expert_opinion", "expert-agent-seed-003"),
    createAgent("community_voice", "community-agent-seed-004"),
  ]);

  for (const agent of agents) {
    console.log(`  ${agent.name}: ${toHex(agent.publicKey).slice(0, 16)}...`);
  }
  console.log();

  // Register sources
  console.log("Registering sources...");
  for (const source of SOURCES) {
    await registerSource(source);
    console.log(`  ${source.label.slice(0, 50)}...`);
  }
  console.log();

  // Create assertions
  console.log("Creating assertions...");
  let totalAssertions = 0;

  for (const claimSet of CLAIM_SETS) {
    console.log(`\n  ${claimSet.subject}/${claimSet.predicate}:`);

    for (const claim of claimSet.claims) {
      const source = SOURCES[claim.sourceIndex];
      const agent = agents[claim.agentIndex];
      const sourceClass = SOURCE_CLASS_MAP[source.tier];

      const hash = await createAssertion(
        agent,
        claimSet.subject,
        claimSet.predicate,
        claim.value,
        claim.confidence,
        source.hash,
        sourceClass
      );

      if (hash) {
        totalAssertions++;
        console.log(`    + "${claim.value.slice(0, 40)}..." (${sourceClass})`);
      }
    }
  }

  console.log(`\nCreated ${totalAssertions} assertions.`);
  console.log();

  // Wait a moment for materialization
  console.log("Waiting for materialization...");
  await new Promise((resolve) => setTimeout(resolve, 2000));
  console.log();

  // Verify with Skeptic
  console.log("Verifying via SkepticLens...");
  for (const claimSet of CLAIM_SETS) {
    await verifySkeptic(claimSet.subject, claimSet.predicate);
  }

  console.log("\nDone! Seeded claims are ready for the Community app.");
}

main().catch((error) => {
  console.error("Seed failed:", error);
  process.exit(1);
});