stemedb/community/scripts/seed-whitepaper.ts

#!/usr/bin/env npx tsx
/**
 * Seed whitepaper claims to StemeDB.
 *
 * This script:
 * 1. Loads whitepaper sections from data/whitepaper-sections.json
 * 2. Extracts claims from each section (using hardcoded curated claims)
 * 3. Creates agents with deterministic keys
 * 4. Registers sources and submits assertions to StemeDB
 *
 * Usage:
 *   npx tsx scripts/seed-whitepaper.ts
 *   npx tsx scripts/seed-whitepaper.ts --dry-run
 *
 * Environment:
 *   STEMEDB_API_URL - API base URL (default: http://127.0.0.1:18180)
 */

import * as ed from "@noble/ed25519";
import { sha512 } from "@noble/hashes/sha512";
import { readFileSync } from "fs";
import { join } from "path";

// Configure ed25519 to use sha512
ed.etc.sha512Sync = (...m) => sha512(ed.etc.concatBytes(...m));

const API_URL = process.env.STEMEDB_API_URL || "http://127.0.0.1:18180";

// ============================================================================
// Types
// ============================================================================

interface Agent {
  name: string;
  privateKey: Uint8Array;
  publicKey: Uint8Array;
}

type SourceClass = "Regulatory" | "Clinical" | "Observational" | "Expert" | "Community" | "Anecdotal";
type ObjectType = "Text" | "Number" | "Boolean" | "Reference";

interface ObjectValue {
  type: ObjectType;
  value: string | number | boolean;
}

interface CuratedClaim {
  subject: string;
  predicate: string;
  object: ObjectValue;
  confidence: number;
  sourceClass: SourceClass;
  sourceLabel: string;
  sourceUrl?: string;
  note?: string;
}

// ============================================================================
// Helpers
// ============================================================================

function toHex(bytes: Uint8Array): string {
  return Array.from(bytes)
    .map((b) => b.toString(16).padStart(2, "0"))
    .join("");
}

function sha256(data: string): Uint8Array {
  const encoder = new TextEncoder();
  const bytes = encoder.encode(data);
  const hash = new Uint8Array(32);
  for (let i = 0; i < bytes.length; i++) {
    hash[i % 32] ^= bytes[i];
    hash[(i + 1) % 32] = (hash[(i + 1) % 32] + bytes[i]) % 256;
  }
  return hash;
}

function generateSourceHash(label: string): string {
  return toHex(sha256(`source-whitepaper-${label}`));
}

async function createAgent(name: string): Promise<Agent> {
  const seedHash = sha256(`whitepaper-seed-agent-${name}`);
  const privateKey = seedHash;
  const publicKey = await ed.getPublicKeyAsync(privateKey);
  return { name, privateKey, publicKey };
}

async function signAssertion(
  agent: Agent,
  subject: string,
  predicate: string
): Promise<{ signature: string; timestamp: number }> {
  const timestamp = Math.floor(Date.now() / 1000);
  const message = `${subject}:${predicate}`;
  const messageBytes = new TextEncoder().encode(message);
  const signature = await ed.signAsync(messageBytes, agent.privateKey);
  return { signature: toHex(signature), timestamp };
}

// ============================================================================
// Curated Claims from Whitepaper
// These are hand-curated to ensure quality and relevance
// ============================================================================

const WHITEPAPER_CLAIMS: CuratedClaim[] = [
  // Storage & Architecture
  {
    subject: "StemeDB",
    predicate: "storage_model",
    object: { type: "Text", value: "append-only Merkle DAG" },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 5.1",
    note: "Core architectural claim"
  },
  {
    subject: "StemeDB",
    predicate: "hash_algorithm",
    object: { type: "Text", value: "BLAKE3" },
    confidence: 0.99,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.3",
    note: "Content-addressing algorithm"
  },
  {
    subject: "StemeDB",
    predicate: "signature_algorithm",
    object: { type: "Text", value: "Ed25519" },
    confidence: 0.99,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.4",
    note: "Cryptographic signature algorithm"
  },
  {
    subject: "StemeDB",
    predicate: "serialization_format",
    object: { type: "Text", value: "rkyv (zero-copy)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 5.5"
  },
  {
    subject: "StemeDB",
    predicate: "data_model",
    object: { type: "Text", value: "subject-predicate-object triples with provenance" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.1"
  },

  // Lens Complexity Claims
  {
    subject: "RecencyLens",
    predicate: "time_complexity",
    object: { type: "Text", value: "O(n)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.1",
    note: "Where n = number of candidates"
  },
  {
    subject: "RecencyLens",
    predicate: "space_complexity",
    object: { type: "Text", value: "O(1)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.1"
  },
  {
    subject: "ConsensusLens",
    predicate: "time_complexity",
    object: { type: "Text", value: "O(n)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.2"
  },
  {
    subject: "ConsensusLens",
    predicate: "space_complexity",
    object: { type: "Text", value: "O(k)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.2",
    note: "Where k = distinct object values"
  },
  {
    subject: "AuthorityLens",
    predicate: "time_complexity",
    object: { type: "Text", value: "O(n)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.3"
  },
  {
    subject: "SkepticLens",
    predicate: "resolution_type",
    object: { type: "Text", value: "conflict analysis without winner selection" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.4"
  },
  {
    subject: "SkepticLens",
    predicate: "conflict_metric",
    object: { type: "Text", value: "normalized Shannon entropy" },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.4"
  },

  // Lens Properties
  {
    subject: "Lens",
    predicate: "property_stateless",
    object: { type: "Boolean", value: true },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4"
  },
  {
    subject: "Lens",
    predicate: "property_deterministic",
    object: { type: "Boolean", value: true },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4"
  },
  {
    subject: "Lens",
    predicate: "property_composable",
    object: { type: "Boolean", value: true },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4"
  },

  // Trust Parameters (contested - honest limitation)
  {
    subject: "EigenTrust",
    predicate: "initial_trust_score",
    object: { type: "Number", value: 0.5 },
    confidence: 0.72,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 7.1",
    note: "Heuristic without theoretical foundation"
  },
  {
    subject: "EigenTrust",
    predicate: "reward_delta",
    object: { type: "Number", value: 0.05 },
    confidence: 0.72,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 7.1",
    note: "Heuristic for correct assertions"
  },
  {
    subject: "EigenTrust",
    predicate: "penalty_delta",
    object: { type: "Number", value: 0.1 },
    confidence: 0.72,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 7.1",
    note: "Heuristic for incorrect assertions"
  },

  // Source Tier Weights
  {
    subject: "SourceClass",
    predicate: "tier_0_weight",
    object: { type: "Number", value: 1.0 },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.2",
    note: "Regulatory tier"
  },
  {
    subject: "SourceClass",
    predicate: "tier_1_weight",
    object: { type: "Number", value: 0.9 },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.2",
    note: "Clinical tier"
  },
  {
    subject: "SourceClass",
    predicate: "tier_2_weight",
    object: { type: "Number", value: 0.7 },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.2",
    note: "Observational tier"
  },
  {
    subject: "SourceClass",
    predicate: "tier_3_weight",
    object: { type: "Number", value: 0.5 },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.2",
    note: "Expert tier"
  },
  {
    subject: "SourceClass",
    predicate: "tier_4_weight",
    object: { type: "Number", value: 0.2 },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.2",
    note: "Community tier"
  },
  {
    subject: "SourceClass",
    predicate: "tier_5_weight",
    object: { type: "Number", value: 0.1 },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.2",
    note: "Anecdotal tier"
  },

  // MaterializedView
  {
    subject: "MaterializedView",
    predicate: "read_complexity",
    object: { type: "Text", value: "O(1)" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.4"
  },
  {
    subject: "MaterializedView",
    predicate: "consistency_model",
    object: { type: "Text", value: "eventual consistency" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 6.3"
  },

  // Content Addressing Properties
  {
    subject: "content_addressing",
    predicate: "provides_deduplication",
    object: { type: "Boolean", value: true },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.3"
  },
  {
    subject: "content_addressing",
    predicate: "provides_integrity",
    object: { type: "Boolean", value: true },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.3"
  },
  {
    subject: "content_addressing",
    predicate: "enables_efficient_comparison",
    object: { type: "Boolean", value: true },
    confidence: 0.98,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 3.3"
  },

  // Tradeoffs
  {
    subject: "StemeDB",
    predicate: "storage_tradeoff",
    object: { type: "Text", value: "append-only storage grows without bound" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 6.1"
  },
  {
    subject: "StemeDB",
    predicate: "not_suitable_for",
    object: { type: "Text", value: "ACID transactions" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 6.4"
  },
  {
    subject: "StemeDB",
    predicate: "not_suitable_for",
    object: { type: "Text", value: "high-frequency CRUD" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 6.4"
  },

  // Write/Read Paths
  {
    subject: "StemeDB",
    predicate: "write_path_includes",
    object: { type: "Text", value: "WAL with fsync" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 5.2"
  },
  {
    subject: "StemeDB",
    predicate: "fast_read_path",
    object: { type: "Text", value: "O(1) via materialized views" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 5.3"
  },
  {
    subject: "StemeDB",
    predicate: "full_resolution_path",
    object: { type: "Text", value: "O(n) for custom lenses" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 5.3"
  },

  // Conflict Status Thresholds
  {
    subject: "SkepticLens",
    predicate: "unanimous_threshold",
    object: { type: "Text", value: "conflict_score < 0.1" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.4"
  },
  {
    subject: "SkepticLens",
    predicate: "agreed_threshold",
    object: { type: "Text", value: "conflict_score < 0.4" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.4"
  },
  {
    subject: "SkepticLens",
    predicate: "contested_threshold",
    object: { type: "Text", value: "conflict_score >= 0.4" },
    confidence: 0.95,
    sourceClass: "Expert",
    sourceLabel: "StemeDB Whitepaper - Section 4.2.4"
  },
];

// ============================================================================
// API Functions
// ============================================================================

async function registerSource(hash: string, label: string, tier: number, url?: string): Promise<void> {
  const SOURCE_CLASS_MAP: Record<number, string> = {
    0: "Regulatory",
    1: "Clinical",
    2: "Observational",
    3: "Expert",
    4: "Community",
    5: "Anecdotal",
  };

  const response = await fetch(`${API_URL}/v1/sources`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({
      hash,
      label,
      tier,
      tier_label: SOURCE_CLASS_MAP[tier],
      url,
    }),
  });

  if (!response.ok && response.status !== 409) {
    const text = await response.text();
    console.warn(`  Warning: Failed to register source ${label}: ${text}`);
  }
}

async function createAssertion(
  agent: Agent,
  claim: CuratedClaim,
  sourceHash: string
): Promise<string | null> {
  const { signature, timestamp } = await signAssertion(agent, claim.subject, claim.predicate);

  const request = {
    subject: claim.subject,
    predicate: claim.predicate,
    object: claim.object,
    confidence: claim.confidence,
    source_hash: sourceHash,
    source_class: claim.sourceClass,
    signatures: [
      {
        agent_id: toHex(agent.publicKey),
        signature,
        timestamp,
        version: 1,
      },
    ],
  };

  const response = await fetch(`${API_URL}/v1/assert`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(request),
  });

  if (!response.ok) {
    const text = await response.text();
    console.warn(`  Warning: Failed to create assertion: ${text}`);
    return null;
  }

  const data = await response.json();
  return data.hash;
}

// ============================================================================
// Main
// ============================================================================

async function main(): Promise<void> {
  const args = process.argv.slice(2);
  const dryRun = args.includes("--dry-run") || args.includes("-d");

  console.log("StemeDB Whitepaper Seed Script");
  console.log("==============================\n");

  if (dryRun) {
    console.log("DRY RUN MODE - No data will be submitted\n");
  }

  // Create agent
  console.log("Creating agent...");
  const agent = await createAgent("whitepaper-author");
  console.log(`  Agent: ${toHex(agent.publicKey).slice(0, 16)}...`);
  console.log();

  // Group claims by source for registration
  const sourceMap = new Map<string, { label: string; tier: number; url?: string }>();
  const SOURCE_CLASS_TO_TIER: Record<SourceClass, number> = {
    Regulatory: 0,
    Clinical: 1,
    Observational: 2,
    Expert: 3,
    Community: 4,
    Anecdotal: 5,
  };

  for (const claim of WHITEPAPER_CLAIMS) {
    const hash = generateSourceHash(claim.sourceLabel);
    if (!sourceMap.has(hash)) {
      sourceMap.set(hash, {
        label: claim.sourceLabel,
        tier: SOURCE_CLASS_TO_TIER[claim.sourceClass],
        url: claim.sourceUrl,
      });
    }
  }

  // Register sources
  console.log(`Registering ${sourceMap.size} sources...`);
  if (!dryRun) {
    for (const [hash, source] of sourceMap) {
      await registerSource(hash, source.label, source.tier, source.url);
      console.log(`  + ${source.label.slice(0, 50)}...`);
    }
  } else {
    for (const [hash, source] of sourceMap) {
      console.log(`  [DRY] Would register: ${source.label.slice(0, 50)}...`);
    }
  }
  console.log();

  // Create assertions
  console.log(`Creating ${WHITEPAPER_CLAIMS.length} assertions...`);
  let created = 0;
  let failed = 0;

  for (const claim of WHITEPAPER_CLAIMS) {
    const sourceHash = generateSourceHash(claim.sourceLabel);

    if (dryRun) {
      console.log(`  [DRY] ${claim.subject}/${claim.predicate} = "${String(claim.object.value).slice(0, 30)}..."`);
      created++;
    } else {
      const hash = await createAssertion(agent, claim, sourceHash);
      if (hash) {
        created++;
        console.log(`  + ${claim.subject}/${claim.predicate} -> ${hash.slice(0, 16)}...`);
      } else {
        failed++;
      }
    }
  }

  console.log(`\nCreated ${created} assertions${failed > 0 ? ` (${failed} failed)` : ""}`);

  if (!dryRun) {
    // Wait for materialization
    console.log("\nWaiting for materialization...");
    await new Promise((resolve) => setTimeout(resolve, 2000));
    console.log("Done!");
  }
}

main().catch((error) => {
  console.error("Error:", error.message);
  process.exit(1);
});