#!/usr/bin/env npx tsx /** * Seed whitepaper claims to StemeDB. * * This script: * 1. Loads whitepaper sections from data/whitepaper-sections.json * 2. Extracts claims from each section (using hardcoded curated claims) * 3. Creates agents with deterministic keys * 4. Registers sources and submits assertions to StemeDB * * Usage: * npx tsx scripts/seed-whitepaper.ts * npx tsx scripts/seed-whitepaper.ts --dry-run * * Environment: * STEMEDB_API_URL - API base URL (default: http://127.0.0.1:18180) */ import * as ed from "@noble/ed25519"; import { sha512 } from "@noble/hashes/sha512"; import { readFileSync } from "fs"; import { join } from "path"; // Configure ed25519 to use sha512 ed.etc.sha512Sync = (...m) => sha512(ed.etc.concatBytes(...m)); const API_URL = process.env.STEMEDB_API_URL || "http://127.0.0.1:18180"; // ============================================================================ // Types // ============================================================================ interface Agent { name: string; privateKey: Uint8Array; publicKey: Uint8Array; } type SourceClass = "Regulatory" | "Clinical" | "Observational" | "Expert" | "Community" | "Anecdotal"; type ObjectType = "Text" | "Number" | "Boolean" | "Reference"; interface ObjectValue { type: ObjectType; value: string | number | boolean; } interface CuratedClaim { subject: string; predicate: string; object: ObjectValue; confidence: number; sourceClass: SourceClass; sourceLabel: string; sourceUrl?: string; note?: string; } // ============================================================================ // Helpers // ============================================================================ function toHex(bytes: Uint8Array): string { return Array.from(bytes) .map((b) => b.toString(16).padStart(2, "0")) .join(""); } function sha256(data: string): Uint8Array { const encoder = new TextEncoder(); const bytes = encoder.encode(data); const hash = new Uint8Array(32); for (let i = 0; i < bytes.length; i++) { hash[i % 32] ^= bytes[i]; hash[(i + 1) % 32] = (hash[(i + 1) % 32] + bytes[i]) % 256; } return hash; } function generateSourceHash(label: string): string { return toHex(sha256(`source-whitepaper-${label}`)); } async function createAgent(name: string): Promise { const seedHash = sha256(`whitepaper-seed-agent-${name}`); const privateKey = seedHash; const publicKey = await ed.getPublicKeyAsync(privateKey); return { name, privateKey, publicKey }; } async function signAssertion( agent: Agent, subject: string, predicate: string ): Promise<{ signature: string; timestamp: number }> { const timestamp = Math.floor(Date.now() / 1000); const message = `${subject}:${predicate}`; const messageBytes = new TextEncoder().encode(message); const signature = await ed.signAsync(messageBytes, agent.privateKey); return { signature: toHex(signature), timestamp }; } // ============================================================================ // Curated Claims from Whitepaper // These are hand-curated to ensure quality and relevance // ============================================================================ const WHITEPAPER_CLAIMS: CuratedClaim[] = [ // =========================================================================== // Introduction Section - Claims about COMPETING databases mentioned in text // "Every mainstream database, from PostgreSQL to MongoDB to Neo4j, enforces // a fundamental assumption: at any given time, a key maps to exactly one value" // =========================================================================== // PostgreSQL claims from Introduction { subject: "PostgreSQL", predicate: "conflict_resolution", object: { type: "Text", value: "overwrite or reject" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Explicit claim about PostgreSQL's approach to conflicting writes" }, { subject: "PostgreSQL", predicate: "storage_assumption", object: { type: "Text", value: "single value per key" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Core assumption of PostgreSQL's data model" }, { subject: "PostgreSQL", predicate: "is_mainstream", object: { type: "Boolean", value: true }, confidence: 0.85, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Implicit: listed as example of mainstream database" }, // MongoDB claims from Introduction { subject: "MongoDB", predicate: "conflict_resolution", object: { type: "Text", value: "overwrite or reject" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Explicit claim about MongoDB's approach to conflicting writes" }, { subject: "MongoDB", predicate: "storage_assumption", object: { type: "Text", value: "single value per key" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Core assumption of MongoDB's data model" }, { subject: "MongoDB", predicate: "is_mainstream", object: { type: "Boolean", value: true }, confidence: 0.85, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Implicit: listed as example of mainstream database" }, // Neo4j claims from Introduction { subject: "Neo4j", predicate: "conflict_resolution", object: { type: "Text", value: "overwrite or reject" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Explicit claim about Neo4j's approach to conflicting writes" }, { subject: "Neo4j", predicate: "storage_assumption", object: { type: "Text", value: "single value per key" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Core assumption of Neo4j's data model" }, { subject: "Neo4j", predicate: "is_mainstream", object: { type: "Boolean", value: true }, confidence: 0.85, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "Implicit: listed as example of mainstream database" }, // Category-level claims from Introduction { subject: "mainstream_databases", predicate: "storage_assumption", object: { type: "Text", value: "single value per key" }, confidence: 0.90, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "General claim about mainstream database category" }, { subject: "mainstream_databases", predicate: "conflict_resolution", object: { type: "Text", value: "overwrite or reject" }, confidence: 0.90, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Introduction", note: "How mainstream databases handle conflicting values" }, // =========================================================================== // Storage & Architecture (use "Episteme" to match page.tsx queries) // NOTE: All claim values should be COMPLETE SENTENCES that can stand alone // =========================================================================== { subject: "Episteme", predicate: "storage_model", object: { type: "Text", value: "Episteme stores assertions in an append-only Merkle DAG" }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 5.1", note: "Core architectural claim" }, { subject: "StemeDB", predicate: "hash_algorithm", object: { type: "Text", value: "StemeDB uses BLAKE3 for content-addressing" }, confidence: 0.99, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.3", note: "Content-addressing algorithm" }, { subject: "Episteme", predicate: "signature_algorithm", object: { type: "Text", value: "Episteme uses Ed25519 signatures for agent attribution" }, confidence: 0.99, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.4", note: "Cryptographic signature algorithm" }, { subject: "Episteme", predicate: "serialization_format", object: { type: "Text", value: "Episteme uses rkyv for zero-copy deserialization" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 5.5" }, { subject: "Episteme", predicate: "data_model", object: { type: "Text", value: "Episteme stores subject-predicate-object triples with full provenance metadata" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.1" }, { subject: "Episteme", predicate: "content_addressing", object: { type: "Text", value: "Episteme uses BLAKE3 content-addressing which provides deduplication, integrity verification, and efficient comparison" }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.3", note: "Content-addressing provides deduplication, integrity, and efficient comparison" }, { subject: "Episteme", predicate: "storage_growth", object: { type: "Text", value: "Episteme's append-only storage grows without bound, mitigated by semantic decay" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 6.1", note: "Fundamental tradeoff mitigated by semantic decay" }, // =========================================================================== // Background Section - CRDT claims // =========================================================================== { subject: "CRDT", predicate: "replica_assumption", object: { type: "Text", value: "CRDTs assume all replicas are authoritative copies of the same logical data" }, confidence: 0.90, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 2.3", note: "StemeDB's claims are genuinely different assertions from different sources" }, { subject: "CRDT", predicate: "merge_semantics", object: { type: "Text", value: "automatic merge via mathematical properties" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 2.3", note: "CRDTs use commutative, associative, idempotent operations" }, { subject: "CRDT", predicate: "consistency_model", object: { type: "Text", value: "eventual consistency" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 2.3", note: "All replicas converge to same state eventually" }, { subject: "CRDT", predicate: "conflict_handling", object: { type: "Text", value: "conflicts are resolved automatically via merge function" }, confidence: 0.90, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 2.3", note: "No human intervention needed for conflict resolution" }, // Lens Complexity Claims (use "complexity" to match page.tsx) { subject: "RecencyLens", predicate: "complexity", object: { type: "Text", value: "RecencyLens has O(n) time complexity and O(1) space complexity" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.1", note: "Where n = number of candidates" }, { subject: "RecencyLens", predicate: "time_complexity", object: { type: "Text", value: "RecencyLens runs in O(n) time where n is the number of candidate assertions" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.1", note: "Where n = number of candidates" }, { subject: "RecencyLens", predicate: "space_complexity", object: { type: "Text", value: "RecencyLens uses O(1) space" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.1" }, { subject: "ConsensusLens", predicate: "time_complexity", object: { type: "Text", value: "ConsensusLens runs in O(n) time for grouping and finding the majority" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.2" }, { subject: "ConsensusLens", predicate: "space_complexity", object: { type: "Text", value: "ConsensusLens uses O(k) space complexity where k is the number of distinct object values" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.2", note: "Where k = distinct object values" }, { subject: "AuthorityLens", predicate: "time_complexity", object: { type: "Text", value: "AuthorityLens runs in O(n) time complexity where n is the number of candidates" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.3" }, { subject: "SkepticLens", predicate: "resolution_type", object: { type: "Text", value: "SkepticLens performs conflict analysis without selecting a winner, preserving all competing claims" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.4" }, { subject: "SkepticLens", predicate: "conflict_metric", object: { type: "Text", value: "SkepticLens uses normalized Shannon entropy to measure conflict between competing claims" }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.4" }, // Lens Properties { subject: "Lens", predicate: "property_stateless", object: { type: "Boolean", value: true }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4" }, { subject: "Lens", predicate: "property_deterministic", object: { type: "Boolean", value: true }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4" }, { subject: "Lens", predicate: "property_composable", object: { type: "Boolean", value: true }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4" }, // Trust Parameters (contested - honest limitation) // Page queries EigenTrust/parameters { subject: "EigenTrust", predicate: "parameters", object: { type: "Text", value: "EigenTrust uses 0.5 initial trust with +0.05 reward and -0.1 penalty deltas" }, confidence: 0.72, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 7.1", note: "Heuristic without theoretical foundation - needs domain-specific calibration" }, { subject: "EigenTrust", predicate: "initial_trust_score", object: { type: "Number", value: 0.5 }, confidence: 0.72, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 7.1", note: "Heuristic without theoretical foundation" }, { subject: "EigenTrust", predicate: "reward_delta", object: { type: "Number", value: 0.05 }, confidence: 0.72, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 7.1", note: "Heuristic for correct assertions" }, { subject: "EigenTrust", predicate: "penalty_delta", object: { type: "Number", value: 0.1 }, confidence: 0.72, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 7.1", note: "Heuristic for incorrect assertions" }, // Source Tier Weights { subject: "SourceClass", predicate: "tier_0_weight", object: { type: "Number", value: 1.0 }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.2", note: "Regulatory tier" }, { subject: "SourceClass", predicate: "tier_1_weight", object: { type: "Number", value: 0.9 }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.2", note: "Clinical tier" }, { subject: "SourceClass", predicate: "tier_2_weight", object: { type: "Number", value: 0.7 }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.2", note: "Observational tier" }, { subject: "SourceClass", predicate: "tier_3_weight", object: { type: "Number", value: 0.5 }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.2", note: "Expert tier" }, { subject: "SourceClass", predicate: "tier_4_weight", object: { type: "Number", value: 0.2 }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.2", note: "Community tier" }, { subject: "SourceClass", predicate: "tier_5_weight", object: { type: "Number", value: 0.1 }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.2", note: "Anecdotal tier" }, // MaterializedView { subject: "MaterializedView", predicate: "read_complexity", object: { type: "Text", value: "MaterializedViews provide O(1) read complexity for pre-computed lens results" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.4" }, { subject: "MaterializedView", predicate: "consistency_model", object: { type: "Text", value: "MaterializedViews use eventual consistency, updating asynchronously after writes" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 6.3" }, // Content Addressing Properties { subject: "content_addressing", predicate: "provides_deduplication", object: { type: "Boolean", value: true }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.3" }, { subject: "content_addressing", predicate: "provides_integrity", object: { type: "Boolean", value: true }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.3" }, { subject: "content_addressing", predicate: "enables_efficient_comparison", object: { type: "Boolean", value: true }, confidence: 0.98, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 3.3" }, // Tradeoffs (use Episteme to match page queries) { subject: "Episteme", predicate: "storage_tradeoff", object: { type: "Text", value: "Episteme's append-only storage grows without bound, requiring semantic decay for long-term management" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 6.1" }, { subject: "Episteme", predicate: "not_suitable_for", object: { type: "Text", value: "Episteme is not suitable for ACID transactions requiring strict consistency guarantees" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 6.4" }, { subject: "Episteme", predicate: "not_suitable_for", object: { type: "Text", value: "Episteme is not designed for high-frequency CRUD workloads" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 6.4" }, // Write/Read Paths { subject: "Episteme", predicate: "write_path_includes", object: { type: "Text", value: "Episteme's write path uses a Write-Ahead Log (WAL) with fsync for durability" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 5.2" }, { subject: "Episteme", predicate: "fast_read_path", object: { type: "Text", value: "Episteme provides O(1) reads via pre-computed materialized views" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 5.3" }, { subject: "Episteme", predicate: "full_resolution_path", object: { type: "Text", value: "Episteme's full resolution path runs in O(n) when using custom lenses" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 5.3" }, // Conflict Status Thresholds { subject: "SkepticLens", predicate: "unanimous_threshold", object: { type: "Text", value: "SkepticLens marks claims as Unanimous when the conflict score is below 0.1" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.4" }, { subject: "SkepticLens", predicate: "agreed_threshold", object: { type: "Text", value: "SkepticLens marks claims as Agreed when the conflict score is between 0.1 and 0.4" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.4" }, { subject: "SkepticLens", predicate: "contested_threshold", object: { type: "Text", value: "SkepticLens marks claims as Contested when the conflict score is 0.4 or higher" }, confidence: 0.95, sourceClass: "Expert", sourceLabel: "StemeDB Whitepaper - Section 4.2.4" }, ]; // ============================================================================ // API Functions // ============================================================================ async function registerSource(hash: string, label: string, tier: number, url?: string): Promise { const SOURCE_CLASS_MAP: Record = { 0: "Regulatory", 1: "Clinical", 2: "Observational", 3: "Expert", 4: "Community", 5: "Anecdotal", }; const response = await fetch(`${API_URL}/v1/sources`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ hash, label, tier, tier_label: SOURCE_CLASS_MAP[tier], url, }), }); if (!response.ok && response.status !== 409) { const text = await response.text(); console.warn(` Warning: Failed to register source ${label}: ${text}`); } } async function createAssertion( agent: Agent, claim: CuratedClaim, sourceHash: string ): Promise { const { signature, timestamp } = await signAssertion(agent, claim.subject, claim.predicate); const request = { subject: claim.subject, predicate: claim.predicate, object: claim.object, confidence: claim.confidence, source_hash: sourceHash, source_class: claim.sourceClass, signatures: [ { agent_id: toHex(agent.publicKey), signature, timestamp, version: 1, }, ], }; const response = await fetch(`${API_URL}/v1/assert`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(request), }); if (!response.ok) { const text = await response.text(); console.warn(` Warning: Failed to create assertion: ${text}`); return null; } const data = await response.json(); return data.hash; } // ============================================================================ // Main // ============================================================================ async function main(): Promise { const args = process.argv.slice(2); const dryRun = args.includes("--dry-run") || args.includes("-d"); console.log("StemeDB Whitepaper Seed Script"); console.log("==============================\n"); if (dryRun) { console.log("DRY RUN MODE - No data will be submitted\n"); } // Create agent console.log("Creating agent..."); const agent = await createAgent("whitepaper-author"); console.log(` Agent: ${toHex(agent.publicKey).slice(0, 16)}...`); console.log(); // Group claims by source for registration const sourceMap = new Map(); const SOURCE_CLASS_TO_TIER: Record = { Regulatory: 0, Clinical: 1, Observational: 2, Expert: 3, Community: 4, Anecdotal: 5, }; for (const claim of WHITEPAPER_CLAIMS) { const hash = generateSourceHash(claim.sourceLabel); if (!sourceMap.has(hash)) { sourceMap.set(hash, { label: claim.sourceLabel, tier: SOURCE_CLASS_TO_TIER[claim.sourceClass], url: claim.sourceUrl, }); } } // Register sources console.log(`Registering ${sourceMap.size} sources...`); if (!dryRun) { for (const [hash, source] of sourceMap) { await registerSource(hash, source.label, source.tier, source.url); console.log(` + ${source.label.slice(0, 50)}...`); } } else { for (const [hash, source] of sourceMap) { console.log(` [DRY] Would register: ${source.label.slice(0, 50)}...`); } } console.log(); // Create assertions console.log(`Creating ${WHITEPAPER_CLAIMS.length} assertions...`); let created = 0; let failed = 0; for (const claim of WHITEPAPER_CLAIMS) { const sourceHash = generateSourceHash(claim.sourceLabel); if (dryRun) { console.log(` [DRY] ${claim.subject}/${claim.predicate} = "${String(claim.object.value).slice(0, 30)}..."`); created++; } else { const hash = await createAssertion(agent, claim, sourceHash); if (hash) { created++; console.log(` + ${claim.subject}/${claim.predicate} -> ${hash.slice(0, 16)}...`); } else { failed++; } } } console.log(`\nCreated ${created} assertions${failed > 0 ? ` (${failed} failed)` : ""}`); if (!dryRun) { // Wait for materialization console.log("\nWaiting for materialization..."); await new Promise((resolve) => setTimeout(resolve, 2000)); console.log("Done!"); } } main().catch((error) => { console.error("Error:", error.message); process.exit(1); });