#!/usr/bin/env npx tsx /** * Entity-Level Claim Extraction CLI Tool * * Extracts atomic claims from prose text and optionally submits them to StemeDB. * * Usage: * npx tsx scripts/extract-claims.ts --text "Your text here" --source-class Expert * npx tsx scripts/extract-claims.ts --file article.txt --source-class Clinical --submit * cat paper.txt | npx tsx scripts/extract-claims.ts --stdin --dry-run * * Environment: * STEMEDB_API_URL - API base URL (default: http://127.0.0.1:18180) */ import * as ed from "@noble/ed25519"; import { sha512 } from "@noble/hashes/sha512"; import { readFileSync } from "fs"; import { execSync } from "child_process"; // Configure ed25519 to use sha512 ed.etc.sha512Sync = (...m) => sha512(ed.etc.concatBytes(...m)); const API_URL = process.env.STEMEDB_API_URL || "http://127.0.0.1:18180"; // ============================================================================ // Types // ============================================================================ type SourceClass = | "Regulatory" | "Clinical" | "Observational" | "Expert" | "Community" | "Anecdotal"; type ObjectType = "Text" | "Number" | "Boolean" | "Reference"; interface ObjectValue { type: ObjectType; value: string | number | boolean; } interface SourceSpan { start: number; end: number; text: string; } type ClaimType = "direct_assertion" | "cited_claim" | "definition" | "measurement"; type DocumentType = "technical_paper" | "news" | "regulatory" | "documentation" | "blog" | "forum"; interface DocumentContext { documentTitle?: string; sectionTitle?: string; documentType?: DocumentType; } interface ExtractedClaim { subject: string; predicate: string; object: ObjectValue; confidence: number; extraction_rationale: string; entity_aliases: string[]; source_span?: SourceSpan; claim_type?: ClaimType; } interface ExtractionOutput { claims: ExtractedClaim[]; source: { url?: string; source_class: SourceClass; content_hash?: string; }; meta: { total_claims: number; unique_subjects: number; extraction_notes?: string; }; } interface Agent { name: string; privateKey: Uint8Array; publicKey: Uint8Array; } interface CLIArgs { text?: string; file?: string; stdin?: boolean; sourceUrl?: string; sourceClass: SourceClass; documentTitle?: string; documentType?: DocumentType; submit: boolean; dryRun: boolean; verbose: boolean; } // ============================================================================ // Helpers // ============================================================================ function toHex(bytes: Uint8Array): string { return Array.from(bytes) .map((b) => b.toString(16).padStart(2, "0")) .join(""); } function sha256(data: string): Uint8Array { const encoder = new TextEncoder(); const bytes = encoder.encode(data); // Simple deterministic hash for seed purposes const hash = new Uint8Array(32); for (let i = 0; i < bytes.length; i++) { hash[i % 32] ^= bytes[i]; hash[(i + 1) % 32] = (hash[(i + 1) % 32] + bytes[i]) % 256; } return hash; } function generateContentHash(content: string): string { return toHex(sha256(content)); } async function createAgent(name: string): Promise { const seedHash = sha256(`extract-claims-agent-${name}`); const privateKey = seedHash; const publicKey = await ed.getPublicKeyAsync(privateKey); return { name, privateKey, publicKey }; } async function signAssertion( agent: Agent, subject: string, predicate: string ): Promise<{ signature: string; timestamp: number }> { const timestamp = Math.floor(Date.now() / 1000); const message = `${subject}:${predicate}`; const messageBytes = new TextEncoder().encode(message); const signature = await ed.signAsync(messageBytes, agent.privateKey); return { signature: toHex(signature), timestamp }; } // ============================================================================ // Claude CLI // ============================================================================ const EXTRACTION_PROMPT = `You are a precise claim extraction engine for StemeDB. Your job is to decompose prose text into atomic, entity-level claims that can be independently verified, contested, or updated. ## CRITICAL: ENTITY ENUMERATION PRINCIPLE When a statement mentions multiple entities (explicitly or via category), extract a SEPARATE claim for EACH entity. Never collapse "all X" into a single claim. A single sentence like "Every mainstream database, from PostgreSQL to MongoDB to Neo4j, enforces single value per key" contains **7 implicit claims**, not 1: - PostgreSQL/storage_model -> "single value per key" - MongoDB/storage_model -> "single value per key" - Neo4j/storage_model -> "single value per key" - mainstream_databases/storage_model -> "single value per key" - PostgreSQL/is_mainstream -> true - MongoDB/is_mainstream -> true - Neo4j/is_mainstream -> true **NEVER produce claims only about the document's main topic while ignoring other entities mentioned.** ## IMPLICIT CLAIMS Extract implied relationships that the text assumes to be true: - Category membership ("mainstream databases" implies each listed DB is mainstream) - Temporal relationships ("before X, we did Y" implies Y predates X) - Causal relationships ("X causes Y" implies correlation between X and Y) ## REJECTION PATTERNS (DO NOT extract claims from): - Hypotheticals: "Consider...", "Suppose...", "Imagine...", "For example...", "What if..." - Illustrative scenarios used to explain concepts - Unspecified subjects: "a drug", "the system", "this database", "an agent" - Generic truisms: "databases store data", "systems have users" - Rhetorical questions or problems being described (not asserted) - Future possibilities or proposals not yet implemented ## REQUIREMENTS for every claim: - Subject: MUST be a proper noun or specific technical term (PostgreSQL, Semaglutide, RecencyLens) - NOT acceptable: "a drug", "the database", "this", "it", "the system" - Predicate: MUST be a specific measurable/verifiable relationship - NOT acceptable: "is_related_to", "involves", "has_something" - Object: MUST be a concrete value, number, or named entity - NOT acceptable: "good", "various", "some", "many" ## CLAIM TYPES (include for each claim): - "direct_assertion": Author states as fact ("StemeDB uses BLAKE3") - "cited_claim": Author cites another source ("Shapiro et al. showed...") - "definition": Defining a term ("A Lens is a function that...") - "measurement": Empirical/quantitative result ("RecencyLens is O(n)") ## CONFIDENCE SCORING: | Factor | Base Confidence | |--------|-----------------| | Explicit statement | 0.95 | | Strong implication | 0.85 | | Weak implication | 0.70 | | Speculation | 0.50 | Modifiers: - Hedge words ("may", "might", "could") -> multiply by 0.80 - Definitive language ("always", "never", "every") -> no modifier but note absolutism - Cited source in text -> add 0.05 (max 1.0) ## DOCUMENT CONTEXT: - Title: DOCUMENT_TITLE - Document type: DOCUMENT_TYPE ## CANONICAL NAMING: - Use consistent names (PostgreSQL not Postgres, MongoDB not Mongo) - Use underscores for multi-word entities (RecencyLens, EigenTrust, mainstream_databases) ## FEW-SHOT EXAMPLE **Input:** "Every mainstream database, from PostgreSQL to MongoDB to Neo4j, enforces single value per key." **Output:** { "claims": [ { "subject": "PostgreSQL", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.95, "claim_type": "direct_assertion", "extraction_rationale": "Explicit statement about PostgreSQL", "entity_aliases": ["Postgres", "PG"] }, { "subject": "MongoDB", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.95, "claim_type": "direct_assertion", "extraction_rationale": "Explicit statement about MongoDB", "entity_aliases": ["Mongo"] }, { "subject": "Neo4j", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.95, "claim_type": "direct_assertion", "extraction_rationale": "Explicit statement about Neo4j", "entity_aliases": [] }, { "subject": "mainstream_databases", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.90, "claim_type": "direct_assertion", "extraction_rationale": "General claim about category", "entity_aliases": [] }, { "subject": "PostgreSQL", "predicate": "is_mainstream", "object": { "type": "Boolean", "value": true }, "confidence": 0.85, "claim_type": "direct_assertion", "extraction_rationale": "Implicit: listed as mainstream example", "entity_aliases": ["Postgres", "PG"] }, { "subject": "MongoDB", "predicate": "is_mainstream", "object": { "type": "Boolean", "value": true }, "confidence": 0.85, "claim_type": "direct_assertion", "extraction_rationale": "Implicit: listed as mainstream example", "entity_aliases": ["Mongo"] }, { "subject": "Neo4j", "predicate": "is_mainstream", "object": { "type": "Boolean", "value": true }, "confidence": 0.85, "claim_type": "direct_assertion", "extraction_rationale": "Implicit: listed as mainstream example", "entity_aliases": [] } ], "meta": { "total_claims": 7, "unique_subjects": 4 } } ## OUTPUT FORMAT: Return ONLY valid JSON matching this schema. No markdown, no explanation, just JSON. { "claims": [{ "subject": "SpecificEntityName", "predicate": "specific_relationship", "object": { "type": "Text|Number|Boolean|Reference", "value": "concrete_value" }, "confidence": 0.0-1.0, "claim_type": "direct_assertion|cited_claim|definition|measurement", "extraction_rationale": "Why this claim was extracted (cite specific text)", "entity_aliases": ["other", "names"], "source_span": { "start": 0, "end": 10, "text": "exact quote" } }], "source": { "source_class": "SOURCE_CLASS" }, "meta": { "total_claims": N, "unique_subjects": M, "extraction_notes": "Note if text was mostly hypothetical/illustrative" } } ## TEXT TO ANALYZE: Source class: SOURCE_CLASS INPUT_TEXT Return ONLY valid JSON. Extract ALL entities mentioned - not just the document's main topic. If text is entirely hypothetical/illustrative, return empty claims array with extraction_notes explaining why.`; function callClaude( text: string, sourceClass: SourceClass, context?: DocumentContext ): ExtractionOutput { // Build the prompt with context const prompt = EXTRACTION_PROMPT .replace(/SOURCE_CLASS/g, sourceClass) .replace("DOCUMENT_TITLE", context?.documentTitle || "(not provided)") .replace("DOCUMENT_TYPE", context?.documentType || "(not provided)") .replace("INPUT_TEXT", text); // Call claude CLI with -p (print mode) and --allowedTools none for safety const result = execSync( `claude -p --output-format json --allowedTools "" --model sonnet`, { input: prompt, encoding: "utf-8", maxBuffer: 10 * 1024 * 1024, // 10MB buffer } ); // Parse the response - claude -p with --output-format json returns structured output let jsonStr = result.trim(); // The output might be wrapped in a JSON response object try { const wrapped = JSON.parse(jsonStr); if (wrapped.result) { jsonStr = wrapped.result; } else if (typeof wrapped === "string") { jsonStr = wrapped; } } catch { // Not wrapped, continue with raw output } // Handle potential markdown code blocks in the response if (jsonStr.startsWith("```")) { const match = jsonStr.match(/```(?:json)?\s*([\s\S]*?)\s*```/); if (match) { jsonStr = match[1]; } } const output: ExtractionOutput = JSON.parse(jsonStr); output.source.source_class = sourceClass; return output; } // ============================================================================ // StemeDB API // ============================================================================ async function submitAssertion( agent: Agent, claim: ExtractedClaim, sourceHash: string, sourceClass: SourceClass ): Promise { const { signature, timestamp } = await signAssertion( agent, claim.subject, claim.predicate ); const request = { subject: claim.subject, predicate: claim.predicate, object: claim.object, confidence: claim.confidence, source_hash: sourceHash, source_class: sourceClass, signatures: [ { agent_id: toHex(agent.publicKey), signature, timestamp, version: 1, }, ], }; const response = await fetch(`${API_URL}/v1/assert`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(request), }); if (!response.ok) { const text = await response.text(); console.error(` Failed to submit assertion: ${text}`); return null; } const data = await response.json(); return data.hash; } async function storeSource(content: string): Promise { const base64Content = Buffer.from(content).toString("base64"); const response = await fetch(`${API_URL}/v1/sources/store`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ content: base64Content, content_type: "text/plain", }), }); if (!response.ok) { const text = await response.text(); throw new Error(`Failed to store source: ${text}`); } const data = await response.json(); return data.hash; } // ============================================================================ // CLI // ============================================================================ function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { sourceClass: "Expert", submit: false, dryRun: false, verbose: false, }; for (let i = 0; i < args.length; i++) { const arg = args[i]; switch (arg) { case "--text": case "-t": result.text = args[++i]; break; case "--file": case "-f": result.file = args[++i]; break; case "--stdin": result.stdin = true; break; case "--source-url": case "-u": result.sourceUrl = args[++i]; break; case "--source-class": case "-c": result.sourceClass = args[++i] as SourceClass; break; case "--document-title": result.documentTitle = args[++i]; break; case "--document-type": result.documentType = args[++i] as DocumentType; break; case "--submit": case "-s": result.submit = true; break; case "--dry-run": case "-d": result.dryRun = true; break; case "--verbose": case "-v": result.verbose = true; break; case "--help": case "-h": printHelp(); process.exit(0); } } return result; } function printHelp(): void { console.log(` Entity-Level Claim Extraction CLI USAGE: npx tsx scripts/extract-claims.ts [OPTIONS] OPTIONS: -t, --text Text to extract claims from -f, --file File to read text from --stdin Read text from stdin -u, --source-url Source URL for provenance -c, --source-class Source tier (default: Expert) One of: Regulatory, Clinical, Observational, Expert, Community, Anecdotal --document-title Document title for context (helps reject hypotheticals) --document-type Document type for context One of: technical_paper, news, regulatory, documentation, blog, forum -s, --submit Submit extracted claims to StemeDB API -d, --dry-run Show what would be submitted without submitting -v, --verbose Show detailed extraction output -h, --help Show this help message EXAMPLES: # Extract from text and show claims npx tsx scripts/extract-claims.ts --text "PostgreSQL uses MVCC for concurrency" -v # Extract from a technical paper with context npx tsx scripts/extract-claims.ts --file paper.txt \\ --document-title "StemeDB: A Claim-Oriented Database" \\ --document-type technical_paper --source-class Expert # Dry run from stdin cat article.txt | npx tsx scripts/extract-claims.ts --stdin --dry-run CLAIM QUALITY: The extractor rejects: - Hypotheticals ("Consider...", "Suppose...", "For example...") - Unspecified subjects ("a drug", "the system") - Generic truisms ("databases store data") - Illustrative scenarios Only claims with named entities and specific predicates are extracted. ENVIRONMENT: STEMEDB_API_URL API base URL (default: http://127.0.0.1:18180) REQUIRES: claude CLI installed and authenticated (uses 'claude -p' for extraction) `); } async function readInput(args: CLIArgs): Promise { if (args.text) { return args.text; } if (args.file) { return readFileSync(args.file, "utf-8"); } if (args.stdin) { const chunks: Buffer[] = []; for await (const chunk of process.stdin) { chunks.push(chunk); } return Buffer.concat(chunks).toString("utf-8"); } throw new Error("No input provided. Use --text, --file, or --stdin"); } // ============================================================================ // Main // ============================================================================ async function main(): Promise { const args = parseArgs(); // Read input text console.log("Reading input..."); const inputText = await readInput(args); console.log(` Input length: ${inputText.length} characters`); // Build document context const context: DocumentContext | undefined = args.documentTitle || args.documentType ? { documentTitle: args.documentTitle, documentType: args.documentType, } : undefined; // Extract claims via Claude console.log("\nExtracting claims via Claude CLI..."); if (context) { console.log(` Document context: ${context.documentTitle || "(no title)"} [${context.documentType || "unknown"}]`); } const extraction = callClaude(inputText, args.sourceClass, context); console.log(`\nExtracted ${extraction.meta.total_claims} claims from ${extraction.meta.unique_subjects} unique subjects`); if (args.verbose) { console.log("\n--- Claims ---"); for (const claim of extraction.claims) { console.log(`\n ${claim.subject}/${claim.predicate}:`); console.log(` Value: ${JSON.stringify(claim.object.value)}`); console.log(` Type: ${claim.claim_type || "unspecified"}`); console.log(` Confidence: ${claim.confidence.toFixed(2)}`); console.log(` Rationale: ${claim.extraction_rationale}`); if (claim.entity_aliases.length > 0) { console.log(` Aliases: ${claim.entity_aliases.join(", ")}`); } } console.log("\n--- End Claims ---"); // Show extraction notes if any if (extraction.meta.extraction_notes) { console.log(`\nNotes: ${extraction.meta.extraction_notes}`); } } // Generate content hash const contentHash = generateContentHash(inputText); extraction.source.content_hash = contentHash; if (args.sourceUrl) { extraction.source.url = args.sourceUrl; } // Dry run - just show JSON if (args.dryRun) { console.log("\n--- Dry Run Output ---"); console.log(JSON.stringify(extraction, null, 2)); return; } // Submit to API if (args.submit) { console.log("\nSubmitting to StemeDB API..."); // Store source document first console.log(" Storing source document..."); const sourceHash = await storeSource(inputText); console.log(` Source hash: ${sourceHash}`); // Create agent const agent = await createAgent("extract-claims"); console.log(` Agent: ${toHex(agent.publicKey).slice(0, 16)}...`); // Submit each claim let submitted = 0; let failed = 0; for (const claim of extraction.claims) { const hash = await submitAssertion( agent, claim, sourceHash, args.sourceClass ); if (hash) { submitted++; if (args.verbose) { console.log(` + ${claim.subject}/${claim.predicate} -> ${hash.slice(0, 16)}...`); } } else { failed++; } } console.log(`\nSubmitted ${submitted} assertions (${failed} failed)`); } else { // Just output the extraction console.log("\n--- Extraction Output ---"); console.log(JSON.stringify(extraction, null, 2)); console.log("\nUse --submit to send these claims to StemeDB, or --dry-run to preview."); } } main().catch((error) => { console.error("Error:", error.message); process.exit(1); });