stemedb/community/scripts/extract-claims.ts

#!/usr/bin/env npx tsx
/**
 * Entity-Level Claim Extraction CLI Tool
 *
 * Extracts atomic claims from prose text and optionally submits them to StemeDB.
 *
 * Usage:
 *   npx tsx scripts/extract-claims.ts --text "Your text here" --source-class Expert
 *   npx tsx scripts/extract-claims.ts --file article.txt --source-class Clinical --submit
 *   cat paper.txt | npx tsx scripts/extract-claims.ts --stdin --dry-run
 *
 * Environment:
 *   STEMEDB_API_URL - API base URL (default: http://127.0.0.1:18180)
 */

import * as ed from "@noble/ed25519";
import { sha512 } from "@noble/hashes/sha512";
import { readFileSync } from "fs";
import { execSync } from "child_process";

// Configure ed25519 to use sha512
ed.etc.sha512Sync = (...m) => sha512(ed.etc.concatBytes(...m));

const API_URL = process.env.STEMEDB_API_URL || "http://127.0.0.1:18180";

// ============================================================================
// Types
// ============================================================================

type SourceClass =
  | "Regulatory"
  | "Clinical"
  | "Observational"
  | "Expert"
  | "Community"
  | "Anecdotal";

type ObjectType = "Text" | "Number" | "Boolean" | "Reference";

interface ObjectValue {
  type: ObjectType;
  value: string | number | boolean;
}

interface SourceSpan {
  start: number;
  end: number;
  text: string;
}

type ClaimType = "direct_assertion" | "cited_claim" | "definition" | "measurement";

type DocumentType = "technical_paper" | "news" | "regulatory" | "documentation" | "blog" | "forum";

interface DocumentContext {
  documentTitle?: string;
  sectionTitle?: string;
  documentType?: DocumentType;
}

interface ExtractedClaim {
  subject: string;
  predicate: string;
  object: ObjectValue;
  confidence: number;
  extraction_rationale: string;
  entity_aliases: string[];
  source_span?: SourceSpan;
  claim_type?: ClaimType;
}

interface ExtractionOutput {
  claims: ExtractedClaim[];
  source: {
    url?: string;
    source_class: SourceClass;
    content_hash?: string;
  };
  meta: {
    total_claims: number;
    unique_subjects: number;
    extraction_notes?: string;
  };
}

interface Agent {
  name: string;
  privateKey: Uint8Array;
  publicKey: Uint8Array;
}

interface CLIArgs {
  text?: string;
  file?: string;
  stdin?: boolean;
  sourceUrl?: string;
  sourceClass: SourceClass;
  documentTitle?: string;
  documentType?: DocumentType;
  submit: boolean;
  dryRun: boolean;
  verbose: boolean;
}

// ============================================================================
// Helpers
// ============================================================================

function toHex(bytes: Uint8Array): string {
  return Array.from(bytes)
    .map((b) => b.toString(16).padStart(2, "0"))
    .join("");
}

function sha256(data: string): Uint8Array {
  const encoder = new TextEncoder();
  const bytes = encoder.encode(data);
  // Simple deterministic hash for seed purposes
  const hash = new Uint8Array(32);
  for (let i = 0; i < bytes.length; i++) {
    hash[i % 32] ^= bytes[i];
    hash[(i + 1) % 32] = (hash[(i + 1) % 32] + bytes[i]) % 256;
  }
  return hash;
}

function generateContentHash(content: string): string {
  return toHex(sha256(content));
}

async function createAgent(name: string): Promise<Agent> {
  const seedHash = sha256(`extract-claims-agent-${name}`);
  const privateKey = seedHash;
  const publicKey = await ed.getPublicKeyAsync(privateKey);
  return { name, privateKey, publicKey };
}

async function signAssertion(
  agent: Agent,
  subject: string,
  predicate: string
): Promise<{ signature: string; timestamp: number }> {
  const timestamp = Math.floor(Date.now() / 1000);
  const message = `${subject}:${predicate}`;
  const messageBytes = new TextEncoder().encode(message);
  const signature = await ed.signAsync(messageBytes, agent.privateKey);
  return { signature: toHex(signature), timestamp };
}

// ============================================================================
// Claude CLI
// ============================================================================

const EXTRACTION_PROMPT = `You are a precise claim extraction engine for StemeDB. Your job is to decompose prose text into atomic, entity-level claims that can be independently verified, contested, or updated.

## CRITICAL: ENTITY ENUMERATION PRINCIPLE

When a statement mentions multiple entities (explicitly or via category), extract a SEPARATE claim for EACH entity. Never collapse "all X" into a single claim.

A single sentence like "Every mainstream database, from PostgreSQL to MongoDB to Neo4j, enforces single value per key" contains **7 implicit claims**, not 1:
- PostgreSQL/storage_model -> "single value per key"
- MongoDB/storage_model -> "single value per key"
- Neo4j/storage_model -> "single value per key"
- mainstream_databases/storage_model -> "single value per key"
- PostgreSQL/is_mainstream -> true
- MongoDB/is_mainstream -> true
- Neo4j/is_mainstream -> true

**NEVER produce claims only about the document's main topic while ignoring other entities mentioned.**

## IMPLICIT CLAIMS

Extract implied relationships that the text assumes to be true:
- Category membership ("mainstream databases" implies each listed DB is mainstream)
- Temporal relationships ("before X, we did Y" implies Y predates X)
- Causal relationships ("X causes Y" implies correlation between X and Y)

## REJECTION PATTERNS (DO NOT extract claims from):
- Hypotheticals: "Consider...", "Suppose...", "Imagine...", "For example...", "What if..."
- Illustrative scenarios used to explain concepts
- Unspecified subjects: "a drug", "the system", "this database", "an agent"
- Generic truisms: "databases store data", "systems have users"
- Rhetorical questions or problems being described (not asserted)
- Future possibilities or proposals not yet implemented

## REQUIREMENTS for every claim:
- Subject: MUST be a proper noun or specific technical term (PostgreSQL, Semaglutide, RecencyLens)
  - NOT acceptable: "a drug", "the database", "this", "it", "the system"
- Predicate: MUST be a specific measurable/verifiable relationship
  - NOT acceptable: "is_related_to", "involves", "has_something"
- Object: MUST be a concrete value, number, or named entity
  - NOT acceptable: "good", "various", "some", "many"

## CLAIM TYPES (include for each claim):
- "direct_assertion": Author states as fact ("StemeDB uses BLAKE3")
- "cited_claim": Author cites another source ("Shapiro et al. showed...")
- "definition": Defining a term ("A Lens is a function that...")
- "measurement": Empirical/quantitative result ("RecencyLens is O(n)")

## CONFIDENCE SCORING:
| Factor | Base Confidence |
|--------|-----------------|
| Explicit statement | 0.95 |
| Strong implication | 0.85 |
| Weak implication | 0.70 |
| Speculation | 0.50 |

Modifiers:
- Hedge words ("may", "might", "could") -> multiply by 0.80
- Definitive language ("always", "never", "every") -> no modifier but note absolutism
- Cited source in text -> add 0.05 (max 1.0)

## DOCUMENT CONTEXT:
- Title: DOCUMENT_TITLE
- Document type: DOCUMENT_TYPE

## CANONICAL NAMING:
- Use consistent names (PostgreSQL not Postgres, MongoDB not Mongo)
- Use underscores for multi-word entities (RecencyLens, EigenTrust, mainstream_databases)

## FEW-SHOT EXAMPLE

**Input:** "Every mainstream database, from PostgreSQL to MongoDB to Neo4j, enforces single value per key."

**Output:**
{
  "claims": [
    { "subject": "PostgreSQL", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.95, "claim_type": "direct_assertion", "extraction_rationale": "Explicit statement about PostgreSQL", "entity_aliases": ["Postgres", "PG"] },
    { "subject": "MongoDB", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.95, "claim_type": "direct_assertion", "extraction_rationale": "Explicit statement about MongoDB", "entity_aliases": ["Mongo"] },
    { "subject": "Neo4j", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.95, "claim_type": "direct_assertion", "extraction_rationale": "Explicit statement about Neo4j", "entity_aliases": [] },
    { "subject": "mainstream_databases", "predicate": "storage_model", "object": { "type": "Text", "value": "single value per key" }, "confidence": 0.90, "claim_type": "direct_assertion", "extraction_rationale": "General claim about category", "entity_aliases": [] },
    { "subject": "PostgreSQL", "predicate": "is_mainstream", "object": { "type": "Boolean", "value": true }, "confidence": 0.85, "claim_type": "direct_assertion", "extraction_rationale": "Implicit: listed as mainstream example", "entity_aliases": ["Postgres", "PG"] },
    { "subject": "MongoDB", "predicate": "is_mainstream", "object": { "type": "Boolean", "value": true }, "confidence": 0.85, "claim_type": "direct_assertion", "extraction_rationale": "Implicit: listed as mainstream example", "entity_aliases": ["Mongo"] },
    { "subject": "Neo4j", "predicate": "is_mainstream", "object": { "type": "Boolean", "value": true }, "confidence": 0.85, "claim_type": "direct_assertion", "extraction_rationale": "Implicit: listed as mainstream example", "entity_aliases": [] }
  ],
  "meta": { "total_claims": 7, "unique_subjects": 4 }
}

## OUTPUT FORMAT:
Return ONLY valid JSON matching this schema. No markdown, no explanation, just JSON.

{
  "claims": [{
    "subject": "SpecificEntityName",
    "predicate": "specific_relationship",
    "object": { "type": "Text|Number|Boolean|Reference", "value": "concrete_value" },
    "confidence": 0.0-1.0,
    "claim_type": "direct_assertion|cited_claim|definition|measurement",
    "extraction_rationale": "Why this claim was extracted (cite specific text)",
    "entity_aliases": ["other", "names"],
    "source_span": { "start": 0, "end": 10, "text": "exact quote" }
  }],
  "source": { "source_class": "SOURCE_CLASS" },
  "meta": {
    "total_claims": N,
    "unique_subjects": M,
    "extraction_notes": "Note if text was mostly hypothetical/illustrative"
  }
}

## TEXT TO ANALYZE:
Source class: SOURCE_CLASS

INPUT_TEXT

Return ONLY valid JSON. Extract ALL entities mentioned - not just the document's main topic. If text is entirely hypothetical/illustrative, return empty claims array with extraction_notes explaining why.`;

function callClaude(
  text: string,
  sourceClass: SourceClass,
  context?: DocumentContext
): ExtractionOutput {
  // Build the prompt with context
  const prompt = EXTRACTION_PROMPT
    .replace(/SOURCE_CLASS/g, sourceClass)
    .replace("DOCUMENT_TITLE", context?.documentTitle || "(not provided)")
    .replace("DOCUMENT_TYPE", context?.documentType || "(not provided)")
    .replace("INPUT_TEXT", text);

  // Call claude CLI with -p (print mode) and --allowedTools none for safety
  const result = execSync(
    `claude -p --output-format json --allowedTools "" --model sonnet`,
    {
      input: prompt,
      encoding: "utf-8",
      maxBuffer: 10 * 1024 * 1024, // 10MB buffer
    }
  );

  // Parse the response - claude -p with --output-format json returns structured output
  let jsonStr = result.trim();

  // The output might be wrapped in a JSON response object
  try {
    const wrapped = JSON.parse(jsonStr);
    if (wrapped.result) {
      jsonStr = wrapped.result;
    } else if (typeof wrapped === "string") {
      jsonStr = wrapped;
    }
  } catch {
    // Not wrapped, continue with raw output
  }

  // Handle potential markdown code blocks in the response
  if (jsonStr.startsWith("```")) {
    const match = jsonStr.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
    if (match) {
      jsonStr = match[1];
    }
  }

  const output: ExtractionOutput = JSON.parse(jsonStr);
  output.source.source_class = sourceClass;

  return output;
}

// ============================================================================
// StemeDB API
// ============================================================================

async function submitAssertion(
  agent: Agent,
  claim: ExtractedClaim,
  sourceHash: string,
  sourceClass: SourceClass
): Promise<string | null> {
  const { signature, timestamp } = await signAssertion(
    agent,
    claim.subject,
    claim.predicate
  );

  const request = {
    subject: claim.subject,
    predicate: claim.predicate,
    object: claim.object,
    confidence: claim.confidence,
    source_hash: sourceHash,
    source_class: sourceClass,
    signatures: [
      {
        agent_id: toHex(agent.publicKey),
        signature,
        timestamp,
        version: 1,
      },
    ],
  };

  const response = await fetch(`${API_URL}/v1/assert`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(request),
  });

  if (!response.ok) {
    const text = await response.text();
    console.error(`  Failed to submit assertion: ${text}`);
    return null;
  }

  const data = await response.json();
  return data.hash;
}

async function storeSource(content: string): Promise<string> {
  const base64Content = Buffer.from(content).toString("base64");

  const response = await fetch(`${API_URL}/v1/sources/store`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({
      content: base64Content,
      content_type: "text/plain",
    }),
  });

  if (!response.ok) {
    const text = await response.text();
    throw new Error(`Failed to store source: ${text}`);
  }

  const data = await response.json();
  return data.hash;
}

// ============================================================================
// CLI
// ============================================================================

function parseArgs(): CLIArgs {
  const args = process.argv.slice(2);
  const result: CLIArgs = {
    sourceClass: "Expert",
    submit: false,
    dryRun: false,
    verbose: false,
  };

  for (let i = 0; i < args.length; i++) {
    const arg = args[i];
    switch (arg) {
      case "--text":
      case "-t":
        result.text = args[++i];
        break;
      case "--file":
      case "-f":
        result.file = args[++i];
        break;
      case "--stdin":
        result.stdin = true;
        break;
      case "--source-url":
      case "-u":
        result.sourceUrl = args[++i];
        break;
      case "--source-class":
      case "-c":
        result.sourceClass = args[++i] as SourceClass;
        break;
      case "--document-title":
        result.documentTitle = args[++i];
        break;
      case "--document-type":
        result.documentType = args[++i] as DocumentType;
        break;
      case "--submit":
      case "-s":
        result.submit = true;
        break;
      case "--dry-run":
      case "-d":
        result.dryRun = true;
        break;
      case "--verbose":
      case "-v":
        result.verbose = true;
        break;
      case "--help":
      case "-h":
        printHelp();
        process.exit(0);
    }
  }

  return result;
}

function printHelp(): void {
  console.log(`
Entity-Level Claim Extraction CLI

USAGE:
  npx tsx scripts/extract-claims.ts [OPTIONS]

OPTIONS:
  -t, --text <text>       Text to extract claims from
  -f, --file <path>       File to read text from
      --stdin             Read text from stdin
  -u, --source-url <url>  Source URL for provenance
  -c, --source-class      Source tier (default: Expert)
                          One of: Regulatory, Clinical, Observational,
                                  Expert, Community, Anecdotal
      --document-title    Document title for context (helps reject hypotheticals)
      --document-type     Document type for context
                          One of: technical_paper, news, regulatory,
                                  documentation, blog, forum
  -s, --submit            Submit extracted claims to StemeDB API
  -d, --dry-run           Show what would be submitted without submitting
  -v, --verbose           Show detailed extraction output
  -h, --help              Show this help message

EXAMPLES:
  # Extract from text and show claims
  npx tsx scripts/extract-claims.ts --text "PostgreSQL uses MVCC for concurrency" -v

  # Extract from a technical paper with context
  npx tsx scripts/extract-claims.ts --file paper.txt \\
    --document-title "StemeDB: A Claim-Oriented Database" \\
    --document-type technical_paper --source-class Expert

  # Dry run from stdin
  cat article.txt | npx tsx scripts/extract-claims.ts --stdin --dry-run

CLAIM QUALITY:
  The extractor rejects:
  - Hypotheticals ("Consider...", "Suppose...", "For example...")
  - Unspecified subjects ("a drug", "the system")
  - Generic truisms ("databases store data")
  - Illustrative scenarios

  Only claims with named entities and specific predicates are extracted.

ENVIRONMENT:
  STEMEDB_API_URL      API base URL (default: http://127.0.0.1:18180)

REQUIRES:
  claude CLI installed and authenticated (uses 'claude -p' for extraction)
`);
}

async function readInput(args: CLIArgs): Promise<string> {
  if (args.text) {
    return args.text;
  }

  if (args.file) {
    return readFileSync(args.file, "utf-8");
  }

  if (args.stdin) {
    const chunks: Buffer[] = [];
    for await (const chunk of process.stdin) {
      chunks.push(chunk);
    }
    return Buffer.concat(chunks).toString("utf-8");
  }

  throw new Error("No input provided. Use --text, --file, or --stdin");
}

// ============================================================================
// Main
// ============================================================================

async function main(): Promise<void> {
  const args = parseArgs();

  // Read input text
  console.log("Reading input...");
  const inputText = await readInput(args);
  console.log(`  Input length: ${inputText.length} characters`);

  // Build document context
  const context: DocumentContext | undefined =
    args.documentTitle || args.documentType
      ? {
          documentTitle: args.documentTitle,
          documentType: args.documentType,
        }
      : undefined;

  // Extract claims via Claude
  console.log("\nExtracting claims via Claude CLI...");
  if (context) {
    console.log(`  Document context: ${context.documentTitle || "(no title)"} [${context.documentType || "unknown"}]`);
  }
  const extraction = callClaude(inputText, args.sourceClass, context);

  console.log(`\nExtracted ${extraction.meta.total_claims} claims from ${extraction.meta.unique_subjects} unique subjects`);

  if (args.verbose) {
    console.log("\n--- Claims ---");
    for (const claim of extraction.claims) {
      console.log(`\n  ${claim.subject}/${claim.predicate}:`);
      console.log(`    Value: ${JSON.stringify(claim.object.value)}`);
      console.log(`    Type: ${claim.claim_type || "unspecified"}`);
      console.log(`    Confidence: ${claim.confidence.toFixed(2)}`);
      console.log(`    Rationale: ${claim.extraction_rationale}`);
      if (claim.entity_aliases.length > 0) {
        console.log(`    Aliases: ${claim.entity_aliases.join(", ")}`);
      }
    }
    console.log("\n--- End Claims ---");

    // Show extraction notes if any
    if (extraction.meta.extraction_notes) {
      console.log(`\nNotes: ${extraction.meta.extraction_notes}`);
    }
  }

  // Generate content hash
  const contentHash = generateContentHash(inputText);
  extraction.source.content_hash = contentHash;
  if (args.sourceUrl) {
    extraction.source.url = args.sourceUrl;
  }

  // Dry run - just show JSON
  if (args.dryRun) {
    console.log("\n--- Dry Run Output ---");
    console.log(JSON.stringify(extraction, null, 2));
    return;
  }

  // Submit to API
  if (args.submit) {
    console.log("\nSubmitting to StemeDB API...");

    // Store source document first
    console.log("  Storing source document...");
    const sourceHash = await storeSource(inputText);
    console.log(`  Source hash: ${sourceHash}`);

    // Create agent
    const agent = await createAgent("extract-claims");
    console.log(`  Agent: ${toHex(agent.publicKey).slice(0, 16)}...`);

    // Submit each claim
    let submitted = 0;
    let failed = 0;

    for (const claim of extraction.claims) {
      const hash = await submitAssertion(
        agent,
        claim,
        sourceHash,
        args.sourceClass
      );

      if (hash) {
        submitted++;
        if (args.verbose) {
          console.log(`  + ${claim.subject}/${claim.predicate} -> ${hash.slice(0, 16)}...`);
        }
      } else {
        failed++;
      }
    }

    console.log(`\nSubmitted ${submitted} assertions (${failed} failed)`);
  } else {
    // Just output the extraction
    console.log("\n--- Extraction Output ---");
    console.log(JSON.stringify(extraction, null, 2));
    console.log("\nUse --submit to send these claims to StemeDB, or --dry-run to preview.");
  }
}

main().catch((error) => {
  console.error("Error:", error.message);
  process.exit(1);
});