stemedb/community/scripts/extract-claims.ts
jordan 1cc453c97b feat: Aphoria policy source tracking + claim extraction pipeline
- Add PolicySourceStore for tracking where policies come from
- Implement claim extraction skill and API endpoints
- Add community UI text selection extractor component
- Create Go SDK aphoria client for policy operations
- Document patent specifications and legal disclosures
- Add guides: golden path loop, policy audit trails, pre-flight checks
- Expand Unreal Engine config extractor with source tracking
- Add UAT reports for policy source tracking validation
- Refactor tests.rs into modular test files

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 02:35:02 -07:00

590 lines
17 KiB
TypeScript

#!/usr/bin/env npx tsx
/**
* Entity-Level Claim Extraction CLI Tool
*
* Extracts atomic claims from prose text and optionally submits them to StemeDB.
*
* Usage:
* npx tsx scripts/extract-claims.ts --text "Your text here" --source-class Expert
* npx tsx scripts/extract-claims.ts --file article.txt --source-class Clinical --submit
* cat paper.txt | npx tsx scripts/extract-claims.ts --stdin --dry-run
*
* Environment:
* STEMEDB_API_URL - API base URL (default: http://127.0.0.1:18180)
*/
import * as ed from "@noble/ed25519";
import { sha512 } from "@noble/hashes/sha512";
import { readFileSync } from "fs";
import { execSync } from "child_process";
// Configure ed25519 to use sha512
ed.etc.sha512Sync = (...m) => sha512(ed.etc.concatBytes(...m));
const API_URL = process.env.STEMEDB_API_URL || "http://127.0.0.1:18180";
// ============================================================================
// Types
// ============================================================================
type SourceClass =
| "Regulatory"
| "Clinical"
| "Observational"
| "Expert"
| "Community"
| "Anecdotal";
type ObjectType = "Text" | "Number" | "Boolean" | "Reference";
interface ObjectValue {
type: ObjectType;
value: string | number | boolean;
}
interface SourceSpan {
start: number;
end: number;
text: string;
}
type ClaimType = "direct_assertion" | "cited_claim" | "definition" | "measurement";
type DocumentType = "technical_paper" | "news" | "regulatory" | "documentation" | "blog" | "forum";
interface DocumentContext {
documentTitle?: string;
sectionTitle?: string;
documentType?: DocumentType;
}
interface ExtractedClaim {
subject: string;
predicate: string;
object: ObjectValue;
confidence: number;
extraction_rationale: string;
entity_aliases: string[];
source_span?: SourceSpan;
claim_type?: ClaimType;
}
interface ExtractionOutput {
claims: ExtractedClaim[];
source: {
url?: string;
source_class: SourceClass;
content_hash?: string;
};
meta: {
total_claims: number;
unique_subjects: number;
extraction_notes?: string;
};
}
interface Agent {
name: string;
privateKey: Uint8Array;
publicKey: Uint8Array;
}
interface CLIArgs {
text?: string;
file?: string;
stdin?: boolean;
sourceUrl?: string;
sourceClass: SourceClass;
documentTitle?: string;
documentType?: DocumentType;
submit: boolean;
dryRun: boolean;
verbose: boolean;
}
// ============================================================================
// Helpers
// ============================================================================
function toHex(bytes: Uint8Array): string {
return Array.from(bytes)
.map((b) => b.toString(16).padStart(2, "0"))
.join("");
}
function sha256(data: string): Uint8Array {
const encoder = new TextEncoder();
const bytes = encoder.encode(data);
// Simple deterministic hash for seed purposes
const hash = new Uint8Array(32);
for (let i = 0; i < bytes.length; i++) {
hash[i % 32] ^= bytes[i];
hash[(i + 1) % 32] = (hash[(i + 1) % 32] + bytes[i]) % 256;
}
return hash;
}
function generateContentHash(content: string): string {
return toHex(sha256(content));
}
async function createAgent(name: string): Promise<Agent> {
const seedHash = sha256(`extract-claims-agent-${name}`);
const privateKey = seedHash;
const publicKey = await ed.getPublicKeyAsync(privateKey);
return { name, privateKey, publicKey };
}
async function signAssertion(
agent: Agent,
subject: string,
predicate: string
): Promise<{ signature: string; timestamp: number }> {
const timestamp = Math.floor(Date.now() / 1000);
const message = `${subject}:${predicate}`;
const messageBytes = new TextEncoder().encode(message);
const signature = await ed.signAsync(messageBytes, agent.privateKey);
return { signature: toHex(signature), timestamp };
}
// ============================================================================
// Claude CLI
// ============================================================================
const EXTRACTION_PROMPT = `You are a precise claim extraction engine for StemeDB. Extract ONLY direct factual assertions.
## REJECTION PATTERNS (DO NOT extract claims from):
- Hypotheticals: "Consider...", "Suppose...", "Imagine...", "For example...", "What if..."
- Illustrative scenarios used to explain concepts
- Unspecified subjects: "a drug", "the system", "this database", "an agent"
- Generic truisms: "databases store data", "systems have users"
- Rhetorical questions or problems being described (not asserted)
- Future possibilities or proposals not yet implemented
## REQUIREMENTS for every claim:
- Subject: MUST be a proper noun or specific technical term (PostgreSQL, Semaglutide, RecencyLens)
- NOT acceptable: "a drug", "the database", "this", "it", "the system"
- Predicate: MUST be a specific measurable/verifiable relationship
- NOT acceptable: "is_related_to", "involves", "has_something"
- Object: MUST be a concrete value, number, or named entity
- NOT acceptable: "good", "various", "some", "many"
## CLAIM TYPES (include for each claim):
- "direct_assertion": Author states as fact ("StemeDB uses BLAKE3")
- "cited_claim": Author cites another source ("Shapiro et al. showed...")
- "definition": Defining a term ("A Lens is a function that...")
- "measurement": Empirical/quantitative result ("RecencyLens is O(n)")
## CONFIDENCE SCORING:
- Direct assertion with specific named entities: 0.90-0.95
- Implied from technical description: 0.80-0.85
- Hedged statement (may, might, could): 0.60-0.70
- Hypothetical example: DO NOT EXTRACT (confidence = 0)
## DOCUMENT CONTEXT:
- Title: DOCUMENT_TITLE
- Document type: DOCUMENT_TYPE
## CANONICAL NAMING:
- Use consistent names (PostgreSQL not Postgres, MongoDB not Mongo)
- Use underscores for multi-word entities (RecencyLens, EigenTrust)
## OUTPUT FORMAT:
Return ONLY valid JSON matching this schema. No markdown, no explanation, just JSON.
{
"claims": [{
"subject": "SpecificEntityName",
"predicate": "specific_relationship",
"object": { "type": "Text|Number|Boolean|Reference", "value": "concrete_value" },
"confidence": 0.0-1.0,
"claim_type": "direct_assertion|cited_claim|definition|measurement",
"extraction_rationale": "Why this claim was extracted (cite specific text)",
"entity_aliases": ["other", "names"],
"source_span": { "start": 0, "end": 10, "text": "exact quote" }
}],
"source": { "source_class": "SOURCE_CLASS" },
"meta": {
"total_claims": N,
"unique_subjects": M,
"extraction_notes": "Note if text was mostly hypothetical/illustrative"
}
}
## TEXT TO ANALYZE:
Source class: SOURCE_CLASS
INPUT_TEXT
Return ONLY valid JSON. If text is entirely hypothetical/illustrative, return empty claims array with extraction_notes explaining why.`;
function callClaude(
text: string,
sourceClass: SourceClass,
context?: DocumentContext
): ExtractionOutput {
// Build the prompt with context
const prompt = EXTRACTION_PROMPT
.replace(/SOURCE_CLASS/g, sourceClass)
.replace("DOCUMENT_TITLE", context?.documentTitle || "(not provided)")
.replace("DOCUMENT_TYPE", context?.documentType || "(not provided)")
.replace("INPUT_TEXT", text);
// Call claude CLI with -p (print mode) and --allowedTools none for safety
const result = execSync(
`claude -p --output-format json --allowedTools "" --model sonnet`,
{
input: prompt,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
}
);
// Parse the response - claude -p with --output-format json returns structured output
let jsonStr = result.trim();
// The output might be wrapped in a JSON response object
try {
const wrapped = JSON.parse(jsonStr);
if (wrapped.result) {
jsonStr = wrapped.result;
} else if (typeof wrapped === "string") {
jsonStr = wrapped;
}
} catch {
// Not wrapped, continue with raw output
}
// Handle potential markdown code blocks in the response
if (jsonStr.startsWith("```")) {
const match = jsonStr.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
if (match) {
jsonStr = match[1];
}
}
const output: ExtractionOutput = JSON.parse(jsonStr);
output.source.source_class = sourceClass;
return output;
}
// ============================================================================
// StemeDB API
// ============================================================================
async function submitAssertion(
agent: Agent,
claim: ExtractedClaim,
sourceHash: string,
sourceClass: SourceClass
): Promise<string | null> {
const { signature, timestamp } = await signAssertion(
agent,
claim.subject,
claim.predicate
);
const request = {
subject: claim.subject,
predicate: claim.predicate,
object: claim.object,
confidence: claim.confidence,
source_hash: sourceHash,
source_class: sourceClass,
signatures: [
{
agent_id: toHex(agent.publicKey),
signature,
timestamp,
version: 1,
},
],
};
const response = await fetch(`${API_URL}/v1/assert`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(request),
});
if (!response.ok) {
const text = await response.text();
console.error(` Failed to submit assertion: ${text}`);
return null;
}
const data = await response.json();
return data.hash;
}
async function storeSource(content: string): Promise<string> {
const base64Content = Buffer.from(content).toString("base64");
const response = await fetch(`${API_URL}/v1/sources/store`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
content: base64Content,
content_type: "text/plain",
}),
});
if (!response.ok) {
const text = await response.text();
throw new Error(`Failed to store source: ${text}`);
}
const data = await response.json();
return data.hash;
}
// ============================================================================
// CLI
// ============================================================================
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = {
sourceClass: "Expert",
submit: false,
dryRun: false,
verbose: false,
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
switch (arg) {
case "--text":
case "-t":
result.text = args[++i];
break;
case "--file":
case "-f":
result.file = args[++i];
break;
case "--stdin":
result.stdin = true;
break;
case "--source-url":
case "-u":
result.sourceUrl = args[++i];
break;
case "--source-class":
case "-c":
result.sourceClass = args[++i] as SourceClass;
break;
case "--document-title":
result.documentTitle = args[++i];
break;
case "--document-type":
result.documentType = args[++i] as DocumentType;
break;
case "--submit":
case "-s":
result.submit = true;
break;
case "--dry-run":
case "-d":
result.dryRun = true;
break;
case "--verbose":
case "-v":
result.verbose = true;
break;
case "--help":
case "-h":
printHelp();
process.exit(0);
}
}
return result;
}
function printHelp(): void {
console.log(`
Entity-Level Claim Extraction CLI
USAGE:
npx tsx scripts/extract-claims.ts [OPTIONS]
OPTIONS:
-t, --text <text> Text to extract claims from
-f, --file <path> File to read text from
--stdin Read text from stdin
-u, --source-url <url> Source URL for provenance
-c, --source-class Source tier (default: Expert)
One of: Regulatory, Clinical, Observational,
Expert, Community, Anecdotal
--document-title Document title for context (helps reject hypotheticals)
--document-type Document type for context
One of: technical_paper, news, regulatory,
documentation, blog, forum
-s, --submit Submit extracted claims to StemeDB API
-d, --dry-run Show what would be submitted without submitting
-v, --verbose Show detailed extraction output
-h, --help Show this help message
EXAMPLES:
# Extract from text and show claims
npx tsx scripts/extract-claims.ts --text "PostgreSQL uses MVCC for concurrency" -v
# Extract from a technical paper with context
npx tsx scripts/extract-claims.ts --file paper.txt \\
--document-title "StemeDB: A Claim-Oriented Database" \\
--document-type technical_paper --source-class Expert
# Dry run from stdin
cat article.txt | npx tsx scripts/extract-claims.ts --stdin --dry-run
CLAIM QUALITY:
The extractor rejects:
- Hypotheticals ("Consider...", "Suppose...", "For example...")
- Unspecified subjects ("a drug", "the system")
- Generic truisms ("databases store data")
- Illustrative scenarios
Only claims with named entities and specific predicates are extracted.
ENVIRONMENT:
STEMEDB_API_URL API base URL (default: http://127.0.0.1:18180)
REQUIRES:
claude CLI installed and authenticated (uses 'claude -p' for extraction)
`);
}
async function readInput(args: CLIArgs): Promise<string> {
if (args.text) {
return args.text;
}
if (args.file) {
return readFileSync(args.file, "utf-8");
}
if (args.stdin) {
const chunks: Buffer[] = [];
for await (const chunk of process.stdin) {
chunks.push(chunk);
}
return Buffer.concat(chunks).toString("utf-8");
}
throw new Error("No input provided. Use --text, --file, or --stdin");
}
// ============================================================================
// Main
// ============================================================================
async function main(): Promise<void> {
const args = parseArgs();
// Read input text
console.log("Reading input...");
const inputText = await readInput(args);
console.log(` Input length: ${inputText.length} characters`);
// Build document context
const context: DocumentContext | undefined =
args.documentTitle || args.documentType
? {
documentTitle: args.documentTitle,
documentType: args.documentType,
}
: undefined;
// Extract claims via Claude
console.log("\nExtracting claims via Claude CLI...");
if (context) {
console.log(` Document context: ${context.documentTitle || "(no title)"} [${context.documentType || "unknown"}]`);
}
const extraction = callClaude(inputText, args.sourceClass, context);
console.log(`\nExtracted ${extraction.meta.total_claims} claims from ${extraction.meta.unique_subjects} unique subjects`);
if (args.verbose) {
console.log("\n--- Claims ---");
for (const claim of extraction.claims) {
console.log(`\n ${claim.subject}/${claim.predicate}:`);
console.log(` Value: ${JSON.stringify(claim.object.value)}`);
console.log(` Type: ${claim.claim_type || "unspecified"}`);
console.log(` Confidence: ${claim.confidence.toFixed(2)}`);
console.log(` Rationale: ${claim.extraction_rationale}`);
if (claim.entity_aliases.length > 0) {
console.log(` Aliases: ${claim.entity_aliases.join(", ")}`);
}
}
console.log("\n--- End Claims ---");
// Show extraction notes if any
if (extraction.meta.extraction_notes) {
console.log(`\nNotes: ${extraction.meta.extraction_notes}`);
}
}
// Generate content hash
const contentHash = generateContentHash(inputText);
extraction.source.content_hash = contentHash;
if (args.sourceUrl) {
extraction.source.url = args.sourceUrl;
}
// Dry run - just show JSON
if (args.dryRun) {
console.log("\n--- Dry Run Output ---");
console.log(JSON.stringify(extraction, null, 2));
return;
}
// Submit to API
if (args.submit) {
console.log("\nSubmitting to StemeDB API...");
// Store source document first
console.log(" Storing source document...");
const sourceHash = await storeSource(inputText);
console.log(` Source hash: ${sourceHash}`);
// Create agent
const agent = await createAgent("extract-claims");
console.log(` Agent: ${toHex(agent.publicKey).slice(0, 16)}...`);
// Submit each claim
let submitted = 0;
let failed = 0;
for (const claim of extraction.claims) {
const hash = await submitAssertion(
agent,
claim,
sourceHash,
args.sourceClass
);
if (hash) {
submitted++;
if (args.verbose) {
console.log(` + ${claim.subject}/${claim.predicate} -> ${hash.slice(0, 16)}...`);
}
} else {
failed++;
}
}
console.log(`\nSubmitted ${submitted} assertions (${failed} failed)`);
} else {
// Just output the extraction
console.log("\n--- Extraction Output ---");
console.log(JSON.stringify(extraction, null, 2));
console.log("\nUse --submit to send these claims to StemeDB, or --dry-run to preview.");
}
}
main().catch((error) => {
console.error("Error:", error.message);
process.exit(1);
});