stemedb/applications/aphoria/src/llm/extractor.rs
jordan 157dbbb9eb feat: Complete Aphoria Phase 8-9 + UAT suite (90/90 tests passing)
## Phase 8: Enterprise Extractor Improvements 
- 14 security extractors (TLS, JWT, SQL injection, XSS, etc.)
- 10 framework-specific extractors (Spring, Django, Rails, etc.)
- Config file security detection (YAML, TOML)

## Phase 9: Autonomous Extractor Generation 
- Shadow mode executor with TP/FP tracking
- Graduation pipeline with confidence thresholds
- Auto-rollback on regression detection
- Cross-project pattern syncing

## UAT Suite Complete (14 scripts, 90 tests)
- test-core-detection.sh (6 tests)
- test-declarative-extractors.sh (5 tests)
- test-domain-frameworks.sh (5 tests)
- test-domain-unreal.sh (3 tests)
- test-llm-extraction.sh (6 tests)
- test-eval-harness.sh (5 tests)
- test-cross-language.sh (3 tests)
- test-precommit-performance.sh (4 tests)
- test-output-formats.sh (8 tests)
- test-drift-detection.sh (6 tests)
- test-exit-codes.sh (12 tests)
+ 3 more scripts

## Other Changes
- Updated roadmap to mark Phase 8-9 complete
- Added .gitignore entries for build artifacts
- Updated pre-commit: 800 line limit, exclude tests/data/cmd

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-06 22:50:55 -07:00

533 lines
18 KiB
Rust

//! LLM-based claim extractor with selective triggering and ontology awareness.
//!
//! The LLM extractor only runs on high-value files where regex extractors
//! found nothing. It uses Claude to semantically analyze code and extract
//! security-relevant claims.
//!
//! ## Ontology-Aware Extraction
//!
//! The extractor is initialized with an `OntologyVocabulary` that constrains
//! the LLM output to use concept paths from the authority corpus. This ensures
//! claims match authority subjects for proper conflict detection.
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use stemedb_core::types::ObjectValue;
use tracing::{debug, info, instrument, warn};
use crate::config::LlmConfig;
use crate::llm::cache::{CachedResponse, LlmCache};
use crate::llm::client::GeminiClient;
use crate::llm::ontology::OntologyVocabulary;
use crate::llm::prompt::build_system_prompt;
use crate::llm::prompts::{
extract_json, language_to_extension, language_to_name, language_to_prefix,
DEFAULT_SYSTEM_PROMPT,
};
use crate::llm::types::{LlmClaim, LlmClaimsResponse};
use crate::types::{ExtractedClaim, Language};
/// LLM-based claim extractor with ontology awareness.
pub struct LlmExtractor {
/// Claude API client (optional for cache-only mode).
client: Option<GeminiClient>,
/// Response cache.
cache: LlmCache,
/// Configuration.
config: LlmConfig,
/// Token budget tracking (thread-safe for parallel file processing).
tokens_used: Arc<AtomicUsize>,
/// Ontology vocabulary for constraining output (optional for backwards compatibility).
vocabulary: Option<Arc<OntologyVocabulary>>,
/// Pre-built system prompt with vocabulary.
system_prompt: String,
/// Cache-only mode (no API calls, return empty on cache miss).
cache_only: bool,
}
impl LlmExtractor {
/// Create a new LLM extractor without ontology vocabulary.
///
/// This is the backwards-compatible constructor. Claims will not be
/// validated against authority vocabulary.
pub fn new(client: GeminiClient, cache: LlmCache, config: LlmConfig) -> Self {
Self {
client: Some(client),
cache,
config,
tokens_used: Arc::new(AtomicUsize::new(0)),
vocabulary: None,
system_prompt: DEFAULT_SYSTEM_PROMPT.to_string(),
cache_only: false,
}
}
/// Create a new LLM extractor with ontology vocabulary.
///
/// The vocabulary constrains LLM output to use concept paths from the
/// authority corpus, ensuring proper conflict detection.
pub fn with_vocabulary(
client: GeminiClient,
cache: LlmCache,
config: LlmConfig,
vocabulary: OntologyVocabulary,
) -> Self {
let system_prompt = build_system_prompt(&vocabulary);
info!(concept_count = vocabulary.concepts.len(), "Built ontology-aware system prompt");
Self {
client: Some(client),
cache,
config,
tokens_used: Arc::new(AtomicUsize::new(0)),
vocabulary: Some(Arc::new(vocabulary)),
system_prompt,
cache_only: false,
}
}
/// Create a cache-only LLM extractor with ontology vocabulary.
///
/// This extractor only returns cached responses; it never makes API calls.
/// Use this for deterministic evaluation runs against previously-cached
/// LLM responses.
pub fn with_vocabulary_cached(
cache: LlmCache,
config: LlmConfig,
vocabulary: OntologyVocabulary,
) -> Self {
let system_prompt = build_system_prompt(&vocabulary);
info!(
concept_count = vocabulary.concepts.len(),
"Built cache-only ontology-aware extractor"
);
Self {
client: None,
cache,
config,
tokens_used: Arc::new(AtomicUsize::new(0)),
vocabulary: Some(Arc::new(vocabulary)),
system_prompt,
cache_only: true,
}
}
/// Get total tokens used so far.
pub fn tokens_used(&self) -> usize {
self.tokens_used.load(Ordering::Relaxed)
}
/// Check if we're within the token budget.
fn within_budget(&self) -> bool {
self.tokens_used.load(Ordering::Relaxed) < self.config.max_tokens_per_scan
}
/// Extract claims from file content using LLM.
///
/// Returns an empty vector if:
/// - Token budget is exhausted
/// - File is not high-value (when high_value_only is set)
/// - Content is too short (<50 chars)
/// - LLM returns no claims or errors
#[instrument(skip(self, content), fields(file = %file_path, language = ?language, content_len = content.len()))]
pub fn extract(
&self,
path_segments: &[String],
content: &str,
language: Language,
file_path: &str,
) -> Vec<ExtractedClaim> {
// Check token budget
if !self.within_budget() {
debug!("Token budget exhausted, skipping LLM extraction");
return vec![];
}
// Check high-value filter
if self.config.high_value_only && !is_high_value_file(file_path) {
debug!("File not high-value, skipping LLM extraction");
return vec![];
}
// Skip very short content
if content.len() < 50 {
debug!("Content too short, skipping LLM extraction");
return vec![];
}
// Build concept path prefix from path segments
let concept_prefix = if path_segments.is_empty() {
format!("code://{}", language_to_prefix(language))
} else {
format!("code://{}/{}", language_to_prefix(language), path_segments.join("/"))
};
// Check cache first (now includes prompt hash for automatic invalidation)
let cache_key = LlmCache::cache_key(content, &self.config.model, &self.system_prompt);
if let Some(cached) = self.cache.get(&cache_key) {
debug!("Using cached LLM response");
// Update token count from cache (for budget tracking across files)
self.tokens_used
.fetch_add(cached.input_tokens + cached.output_tokens, Ordering::Relaxed);
return self.parse_claims(&cached.claims_json, &concept_prefix, file_path);
}
// In cache-only mode, return empty on cache miss
if self.cache_only {
debug!("Cache miss in cache-only mode, returning empty");
return vec![];
}
// Check if we have a client for API calls
let client = match &self.client {
Some(c) => c,
None => {
debug!("No API client available, returning empty");
return vec![];
}
};
// Call Claude API with ontology-aware prompt
let user_message = format!(
"Analyze this {} code for security-relevant claims:\n\n```{}\n{}\n```",
language_to_name(language),
language_to_extension(language),
content
);
match client.complete(&self.system_prompt, &user_message) {
Ok(result) => {
// Update token budget
let tokens = result.input_tokens + result.output_tokens;
self.tokens_used.fetch_add(tokens, Ordering::Relaxed);
info!(
input_tokens = result.input_tokens,
output_tokens = result.output_tokens,
total_used = self.tokens_used.load(Ordering::Relaxed),
budget = self.config.max_tokens_per_scan,
"LLM extraction complete"
);
// Cache the response
if self.config.cache_responses {
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let cached_response = CachedResponse {
claims_json: result.response_text.clone(),
cached_at: timestamp,
input_tokens: result.input_tokens,
output_tokens: result.output_tokens,
};
self.cache.put(&cache_key, &cached_response);
}
self.parse_claims(&result.response_text, &concept_prefix, file_path)
}
Err(e) => {
warn!(error = %e, "LLM extraction failed");
vec![]
}
}
}
/// Parse LLM JSON response into ExtractedClaim structs.
///
/// When vocabulary is available, validates claims against the ontology
/// and uses fuzzy matching to correct near-misses.
fn parse_claims(
&self,
json: &str,
concept_prefix: &str,
file_path: &str,
) -> Vec<ExtractedClaim> {
// Try to extract JSON from response (may have markdown code blocks)
let json_str = extract_json(json);
let response: LlmClaimsResponse = match serde_json::from_str(json_str) {
Ok(r) => r,
Err(e) => {
debug!(error = %e, json = %json, "Failed to parse LLM response");
return vec![];
}
};
response
.claims
.into_iter()
.filter(|c| c.confidence >= self.config.min_confidence)
.filter_map(|claim| self.validate_and_transform_claim(claim, concept_prefix, file_path))
.collect()
}
/// Validate a claim against the ontology and transform it to an ExtractedClaim.
///
/// Returns None if the claim doesn't match any known concept.
fn validate_and_transform_claim(
&self,
claim: LlmClaim,
concept_prefix: &str,
file_path: &str,
) -> Option<ExtractedClaim> {
let value = match claim.value_type.as_str() {
"boolean" => claim
.value
.as_bool()
.map(ObjectValue::Boolean)
.unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())),
"number" => claim
.value
.as_f64()
.map(ObjectValue::Number)
.unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())),
_ => ObjectValue::Text(
claim
.value
.as_str()
.map(|s| s.to_string())
.unwrap_or_else(|| claim.value.to_string()),
),
};
// If no vocabulary, accept all claims (backwards compatibility)
let Some(vocab) = &self.vocabulary else {
return Some(ExtractedClaim {
concept_path: format!("{}/{}", concept_prefix, claim.subject),
predicate: claim.predicate,
value,
file: file_path.to_string(),
line: claim.line,
matched_text: claim.matched_text,
confidence: claim.confidence,
description: claim.description,
});
};
// Try exact match on both subject AND predicate first
if let Some(concept) = vocab.find_by_leaf_and_predicate(&claim.subject, &claim.predicate) {
debug!(
subject = %claim.subject,
predicate = %claim.predicate,
"Claim matched ontology concept"
);
return Some(ExtractedClaim {
concept_path: format!("{}/{}", concept_prefix, concept.leaf_path),
predicate: concept.predicate.clone(),
value,
file: file_path.to_string(),
line: claim.line,
matched_text: claim.matched_text,
confidence: claim.confidence,
description: claim.description,
});
}
// Subject exists but predicate doesn't match any known predicate for it
if vocab.find_by_leaf(&claim.subject).is_some() {
debug!(
subject = %claim.subject,
claim_predicate = %claim.predicate,
"Claim subject exists but predicate not in vocabulary"
);
}
// Try fuzzy matching for near-misses
if let Some(concept) = vocab.fuzzy_match(&claim.subject, 0.6) {
warn!(
original = %claim.subject,
matched = %concept.leaf_path,
"Fuzzy matched claim to authority concept"
);
return Some(ExtractedClaim {
concept_path: format!("{}/{}", concept_prefix, concept.leaf_path),
predicate: concept.predicate.clone(),
value,
file: file_path.to_string(),
line: claim.line,
matched_text: claim.matched_text,
confidence: claim.confidence * 0.9, // Reduce confidence for fuzzy matches
description: claim.description,
});
}
// Claim doesn't match any known concept
debug!(
subject = %claim.subject,
"Rejecting claim - no matching ontology concept"
);
None
}
}
/// Check if a file path indicates a high-value file for security analysis.
///
/// High-value files include:
/// - Files in security-sensitive directories (auth/, config/, crypto/, etc.)
/// - Files with security-related names (password, secret, credential, etc.)
pub fn is_high_value_file(path: &str) -> bool {
let lower = path.to_lowercase();
// High-value directories
let dirs = [
"auth/",
"authentication/",
"config/",
"configuration/",
"crypto/",
"cryptography/",
"security/",
"secrets/",
"certs/",
"certificates/",
"ssl/",
"tls/",
"keys/",
"credentials/",
];
// High-value file name components
let names = [
"secret",
"password",
"credential",
"token",
"auth",
"login",
"session",
"jwt",
"tls",
"ssl",
"cert",
"key",
"config",
"settings",
"security",
"crypto",
"encrypt",
"decrypt",
"oauth",
"saml",
"ldap",
"api_key",
"apikey",
"access_key",
"private",
];
dirs.iter().any(|d| lower.contains(d)) || names.iter().any(|n| lower.contains(n))
}
#[cfg(test)]
mod tests {
use super::*;
use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, SourceClass};
fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
let source_metadata = serde_json::json!({
"description": "Test description",
"source": "test",
});
Assertion {
subject: subject.to_string(),
predicate: predicate.to_string(),
object: value,
parent_hash: None,
source_hash: [0u8; 32],
source_class: SourceClass::Clinical,
visual_hash: None,
epoch: None,
source_metadata: serde_json::to_vec(&source_metadata).ok(),
lifecycle: LifecycleStage::Approved,
signatures: vec![],
confidence: 1.0,
timestamp: 0,
hlc_timestamp: HlcTimestamp::default(),
vector: None,
}
}
#[test]
fn test_is_high_value_file_directories() {
assert!(is_high_value_file("src/auth/login.py"));
assert!(is_high_value_file("config/database.yaml"));
assert!(is_high_value_file("pkg/crypto/encrypt.go"));
assert!(is_high_value_file("security/firewall.rs"));
assert!(is_high_value_file("secrets/api_keys.env"));
assert!(is_high_value_file("certs/server.pem"));
}
#[test]
fn test_is_high_value_file_names() {
assert!(is_high_value_file("src/password_validator.py"));
assert!(is_high_value_file("lib/jwt_handler.ts"));
assert!(is_high_value_file("utils/token_generator.go"));
assert!(is_high_value_file("services/oauth_client.rs"));
}
#[test]
fn test_is_high_value_file_not_high_value() {
assert!(!is_high_value_file("src/main.rs"));
assert!(!is_high_value_file("lib/utils.py"));
assert!(!is_high_value_file("pkg/handler.go"));
assert!(!is_high_value_file("tests/test_api.rs"));
}
#[test]
fn test_vocabulary_from_hardcoded_assertions() {
let assertions = vec![
make_test_assertion(
"rfc://5246/tls/cert_verification",
"enabled",
ObjectValue::Boolean(true),
),
make_test_assertion(
"owasp://rate_limit/enabled",
"enabled",
ObjectValue::Boolean(true),
),
make_test_assertion(
"owasp://crypto/hashing/algorithm",
"algorithm",
ObjectValue::Text("secure".to_string()),
),
];
let vocab = OntologyVocabulary::from_assertions(&assertions);
assert_eq!(vocab.concepts.len(), 3);
// Check leaf path extraction
assert!(vocab.find_by_leaf("tls/cert_verification").is_some());
assert!(vocab.find_by_leaf("rate_limit/enabled").is_some());
assert!(vocab.find_by_leaf("hashing/algorithm").is_some());
}
#[test]
fn test_prompt_section_format() {
let assertions = vec![make_test_assertion(
"owasp://rate_limit/enabled",
"enabled",
ObjectValue::Boolean(true),
)];
let vocab = OntologyVocabulary::from_assertions(&assertions);
let section = vocab.to_prompt_section();
// Should contain table headers
assert!(section.contains("Concept Path"));
assert!(section.contains("Predicate"));
assert!(section.contains("Value Type"));
// Should contain our concept
assert!(section.contains("rate_limit/enabled"));
assert!(section.contains("enabled"));
assert!(section.contains("boolean"));
}
}