//! LLM-based claim extractor with selective triggering and ontology awareness. //! //! The LLM extractor only runs on high-value files where regex extractors //! found nothing. It uses Claude to semantically analyze code and extract //! security-relevant claims. //! //! ## Ontology-Aware Extraction //! //! The extractor is initialized with an `OntologyVocabulary` that constrains //! the LLM output to use concept paths from the authority corpus. This ensures //! claims match authority subjects for proper conflict detection. use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use stemedb_core::types::ObjectValue; use tracing::{debug, info, instrument, warn}; use crate::config::LlmConfig; use crate::llm::cache::{CachedResponse, LlmCache}; use crate::llm::client::GeminiClient; use crate::llm::ontology::OntologyVocabulary; use crate::llm::prompt::build_system_prompt; use crate::llm::prompts::{ extract_json, language_to_extension, language_to_name, language_to_prefix, DEFAULT_SYSTEM_PROMPT, }; use crate::llm::types::{LlmClaim, LlmClaimsResponse}; use crate::types::{ExtractedClaim, Language}; /// LLM-based claim extractor with ontology awareness. pub struct LlmExtractor { /// Claude API client (optional for cache-only mode). client: Option, /// Response cache. cache: LlmCache, /// Configuration. config: LlmConfig, /// Token budget tracking (thread-safe for parallel file processing). tokens_used: Arc, /// Ontology vocabulary for constraining output (optional for backwards compatibility). vocabulary: Option>, /// Pre-built system prompt with vocabulary. system_prompt: String, /// Cache-only mode (no API calls, return empty on cache miss). cache_only: bool, } impl LlmExtractor { /// Create a new LLM extractor without ontology vocabulary. /// /// This is the backwards-compatible constructor. Claims will not be /// validated against authority vocabulary. pub fn new(client: GeminiClient, cache: LlmCache, config: LlmConfig) -> Self { Self { client: Some(client), cache, config, tokens_used: Arc::new(AtomicUsize::new(0)), vocabulary: None, system_prompt: DEFAULT_SYSTEM_PROMPT.to_string(), cache_only: false, } } /// Create a new LLM extractor with ontology vocabulary. /// /// The vocabulary constrains LLM output to use concept paths from the /// authority corpus, ensuring proper conflict detection. pub fn with_vocabulary( client: GeminiClient, cache: LlmCache, config: LlmConfig, vocabulary: OntologyVocabulary, ) -> Self { let system_prompt = build_system_prompt(&vocabulary); info!(concept_count = vocabulary.concepts.len(), "Built ontology-aware system prompt"); Self { client: Some(client), cache, config, tokens_used: Arc::new(AtomicUsize::new(0)), vocabulary: Some(Arc::new(vocabulary)), system_prompt, cache_only: false, } } /// Create a cache-only LLM extractor with ontology vocabulary. /// /// This extractor only returns cached responses; it never makes API calls. /// Use this for deterministic evaluation runs against previously-cached /// LLM responses. pub fn with_vocabulary_cached( cache: LlmCache, config: LlmConfig, vocabulary: OntologyVocabulary, ) -> Self { let system_prompt = build_system_prompt(&vocabulary); info!( concept_count = vocabulary.concepts.len(), "Built cache-only ontology-aware extractor" ); Self { client: None, cache, config, tokens_used: Arc::new(AtomicUsize::new(0)), vocabulary: Some(Arc::new(vocabulary)), system_prompt, cache_only: true, } } /// Get total tokens used so far. pub fn tokens_used(&self) -> usize { self.tokens_used.load(Ordering::Relaxed) } /// Check if we're within the token budget. fn within_budget(&self) -> bool { self.tokens_used.load(Ordering::Relaxed) < self.config.max_tokens_per_scan } /// Extract claims from file content using LLM. /// /// Returns an empty vector if: /// - Token budget is exhausted /// - File is not high-value (when high_value_only is set) /// - Content is too short (<50 chars) /// - LLM returns no claims or errors #[instrument(skip(self, content), fields(file = %file_path, language = ?language, content_len = content.len()))] pub fn extract( &self, path_segments: &[String], content: &str, language: Language, file_path: &str, ) -> Vec { // Check token budget if !self.within_budget() { debug!("Token budget exhausted, skipping LLM extraction"); return vec![]; } // Check high-value filter if self.config.high_value_only && !is_high_value_file(file_path) { debug!("File not high-value, skipping LLM extraction"); return vec![]; } // Skip very short content if content.len() < 50 { debug!("Content too short, skipping LLM extraction"); return vec![]; } // Build concept path prefix from path segments let concept_prefix = if path_segments.is_empty() { format!("code://{}", language_to_prefix(language)) } else { format!("code://{}/{}", language_to_prefix(language), path_segments.join("/")) }; // Check cache first (now includes prompt hash for automatic invalidation) let cache_key = LlmCache::cache_key(content, &self.config.model, &self.system_prompt); if let Some(cached) = self.cache.get(&cache_key) { debug!("Using cached LLM response"); // Update token count from cache (for budget tracking across files) self.tokens_used .fetch_add(cached.input_tokens + cached.output_tokens, Ordering::Relaxed); return self.parse_claims(&cached.claims_json, &concept_prefix, file_path); } // In cache-only mode, return empty on cache miss if self.cache_only { debug!("Cache miss in cache-only mode, returning empty"); return vec![]; } // Check if we have a client for API calls let client = match &self.client { Some(c) => c, None => { debug!("No API client available, returning empty"); return vec![]; } }; // Call Claude API with ontology-aware prompt let user_message = format!( "Analyze this {} code for security-relevant claims:\n\n```{}\n{}\n```", language_to_name(language), language_to_extension(language), content ); match client.complete(&self.system_prompt, &user_message) { Ok(result) => { // Update token budget let tokens = result.input_tokens + result.output_tokens; self.tokens_used.fetch_add(tokens, Ordering::Relaxed); info!( input_tokens = result.input_tokens, output_tokens = result.output_tokens, total_used = self.tokens_used.load(Ordering::Relaxed), budget = self.config.max_tokens_per_scan, "LLM extraction complete" ); // Cache the response if self.config.cache_responses { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); let cached_response = CachedResponse { claims_json: result.response_text.clone(), cached_at: timestamp, input_tokens: result.input_tokens, output_tokens: result.output_tokens, }; self.cache.put(&cache_key, &cached_response); } self.parse_claims(&result.response_text, &concept_prefix, file_path) } Err(e) => { warn!(error = %e, "LLM extraction failed"); vec![] } } } /// Parse LLM JSON response into ExtractedClaim structs. /// /// When vocabulary is available, validates claims against the ontology /// and uses fuzzy matching to correct near-misses. fn parse_claims( &self, json: &str, concept_prefix: &str, file_path: &str, ) -> Vec { // Try to extract JSON from response (may have markdown code blocks) let json_str = extract_json(json); let response: LlmClaimsResponse = match serde_json::from_str(json_str) { Ok(r) => r, Err(e) => { debug!(error = %e, json = %json, "Failed to parse LLM response"); return vec![]; } }; response .claims .into_iter() .filter(|c| c.confidence >= self.config.min_confidence) .filter_map(|claim| self.validate_and_transform_claim(claim, concept_prefix, file_path)) .collect() } /// Validate a claim against the ontology and transform it to an ExtractedClaim. /// /// Returns None if the claim doesn't match any known concept. fn validate_and_transform_claim( &self, claim: LlmClaim, concept_prefix: &str, file_path: &str, ) -> Option { let value = match claim.value_type.as_str() { "boolean" => claim .value .as_bool() .map(ObjectValue::Boolean) .unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())), "number" => claim .value .as_f64() .map(ObjectValue::Number) .unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())), _ => ObjectValue::Text( claim .value .as_str() .map(|s| s.to_string()) .unwrap_or_else(|| claim.value.to_string()), ), }; // If no vocabulary, accept all claims (backwards compatibility) let Some(vocab) = &self.vocabulary else { return Some(ExtractedClaim { concept_path: format!("{}/{}", concept_prefix, claim.subject), predicate: claim.predicate, value, file: file_path.to_string(), line: claim.line, matched_text: claim.matched_text, confidence: claim.confidence, description: claim.description, }); }; // Try exact match on both subject AND predicate first if let Some(concept) = vocab.find_by_leaf_and_predicate(&claim.subject, &claim.predicate) { debug!( subject = %claim.subject, predicate = %claim.predicate, "Claim matched ontology concept" ); return Some(ExtractedClaim { concept_path: format!("{}/{}", concept_prefix, concept.leaf_path), predicate: concept.predicate.clone(), value, file: file_path.to_string(), line: claim.line, matched_text: claim.matched_text, confidence: claim.confidence, description: claim.description, }); } // Subject exists but predicate doesn't match any known predicate for it if vocab.find_by_leaf(&claim.subject).is_some() { debug!( subject = %claim.subject, claim_predicate = %claim.predicate, "Claim subject exists but predicate not in vocabulary" ); } // Try fuzzy matching for near-misses if let Some(concept) = vocab.fuzzy_match(&claim.subject, 0.6) { warn!( original = %claim.subject, matched = %concept.leaf_path, "Fuzzy matched claim to authority concept" ); return Some(ExtractedClaim { concept_path: format!("{}/{}", concept_prefix, concept.leaf_path), predicate: concept.predicate.clone(), value, file: file_path.to_string(), line: claim.line, matched_text: claim.matched_text, confidence: claim.confidence * 0.9, // Reduce confidence for fuzzy matches description: claim.description, }); } // Claim doesn't match any known concept debug!( subject = %claim.subject, "Rejecting claim - no matching ontology concept" ); None } } /// Check if a file path indicates a high-value file for security analysis. /// /// High-value files include: /// - Files in security-sensitive directories (auth/, config/, crypto/, etc.) /// - Files with security-related names (password, secret, credential, etc.) pub fn is_high_value_file(path: &str) -> bool { let lower = path.to_lowercase(); // High-value directories let dirs = [ "auth/", "authentication/", "config/", "configuration/", "crypto/", "cryptography/", "security/", "secrets/", "certs/", "certificates/", "ssl/", "tls/", "keys/", "credentials/", ]; // High-value file name components let names = [ "secret", "password", "credential", "token", "auth", "login", "session", "jwt", "tls", "ssl", "cert", "key", "config", "settings", "security", "crypto", "encrypt", "decrypt", "oauth", "saml", "ldap", "api_key", "apikey", "access_key", "private", ]; dirs.iter().any(|d| lower.contains(d)) || names.iter().any(|n| lower.contains(n)) } #[cfg(test)] mod tests { use super::*; use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, SourceClass}; fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion { let source_metadata = serde_json::json!({ "description": "Test description", "source": "test", }); Assertion { subject: subject.to_string(), predicate: predicate.to_string(), object: value, parent_hash: None, source_hash: [0u8; 32], source_class: SourceClass::Clinical, visual_hash: None, epoch: None, source_metadata: serde_json::to_vec(&source_metadata).ok(), lifecycle: LifecycleStage::Approved, signatures: vec![], confidence: 1.0, timestamp: 0, hlc_timestamp: HlcTimestamp::default(), vector: None, } } #[test] fn test_is_high_value_file_directories() { assert!(is_high_value_file("src/auth/login.py")); assert!(is_high_value_file("config/database.yaml")); assert!(is_high_value_file("pkg/crypto/encrypt.go")); assert!(is_high_value_file("security/firewall.rs")); assert!(is_high_value_file("secrets/api_keys.env")); assert!(is_high_value_file("certs/server.pem")); } #[test] fn test_is_high_value_file_names() { assert!(is_high_value_file("src/password_validator.py")); assert!(is_high_value_file("lib/jwt_handler.ts")); assert!(is_high_value_file("utils/token_generator.go")); assert!(is_high_value_file("services/oauth_client.rs")); } #[test] fn test_is_high_value_file_not_high_value() { assert!(!is_high_value_file("src/main.rs")); assert!(!is_high_value_file("lib/utils.py")); assert!(!is_high_value_file("pkg/handler.go")); assert!(!is_high_value_file("tests/test_api.rs")); } #[test] fn test_vocabulary_from_hardcoded_assertions() { let assertions = vec![ make_test_assertion( "rfc://5246/tls/cert_verification", "enabled", ObjectValue::Boolean(true), ), make_test_assertion( "owasp://rate_limit/enabled", "enabled", ObjectValue::Boolean(true), ), make_test_assertion( "owasp://crypto/hashing/algorithm", "algorithm", ObjectValue::Text("secure".to_string()), ), ]; let vocab = OntologyVocabulary::from_assertions(&assertions); assert_eq!(vocab.concepts.len(), 3); // Check leaf path extraction assert!(vocab.find_by_leaf("tls/cert_verification").is_some()); assert!(vocab.find_by_leaf("rate_limit/enabled").is_some()); assert!(vocab.find_by_leaf("hashing/algorithm").is_some()); } #[test] fn test_prompt_section_format() { let assertions = vec![make_test_assertion( "owasp://rate_limit/enabled", "enabled", ObjectValue::Boolean(true), )]; let vocab = OntologyVocabulary::from_assertions(&assertions); let section = vocab.to_prompt_section(); // Should contain table headers assert!(section.contains("Concept Path")); assert!(section.contains("Predicate")); assert!(section.contains("Value Type")); // Should contain our concept assert!(section.contains("rate_limit/enabled")); assert!(section.contains("enabled")); assert!(section.contains("boolean")); } }