## Phase 8: Enterprise Extractor Improvements ✅ - 14 security extractors (TLS, JWT, SQL injection, XSS, etc.) - 10 framework-specific extractors (Spring, Django, Rails, etc.) - Config file security detection (YAML, TOML) ## Phase 9: Autonomous Extractor Generation ✅ - Shadow mode executor with TP/FP tracking - Graduation pipeline with confidence thresholds - Auto-rollback on regression detection - Cross-project pattern syncing ## UAT Suite Complete (14 scripts, 90 tests) - test-core-detection.sh (6 tests) - test-declarative-extractors.sh (5 tests) - test-domain-frameworks.sh (5 tests) - test-domain-unreal.sh (3 tests) - test-llm-extraction.sh (6 tests) - test-eval-harness.sh (5 tests) - test-cross-language.sh (3 tests) - test-precommit-performance.sh (4 tests) - test-output-formats.sh (8 tests) - test-drift-detection.sh (6 tests) - test-exit-codes.sh (12 tests) + 3 more scripts ## Other Changes - Updated roadmap to mark Phase 8-9 complete - Added .gitignore entries for build artifacts - Updated pre-commit: 800 line limit, exclude tests/data/cmd Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
533 lines
18 KiB
Rust
533 lines
18 KiB
Rust
//! LLM-based claim extractor with selective triggering and ontology awareness.
|
|
//!
|
|
//! The LLM extractor only runs on high-value files where regex extractors
|
|
//! found nothing. It uses Claude to semantically analyze code and extract
|
|
//! security-relevant claims.
|
|
//!
|
|
//! ## Ontology-Aware Extraction
|
|
//!
|
|
//! The extractor is initialized with an `OntologyVocabulary` that constrains
|
|
//! the LLM output to use concept paths from the authority corpus. This ensures
|
|
//! claims match authority subjects for proper conflict detection.
|
|
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
use std::sync::Arc;
|
|
|
|
use stemedb_core::types::ObjectValue;
|
|
use tracing::{debug, info, instrument, warn};
|
|
|
|
use crate::config::LlmConfig;
|
|
use crate::llm::cache::{CachedResponse, LlmCache};
|
|
use crate::llm::client::GeminiClient;
|
|
use crate::llm::ontology::OntologyVocabulary;
|
|
use crate::llm::prompt::build_system_prompt;
|
|
use crate::llm::prompts::{
|
|
extract_json, language_to_extension, language_to_name, language_to_prefix,
|
|
DEFAULT_SYSTEM_PROMPT,
|
|
};
|
|
use crate::llm::types::{LlmClaim, LlmClaimsResponse};
|
|
use crate::types::{ExtractedClaim, Language};
|
|
|
|
/// LLM-based claim extractor with ontology awareness.
|
|
pub struct LlmExtractor {
|
|
/// Claude API client (optional for cache-only mode).
|
|
client: Option<GeminiClient>,
|
|
/// Response cache.
|
|
cache: LlmCache,
|
|
/// Configuration.
|
|
config: LlmConfig,
|
|
/// Token budget tracking (thread-safe for parallel file processing).
|
|
tokens_used: Arc<AtomicUsize>,
|
|
/// Ontology vocabulary for constraining output (optional for backwards compatibility).
|
|
vocabulary: Option<Arc<OntologyVocabulary>>,
|
|
/// Pre-built system prompt with vocabulary.
|
|
system_prompt: String,
|
|
/// Cache-only mode (no API calls, return empty on cache miss).
|
|
cache_only: bool,
|
|
}
|
|
|
|
impl LlmExtractor {
|
|
/// Create a new LLM extractor without ontology vocabulary.
|
|
///
|
|
/// This is the backwards-compatible constructor. Claims will not be
|
|
/// validated against authority vocabulary.
|
|
pub fn new(client: GeminiClient, cache: LlmCache, config: LlmConfig) -> Self {
|
|
Self {
|
|
client: Some(client),
|
|
cache,
|
|
config,
|
|
tokens_used: Arc::new(AtomicUsize::new(0)),
|
|
vocabulary: None,
|
|
system_prompt: DEFAULT_SYSTEM_PROMPT.to_string(),
|
|
cache_only: false,
|
|
}
|
|
}
|
|
|
|
/// Create a new LLM extractor with ontology vocabulary.
|
|
///
|
|
/// The vocabulary constrains LLM output to use concept paths from the
|
|
/// authority corpus, ensuring proper conflict detection.
|
|
pub fn with_vocabulary(
|
|
client: GeminiClient,
|
|
cache: LlmCache,
|
|
config: LlmConfig,
|
|
vocabulary: OntologyVocabulary,
|
|
) -> Self {
|
|
let system_prompt = build_system_prompt(&vocabulary);
|
|
info!(concept_count = vocabulary.concepts.len(), "Built ontology-aware system prompt");
|
|
|
|
Self {
|
|
client: Some(client),
|
|
cache,
|
|
config,
|
|
tokens_used: Arc::new(AtomicUsize::new(0)),
|
|
vocabulary: Some(Arc::new(vocabulary)),
|
|
system_prompt,
|
|
cache_only: false,
|
|
}
|
|
}
|
|
|
|
/// Create a cache-only LLM extractor with ontology vocabulary.
|
|
///
|
|
/// This extractor only returns cached responses; it never makes API calls.
|
|
/// Use this for deterministic evaluation runs against previously-cached
|
|
/// LLM responses.
|
|
pub fn with_vocabulary_cached(
|
|
cache: LlmCache,
|
|
config: LlmConfig,
|
|
vocabulary: OntologyVocabulary,
|
|
) -> Self {
|
|
let system_prompt = build_system_prompt(&vocabulary);
|
|
info!(
|
|
concept_count = vocabulary.concepts.len(),
|
|
"Built cache-only ontology-aware extractor"
|
|
);
|
|
|
|
Self {
|
|
client: None,
|
|
cache,
|
|
config,
|
|
tokens_used: Arc::new(AtomicUsize::new(0)),
|
|
vocabulary: Some(Arc::new(vocabulary)),
|
|
system_prompt,
|
|
cache_only: true,
|
|
}
|
|
}
|
|
|
|
/// Get total tokens used so far.
|
|
pub fn tokens_used(&self) -> usize {
|
|
self.tokens_used.load(Ordering::Relaxed)
|
|
}
|
|
|
|
/// Check if we're within the token budget.
|
|
fn within_budget(&self) -> bool {
|
|
self.tokens_used.load(Ordering::Relaxed) < self.config.max_tokens_per_scan
|
|
}
|
|
|
|
/// Extract claims from file content using LLM.
|
|
///
|
|
/// Returns an empty vector if:
|
|
/// - Token budget is exhausted
|
|
/// - File is not high-value (when high_value_only is set)
|
|
/// - Content is too short (<50 chars)
|
|
/// - LLM returns no claims or errors
|
|
#[instrument(skip(self, content), fields(file = %file_path, language = ?language, content_len = content.len()))]
|
|
pub fn extract(
|
|
&self,
|
|
path_segments: &[String],
|
|
content: &str,
|
|
language: Language,
|
|
file_path: &str,
|
|
) -> Vec<ExtractedClaim> {
|
|
// Check token budget
|
|
if !self.within_budget() {
|
|
debug!("Token budget exhausted, skipping LLM extraction");
|
|
return vec![];
|
|
}
|
|
|
|
// Check high-value filter
|
|
if self.config.high_value_only && !is_high_value_file(file_path) {
|
|
debug!("File not high-value, skipping LLM extraction");
|
|
return vec![];
|
|
}
|
|
|
|
// Skip very short content
|
|
if content.len() < 50 {
|
|
debug!("Content too short, skipping LLM extraction");
|
|
return vec![];
|
|
}
|
|
|
|
// Build concept path prefix from path segments
|
|
let concept_prefix = if path_segments.is_empty() {
|
|
format!("code://{}", language_to_prefix(language))
|
|
} else {
|
|
format!("code://{}/{}", language_to_prefix(language), path_segments.join("/"))
|
|
};
|
|
|
|
// Check cache first (now includes prompt hash for automatic invalidation)
|
|
let cache_key = LlmCache::cache_key(content, &self.config.model, &self.system_prompt);
|
|
if let Some(cached) = self.cache.get(&cache_key) {
|
|
debug!("Using cached LLM response");
|
|
// Update token count from cache (for budget tracking across files)
|
|
self.tokens_used
|
|
.fetch_add(cached.input_tokens + cached.output_tokens, Ordering::Relaxed);
|
|
return self.parse_claims(&cached.claims_json, &concept_prefix, file_path);
|
|
}
|
|
|
|
// In cache-only mode, return empty on cache miss
|
|
if self.cache_only {
|
|
debug!("Cache miss in cache-only mode, returning empty");
|
|
return vec![];
|
|
}
|
|
|
|
// Check if we have a client for API calls
|
|
let client = match &self.client {
|
|
Some(c) => c,
|
|
None => {
|
|
debug!("No API client available, returning empty");
|
|
return vec![];
|
|
}
|
|
};
|
|
|
|
// Call Claude API with ontology-aware prompt
|
|
let user_message = format!(
|
|
"Analyze this {} code for security-relevant claims:\n\n```{}\n{}\n```",
|
|
language_to_name(language),
|
|
language_to_extension(language),
|
|
content
|
|
);
|
|
|
|
match client.complete(&self.system_prompt, &user_message) {
|
|
Ok(result) => {
|
|
// Update token budget
|
|
let tokens = result.input_tokens + result.output_tokens;
|
|
self.tokens_used.fetch_add(tokens, Ordering::Relaxed);
|
|
|
|
info!(
|
|
input_tokens = result.input_tokens,
|
|
output_tokens = result.output_tokens,
|
|
total_used = self.tokens_used.load(Ordering::Relaxed),
|
|
budget = self.config.max_tokens_per_scan,
|
|
"LLM extraction complete"
|
|
);
|
|
|
|
// Cache the response
|
|
if self.config.cache_responses {
|
|
let timestamp = std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.map(|d| d.as_secs())
|
|
.unwrap_or(0);
|
|
|
|
let cached_response = CachedResponse {
|
|
claims_json: result.response_text.clone(),
|
|
cached_at: timestamp,
|
|
input_tokens: result.input_tokens,
|
|
output_tokens: result.output_tokens,
|
|
};
|
|
self.cache.put(&cache_key, &cached_response);
|
|
}
|
|
|
|
self.parse_claims(&result.response_text, &concept_prefix, file_path)
|
|
}
|
|
Err(e) => {
|
|
warn!(error = %e, "LLM extraction failed");
|
|
vec![]
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse LLM JSON response into ExtractedClaim structs.
|
|
///
|
|
/// When vocabulary is available, validates claims against the ontology
|
|
/// and uses fuzzy matching to correct near-misses.
|
|
fn parse_claims(
|
|
&self,
|
|
json: &str,
|
|
concept_prefix: &str,
|
|
file_path: &str,
|
|
) -> Vec<ExtractedClaim> {
|
|
// Try to extract JSON from response (may have markdown code blocks)
|
|
let json_str = extract_json(json);
|
|
|
|
let response: LlmClaimsResponse = match serde_json::from_str(json_str) {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
debug!(error = %e, json = %json, "Failed to parse LLM response");
|
|
return vec![];
|
|
}
|
|
};
|
|
|
|
response
|
|
.claims
|
|
.into_iter()
|
|
.filter(|c| c.confidence >= self.config.min_confidence)
|
|
.filter_map(|claim| self.validate_and_transform_claim(claim, concept_prefix, file_path))
|
|
.collect()
|
|
}
|
|
|
|
/// Validate a claim against the ontology and transform it to an ExtractedClaim.
|
|
///
|
|
/// Returns None if the claim doesn't match any known concept.
|
|
fn validate_and_transform_claim(
|
|
&self,
|
|
claim: LlmClaim,
|
|
concept_prefix: &str,
|
|
file_path: &str,
|
|
) -> Option<ExtractedClaim> {
|
|
let value = match claim.value_type.as_str() {
|
|
"boolean" => claim
|
|
.value
|
|
.as_bool()
|
|
.map(ObjectValue::Boolean)
|
|
.unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())),
|
|
"number" => claim
|
|
.value
|
|
.as_f64()
|
|
.map(ObjectValue::Number)
|
|
.unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())),
|
|
_ => ObjectValue::Text(
|
|
claim
|
|
.value
|
|
.as_str()
|
|
.map(|s| s.to_string())
|
|
.unwrap_or_else(|| claim.value.to_string()),
|
|
),
|
|
};
|
|
|
|
// If no vocabulary, accept all claims (backwards compatibility)
|
|
let Some(vocab) = &self.vocabulary else {
|
|
return Some(ExtractedClaim {
|
|
concept_path: format!("{}/{}", concept_prefix, claim.subject),
|
|
predicate: claim.predicate,
|
|
value,
|
|
file: file_path.to_string(),
|
|
line: claim.line,
|
|
matched_text: claim.matched_text,
|
|
confidence: claim.confidence,
|
|
description: claim.description,
|
|
});
|
|
};
|
|
|
|
// Try exact match on both subject AND predicate first
|
|
if let Some(concept) = vocab.find_by_leaf_and_predicate(&claim.subject, &claim.predicate) {
|
|
debug!(
|
|
subject = %claim.subject,
|
|
predicate = %claim.predicate,
|
|
"Claim matched ontology concept"
|
|
);
|
|
return Some(ExtractedClaim {
|
|
concept_path: format!("{}/{}", concept_prefix, concept.leaf_path),
|
|
predicate: concept.predicate.clone(),
|
|
value,
|
|
file: file_path.to_string(),
|
|
line: claim.line,
|
|
matched_text: claim.matched_text,
|
|
confidence: claim.confidence,
|
|
description: claim.description,
|
|
});
|
|
}
|
|
|
|
// Subject exists but predicate doesn't match any known predicate for it
|
|
if vocab.find_by_leaf(&claim.subject).is_some() {
|
|
debug!(
|
|
subject = %claim.subject,
|
|
claim_predicate = %claim.predicate,
|
|
"Claim subject exists but predicate not in vocabulary"
|
|
);
|
|
}
|
|
|
|
// Try fuzzy matching for near-misses
|
|
if let Some(concept) = vocab.fuzzy_match(&claim.subject, 0.6) {
|
|
warn!(
|
|
original = %claim.subject,
|
|
matched = %concept.leaf_path,
|
|
"Fuzzy matched claim to authority concept"
|
|
);
|
|
return Some(ExtractedClaim {
|
|
concept_path: format!("{}/{}", concept_prefix, concept.leaf_path),
|
|
predicate: concept.predicate.clone(),
|
|
value,
|
|
file: file_path.to_string(),
|
|
line: claim.line,
|
|
matched_text: claim.matched_text,
|
|
confidence: claim.confidence * 0.9, // Reduce confidence for fuzzy matches
|
|
description: claim.description,
|
|
});
|
|
}
|
|
|
|
// Claim doesn't match any known concept
|
|
debug!(
|
|
subject = %claim.subject,
|
|
"Rejecting claim - no matching ontology concept"
|
|
);
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Check if a file path indicates a high-value file for security analysis.
|
|
///
|
|
/// High-value files include:
|
|
/// - Files in security-sensitive directories (auth/, config/, crypto/, etc.)
|
|
/// - Files with security-related names (password, secret, credential, etc.)
|
|
pub fn is_high_value_file(path: &str) -> bool {
|
|
let lower = path.to_lowercase();
|
|
|
|
// High-value directories
|
|
let dirs = [
|
|
"auth/",
|
|
"authentication/",
|
|
"config/",
|
|
"configuration/",
|
|
"crypto/",
|
|
"cryptography/",
|
|
"security/",
|
|
"secrets/",
|
|
"certs/",
|
|
"certificates/",
|
|
"ssl/",
|
|
"tls/",
|
|
"keys/",
|
|
"credentials/",
|
|
];
|
|
|
|
// High-value file name components
|
|
let names = [
|
|
"secret",
|
|
"password",
|
|
"credential",
|
|
"token",
|
|
"auth",
|
|
"login",
|
|
"session",
|
|
"jwt",
|
|
"tls",
|
|
"ssl",
|
|
"cert",
|
|
"key",
|
|
"config",
|
|
"settings",
|
|
"security",
|
|
"crypto",
|
|
"encrypt",
|
|
"decrypt",
|
|
"oauth",
|
|
"saml",
|
|
"ldap",
|
|
"api_key",
|
|
"apikey",
|
|
"access_key",
|
|
"private",
|
|
];
|
|
|
|
dirs.iter().any(|d| lower.contains(d)) || names.iter().any(|n| lower.contains(n))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, SourceClass};
|
|
|
|
fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
|
|
let source_metadata = serde_json::json!({
|
|
"description": "Test description",
|
|
"source": "test",
|
|
});
|
|
|
|
Assertion {
|
|
subject: subject.to_string(),
|
|
predicate: predicate.to_string(),
|
|
object: value,
|
|
parent_hash: None,
|
|
source_hash: [0u8; 32],
|
|
source_class: SourceClass::Clinical,
|
|
visual_hash: None,
|
|
epoch: None,
|
|
source_metadata: serde_json::to_vec(&source_metadata).ok(),
|
|
lifecycle: LifecycleStage::Approved,
|
|
signatures: vec![],
|
|
confidence: 1.0,
|
|
timestamp: 0,
|
|
hlc_timestamp: HlcTimestamp::default(),
|
|
vector: None,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_high_value_file_directories() {
|
|
assert!(is_high_value_file("src/auth/login.py"));
|
|
assert!(is_high_value_file("config/database.yaml"));
|
|
assert!(is_high_value_file("pkg/crypto/encrypt.go"));
|
|
assert!(is_high_value_file("security/firewall.rs"));
|
|
assert!(is_high_value_file("secrets/api_keys.env"));
|
|
assert!(is_high_value_file("certs/server.pem"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_high_value_file_names() {
|
|
assert!(is_high_value_file("src/password_validator.py"));
|
|
assert!(is_high_value_file("lib/jwt_handler.ts"));
|
|
assert!(is_high_value_file("utils/token_generator.go"));
|
|
assert!(is_high_value_file("services/oauth_client.rs"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_high_value_file_not_high_value() {
|
|
assert!(!is_high_value_file("src/main.rs"));
|
|
assert!(!is_high_value_file("lib/utils.py"));
|
|
assert!(!is_high_value_file("pkg/handler.go"));
|
|
assert!(!is_high_value_file("tests/test_api.rs"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_vocabulary_from_hardcoded_assertions() {
|
|
let assertions = vec![
|
|
make_test_assertion(
|
|
"rfc://5246/tls/cert_verification",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
make_test_assertion(
|
|
"owasp://crypto/hashing/algorithm",
|
|
"algorithm",
|
|
ObjectValue::Text("secure".to_string()),
|
|
),
|
|
];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
|
|
assert_eq!(vocab.concepts.len(), 3);
|
|
|
|
// Check leaf path extraction
|
|
assert!(vocab.find_by_leaf("tls/cert_verification").is_some());
|
|
assert!(vocab.find_by_leaf("rate_limit/enabled").is_some());
|
|
assert!(vocab.find_by_leaf("hashing/algorithm").is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_prompt_section_format() {
|
|
let assertions = vec![make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
)];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
let section = vocab.to_prompt_section();
|
|
|
|
// Should contain table headers
|
|
assert!(section.contains("Concept Path"));
|
|
assert!(section.contains("Predicate"));
|
|
assert!(section.contains("Value Type"));
|
|
|
|
// Should contain our concept
|
|
assert!(section.contains("rate_limit/enabled"));
|
|
assert!(section.contains("enabled"));
|
|
assert!(section.contains("boolean"));
|
|
}
|
|
}
|