stemedb/applications/aphoria/src/llm/extractor.rs

//! LLM-based claim extractor with selective triggering and ontology awareness.
//!
//! The LLM extractor only runs on high-value files where regex extractors
//! found nothing. It uses Claude to semantically analyze code and extract
//! security-relevant claims.
//!
//! ## Ontology-Aware Extraction
//!
//! The extractor is initialized with an `OntologyVocabulary` that constrains
//! the LLM output to use concept paths from the authority corpus. This ensures
//! claims match authority subjects for proper conflict detection.

use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;

use stemedb_core::types::ObjectValue;
use tracing::{debug, info, instrument, warn};

use crate::config::LlmConfig;
use crate::llm::cache::{CachedResponse, LlmCache};
use crate::llm::client::GeminiClient;
use crate::llm::ontology::OntologyVocabulary;
use crate::llm::prompt::build_system_prompt;
use crate::llm::prompts::{
    extract_json, language_to_extension, language_to_name, language_to_prefix,
    DEFAULT_SYSTEM_PROMPT,
};
use crate::llm::types::{LlmClaim, LlmClaimsResponse};
use crate::types::{ExtractedClaim, Language};

/// LLM-based claim extractor with ontology awareness.
pub struct LlmExtractor {
    /// Claude API client (optional for cache-only mode).
    client: Option<GeminiClient>,
    /// Response cache.
    cache: LlmCache,
    /// Configuration.
    config: LlmConfig,
    /// Token budget tracking (thread-safe for parallel file processing).
    tokens_used: Arc<AtomicUsize>,
    /// Ontology vocabulary for constraining output (optional for backwards compatibility).
    vocabulary: Option<Arc<OntologyVocabulary>>,
    /// Pre-built system prompt with vocabulary.
    system_prompt: String,
    /// Cache-only mode (no API calls, return empty on cache miss).
    cache_only: bool,
}

impl LlmExtractor {
    /// Create a new LLM extractor without ontology vocabulary.
    ///
    /// This is the backwards-compatible constructor. Claims will not be
    /// validated against authority vocabulary.
    pub fn new(client: GeminiClient, cache: LlmCache, config: LlmConfig) -> Self {
        Self {
            client: Some(client),
            cache,
            config,
            tokens_used: Arc::new(AtomicUsize::new(0)),
            vocabulary: None,
            system_prompt: DEFAULT_SYSTEM_PROMPT.to_string(),
            cache_only: false,
        }
    }

    /// Create a new LLM extractor with ontology vocabulary.
    ///
    /// The vocabulary constrains LLM output to use concept paths from the
    /// authority corpus, ensuring proper conflict detection.
    pub fn with_vocabulary(
        client: GeminiClient,
        cache: LlmCache,
        config: LlmConfig,
        vocabulary: OntologyVocabulary,
    ) -> Self {
        let system_prompt = build_system_prompt(&vocabulary);
        info!(concept_count = vocabulary.concepts.len(), "Built ontology-aware system prompt");

        Self {
            client: Some(client),
            cache,
            config,
            tokens_used: Arc::new(AtomicUsize::new(0)),
            vocabulary: Some(Arc::new(vocabulary)),
            system_prompt,
            cache_only: false,
        }
    }

    /// Create a cache-only LLM extractor with ontology vocabulary.
    ///
    /// This extractor only returns cached responses; it never makes API calls.
    /// Use this for deterministic evaluation runs against previously-cached
    /// LLM responses.
    pub fn with_vocabulary_cached(
        cache: LlmCache,
        config: LlmConfig,
        vocabulary: OntologyVocabulary,
    ) -> Self {
        let system_prompt = build_system_prompt(&vocabulary);
        info!(
            concept_count = vocabulary.concepts.len(),
            "Built cache-only ontology-aware extractor"
        );

        Self {
            client: None,
            cache,
            config,
            tokens_used: Arc::new(AtomicUsize::new(0)),
            vocabulary: Some(Arc::new(vocabulary)),
            system_prompt,
            cache_only: true,
        }
    }

    /// Get total tokens used so far.
    pub fn tokens_used(&self) -> usize {
        self.tokens_used.load(Ordering::Relaxed)
    }

    /// Check if we're within the token budget.
    fn within_budget(&self) -> bool {
        self.tokens_used.load(Ordering::Relaxed) < self.config.max_tokens_per_scan
    }

    /// Extract claims from file content using LLM.
    ///
    /// Returns an empty vector if:
    /// - Token budget is exhausted
    /// - File is not high-value (when high_value_only is set)
    /// - Content is too short (<50 chars)
    /// - LLM returns no claims or errors
    #[instrument(skip(self, content), fields(file = %file_path, language = ?language, content_len = content.len()))]
    pub fn extract(
        &self,
        path_segments: &[String],
        content: &str,
        language: Language,
        file_path: &str,
    ) -> Vec<ExtractedClaim> {
        // Check token budget
        if !self.within_budget() {
            debug!("Token budget exhausted, skipping LLM extraction");
            return vec![];
        }

        // Check high-value filter
        if self.config.high_value_only && !is_high_value_file(file_path) {
            debug!("File not high-value, skipping LLM extraction");
            return vec![];
        }

        // Skip very short content
        if content.len() < 50 {
            debug!("Content too short, skipping LLM extraction");
            return vec![];
        }

        // Build concept path prefix from path segments
        let concept_prefix = if path_segments.is_empty() {
            format!("code://{}", language_to_prefix(language))
        } else {
            format!("code://{}/{}", language_to_prefix(language), path_segments.join("/"))
        };

        // Check cache first (now includes prompt hash for automatic invalidation)
        let cache_key = LlmCache::cache_key(content, &self.config.model, &self.system_prompt);
        if let Some(cached) = self.cache.get(&cache_key) {
            debug!("Using cached LLM response");
            // Update token count from cache (for budget tracking across files)
            self.tokens_used
                .fetch_add(cached.input_tokens + cached.output_tokens, Ordering::Relaxed);
            return self.parse_claims(&cached.claims_json, &concept_prefix, file_path);
        }

        // In cache-only mode, return empty on cache miss
        if self.cache_only {
            debug!("Cache miss in cache-only mode, returning empty");
            return vec![];
        }

        // Check if we have a client for API calls
        let client = match &self.client {
            Some(c) => c,
            None => {
                debug!("No API client available, returning empty");
                return vec![];
            }
        };

        // Call Claude API with ontology-aware prompt
        let user_message = format!(
            "Analyze this {} code for security-relevant claims:\n\n```{}\n{}\n```",
            language_to_name(language),
            language_to_extension(language),
            content
        );

        match client.complete(&self.system_prompt, &user_message) {
            Ok(result) => {
                // Update token budget
                let tokens = result.input_tokens + result.output_tokens;
                self.tokens_used.fetch_add(tokens, Ordering::Relaxed);

                info!(
                    input_tokens = result.input_tokens,
                    output_tokens = result.output_tokens,
                    total_used = self.tokens_used.load(Ordering::Relaxed),
                    budget = self.config.max_tokens_per_scan,
                    "LLM extraction complete"
                );

                // Cache the response
                if self.config.cache_responses {
                    let timestamp = std::time::SystemTime::now()
                        .duration_since(std::time::UNIX_EPOCH)
                        .map(|d| d.as_secs())
                        .unwrap_or(0);

                    let cached_response = CachedResponse {
                        claims_json: result.response_text.clone(),
                        cached_at: timestamp,
                        input_tokens: result.input_tokens,
                        output_tokens: result.output_tokens,
                    };
                    self.cache.put(&cache_key, &cached_response);
                }

                self.parse_claims(&result.response_text, &concept_prefix, file_path)
            }
            Err(e) => {
                warn!(error = %e, "LLM extraction failed");
                vec![]
            }
        }
    }

    /// Parse LLM JSON response into ExtractedClaim structs.
    ///
    /// When vocabulary is available, validates claims against the ontology
    /// and uses fuzzy matching to correct near-misses.
    fn parse_claims(
        &self,
        json: &str,
        concept_prefix: &str,
        file_path: &str,
    ) -> Vec<ExtractedClaim> {
        // Try to extract JSON from response (may have markdown code blocks)
        let json_str = extract_json(json);

        let response: LlmClaimsResponse = match serde_json::from_str(json_str) {
            Ok(r) => r,
            Err(e) => {
                debug!(error = %e, json = %json, "Failed to parse LLM response");
                return vec![];
            }
        };

        response
            .claims
            .into_iter()
            .filter(|c| c.confidence >= self.config.min_confidence)
            .filter_map(|claim| self.validate_and_transform_claim(claim, concept_prefix, file_path))
            .collect()
    }

    /// Validate a claim against the ontology and transform it to an ExtractedClaim.
    ///
    /// Returns None if the claim doesn't match any known concept.
    fn validate_and_transform_claim(
        &self,
        claim: LlmClaim,
        concept_prefix: &str,
        file_path: &str,
    ) -> Option<ExtractedClaim> {
        let value = match claim.value_type.as_str() {
            "boolean" => claim
                .value
                .as_bool()
                .map(ObjectValue::Boolean)
                .unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())),
            "number" => claim
                .value
                .as_f64()
                .map(ObjectValue::Number)
                .unwrap_or_else(|| ObjectValue::Text(claim.value.to_string())),
            _ => ObjectValue::Text(
                claim
                    .value
                    .as_str()
                    .map(|s| s.to_string())
                    .unwrap_or_else(|| claim.value.to_string()),
            ),
        };

        // If no vocabulary, accept all claims (backwards compatibility)
        let Some(vocab) = &self.vocabulary else {
            return Some(ExtractedClaim {
                concept_path: format!("{}/{}", concept_prefix, claim.subject),
                predicate: claim.predicate,
                value,
                file: file_path.to_string(),
                line: claim.line,
                matched_text: claim.matched_text,
                confidence: claim.confidence,
                description: claim.description,
            });
        };

        // Try exact match on both subject AND predicate first
        if let Some(concept) = vocab.find_by_leaf_and_predicate(&claim.subject, &claim.predicate) {
            debug!(
                subject = %claim.subject,
                predicate = %claim.predicate,
                "Claim matched ontology concept"
            );
            return Some(ExtractedClaim {
                concept_path: format!("{}/{}", concept_prefix, concept.leaf_path),
                predicate: concept.predicate.clone(),
                value,
                file: file_path.to_string(),
                line: claim.line,
                matched_text: claim.matched_text,
                confidence: claim.confidence,
                description: claim.description,
            });
        }

        // Subject exists but predicate doesn't match any known predicate for it
        if vocab.find_by_leaf(&claim.subject).is_some() {
            debug!(
                subject = %claim.subject,
                claim_predicate = %claim.predicate,
                "Claim subject exists but predicate not in vocabulary"
            );
        }

        // Try fuzzy matching for near-misses
        if let Some(concept) = vocab.fuzzy_match(&claim.subject, 0.6) {
            warn!(
                original = %claim.subject,
                matched = %concept.leaf_path,
                "Fuzzy matched claim to authority concept"
            );
            return Some(ExtractedClaim {
                concept_path: format!("{}/{}", concept_prefix, concept.leaf_path),
                predicate: concept.predicate.clone(),
                value,
                file: file_path.to_string(),
                line: claim.line,
                matched_text: claim.matched_text,
                confidence: claim.confidence * 0.9, // Reduce confidence for fuzzy matches
                description: claim.description,
            });
        }

        // Claim doesn't match any known concept
        debug!(
            subject = %claim.subject,
            "Rejecting claim - no matching ontology concept"
        );
        None
    }
}

/// Check if a file path indicates a high-value file for security analysis.
///
/// High-value files include:
/// - Files in security-sensitive directories (auth/, config/, crypto/, etc.)
/// - Files with security-related names (password, secret, credential, etc.)
pub fn is_high_value_file(path: &str) -> bool {
    let lower = path.to_lowercase();

    // High-value directories
    let dirs = [
        "auth/",
        "authentication/",
        "config/",
        "configuration/",
        "crypto/",
        "cryptography/",
        "security/",
        "secrets/",
        "certs/",
        "certificates/",
        "ssl/",
        "tls/",
        "keys/",
        "credentials/",
    ];

    // High-value file name components
    let names = [
        "secret",
        "password",
        "credential",
        "token",
        "auth",
        "login",
        "session",
        "jwt",
        "tls",
        "ssl",
        "cert",
        "key",
        "config",
        "settings",
        "security",
        "crypto",
        "encrypt",
        "decrypt",
        "oauth",
        "saml",
        "ldap",
        "api_key",
        "apikey",
        "access_key",
        "private",
    ];

    dirs.iter().any(|d| lower.contains(d)) || names.iter().any(|n| lower.contains(n))
}

#[cfg(test)]
mod tests {
    use super::*;
    use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, SourceClass};

    fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
        let source_metadata = serde_json::json!({
            "description": "Test description",
            "source": "test",
        });

        Assertion {
            subject: subject.to_string(),
            predicate: predicate.to_string(),
            object: value,
            parent_hash: None,
            source_hash: [0u8; 32],
            source_class: SourceClass::Clinical,
            visual_hash: None,
            epoch: None,
            source_metadata: serde_json::to_vec(&source_metadata).ok(),
            lifecycle: LifecycleStage::Approved,
            signatures: vec![],
            confidence: 1.0,
            timestamp: 0,
            hlc_timestamp: HlcTimestamp::default(),
            vector: None,
        }
    }

    #[test]
    fn test_is_high_value_file_directories() {
        assert!(is_high_value_file("src/auth/login.py"));
        assert!(is_high_value_file("config/database.yaml"));
        assert!(is_high_value_file("pkg/crypto/encrypt.go"));
        assert!(is_high_value_file("security/firewall.rs"));
        assert!(is_high_value_file("secrets/api_keys.env"));
        assert!(is_high_value_file("certs/server.pem"));
    }

    #[test]
    fn test_is_high_value_file_names() {
        assert!(is_high_value_file("src/password_validator.py"));
        assert!(is_high_value_file("lib/jwt_handler.ts"));
        assert!(is_high_value_file("utils/token_generator.go"));
        assert!(is_high_value_file("services/oauth_client.rs"));
    }

    #[test]
    fn test_is_high_value_file_not_high_value() {
        assert!(!is_high_value_file("src/main.rs"));
        assert!(!is_high_value_file("lib/utils.py"));
        assert!(!is_high_value_file("pkg/handler.go"));
        assert!(!is_high_value_file("tests/test_api.rs"));
    }

    #[test]
    fn test_vocabulary_from_hardcoded_assertions() {
        let assertions = vec![
            make_test_assertion(
                "rfc://5246/tls/cert_verification",
                "enabled",
                ObjectValue::Boolean(true),
            ),
            make_test_assertion(
                "owasp://rate_limit/enabled",
                "enabled",
                ObjectValue::Boolean(true),
            ),
            make_test_assertion(
                "owasp://crypto/hashing/algorithm",
                "algorithm",
                ObjectValue::Text("secure".to_string()),
            ),
        ];

        let vocab = OntologyVocabulary::from_assertions(&assertions);

        assert_eq!(vocab.concepts.len(), 3);

        // Check leaf path extraction
        assert!(vocab.find_by_leaf("tls/cert_verification").is_some());
        assert!(vocab.find_by_leaf("rate_limit/enabled").is_some());
        assert!(vocab.find_by_leaf("hashing/algorithm").is_some());
    }

    #[test]
    fn test_prompt_section_format() {
        let assertions = vec![make_test_assertion(
            "owasp://rate_limit/enabled",
            "enabled",
            ObjectValue::Boolean(true),
        )];

        let vocab = OntologyVocabulary::from_assertions(&assertions);
        let section = vocab.to_prompt_section();

        // Should contain table headers
        assert!(section.contains("Concept Path"));
        assert!(section.contains("Predicate"));
        assert!(section.contains("Value Type"));

        // Should contain our concept
        assert!(section.contains("rate_limit/enabled"));
        assert!(section.contains("enabled"));
        assert!(section.contains("boolean"));
    }
}