//! Ontology vocabulary extraction from authority corpus. //! //! Extracts concept vocabulary from corpus assertions to constrain //! LLM output to paths that match authority subjects. use serde::Deserialize; use stemedb_core::types::{Assertion, ObjectValue}; /// A concept from the authority corpus. #[derive(Debug, Clone)] pub struct AuthorityConcept { /// Full subject path (e.g., "owasp://rate_limit/enabled") pub subject: String, /// Leaf key for matching (e.g., "rate_limit/enabled") pub leaf_path: String, /// Valid predicate (e.g., "enabled") pub predicate: String, /// Expected value type pub value_type: ValueType, /// Example value for LLM context pub example_value: String, /// Description for LLM context pub description: String, } /// Value type for a concept. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ValueType { /// Boolean value (true/false). Boolean, /// Text string value. Text, /// Numeric value. Number, } impl ValueType { /// Convert to string for prompt. pub fn as_str(&self) -> &'static str { match self { ValueType::Boolean => "boolean", ValueType::Text => "text", ValueType::Number => "number", } } } /// Helper to extract description from source_metadata JSON. #[derive(Debug, Deserialize)] struct SourceMetadata { description: Option, } /// Vocabulary extracted from authority corpus. pub struct OntologyVocabulary { /// List of authority concepts for constraining LLM output. pub concepts: Vec, } impl OntologyVocabulary { /// Build vocabulary from corpus assertions. pub fn from_assertions(assertions: &[Assertion]) -> Self { let concepts = assertions.iter().filter_map(Self::assertion_to_concept).collect(); Self { concepts } } /// Convert an assertion to an AuthorityConcept. fn assertion_to_concept(assertion: &Assertion) -> Option { let leaf_path = Self::extract_leaf_path(&assertion.subject)?; let (value_type, example_value) = match &assertion.object { ObjectValue::Boolean(b) => (ValueType::Boolean, b.to_string()), ObjectValue::Text(t) => (ValueType::Text, t.clone()), ObjectValue::Number(n) => (ValueType::Number, n.to_string()), ObjectValue::Reference(r) => (ValueType::Text, r.clone()), }; // Extract description from source_metadata if available let description = assertion .source_metadata .as_ref() .and_then(|meta| serde_json::from_slice::(meta).ok()) .and_then(|m| m.description) .unwrap_or_else(|| format!("{} {}", assertion.subject, assertion.predicate)); Some(AuthorityConcept { subject: assertion.subject.clone(), leaf_path, predicate: assertion.predicate.clone(), value_type, example_value, description, }) } /// Extract the leaf path from a subject. /// /// For `rfc://5246/tls/cert_verification`, returns `tls/cert_verification`. /// For `owasp://rate_limit/enabled`, returns `rate_limit/enabled`. fn extract_leaf_path(subject: &str) -> Option { // Split on "://" to separate scheme from path let path = subject.find("://").map(|i| &subject[i + 3..]).unwrap_or(subject); // Get last two non-empty segments let mut segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); if segments.len() < 2 { return None; } // Take last 2 segments let len = segments.len(); segments.drain(..len - 2); Some(segments.join("/")) } /// Format concepts as a markdown table for prompt injection. pub fn to_prompt_section(&self) -> String { let mut lines = Vec::with_capacity(self.concepts.len() + 3); lines.push("| Concept Path | Predicate | Value Type | Example | Description |".to_string()); lines.push("|--------------|-----------|------------|---------|-------------|".to_string()); for concept in &self.concepts { // Truncate description for table readability let desc = if concept.description.len() > 60 { format!("{}...", &concept.description[..57]) } else { concept.description.clone() }; lines.push(format!( "| {} | {} | {} | {} | {} |", concept.leaf_path, concept.predicate, concept.value_type.as_str(), concept.example_value, desc )); } lines.join("\n") } /// Find a concept by leaf path. pub fn find_by_leaf(&self, leaf_path: &str) -> Option<&AuthorityConcept> { self.concepts.iter().find(|c| c.leaf_path == leaf_path) } /// Find a concept by leaf path AND predicate. /// /// This is more precise than `find_by_leaf` when multiple predicates /// are defined for the same subject path (e.g., auth/bypass with /// debug_mode and header_based predicates). pub fn find_by_leaf_and_predicate( &self, leaf_path: &str, predicate: &str, ) -> Option<&AuthorityConcept> { self.concepts.iter().find(|c| c.leaf_path == leaf_path && c.predicate == predicate) } /// Find a concept by leaf path with fuzzy matching. /// /// Returns the best match if similarity is above the threshold. pub fn fuzzy_match(&self, leaf_path: &str, threshold: f32) -> Option<&AuthorityConcept> { let mut best_match: Option<(&AuthorityConcept, f32)> = None; for concept in &self.concepts { let similarity = Self::path_similarity(&concept.leaf_path, leaf_path); if similarity >= threshold { if let Some((_, best_score)) = best_match { if similarity > best_score { best_match = Some((concept, similarity)); } } else { best_match = Some((concept, similarity)); } } } best_match.map(|(c, _)| c) } /// Calculate similarity between two paths. /// /// Uses segment-based matching: /// - Exact match: 1.0 /// - Same final segment: 0.7 /// - Contains same words: 0.5 fn path_similarity(a: &str, b: &str) -> f32 { if a == b { return 1.0; } let a_lower = a.to_lowercase(); let b_lower = b.to_lowercase(); if a_lower == b_lower { return 0.95; } // Check final segment match let a_final = a_lower.rsplit('/').next().unwrap_or(&a_lower); let b_final = b_lower.rsplit('/').next().unwrap_or(&b_lower); if a_final == b_final { return 0.7; } // Check word overlap let a_words: Vec<&str> = a_lower.split(['/', '_']).collect(); let b_words: Vec<&str> = b_lower.split(['/', '_']).collect(); let mut matches = 0; for a_word in &a_words { if b_words.contains(a_word) { matches += 1; } } if matches > 0 { let max_words = a_words.len().max(b_words.len()) as f32; return (matches as f32) / max_words * 0.5; } 0.0 } /// Get all unique leaf paths as a simple list for the prompt. pub fn leaf_paths(&self) -> Vec<&str> { self.concepts.iter().map(|c| c.leaf_path.as_str()).collect() } } #[cfg(test)] mod tests { use super::*; use stemedb_core::types::{HlcTimestamp, LifecycleStage, SourceClass}; fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion { let source_metadata = serde_json::json!({ "description": "Test description", "source": "test", }); Assertion { subject: subject.to_string(), predicate: predicate.to_string(), object: value, parent_hash: None, source_hash: [0u8; 32], source_class: SourceClass::Clinical, visual_hash: None, epoch: None, source_metadata: serde_json::to_vec(&source_metadata).ok(), lifecycle: LifecycleStage::Approved, signatures: vec![], confidence: 1.0, timestamp: 0, hlc_timestamp: HlcTimestamp::default(), vector: None, } } #[test] fn test_extract_leaf_path() { assert_eq!( OntologyVocabulary::extract_leaf_path("rfc://5246/tls/cert_verification"), Some("tls/cert_verification".to_string()) ); assert_eq!( OntologyVocabulary::extract_leaf_path("owasp://rate_limit/enabled"), Some("rate_limit/enabled".to_string()) ); assert_eq!( OntologyVocabulary::extract_leaf_path("owasp://injection/db/query/construction"), Some("query/construction".to_string()) ); } #[test] fn test_from_assertions() { let assertions = vec![ make_test_assertion( "rfc://5246/tls/cert_verification", "enabled", ObjectValue::Boolean(true), ), make_test_assertion( "owasp://rate_limit/enabled", "enabled", ObjectValue::Boolean(true), ), ]; let vocab = OntologyVocabulary::from_assertions(&assertions); assert_eq!(vocab.concepts.len(), 2); assert!(vocab.find_by_leaf("tls/cert_verification").is_some()); assert!(vocab.find_by_leaf("rate_limit/enabled").is_some()); } #[test] fn test_fuzzy_match() { let assertions = vec![make_test_assertion( "owasp://rate_limit/enabled", "enabled", ObjectValue::Boolean(true), )]; let vocab = OntologyVocabulary::from_assertions(&assertions); // Exact match let exact = vocab.fuzzy_match("rate_limit/enabled", 0.5); assert!(exact.is_some()); assert_eq!(exact.map(|c| c.leaf_path.as_str()), Some("rate_limit/enabled")); // Similar match - same final segment should score 0.7 let fuzzy = vocab.fuzzy_match("api/enabled", 0.6); assert!(fuzzy.is_some()); assert_eq!(fuzzy.map(|c| c.leaf_path.as_str()), Some("rate_limit/enabled")); // No match let no_match = vocab.fuzzy_match("completely_different", 0.5); assert!(no_match.is_none()); } #[test] fn test_to_prompt_section() { let assertions = vec![make_test_assertion( "owasp://rate_limit/enabled", "enabled", ObjectValue::Boolean(true), )]; let vocab = OntologyVocabulary::from_assertions(&assertions); let section = vocab.to_prompt_section(); assert!(section.contains("rate_limit/enabled")); assert!(section.contains("enabled")); assert!(section.contains("boolean")); } #[test] fn test_path_similarity() { // Exact match assert_eq!(OntologyVocabulary::path_similarity("a/b", "a/b"), 1.0); // Case insensitive assert!(OntologyVocabulary::path_similarity("A/B", "a/b") > 0.9); // Same final segment assert!( OntologyVocabulary::path_similarity("x/cert_verification", "y/cert_verification") > 0.6 ); // No match assert_eq!(OntologyVocabulary::path_similarity("a/b", "x/y"), 0.0); } }