stemedb/applications/aphoria/src/llm/ontology.rs
jml 65065f3d8f feat(aphoria): implement community corpus with wiki import and pattern aggregation
Implements Phase 4 (A4) - Community corpus as first-class citizens:

- **Community Corpus Builder** - Queries StemeDB pattern aggregates
- **Wiki Import** - Bootstrap corpus from markdown docs (aphoria corpus import wiki)
- **Pattern Aggregation** - Automatic learning from local scans (--sync flag)
- **Storage Layer** - StemeDBPatternStore with content-addressed deduplication
- **Promotion Logic** - Multi-tier thresholds (95%/80%/50% adoption rates)
- **Corpus Build** - Unified registry for RFC/OWASP/Vendor/Community sources
- **Trust Packs** - Export corpus as signed, distributable artifacts
- **Documentation** - bootstrap-corpus.md guide + CLI reference updates

Technical details:
- Pattern aggregates stored as assertions with predicate "pattern_aggregate"
- Content-addressed subjects via BLAKE3(subject:predicate:value)
- PatternAggregator handles write path (observations → patterns)
- StemeDBPatternStore handles read path (pattern queries)
- Integration tests + fixtures in tests/wiki_import_test.rs

Deleted hardcoded.rs (368 lines) - corpus now fully emergent from StemeDB.
Deleted enriched-corpus-patterns.md (677 lines) - feature shipped.

Closes VG-026 (community corpus), part of A4 milestone.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-09 00:12:31 +00:00

365 lines
12 KiB
Rust

//! Ontology vocabulary extraction from authority corpus.
//!
//! Extracts concept vocabulary from corpus assertions to constrain
//! LLM output to paths that match authority subjects.
use serde::Deserialize;
use stemedb_core::types::{Assertion, ObjectValue};
/// A concept from the authority corpus.
#[derive(Debug, Clone)]
pub struct AuthorityConcept {
/// Full subject path (e.g., "owasp://rate_limit/enabled")
pub subject: String,
/// Leaf key for matching (e.g., "rate_limit/enabled")
pub leaf_path: String,
/// Valid predicate (e.g., "enabled")
pub predicate: String,
/// Expected value type
pub value_type: ValueType,
/// Example value for LLM context
pub example_value: String,
/// Description for LLM context
pub description: String,
}
/// Value type for a concept.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ValueType {
/// Boolean value (true/false).
Boolean,
/// Text string value.
Text,
/// Numeric value.
Number,
}
impl ValueType {
/// Convert to string for prompt.
pub fn as_str(&self) -> &'static str {
match self {
ValueType::Boolean => "boolean",
ValueType::Text => "text",
ValueType::Number => "number",
}
}
}
/// Helper to extract description from source_metadata JSON.
#[derive(Debug, Deserialize)]
struct SourceMetadata {
description: Option<String>,
}
/// Vocabulary extracted from authority corpus.
pub struct OntologyVocabulary {
/// List of authority concepts for constraining LLM output.
pub concepts: Vec<AuthorityConcept>,
}
impl OntologyVocabulary {
/// Build vocabulary from corpus assertions.
pub fn from_assertions(assertions: &[Assertion]) -> Self {
let concepts = assertions.iter().filter_map(Self::assertion_to_concept).collect();
Self { concepts }
}
/// Convert an assertion to an AuthorityConcept.
fn assertion_to_concept(assertion: &Assertion) -> Option<AuthorityConcept> {
let leaf_path = Self::extract_leaf_path(&assertion.subject)?;
let (value_type, example_value) = match &assertion.object {
ObjectValue::Boolean(b) => (ValueType::Boolean, b.to_string()),
ObjectValue::Text(t) => (ValueType::Text, t.clone()),
ObjectValue::Number(n) => (ValueType::Number, n.to_string()),
ObjectValue::Reference(r) => (ValueType::Text, r.clone()),
};
// Extract description from source_metadata if available
let description = assertion
.source_metadata
.as_ref()
.and_then(|meta| serde_json::from_slice::<SourceMetadata>(meta).ok())
.and_then(|m| m.description)
.unwrap_or_else(|| format!("{} {}", assertion.subject, assertion.predicate));
Some(AuthorityConcept {
subject: assertion.subject.clone(),
leaf_path,
predicate: assertion.predicate.clone(),
value_type,
example_value,
description,
})
}
/// Extract the leaf path from a subject.
///
/// For `rfc://5246/tls/cert_verification`, returns `tls/cert_verification`.
/// For `owasp://rate_limit/enabled`, returns `rate_limit/enabled`.
fn extract_leaf_path(subject: &str) -> Option<String> {
// Split on "://" to separate scheme from path
let path = subject.find("://").map(|i| &subject[i + 3..]).unwrap_or(subject);
// Get last two non-empty segments
let mut segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
if segments.len() < 2 {
return None;
}
// Take last 2 segments
let len = segments.len();
segments.drain(..len - 2);
Some(segments.join("/"))
}
/// Format concepts as a markdown table for prompt injection.
pub fn to_prompt_section(&self) -> String {
let mut lines = Vec::with_capacity(self.concepts.len() + 3);
lines.push("| Concept Path | Predicate | Value Type | Example | Description |".to_string());
lines.push("|--------------|-----------|------------|---------|-------------|".to_string());
for concept in &self.concepts {
// Truncate description for table readability
let desc = if concept.description.len() > 60 {
format!("{}...", &concept.description[..57])
} else {
concept.description.clone()
};
lines.push(format!(
"| {} | {} | {} | {} | {} |",
concept.leaf_path,
concept.predicate,
concept.value_type.as_str(),
concept.example_value,
desc
));
}
lines.join("\n")
}
/// Find a concept by leaf path.
pub fn find_by_leaf(&self, leaf_path: &str) -> Option<&AuthorityConcept> {
self.concepts.iter().find(|c| c.leaf_path == leaf_path)
}
/// Find a concept by leaf path AND predicate.
///
/// This is more precise than `find_by_leaf` when multiple predicates
/// are defined for the same subject path (e.g., auth/bypass with
/// debug_mode and header_based predicates).
pub fn find_by_leaf_and_predicate(
&self,
leaf_path: &str,
predicate: &str,
) -> Option<&AuthorityConcept> {
self.concepts.iter().find(|c| c.leaf_path == leaf_path && c.predicate == predicate)
}
/// Find a concept by leaf path with fuzzy matching.
///
/// Returns the best match if similarity is above the threshold.
pub fn fuzzy_match(&self, leaf_path: &str, threshold: f32) -> Option<&AuthorityConcept> {
let mut best_match: Option<(&AuthorityConcept, f32)> = None;
for concept in &self.concepts {
let similarity = Self::path_similarity(&concept.leaf_path, leaf_path);
if similarity >= threshold {
if let Some((_, best_score)) = best_match {
if similarity > best_score {
best_match = Some((concept, similarity));
}
} else {
best_match = Some((concept, similarity));
}
}
}
best_match.map(|(c, _)| c)
}
/// Calculate similarity between two paths.
///
/// Uses segment-based matching:
/// - Exact match: 1.0
/// - Same final segment: 0.7
/// - Contains same words: 0.5
fn path_similarity(a: &str, b: &str) -> f32 {
if a == b {
return 1.0;
}
let a_lower = a.to_lowercase();
let b_lower = b.to_lowercase();
if a_lower == b_lower {
return 0.95;
}
// Check final segment match
let a_final = a_lower.rsplit('/').next().unwrap_or(&a_lower);
let b_final = b_lower.rsplit('/').next().unwrap_or(&b_lower);
if a_final == b_final {
return 0.7;
}
// Check word overlap
let a_words: Vec<&str> = a_lower.split(['/', '_']).collect();
let b_words: Vec<&str> = b_lower.split(['/', '_']).collect();
let mut matches = 0;
for a_word in &a_words {
if b_words.contains(a_word) {
matches += 1;
}
}
if matches > 0 {
let max_words = a_words.len().max(b_words.len()) as f32;
return (matches as f32) / max_words * 0.5;
}
0.0
}
/// Get all unique leaf paths as a simple list for the prompt.
pub fn leaf_paths(&self) -> Vec<&str> {
self.concepts.iter().map(|c| c.leaf_path.as_str()).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use stemedb_core::types::{HlcTimestamp, LifecycleStage, SourceClass};
fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
let source_metadata = serde_json::json!({
"description": "Test description",
"source": "test",
});
Assertion {
subject: subject.to_string(),
predicate: predicate.to_string(),
object: value,
parent_hash: None,
source_hash: [0u8; 32],
source_class: SourceClass::Clinical,
visual_hash: None,
epoch: None,
source_metadata: serde_json::to_vec(&source_metadata).ok(),
lifecycle: LifecycleStage::Approved,
signatures: vec![],
confidence: 1.0,
timestamp: 0,
hlc_timestamp: HlcTimestamp::default(),
vector: None,
}
}
#[test]
fn test_extract_leaf_path() {
assert_eq!(
OntologyVocabulary::extract_leaf_path("rfc://5246/tls/cert_verification"),
Some("tls/cert_verification".to_string())
);
assert_eq!(
OntologyVocabulary::extract_leaf_path("owasp://rate_limit/enabled"),
Some("rate_limit/enabled".to_string())
);
assert_eq!(
OntologyVocabulary::extract_leaf_path("owasp://injection/db/query/construction"),
Some("query/construction".to_string())
);
}
#[test]
fn test_from_assertions() {
let assertions = vec![
make_test_assertion(
"rfc://5246/tls/cert_verification",
"enabled",
ObjectValue::Boolean(true),
),
make_test_assertion(
"owasp://rate_limit/enabled",
"enabled",
ObjectValue::Boolean(true),
),
];
let vocab = OntologyVocabulary::from_assertions(&assertions);
assert_eq!(vocab.concepts.len(), 2);
assert!(vocab.find_by_leaf("tls/cert_verification").is_some());
assert!(vocab.find_by_leaf("rate_limit/enabled").is_some());
}
#[test]
fn test_fuzzy_match() {
let assertions = vec![make_test_assertion(
"owasp://rate_limit/enabled",
"enabled",
ObjectValue::Boolean(true),
)];
let vocab = OntologyVocabulary::from_assertions(&assertions);
// Exact match
let exact = vocab.fuzzy_match("rate_limit/enabled", 0.5);
assert!(exact.is_some());
assert_eq!(exact.map(|c| c.leaf_path.as_str()), Some("rate_limit/enabled"));
// Similar match - same final segment should score 0.7
let fuzzy = vocab.fuzzy_match("api/enabled", 0.6);
assert!(fuzzy.is_some());
assert_eq!(fuzzy.map(|c| c.leaf_path.as_str()), Some("rate_limit/enabled"));
// No match
let no_match = vocab.fuzzy_match("completely_different", 0.5);
assert!(no_match.is_none());
}
#[test]
fn test_to_prompt_section() {
let assertions = vec![make_test_assertion(
"owasp://rate_limit/enabled",
"enabled",
ObjectValue::Boolean(true),
)];
let vocab = OntologyVocabulary::from_assertions(&assertions);
let section = vocab.to_prompt_section();
assert!(section.contains("rate_limit/enabled"));
assert!(section.contains("enabled"));
assert!(section.contains("boolean"));
}
#[test]
fn test_path_similarity() {
// Exact match
assert_eq!(OntologyVocabulary::path_similarity("a/b", "a/b"), 1.0);
// Case insensitive
assert!(OntologyVocabulary::path_similarity("A/B", "a/b") > 0.9);
// Same final segment
assert!(
OntologyVocabulary::path_similarity("x/cert_verification", "y/cert_verification") > 0.6
);
// No match
assert_eq!(OntologyVocabulary::path_similarity("a/b", "x/y"), 0.0);
}
}