Implements Phase 4 (A4) - Community corpus as first-class citizens: - **Community Corpus Builder** - Queries StemeDB pattern aggregates - **Wiki Import** - Bootstrap corpus from markdown docs (aphoria corpus import wiki) - **Pattern Aggregation** - Automatic learning from local scans (--sync flag) - **Storage Layer** - StemeDBPatternStore with content-addressed deduplication - **Promotion Logic** - Multi-tier thresholds (95%/80%/50% adoption rates) - **Corpus Build** - Unified registry for RFC/OWASP/Vendor/Community sources - **Trust Packs** - Export corpus as signed, distributable artifacts - **Documentation** - bootstrap-corpus.md guide + CLI reference updates Technical details: - Pattern aggregates stored as assertions with predicate "pattern_aggregate" - Content-addressed subjects via BLAKE3(subject:predicate:value) - PatternAggregator handles write path (observations → patterns) - StemeDBPatternStore handles read path (pattern queries) - Integration tests + fixtures in tests/wiki_import_test.rs Deleted hardcoded.rs (368 lines) - corpus now fully emergent from StemeDB. Deleted enriched-corpus-patterns.md (677 lines) - feature shipped. Closes VG-026 (community corpus), part of A4 milestone. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
365 lines
12 KiB
Rust
365 lines
12 KiB
Rust
//! Ontology vocabulary extraction from authority corpus.
|
|
//!
|
|
//! Extracts concept vocabulary from corpus assertions to constrain
|
|
//! LLM output to paths that match authority subjects.
|
|
|
|
use serde::Deserialize;
|
|
use stemedb_core::types::{Assertion, ObjectValue};
|
|
|
|
/// A concept from the authority corpus.
|
|
#[derive(Debug, Clone)]
|
|
pub struct AuthorityConcept {
|
|
/// Full subject path (e.g., "owasp://rate_limit/enabled")
|
|
pub subject: String,
|
|
/// Leaf key for matching (e.g., "rate_limit/enabled")
|
|
pub leaf_path: String,
|
|
/// Valid predicate (e.g., "enabled")
|
|
pub predicate: String,
|
|
/// Expected value type
|
|
pub value_type: ValueType,
|
|
/// Example value for LLM context
|
|
pub example_value: String,
|
|
/// Description for LLM context
|
|
pub description: String,
|
|
}
|
|
|
|
/// Value type for a concept.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum ValueType {
|
|
/// Boolean value (true/false).
|
|
Boolean,
|
|
/// Text string value.
|
|
Text,
|
|
/// Numeric value.
|
|
Number,
|
|
}
|
|
|
|
impl ValueType {
|
|
/// Convert to string for prompt.
|
|
pub fn as_str(&self) -> &'static str {
|
|
match self {
|
|
ValueType::Boolean => "boolean",
|
|
ValueType::Text => "text",
|
|
ValueType::Number => "number",
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Helper to extract description from source_metadata JSON.
|
|
#[derive(Debug, Deserialize)]
|
|
struct SourceMetadata {
|
|
description: Option<String>,
|
|
}
|
|
|
|
/// Vocabulary extracted from authority corpus.
|
|
pub struct OntologyVocabulary {
|
|
/// List of authority concepts for constraining LLM output.
|
|
pub concepts: Vec<AuthorityConcept>,
|
|
}
|
|
|
|
impl OntologyVocabulary {
|
|
/// Build vocabulary from corpus assertions.
|
|
pub fn from_assertions(assertions: &[Assertion]) -> Self {
|
|
let concepts = assertions.iter().filter_map(Self::assertion_to_concept).collect();
|
|
|
|
Self { concepts }
|
|
}
|
|
|
|
/// Convert an assertion to an AuthorityConcept.
|
|
fn assertion_to_concept(assertion: &Assertion) -> Option<AuthorityConcept> {
|
|
let leaf_path = Self::extract_leaf_path(&assertion.subject)?;
|
|
|
|
let (value_type, example_value) = match &assertion.object {
|
|
ObjectValue::Boolean(b) => (ValueType::Boolean, b.to_string()),
|
|
ObjectValue::Text(t) => (ValueType::Text, t.clone()),
|
|
ObjectValue::Number(n) => (ValueType::Number, n.to_string()),
|
|
ObjectValue::Reference(r) => (ValueType::Text, r.clone()),
|
|
};
|
|
|
|
// Extract description from source_metadata if available
|
|
let description = assertion
|
|
.source_metadata
|
|
.as_ref()
|
|
.and_then(|meta| serde_json::from_slice::<SourceMetadata>(meta).ok())
|
|
.and_then(|m| m.description)
|
|
.unwrap_or_else(|| format!("{} {}", assertion.subject, assertion.predicate));
|
|
|
|
Some(AuthorityConcept {
|
|
subject: assertion.subject.clone(),
|
|
leaf_path,
|
|
predicate: assertion.predicate.clone(),
|
|
value_type,
|
|
example_value,
|
|
description,
|
|
})
|
|
}
|
|
|
|
/// Extract the leaf path from a subject.
|
|
///
|
|
/// For `rfc://5246/tls/cert_verification`, returns `tls/cert_verification`.
|
|
/// For `owasp://rate_limit/enabled`, returns `rate_limit/enabled`.
|
|
fn extract_leaf_path(subject: &str) -> Option<String> {
|
|
// Split on "://" to separate scheme from path
|
|
let path = subject.find("://").map(|i| &subject[i + 3..]).unwrap_or(subject);
|
|
|
|
// Get last two non-empty segments
|
|
let mut segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
|
|
|
|
if segments.len() < 2 {
|
|
return None;
|
|
}
|
|
|
|
// Take last 2 segments
|
|
let len = segments.len();
|
|
segments.drain(..len - 2);
|
|
Some(segments.join("/"))
|
|
}
|
|
|
|
/// Format concepts as a markdown table for prompt injection.
|
|
pub fn to_prompt_section(&self) -> String {
|
|
let mut lines = Vec::with_capacity(self.concepts.len() + 3);
|
|
|
|
lines.push("| Concept Path | Predicate | Value Type | Example | Description |".to_string());
|
|
lines.push("|--------------|-----------|------------|---------|-------------|".to_string());
|
|
|
|
for concept in &self.concepts {
|
|
// Truncate description for table readability
|
|
let desc = if concept.description.len() > 60 {
|
|
format!("{}...", &concept.description[..57])
|
|
} else {
|
|
concept.description.clone()
|
|
};
|
|
|
|
lines.push(format!(
|
|
"| {} | {} | {} | {} | {} |",
|
|
concept.leaf_path,
|
|
concept.predicate,
|
|
concept.value_type.as_str(),
|
|
concept.example_value,
|
|
desc
|
|
));
|
|
}
|
|
|
|
lines.join("\n")
|
|
}
|
|
|
|
/// Find a concept by leaf path.
|
|
pub fn find_by_leaf(&self, leaf_path: &str) -> Option<&AuthorityConcept> {
|
|
self.concepts.iter().find(|c| c.leaf_path == leaf_path)
|
|
}
|
|
|
|
/// Find a concept by leaf path AND predicate.
|
|
///
|
|
/// This is more precise than `find_by_leaf` when multiple predicates
|
|
/// are defined for the same subject path (e.g., auth/bypass with
|
|
/// debug_mode and header_based predicates).
|
|
pub fn find_by_leaf_and_predicate(
|
|
&self,
|
|
leaf_path: &str,
|
|
predicate: &str,
|
|
) -> Option<&AuthorityConcept> {
|
|
self.concepts.iter().find(|c| c.leaf_path == leaf_path && c.predicate == predicate)
|
|
}
|
|
|
|
/// Find a concept by leaf path with fuzzy matching.
|
|
///
|
|
/// Returns the best match if similarity is above the threshold.
|
|
pub fn fuzzy_match(&self, leaf_path: &str, threshold: f32) -> Option<&AuthorityConcept> {
|
|
let mut best_match: Option<(&AuthorityConcept, f32)> = None;
|
|
|
|
for concept in &self.concepts {
|
|
let similarity = Self::path_similarity(&concept.leaf_path, leaf_path);
|
|
if similarity >= threshold {
|
|
if let Some((_, best_score)) = best_match {
|
|
if similarity > best_score {
|
|
best_match = Some((concept, similarity));
|
|
}
|
|
} else {
|
|
best_match = Some((concept, similarity));
|
|
}
|
|
}
|
|
}
|
|
|
|
best_match.map(|(c, _)| c)
|
|
}
|
|
|
|
/// Calculate similarity between two paths.
|
|
///
|
|
/// Uses segment-based matching:
|
|
/// - Exact match: 1.0
|
|
/// - Same final segment: 0.7
|
|
/// - Contains same words: 0.5
|
|
fn path_similarity(a: &str, b: &str) -> f32 {
|
|
if a == b {
|
|
return 1.0;
|
|
}
|
|
|
|
let a_lower = a.to_lowercase();
|
|
let b_lower = b.to_lowercase();
|
|
|
|
if a_lower == b_lower {
|
|
return 0.95;
|
|
}
|
|
|
|
// Check final segment match
|
|
let a_final = a_lower.rsplit('/').next().unwrap_or(&a_lower);
|
|
let b_final = b_lower.rsplit('/').next().unwrap_or(&b_lower);
|
|
|
|
if a_final == b_final {
|
|
return 0.7;
|
|
}
|
|
|
|
// Check word overlap
|
|
let a_words: Vec<&str> = a_lower.split(['/', '_']).collect();
|
|
let b_words: Vec<&str> = b_lower.split(['/', '_']).collect();
|
|
|
|
let mut matches = 0;
|
|
for a_word in &a_words {
|
|
if b_words.contains(a_word) {
|
|
matches += 1;
|
|
}
|
|
}
|
|
|
|
if matches > 0 {
|
|
let max_words = a_words.len().max(b_words.len()) as f32;
|
|
return (matches as f32) / max_words * 0.5;
|
|
}
|
|
|
|
0.0
|
|
}
|
|
|
|
/// Get all unique leaf paths as a simple list for the prompt.
|
|
pub fn leaf_paths(&self) -> Vec<&str> {
|
|
self.concepts.iter().map(|c| c.leaf_path.as_str()).collect()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use stemedb_core::types::{HlcTimestamp, LifecycleStage, SourceClass};
|
|
|
|
fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
|
|
let source_metadata = serde_json::json!({
|
|
"description": "Test description",
|
|
"source": "test",
|
|
});
|
|
|
|
Assertion {
|
|
subject: subject.to_string(),
|
|
predicate: predicate.to_string(),
|
|
object: value,
|
|
parent_hash: None,
|
|
source_hash: [0u8; 32],
|
|
source_class: SourceClass::Clinical,
|
|
visual_hash: None,
|
|
epoch: None,
|
|
source_metadata: serde_json::to_vec(&source_metadata).ok(),
|
|
lifecycle: LifecycleStage::Approved,
|
|
signatures: vec![],
|
|
confidence: 1.0,
|
|
timestamp: 0,
|
|
hlc_timestamp: HlcTimestamp::default(),
|
|
vector: None,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_leaf_path() {
|
|
assert_eq!(
|
|
OntologyVocabulary::extract_leaf_path("rfc://5246/tls/cert_verification"),
|
|
Some("tls/cert_verification".to_string())
|
|
);
|
|
|
|
assert_eq!(
|
|
OntologyVocabulary::extract_leaf_path("owasp://rate_limit/enabled"),
|
|
Some("rate_limit/enabled".to_string())
|
|
);
|
|
|
|
assert_eq!(
|
|
OntologyVocabulary::extract_leaf_path("owasp://injection/db/query/construction"),
|
|
Some("query/construction".to_string())
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_from_assertions() {
|
|
let assertions = vec![
|
|
make_test_assertion(
|
|
"rfc://5246/tls/cert_verification",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
|
|
assert_eq!(vocab.concepts.len(), 2);
|
|
assert!(vocab.find_by_leaf("tls/cert_verification").is_some());
|
|
assert!(vocab.find_by_leaf("rate_limit/enabled").is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_fuzzy_match() {
|
|
let assertions = vec![make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
)];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
|
|
// Exact match
|
|
let exact = vocab.fuzzy_match("rate_limit/enabled", 0.5);
|
|
assert!(exact.is_some());
|
|
assert_eq!(exact.map(|c| c.leaf_path.as_str()), Some("rate_limit/enabled"));
|
|
|
|
// Similar match - same final segment should score 0.7
|
|
let fuzzy = vocab.fuzzy_match("api/enabled", 0.6);
|
|
assert!(fuzzy.is_some());
|
|
assert_eq!(fuzzy.map(|c| c.leaf_path.as_str()), Some("rate_limit/enabled"));
|
|
|
|
// No match
|
|
let no_match = vocab.fuzzy_match("completely_different", 0.5);
|
|
assert!(no_match.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_to_prompt_section() {
|
|
let assertions = vec![make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
)];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
let section = vocab.to_prompt_section();
|
|
|
|
assert!(section.contains("rate_limit/enabled"));
|
|
assert!(section.contains("enabled"));
|
|
assert!(section.contains("boolean"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_path_similarity() {
|
|
// Exact match
|
|
assert_eq!(OntologyVocabulary::path_similarity("a/b", "a/b"), 1.0);
|
|
|
|
// Case insensitive
|
|
assert!(OntologyVocabulary::path_similarity("A/B", "a/b") > 0.9);
|
|
|
|
// Same final segment
|
|
assert!(
|
|
OntologyVocabulary::path_similarity("x/cert_verification", "y/cert_verification") > 0.6
|
|
);
|
|
|
|
// No match
|
|
assert_eq!(OntologyVocabulary::path_similarity("a/b", "x/y"), 0.0);
|
|
}
|
|
}
|