stemedb/applications/aphoria/tests/wiki_import_test.rs
jml 65065f3d8f feat(aphoria): implement community corpus with wiki import and pattern aggregation
Implements Phase 4 (A4) - Community corpus as first-class citizens:

- **Community Corpus Builder** - Queries StemeDB pattern aggregates
- **Wiki Import** - Bootstrap corpus from markdown docs (aphoria corpus import wiki)
- **Pattern Aggregation** - Automatic learning from local scans (--sync flag)
- **Storage Layer** - StemeDBPatternStore with content-addressed deduplication
- **Promotion Logic** - Multi-tier thresholds (95%/80%/50% adoption rates)
- **Corpus Build** - Unified registry for RFC/OWASP/Vendor/Community sources
- **Trust Packs** - Export corpus as signed, distributable artifacts
- **Documentation** - bootstrap-corpus.md guide + CLI reference updates

Technical details:
- Pattern aggregates stored as assertions with predicate "pattern_aggregate"
- Content-addressed subjects via BLAKE3(subject:predicate:value)
- PatternAggregator handles write path (observations → patterns)
- StemeDBPatternStore handles read path (pattern queries)
- Integration tests + fixtures in tests/wiki_import_test.rs

Deleted hardcoded.rs (368 lines) - corpus now fully emergent from StemeDB.
Deleted enriched-corpus-patterns.md (677 lines) - feature shipped.

Closes VG-026 (community corpus), part of A4 milestone.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-09 00:12:31 +00:00

261 lines
10 KiB
Rust

//! Integration tests for wiki corpus import.
use std::path::PathBuf;
use std::sync::Arc;
use aphoria::community::PatternAggregator;
use aphoria::corpus::{import_from_wiki, WikiParser};
use aphoria::{import_corpus_from_wiki, AphoriaConfig, PatternAggregate};
use stemedb_storage::{GenericPredicateIndexStore, HybridStore, PredicateIndexStore};
use tempfile::TempDir;
#[tokio::test]
async fn test_import_from_wiki_basic() {
// Get wiki fixtures path
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let wiki_path = manifest_dir.join("tests/fixtures/wiki");
let timestamp = 1706832000;
let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki");
// Should extract patterns from markdown files
assert!(!patterns.is_empty(), "Expected patterns to be extracted from wiki files");
// Check pattern structure
for pattern in &patterns {
assert!(pattern.subject.starts_with("code://*/"), "Subject should be wildcarded");
assert!(!pattern.predicate.is_empty(), "Predicate should not be empty");
assert_eq!(pattern.project_count, 1, "Bootstrap count should be 1");
assert_eq!(pattern.observation_count, 1, "Observation count should be 1");
assert_eq!(pattern.first_seen, timestamp);
assert_eq!(pattern.last_seen, timestamp);
}
}
#[tokio::test]
async fn test_wiki_pattern_to_storage() {
// Create temporary storage
let temp_dir = TempDir::new().expect("tempdir");
let store_path = temp_dir.path().join("store");
std::fs::create_dir_all(&store_path).expect("create store dir");
let hybrid_store = Arc::new(HybridStore::open(&store_path).expect("open hybrid store"));
let predicate_index = Arc::new(GenericPredicateIndexStore::new(hybrid_store.clone()));
// Import patterns from wiki
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let wiki_path = manifest_dir.join("tests/fixtures/wiki");
let timestamp = 1706832000;
let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki");
assert!(!patterns.is_empty(), "Should have patterns");
// Store patterns using PatternAggregator
let aggregator = PatternAggregator::new(hybrid_store.clone(), predicate_index.clone());
let hashes = aggregator.add_patterns(&patterns).await.expect("add_patterns");
assert_eq!(hashes.len(), patterns.len(), "All patterns should be stored");
// Query patterns back from storage
let query_result =
predicate_index.get_by_predicate("pattern_aggregate").await.expect("get_by_predicate");
assert_eq!(query_result.len(), patterns.len(), "Should retrieve all stored patterns");
}
#[tokio::test]
async fn test_wiki_parser_extracts_tls_patterns() {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let wiki_path = manifest_dir.join("tests/fixtures/wiki");
let timestamp = 1706832000;
let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki");
// Find TLS pattern (parser extracts "tls" from "TLS certificate verification")
let tls_pattern = patterns.iter().find(|p| p.subject.contains("tls"));
assert!(tls_pattern.is_some(), "Should extract TLS pattern");
if let Some(pattern) = tls_pattern {
assert_eq!(pattern.predicate, "enabled", "Predicate should be 'enabled'");
// Value should be Boolean(true) since "MUST be enabled"
match &pattern.value {
aphoria::community::CommunityObjectValue::Boolean(b) => {
assert!(*b, "TLS should be enabled");
}
_ => panic!("Expected Boolean value"),
}
}
}
#[tokio::test]
async fn test_wiki_parser_extracts_authentication_patterns() {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let wiki_path = manifest_dir.join("tests/fixtures/wiki");
let timestamp = 1706832000;
let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki");
// Find JWT pattern (parser extracts "jwt" from "JWT authentication")
let jwt_pattern = patterns.iter().find(|p| p.subject.contains("jwt"));
assert!(jwt_pattern.is_some(), "Should extract JWT pattern");
// Find password hashing pattern
let password_pattern = patterns.iter().find(|p| p.subject.contains("password"));
assert!(password_pattern.is_some(), "Should extract password hashing pattern");
}
#[tokio::test]
async fn test_wiki_import_deduplication() {
// Create temporary storage
let temp_dir = TempDir::new().expect("tempdir");
let store_path = temp_dir.path().join("store");
std::fs::create_dir_all(&store_path).expect("create store dir");
let hybrid_store = Arc::new(HybridStore::open(&store_path).expect("open hybrid store"));
let predicate_index = Arc::new(GenericPredicateIndexStore::new(hybrid_store.clone()));
let aggregator = PatternAggregator::new(hybrid_store.clone(), predicate_index.clone());
// Import patterns twice
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let wiki_path = manifest_dir.join("tests/fixtures/wiki");
let timestamp = 1706832000;
let patterns1 = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki");
aggregator.add_patterns(&patterns1).await.expect("add_patterns first");
let patterns2 = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki");
aggregator.add_patterns(&patterns2).await.expect("add_patterns second");
// Query patterns - should have entries for both imports
// (deduplication happens via content-addressed subject)
let query_result =
predicate_index.get_by_predicate("pattern_aggregate").await.expect("get_by_predicate");
// Both imports should create distinct assertions since they have different timestamps
// or same content-addressed hashes would overwrite
assert!(
query_result.len() >= patterns1.len(),
"Should have at least as many patterns as first import"
);
}
#[test]
fn test_wiki_pattern_content_addressed_subject() {
use aphoria::community::CommunityObjectValue;
let pattern1 = PatternAggregate {
subject: "code://*/tls/cert".to_string(),
predicate: "enabled".to_string(),
value: CommunityObjectValue::Boolean(true),
project_count: 1,
observation_count: 1,
first_seen: 1000,
last_seen: 2000,
};
let pattern2 = PatternAggregate {
subject: "code://*/tls/cert".to_string(),
predicate: "enabled".to_string(),
value: CommunityObjectValue::Boolean(true),
project_count: 5,
observation_count: 10,
first_seen: 1000,
last_seen: 3000,
};
// Same subject/predicate/value should produce same content-addressed hash
// even if counts differ
let hash1 = {
let mut hasher = blake3::Hasher::new();
hasher.update(pattern1.subject.as_bytes());
hasher.update(b":");
hasher.update(pattern1.predicate.as_bytes());
hasher.update(b":");
hasher.update(&[1u8]); // Boolean(true)
hex::encode(hasher.finalize().as_bytes())
};
let hash2 = {
let mut hasher = blake3::Hasher::new();
hasher.update(pattern2.subject.as_bytes());
hasher.update(b":");
hasher.update(pattern2.predicate.as_bytes());
hasher.update(b":");
hasher.update(&[1u8]); // Boolean(true)
hex::encode(hasher.finalize().as_bytes())
};
assert_eq!(hash1, hash2, "Same pattern should have same content hash");
}
#[tokio::test]
async fn test_wiki_parser_edge_cases() {
let parser = WikiParser::new().expect("parser");
// Test: Authority within 5 lines after pattern (boundary condition)
// Pattern at line 0, authority at line 5 (within range [0..6))
let content = "TLS MUST be enabled.\n\n\n\n\nAuthority: RFC 5246";
let patterns = parser.parse(content).expect("parse");
assert_eq!(patterns.len(), 1);
assert!(
patterns[0].authority.is_some(),
"Should find authority within 5 lines after pattern"
);
// Test: Authority beyond 5 lines after pattern
// Pattern at line 0, authority at line 6 (beyond range [0..6))
let content = "TLS MUST be enabled.\n\n\n\n\n\nAuthority: RFC 5246";
let patterns = parser.parse(content).expect("parse");
assert_eq!(patterns.len(), 1);
assert!(
patterns[0].authority.is_none(),
"Should NOT find authority beyond 5 lines after pattern"
);
// Test: Empty file
let patterns = parser.parse("").expect("parse");
assert_eq!(patterns.len(), 0);
// Test: No patterns
let content = "This is just regular markdown text.";
let patterns = parser.parse(content).expect("parse");
assert_eq!(patterns.len(), 0);
// Test: Multi-line pattern (continuation)
let content = "TLS certificate verification MUST be enabled\nacross all connections.";
let patterns = parser.parse(content).expect("parse");
assert_eq!(patterns.len(), 1);
assert!(patterns[0].subject.contains("tls"));
}
#[tokio::test]
async fn test_wiki_import_duplicate_patterns() {
use tempfile::TempDir;
// Test: Same pattern in multiple files - import_corpus_from_wiki returns extraction count
let temp_dir = TempDir::new().expect("tempdir");
let wiki_dir = temp_dir.path().join("wiki");
std::fs::create_dir_all(&wiki_dir).expect("create wiki dir");
// Write two files with identical patterns
std::fs::write(
wiki_dir.join("file1.md"),
"## TLS\nTLS MUST be enabled.\nAuthority: RFC 5246",
)
.expect("write file1");
std::fs::write(
wiki_dir.join("file2.md"),
"## TLS\nTLS MUST be enabled.\nAuthority: RFC 5246",
)
.expect("write file2");
let config = AphoriaConfig::default();
let count = import_corpus_from_wiki(&wiki_dir, &config).await.expect("import");
// Returns number of patterns extracted (2), not number stored (1 after deduplication)
// Deduplication happens at storage layer via content-addressed subject
assert_eq!(count, 2, "Should extract 2 patterns (one from each file)");
}