//! Integration tests for wiki corpus import. use std::path::PathBuf; use std::sync::Arc; use aphoria::community::PatternAggregator; use aphoria::corpus::{import_from_wiki, WikiParser}; use aphoria::{import_corpus_from_wiki, AphoriaConfig, PatternAggregate}; use stemedb_storage::{GenericPredicateIndexStore, HybridStore, PredicateIndexStore}; use tempfile::TempDir; #[tokio::test] async fn test_import_from_wiki_basic() { // Get wiki fixtures path let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let wiki_path = manifest_dir.join("tests/fixtures/wiki"); let timestamp = 1706832000; let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki"); // Should extract patterns from markdown files assert!(!patterns.is_empty(), "Expected patterns to be extracted from wiki files"); // Check pattern structure for pattern in &patterns { assert!(pattern.subject.starts_with("code://*/"), "Subject should be wildcarded"); assert!(!pattern.predicate.is_empty(), "Predicate should not be empty"); assert_eq!(pattern.project_count, 1, "Bootstrap count should be 1"); assert_eq!(pattern.observation_count, 1, "Observation count should be 1"); assert_eq!(pattern.first_seen, timestamp); assert_eq!(pattern.last_seen, timestamp); } } #[tokio::test] async fn test_wiki_pattern_to_storage() { // Create temporary storage let temp_dir = TempDir::new().expect("tempdir"); let store_path = temp_dir.path().join("store"); std::fs::create_dir_all(&store_path).expect("create store dir"); let hybrid_store = Arc::new(HybridStore::open(&store_path).expect("open hybrid store")); let predicate_index = Arc::new(GenericPredicateIndexStore::new(hybrid_store.clone())); // Import patterns from wiki let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let wiki_path = manifest_dir.join("tests/fixtures/wiki"); let timestamp = 1706832000; let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki"); assert!(!patterns.is_empty(), "Should have patterns"); // Store patterns using PatternAggregator let aggregator = PatternAggregator::new(hybrid_store.clone(), predicate_index.clone()); let hashes = aggregator.add_patterns(&patterns).await.expect("add_patterns"); assert_eq!(hashes.len(), patterns.len(), "All patterns should be stored"); // Query patterns back from storage let query_result = predicate_index.get_by_predicate("pattern_aggregate").await.expect("get_by_predicate"); assert_eq!(query_result.len(), patterns.len(), "Should retrieve all stored patterns"); } #[tokio::test] async fn test_wiki_parser_extracts_tls_patterns() { let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let wiki_path = manifest_dir.join("tests/fixtures/wiki"); let timestamp = 1706832000; let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki"); // Find TLS pattern (parser extracts "tls" from "TLS certificate verification") let tls_pattern = patterns.iter().find(|p| p.subject.contains("tls")); assert!(tls_pattern.is_some(), "Should extract TLS pattern"); if let Some(pattern) = tls_pattern { assert_eq!(pattern.predicate, "enabled", "Predicate should be 'enabled'"); // Value should be Boolean(true) since "MUST be enabled" match &pattern.value { aphoria::community::CommunityObjectValue::Boolean(b) => { assert!(*b, "TLS should be enabled"); } _ => panic!("Expected Boolean value"), } } } #[tokio::test] async fn test_wiki_parser_extracts_authentication_patterns() { let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let wiki_path = manifest_dir.join("tests/fixtures/wiki"); let timestamp = 1706832000; let patterns = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki"); // Find JWT pattern (parser extracts "jwt" from "JWT authentication") let jwt_pattern = patterns.iter().find(|p| p.subject.contains("jwt")); assert!(jwt_pattern.is_some(), "Should extract JWT pattern"); // Find password hashing pattern let password_pattern = patterns.iter().find(|p| p.subject.contains("password")); assert!(password_pattern.is_some(), "Should extract password hashing pattern"); } #[tokio::test] async fn test_wiki_import_deduplication() { // Create temporary storage let temp_dir = TempDir::new().expect("tempdir"); let store_path = temp_dir.path().join("store"); std::fs::create_dir_all(&store_path).expect("create store dir"); let hybrid_store = Arc::new(HybridStore::open(&store_path).expect("open hybrid store")); let predicate_index = Arc::new(GenericPredicateIndexStore::new(hybrid_store.clone())); let aggregator = PatternAggregator::new(hybrid_store.clone(), predicate_index.clone()); // Import patterns twice let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let wiki_path = manifest_dir.join("tests/fixtures/wiki"); let timestamp = 1706832000; let patterns1 = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki"); aggregator.add_patterns(&patterns1).await.expect("add_patterns first"); let patterns2 = import_from_wiki(&wiki_path, timestamp).await.expect("import_from_wiki"); aggregator.add_patterns(&patterns2).await.expect("add_patterns second"); // Query patterns - should have entries for both imports // (deduplication happens via content-addressed subject) let query_result = predicate_index.get_by_predicate("pattern_aggregate").await.expect("get_by_predicate"); // Both imports should create distinct assertions since they have different timestamps // or same content-addressed hashes would overwrite assert!( query_result.len() >= patterns1.len(), "Should have at least as many patterns as first import" ); } #[test] fn test_wiki_pattern_content_addressed_subject() { use aphoria::community::CommunityObjectValue; let pattern1 = PatternAggregate { subject: "code://*/tls/cert".to_string(), predicate: "enabled".to_string(), value: CommunityObjectValue::Boolean(true), project_count: 1, observation_count: 1, first_seen: 1000, last_seen: 2000, }; let pattern2 = PatternAggregate { subject: "code://*/tls/cert".to_string(), predicate: "enabled".to_string(), value: CommunityObjectValue::Boolean(true), project_count: 5, observation_count: 10, first_seen: 1000, last_seen: 3000, }; // Same subject/predicate/value should produce same content-addressed hash // even if counts differ let hash1 = { let mut hasher = blake3::Hasher::new(); hasher.update(pattern1.subject.as_bytes()); hasher.update(b":"); hasher.update(pattern1.predicate.as_bytes()); hasher.update(b":"); hasher.update(&[1u8]); // Boolean(true) hex::encode(hasher.finalize().as_bytes()) }; let hash2 = { let mut hasher = blake3::Hasher::new(); hasher.update(pattern2.subject.as_bytes()); hasher.update(b":"); hasher.update(pattern2.predicate.as_bytes()); hasher.update(b":"); hasher.update(&[1u8]); // Boolean(true) hex::encode(hasher.finalize().as_bytes()) }; assert_eq!(hash1, hash2, "Same pattern should have same content hash"); } #[tokio::test] async fn test_wiki_parser_edge_cases() { let parser = WikiParser::new().expect("parser"); // Test: Authority within 5 lines after pattern (boundary condition) // Pattern at line 0, authority at line 5 (within range [0..6)) let content = "TLS MUST be enabled.\n\n\n\n\nAuthority: RFC 5246"; let patterns = parser.parse(content).expect("parse"); assert_eq!(patterns.len(), 1); assert!( patterns[0].authority.is_some(), "Should find authority within 5 lines after pattern" ); // Test: Authority beyond 5 lines after pattern // Pattern at line 0, authority at line 6 (beyond range [0..6)) let content = "TLS MUST be enabled.\n\n\n\n\n\nAuthority: RFC 5246"; let patterns = parser.parse(content).expect("parse"); assert_eq!(patterns.len(), 1); assert!( patterns[0].authority.is_none(), "Should NOT find authority beyond 5 lines after pattern" ); // Test: Empty file let patterns = parser.parse("").expect("parse"); assert_eq!(patterns.len(), 0); // Test: No patterns let content = "This is just regular markdown text."; let patterns = parser.parse(content).expect("parse"); assert_eq!(patterns.len(), 0); // Test: Multi-line pattern (continuation) let content = "TLS certificate verification MUST be enabled\nacross all connections."; let patterns = parser.parse(content).expect("parse"); assert_eq!(patterns.len(), 1); assert!(patterns[0].subject.contains("tls")); } #[tokio::test] async fn test_wiki_import_duplicate_patterns() { use tempfile::TempDir; // Test: Same pattern in multiple files - import_corpus_from_wiki returns extraction count let temp_dir = TempDir::new().expect("tempdir"); let wiki_dir = temp_dir.path().join("wiki"); std::fs::create_dir_all(&wiki_dir).expect("create wiki dir"); // Write two files with identical patterns std::fs::write( wiki_dir.join("file1.md"), "## TLS\nTLS MUST be enabled.\nAuthority: RFC 5246", ) .expect("write file1"); std::fs::write( wiki_dir.join("file2.md"), "## TLS\nTLS MUST be enabled.\nAuthority: RFC 5246", ) .expect("write file2"); let config = AphoriaConfig::default(); let count = import_corpus_from_wiki(&wiki_dir, &config).await.expect("import"); // Returns number of patterns extracted (2), not number stored (1 after deduplication) // Deduplication happens at storage layer via content-addressed subject assert_eq!(count, 2, "Should extract 2 patterns (one from each file)"); }