diff --git a/applications/aphoria/src/config/defaults.rs b/applications/aphoria/src/config/defaults.rs index a3131f1..2ecc59e 100644 --- a/applications/aphoria/src/config/defaults.rs +++ b/applications/aphoria/src/config/defaults.rs @@ -32,6 +32,7 @@ impl Default for ExtractorConfig { "timeout_config".to_string(), "dep_versions".to_string(), "cors_config".to_string(), + "durability_config".to_string(), "rate_limit".to_string(), // Phase 2 extractors "weak_crypto".to_string(), @@ -44,6 +45,12 @@ impl Default for ExtractorConfig { // Phase 8: Enterprise extractors (first batch) "high_entropy_secrets".to_string(), "auth_bypass".to_string(), + "api_key_security".to_string(), + "import_graph".to_string(), + "derive_pattern".to_string(), + "const_declarations".to_string(), + "unsafe_atomic".to_string(), + "circuit_breaker_config".to_string(), "insecure_cookies".to_string(), // Phase 8: Enterprise extractors (second batch) "path_traversal".to_string(), @@ -85,7 +92,10 @@ impl Default for TimeoutExtractorConfig { impl Default for DepVersionConfig { fn default() -> Self { - Self { advisory_db: dirs_default_advisory_db() } + Self { + enabled: false, // OPT-IN: Disabled by default to reduce noise + advisory_db: dirs_default_advisory_db(), + } } } diff --git a/applications/aphoria/src/config/types/extractors.rs b/applications/aphoria/src/config/types/extractors.rs index 6b99161..4aec367 100644 --- a/applications/aphoria/src/config/types/extractors.rs +++ b/applications/aphoria/src/config/types/extractors.rs @@ -63,6 +63,12 @@ pub struct TimeoutExtractorConfig { #[derive(Debug, Clone, Deserialize)] #[serde(default)] pub struct DepVersionConfig { + /// Enable dependency version extraction (opt-in). + /// + /// Default: false to reduce noise in output. + /// Enable this if you want dependency inventory for advisory lookup. + pub enabled: bool, + /// Path to advisory database. pub advisory_db: PathBuf, } diff --git a/applications/aphoria/src/extractors/api_key_security.rs b/applications/aphoria/src/extractors/api_key_security.rs new file mode 100644 index 0000000..29bf8ff --- /dev/null +++ b/applications/aphoria/src/extractors/api_key_security.rs @@ -0,0 +1,402 @@ +//! API key security configuration extractor. +//! +//! Detects potential API authentication misconfigurations: +//! - `require_for_all: false` - API key not required for all endpoints +//! - Excessive public paths (> 5 paths) - overly permissive access +//! - Using DEFAULT_API_KEY_RATE_LIMIT without customization + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for API key security configuration. +/// +/// Focuses on authentication and rate limiting misconfigurations. +pub struct ApiKeySecurityExtractor { + /// Pattern: require_for_all: false + require_for_all_false: Regex, + /// Pattern: public_paths: vec![...] with more than 5 entries + public_paths_array: Regex, + /// Pattern: DEFAULT_API_KEY_RATE_LIMIT usage + default_rate_limit: Regex, +} + +impl Default for ApiKeySecurityExtractor { + fn default() -> Self { + Self::new() + } +} + +impl ApiKeySecurityExtractor { + /// Create a new API key security extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // Rust: require_for_all: false + // Go: RequireForAll: false + // YAML: require_for_all: false + require_for_all_false: Regex::new( + r#"(?i)require_?for_?all\s*[:=]\s*false"# + ) + .expect("valid regex"), + + // Look for public_paths arrays - we'll count entries manually + // Handles Rust vec![...], Go []string{...}, YAML lists + public_paths_array: Regex::new( + r#"(?i)public_?paths\s*[:=]\s*(?:vec!|[\[\{])"# + ) + .expect("valid regex"), + + // Using default rate limit constant + default_rate_limit: Regex::new( + r"DEFAULT_API_KEY_RATE_LIMIT" + ) + .expect("valid regex"), + } + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") { + 0.5 + } else { + 1.0 + } + } + + /// Count public paths in a potential array definition. + fn count_public_paths(&self, content: &str, start_line: usize) -> usize { + let lines: Vec<&str> = content.lines().collect(); + let mut count = 0; + let mut depth = 0; + let mut in_array = false; + + for (idx, line) in lines.iter().enumerate().skip(start_line) { + if idx >= start_line + 20 { + // Don't search more than 20 lines ahead + break; + } + + for ch in line.chars() { + match ch { + '[' => { + depth += 1; + in_array = true; + } + ']' => { + depth -= 1; + if depth == 0 { + return count; + } + } + '"' | '\'' if in_array && depth > 0 => { + count += 1; + // Skip to end of string to avoid double-counting + break; + } + _ => {} + } + } + + if depth == 0 && in_array { + break; + } + } + + count + } +} + +impl Extractor for ApiKeySecurityExtractor { + fn name(&self) -> &str { + "api_key_security" + } + + fn languages(&self) -> &[Language] { + &[ + Language::Rust, + Language::Go, + Language::Python, + Language::TypeScript, + Language::JavaScript, + Language::Yaml, + Language::Toml, + Language::Json, + ] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + for (line_idx, line) in content.lines().enumerate() { + let line_num = line_idx + 1; + + // Check for require_for_all: false + if self.require_for_all_false.is_match(line) { + let mut concept_path = path_segments.to_vec(); + concept_path.push("api".to_string()); + concept_path.push("auth".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "require_api_key".to_string(), + value: ObjectValue::Boolean(false), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: "API key not required for all endpoints (require_for_all: false)".to_string(), + }); + } + + // Check for public_paths arrays + if self.public_paths_array.is_match(line) { + let count = self.count_public_paths(content, line_idx); + + if count > 5 { + let mut concept_path = path_segments.to_vec(); + concept_path.push("api".to_string()); + concept_path.push("auth".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "public_paths_count".to_string(), + value: ObjectValue::Number(count as f64), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence: confidence * 0.9, // Slight reduction since we're inferring + description: format!("Overly permissive public paths ({} paths)", count), + }); + } + } + + // Check for DEFAULT_API_KEY_RATE_LIMIT usage + if self.default_rate_limit.is_match(line) { + // Only flag if it looks like it's being used directly without customization + if !line.contains("const") && !line.contains("pub const") && !line.contains("//") { + let mut concept_path = path_segments.to_vec(); + concept_path.push("api".to_string()); + concept_path.push("rate_limit".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "using_default".to_string(), + value: ObjectValue::Boolean(true), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence: confidence * 0.7, // Lower confidence - might be intentional + description: "Using default API key rate limit without customization".to_string(), + }); + } + } + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_require_for_all_false_rust() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" + ApiKeyAuthConfig { + require_for_all: false, + public_paths: vec!["/health".to_string()], + } + "#; + + let claims = extractor.extract( + &["rust".to_string(), "myapi".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert!(!claims.is_empty()); + let require_claim = claims.iter().find(|c| c.predicate == "require_api_key"); + assert!(require_claim.is_some()); + if let Some(claim) = require_claim { + assert_eq!(claim.value, ObjectValue::Boolean(false)); + assert!(claim.concept_path.contains("api/auth")); + } + } + + #[test] + fn test_require_for_all_false_yaml() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" +api: + auth: + require_for_all: false + public_paths: + - /health + - /metrics +"#; + + let claims = extractor.extract( + &["config".to_string()], + content, + Language::Yaml, + "config/api.yaml", + ); + + assert!(!claims.is_empty()); + let require_claim = claims.iter().find(|c| c.predicate == "require_api_key"); + assert!(require_claim.is_some()); + } + + #[test] + fn test_excessive_public_paths() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" + public_paths: vec![ + "/health".to_string(), + "/metrics".to_string(), + "/swagger-ui".to_string(), + "/docs".to_string(), + "/status".to_string(), + "/ping".to_string(), + "/info".to_string(), + ] + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/middleware.rs", + ); + + assert!(!claims.is_empty()); + let paths_claim = claims.iter().find(|c| c.predicate == "public_paths_count"); + assert!(paths_claim.is_some()); + if let Some(claim) = paths_claim { + if let ObjectValue::Number(count) = claim.value { + assert!(count > 5.0); + } else { + panic!("Expected Number value"); + } + } + } + + #[test] + fn test_reasonable_public_paths_not_flagged() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" + public_paths: vec![ + "/health".to_string(), + "/v1/health".to_string(), + "/swagger-ui".to_string(), + ] + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/middleware.rs", + ); + + // Should not flag this - only 3 paths + let paths_claim = claims.iter().find(|c| c.predicate == "public_paths_count"); + assert!(paths_claim.is_none()); + } + + #[test] + fn test_default_rate_limit_usage() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" + let rate_limit = record.rate_limit.unwrap_or(DEFAULT_API_KEY_RATE_LIMIT); + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/handlers.rs", + ); + + assert!(!claims.is_empty()); + let rate_claim = claims.iter().find(|c| c.predicate == "using_default"); + assert!(rate_claim.is_some()); + } + + #[test] + fn test_default_rate_limit_const_definition_not_flagged() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" + pub const DEFAULT_API_KEY_RATE_LIMIT: u64 = 10_000; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + // Should not flag constant definition + let rate_claim = claims.iter().find(|c| c.predicate == "using_default"); + assert!(rate_claim.is_none()); + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" + ApiKeyAuthConfig { + require_for_all: false, + } + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/middleware_test.rs", + ); + + assert!(!claims.is_empty()); + assert_eq!(claims[0].confidence, 0.5); + } + + #[test] + fn test_go_api_config() { + let extractor = ApiKeySecurityExtractor::new(); + let content = r#" +config := &AuthConfig{ + RequireForAll: false, + PublicPaths: []string{"/health", "/metrics"}, +} +"#; + + let claims = extractor.extract( + &["go".to_string()], + content, + Language::Go, + "config.go", + ); + + assert!(!claims.is_empty()); + let require_claim = claims.iter().find(|c| c.predicate == "require_api_key"); + assert!(require_claim.is_some()); + } +} diff --git a/applications/aphoria/src/extractors/circuit_breaker_config.rs b/applications/aphoria/src/extractors/circuit_breaker_config.rs new file mode 100644 index 0000000..d379ca2 --- /dev/null +++ b/applications/aphoria/src/extractors/circuit_breaker_config.rs @@ -0,0 +1,256 @@ +//! Circuit breaker configuration extractor. +//! +//! Detects missing or explicitly disabled circuit breaker middleware. +//! Circuit breakers are critical for resilience - they prevent cascading +//! failures by temporarily blocking requests to misbehaving agents. + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for circuit breaker configuration. +/// +/// Detects: +/// - Explicitly disabled circuit breakers +/// - Router configurations missing circuit breaker middleware +pub struct CircuitBreakerConfigExtractor { + /// Pattern: circuit_breaker_enabled: false + disabled_pattern: Regex, + /// Pattern: CircuitBreakerConfig with enabled: false + config_disabled: Regex, +} + +impl Default for CircuitBreakerConfigExtractor { + fn default() -> Self { + Self::new() + } +} + +impl CircuitBreakerConfigExtractor { + /// Create a new circuit breaker config extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // YAML/TOML: circuit_breaker_enabled: false + disabled_pattern: Regex::new( + r#"(?i)circuit_?breaker_?enabled\s*[:=]\s*false"# + ) + .expect("valid regex"), + + // Look for lines with just "enabled: false" in circuit breaker context + // We'll rely on the first pattern for most cases + config_disabled: Regex::new( + r"(?i)^\s*enabled\s*:\s*false" + ) + .expect("valid regex"), + } + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") { + 0.5 + } else { + 1.0 + } + } +} + +impl Extractor for CircuitBreakerConfigExtractor { + fn name(&self) -> &str { + "circuit_breaker_config" + } + + fn languages(&self) -> &[Language] { + &[ + Language::Rust, + Language::Go, + Language::Yaml, + Language::Toml, + Language::Json, + ] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + for (line_idx, line) in content.lines().enumerate() { + let line_num = line_idx + 1; + + // Check for explicitly disabled circuit breaker + if self.disabled_pattern.is_match(line) { + let mut concept_path = path_segments.to_vec(); + concept_path.push("api".to_string()); + concept_path.push("circuit_breaker".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "enabled".to_string(), + value: ObjectValue::Boolean(false), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: "Circuit breaker explicitly disabled".to_string(), + }); + } + + // Check for config with enabled: false + if self.config_disabled.is_match(line) { + let mut concept_path = path_segments.to_vec(); + concept_path.push("api".to_string()); + concept_path.push("circuit_breaker".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "enabled".to_string(), + value: ObjectValue::Boolean(false), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence: confidence * 0.9, // Slightly lower for multiline pattern + description: "Circuit breaker configuration disabled".to_string(), + }); + } + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_disabled_yaml() { + let extractor = CircuitBreakerConfigExtractor::new(); + let content = r#" +api: + circuit_breaker_enabled: false + timeout: 30s +"#; + + let claims = extractor.extract( + &["config".to_string()], + content, + Language::Yaml, + "config/api.yaml", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].predicate, "enabled"); + assert_eq!(claims[0].value, ObjectValue::Boolean(false)); + assert!(claims[0].concept_path.contains("circuit_breaker")); + } + + #[test] + fn test_disabled_toml() { + let extractor = CircuitBreakerConfigExtractor::new(); + let content = r#" +[api] +circuit_breaker_enabled = false +timeout = 30 +"#; + + let claims = extractor.extract( + &["config".to_string()], + content, + Language::Toml, + "config.toml", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].value, ObjectValue::Boolean(false)); + } + + #[test] + fn test_rust_config_disabled() { + let extractor = CircuitBreakerConfigExtractor::new(); + let content = r#" + CircuitBreakerConfig { + enabled: false, + failure_threshold: 5, + } + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].predicate, "enabled"); + } + + #[test] + fn test_enabled_not_flagged() { + let extractor = CircuitBreakerConfigExtractor::new(); + let content = r#" +api: + circuit_breaker_enabled: true + failure_threshold: 5 +"#; + + let claims = extractor.extract( + &["config".to_string()], + content, + Language::Yaml, + "config/api.yaml", + ); + + // Should not flag when enabled + assert_eq!(claims.len(), 0); + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = CircuitBreakerConfigExtractor::new(); + let content = r#" + circuit_breaker_enabled: false + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config_test.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].confidence, 0.5); + } + + #[test] + fn test_go_snake_case() { + let extractor = CircuitBreakerConfigExtractor::new(); + let content = r#" +config := Config{ + CircuitBreakerEnabled: false, +} +"#; + + let claims = extractor.extract( + &["go".to_string()], + content, + Language::Go, + "config.go", + ); + + assert_eq!(claims.len(), 1); + } +} diff --git a/applications/aphoria/src/extractors/const_declarations.rs b/applications/aphoria/src/extractors/const_declarations.rs new file mode 100644 index 0000000..fe25750 --- /dev/null +++ b/applications/aphoria/src/extractors/const_declarations.rs @@ -0,0 +1,297 @@ +//! Constant declarations extractor for Rust. +//! +//! Tracks `const` and `static` declarations with their values for provenance tracking. +//! Enables learning loop to preserve knowledge of magic constants like: +//! - `const RAPL_POWER_UNIT: u32 = 0x606` (Intel SDM register) +//! - `const MAX_RETRIES: u8 = 3` (retry policy) +//! - `const BUFFER_SIZE: usize = 4096` (buffer sizing) + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for Rust constant declarations. +/// +/// Detects `const` and `static` declarations to track magic constants +/// and preserve provenance information. +pub struct ConstDeclarationsExtractor { + /// Matches: const NAME: Type = value; + const_decl: Regex, + /// Matches: static NAME: Type = value; + static_decl: Regex, +} + +impl Default for ConstDeclarationsExtractor { + fn default() -> Self { + Self::new() + } +} + +impl ConstDeclarationsExtractor { + /// Create a new constant declarations extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // const RAPL_POWER_UNIT: u32 = 0x606; + const_decl: Regex::new( + r"^\s*(?:pub\s+)?const\s+([A-Z_][A-Z0-9_]*)\s*:\s*(\w+)\s*=\s*([^;]+);" + ) + .expect("valid regex"), + + // static MAX_CONNECTIONS: usize = 100; + static_decl: Regex::new( + r"^\s*(?:pub\s+)?static\s+([A-Z_][A-Z0-9_]*)\s*:\s*(\w+)\s*=\s*([^;]+);" + ) + .expect("valid regex"), + } + } + + /// Clean up the value string (remove comments, whitespace). + fn clean_value(&self, value: &str) -> String { + value + .split("//") + .next() + .unwrap_or(value) + .trim() + .to_string() + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") || file.contains("bench") { + 0.5 + } else { + 1.0 + } + } +} + +impl Extractor for ConstDeclarationsExtractor { + fn name(&self) -> &str { + "const_declarations" + } + + fn languages(&self) -> &[Language] { + &[Language::Rust] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + for (line_idx, line) in content.lines().enumerate() { + let line_num = line_idx + 1; + + // Check for const declarations + if let Some(cap) = self.const_decl.captures(line) { + let name = cap.get(1).map_or("", |m| m.as_str()); + let type_name = cap.get(2).map_or("", |m| m.as_str()); + let value = cap.get(3).map_or("", |m| m.as_str()); + + let cleaned_value = self.clean_value(value); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("const".to_string()); + concept_path.push(name.to_lowercase()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "value".to_string(), + value: ObjectValue::Text(cleaned_value.clone()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("{}: {} = {}", name, type_name, cleaned_value), + }); + } + + // Check for static declarations + if let Some(cap) = self.static_decl.captures(line) { + let name = cap.get(1).map_or("", |m| m.as_str()); + let type_name = cap.get(2).map_or("", |m| m.as_str()); + let value = cap.get(3).map_or("", |m| m.as_str()); + + let cleaned_value = self.clean_value(value); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("static".to_string()); + concept_path.push(name.to_lowercase()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "value".to_string(), + value: ObjectValue::Text(cleaned_value.clone()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("static {}: {} = {}", name, type_name, cleaned_value), + }); + } + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_const() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +const MAX_RETRIES: u8 = 3; +const BUFFER_SIZE: usize = 4096; + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert_eq!(claims.len(), 2); + assert!(claims.iter().any(|c| c.concept_path.contains("max_retries"))); + assert!(claims.iter().any(|c| c.concept_path.contains("buffer_size"))); + + let retry_claim = claims.iter().find(|c| c.concept_path.contains("max_retries")).unwrap(); + assert_eq!(retry_claim.value, ObjectValue::Text("3".to_string())); + } + + #[test] + fn test_hex_constant() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +const RAPL_POWER_UNIT: u32 = 0x606; + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "thermal".to_string()], + content, + Language::Rust, + "src/thermal/msr.rs", + ); + + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("thermal")); + assert!(claims[0].concept_path.contains("rapl_power_unit")); + assert_eq!(claims[0].value, ObjectValue::Text("0x606".to_string())); + } + + #[test] + fn test_pub_const() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +pub const DEFAULT_TIMEOUT: u64 = 30; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].value, ObjectValue::Text("30".to_string())); + } + + #[test] + fn test_static_declaration() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +static MAX_CONNECTIONS: usize = 100; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/server.rs", + ); + + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("static")); + assert!(claims[0].concept_path.contains("max_connections")); + } + + #[test] + fn test_value_with_comment() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +const TIMEOUT_MS: u64 = 5000; // 5 seconds + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert_eq!(claims.len(), 1); + // Comment should be stripped + assert_eq!(claims[0].value, ObjectValue::Text("5000".to_string())); + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +const TEST_VALUE: u32 = 42; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib_test.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].confidence, 0.5); + } + + #[test] + fn test_real_world_maxwell() { + let extractor = ConstDeclarationsExtractor::new(); + let content = r#" +//! MSR register definitions + +pub const RAPL_POWER_UNIT: u32 = 0x606; +pub const RAPL_PKG_POWER_LIMIT: u32 = 0x610; +pub const RAPL_PKG_ENERGY_STATUS: u32 = 0x611; + +const MAX_TEMP_CELSIUS: u8 = 85; + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "thermal".to_string()], + content, + Language::Rust, + "src/thermal/msr.rs", + ); + + assert_eq!(claims.len(), 4); + + // All thermal constants should be tracked + assert!(claims.iter().any(|c| c.value == ObjectValue::Text("0x606".to_string()))); + assert!(claims.iter().any(|c| c.value == ObjectValue::Text("0x610".to_string()))); + assert!(claims.iter().any(|c| c.value == ObjectValue::Text("85".to_string()))); + } +} diff --git a/applications/aphoria/src/extractors/dep_versions.rs b/applications/aphoria/src/extractors/dep_versions.rs index 29925af..71b2fb1 100644 --- a/applications/aphoria/src/extractors/dep_versions.rs +++ b/applications/aphoria/src/extractors/dep_versions.rs @@ -61,29 +61,50 @@ impl DepVersionsExtractor { file: &str, ) -> Vec { let mut claims = Vec::new(); + let mut in_dependencies = false; for (line_idx, line) in content.lines().enumerate() { - if let Some(captures) = self.cargo_dep.captures(line) { - let package = captures.get(1).map(|m| m.as_str()).unwrap_or(""); - let version = captures.get(2).or(captures.get(3)).map(|m| m.as_str()).unwrap_or(""); + let trimmed = line.trim(); - if !package.is_empty() && !version.is_empty() && version != "*" { - // Record the dependency for potential advisory lookup - let mut concept_path = path_segments.to_vec(); - concept_path.push("dep".to_string()); - concept_path.push(package.to_string()); - concept_path.push("version".to_string()); + // Track dependency sections + if trimmed.starts_with("[dependencies") + || trimmed.starts_with("[dev-dependencies") + || trimmed.starts_with("[build-dependencies") + { + in_dependencies = true; + continue; + } - claims.push(ExtractedClaim { - concept_path: format!("code://{}", concept_path.join("/")), - predicate: "installed_version".to_string(), - value: ObjectValue::Text(version.to_string()), - file: file.to_string(), - line: line_idx + 1, - matched_text: line.trim().to_string(), - confidence: 1.0, - description: format!("Dependency {} at version {}", package, version), - }); + // Exit dependency section when we hit a new section + if trimmed.starts_with('[') { + in_dependencies = false; + continue; + } + + // Only extract if we're in a dependencies section + if in_dependencies { + if let Some(captures) = self.cargo_dep.captures(line) { + let package = captures.get(1).map(|m| m.as_str()).unwrap_or(""); + let version = captures.get(2).or(captures.get(3)).map(|m| m.as_str()).unwrap_or(""); + + if !package.is_empty() && !version.is_empty() && version != "*" { + // Record the dependency for potential advisory lookup + let mut concept_path = path_segments.to_vec(); + concept_path.push("dep".to_string()); + concept_path.push(package.to_string()); + concept_path.push("version".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "installed_version".to_string(), + value: ObjectValue::Text(version.to_string()), + file: file.to_string(), + line: line_idx + 1, + matched_text: line.trim().to_string(), + confidence: 1.0, + description: format!("Dependency {} at version {}", package, version), + }); + } } } } @@ -347,4 +368,59 @@ flask>=2.0.0 assert_eq!(claims.len(), 2); } + + #[test] + fn test_cargo_ignores_package_metadata() { + let extractor = DepVersionsExtractor::new(); + let content = r#" +[package] +name = "maxwell-daemon" +version = "0.1.0" +edition = "2021" + +[[bin]] +name = "maxwelld" +path = "src/main.rs" + +[dependencies] +tokio = "1.28" + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string()], + content, + Language::CargoManifest, + "Cargo.toml", + ); + + // Should only extract the dependency (tokio), not package metadata + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("tokio")); + assert!(!claims.iter().any(|c| c.concept_path.contains("name"))); + assert!(!claims.iter().any(|c| c.concept_path.contains("version") && c.value == ObjectValue::Text("0.1.0".to_string()))); + } + + #[test] + fn test_cargo_extracts_from_dev_dependencies() { + let extractor = DepVersionsExtractor::new(); + let content = r#" +[dependencies] +tokio = "1.28" + +[dev-dependencies] +criterion = "0.5" + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::CargoManifest, + "Cargo.toml", + ); + + // Should extract from both [dependencies] and [dev-dependencies] + assert_eq!(claims.len(), 2); + assert!(claims.iter().any(|c| c.concept_path.contains("tokio"))); + assert!(claims.iter().any(|c| c.concept_path.contains("criterion"))); + } } diff --git a/applications/aphoria/src/extractors/derive_pattern.rs b/applications/aphoria/src/extractors/derive_pattern.rs new file mode 100644 index 0000000..d496394 --- /dev/null +++ b/applications/aphoria/src/extractors/derive_pattern.rs @@ -0,0 +1,376 @@ +//! Derive pattern extractor for Rust. +//! +//! Tracks `#[derive(...)]` annotations to detect API consistency patterns. +//! Enables learning loop conventions like "all message types derive Serialize + Deserialize" +//! or "all errors derive Debug + Display + Error". + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for Rust derive patterns. +/// +/// Detects `#[derive(...)]` annotations to track API consistency. +/// This enables the learning loop to establish patterns like: +/// - All message types: Serialize, Deserialize, Debug, Clone +/// - All error types: Debug, Display, Error +/// - All config types: Deserialize, Debug, Clone +pub struct DerivePatternExtractor { + /// Matches: #[derive(Debug, Clone, ...)] + derive_attr: Regex, + /// Matches: struct/enum name after derive + type_decl: Regex, +} + +impl Default for DerivePatternExtractor { + fn default() -> Self { + Self::new() + } +} + +impl DerivePatternExtractor { + /// Create a new derive pattern extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // Matches: #[derive(Debug, Clone, Serialize)] + derive_attr: Regex::new( + r#"#\[derive\s*\((.*?)\)\]"# + ) + .expect("valid regex"), + + // Matches struct/enum declarations + type_decl: Regex::new( + r"^\s*(?:pub\s+)?(?:struct|enum)\s+([A-Z][a-zA-Z0-9_]*)" + ) + .expect("valid regex"), + } + } + + /// Parse derive traits from the attribute string. + fn parse_derives(&self, derives_str: &str) -> Vec { + derives_str + .split(',') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .collect() + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") || file.contains("bench") { + 0.5 // Test/example types don't reflect production API patterns + } else { + 1.0 + } + } + + /// Infer type category from name or context. + fn infer_type_category(&self, type_name: &str, derives: &[String]) -> &'static str { + // Heuristics to categorize types + if type_name.ends_with("Error") || type_name.ends_with("Exception") { + "error" + } else if type_name.ends_with("Config") || type_name.ends_with("Settings") { + "config" + } else if type_name.ends_with("Request") || type_name.ends_with("Response") + || type_name.ends_with("Message") || type_name.ends_with("Event") { + "message" + } else if derives.iter().any(|d| d == "Serialize" || d == "Deserialize") { + "data" // Serializable types + } else { + "type" // Generic + } + } +} + +impl Extractor for DerivePatternExtractor { + fn name(&self) -> &str { + "derive_pattern" + } + + fn languages(&self) -> &[Language] { + &[Language::Rust] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + let lines: Vec<&str> = content.lines().collect(); + + for i in 0..lines.len() { + let line = lines[i]; + + // Look for #[derive(...)] + if let Some(cap) = self.derive_attr.captures(line) { + let derives_str = cap.get(1).map_or("", |m| m.as_str()); + let derives = self.parse_derives(derives_str); + + // Look ahead for the type declaration (within next 3 lines) + let mut type_name = None; + for line in lines.iter().skip(i + 1).take(3) { + if let Some(type_cap) = self.type_decl.captures(line) { + type_name = type_cap.get(1).map(|m| m.as_str().to_string()); + break; + } + } + + if let Some(name) = type_name { + let category = self.infer_type_category(&name, &derives); + + // Create a concept path based on category + let mut concept_path = path_segments.to_vec(); + concept_path.push(category.to_string()); + concept_path.push(name.to_lowercase()); + concept_path.push("derives".to_string()); + + // Sort derives for consistency + let mut sorted_derives = derives.clone(); + sorted_derives.sort(); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "traits".to_string(), + value: ObjectValue::Text(sorted_derives.join(",")), + file: file.to_string(), + line: i + 1, + matched_text: line.trim().to_string(), + confidence, + description: format!("{} derives {}", name, sorted_derives.join(", ")), + }); + } + } + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_derive() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Debug, Clone)] +pub struct Wallet { + balance: u64, +} + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string()], + content, + Language::Rust, + "src/wallet.rs", + ); + + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("wallet")); + assert!(claims[0].concept_path.contains("derives")); + if let ObjectValue::Text(ref val) = claims[0].value { + assert!(val.contains("Clone")); + assert!(val.contains("Debug")); + } else { + panic!("Expected Text value"); + } + } + + #[test] + fn test_message_type_pattern() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct BidMessage { + amount: u64, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct AckMessage { + id: String, +} + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "vsock".to_string()], + content, + Language::Rust, + "src/messages.rs", + ); + + assert_eq!(claims.len(), 2); + + // Both should be categorized as "message" + assert!(claims.iter().all(|c| c.concept_path.contains("message"))); + + // Both should have the same derives (sorted) + if let ObjectValue::Text(ref val1) = claims[0].value { + if let ObjectValue::Text(ref val2) = claims[1].value { + assert_eq!(val1, val2); // Same pattern! + assert!(val1.contains("Clone")); + assert!(val1.contains("Debug")); + assert!(val1.contains("Deserialize")); + assert!(val1.contains("Serialize")); + } + } + } + + #[test] + fn test_error_type_categorization() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Debug, Display, Error)] +pub enum WalletError { + InsufficientFunds, +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/error.rs", + ); + + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("error")); + assert!(claims[0].concept_path.contains("walleterror")); + } + + #[test] + fn test_config_type_categorization() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Deserialize, Debug, Clone)] +pub struct AppConfig { + port: u16, +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("config")); + } + + #[test] + fn test_multiline_struct() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Debug, Clone)] +pub struct Wallet { + balance: u64, + owner: String, +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + assert_eq!(claims.len(), 1); + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Debug, Clone)] +struct TestHelper { + data: Vec, +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/wallet_test.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].confidence, 0.5); + } + + #[test] + fn test_sorted_derives() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +#[derive(Clone, Debug, Serialize, Deserialize)] +struct Foo {} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + assert_eq!(claims.len(), 1); + // Should be alphabetically sorted + if let ObjectValue::Text(ref val) = claims[0].value { + assert_eq!(val, "Clone,Debug,Deserialize,Serialize"); + } + } + + #[test] + fn test_real_world_example() { + let extractor = DerivePatternExtractor::new(); + let content = r#" +//! Message types for vsock communication + +use serde::{Serialize, Deserialize}; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct BidMessage { + pub amount: u64, + pub timestamp: u64, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct AckMessage { + pub id: String, +} + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "vsock".to_string(), "messages".to_string()], + content, + Language::Rust, + "src/vsock/messages.rs", + ); + + assert_eq!(claims.len(), 2); + // Both should have consistent derives + assert!(claims.iter().all(|c| { + if let ObjectValue::Text(ref v) = c.value { + v.contains("Clone") && v.contains("Debug") && v.contains("Serialize") + } else { + false + } + })); + } +} diff --git a/applications/aphoria/src/extractors/durability_config.rs b/applications/aphoria/src/extractors/durability_config.rs new file mode 100644 index 0000000..eadb42a --- /dev/null +++ b/applications/aphoria/src/extractors/durability_config.rs @@ -0,0 +1,399 @@ +//! Durability configuration extractor. +//! +//! Detects WAL durability settings that impact data integrity guarantees. +//! Critical for systems that must survive crashes or power failures. + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for durability configuration. +/// +/// Detects: +/// - DurabilityLevel::Eventual (risky - no fsync) +/// - DurabilityLevel::Batched (balanced - periodic fsync) +/// - DurabilityLevel::Immediate (safe - fsync after every write) +/// - YAML/TOML config: `durability: "eventual"` or `fsync_strategy = "none"` +pub struct DurabilityConfigExtractor { + /// Rust enum patterns + durability_enum: Regex, + /// YAML/TOML patterns + yaml_durability: Regex, + toml_fsync: Regex, + /// Batched configuration + batched_pattern: Regex, +} + +impl Default for DurabilityConfigExtractor { + fn default() -> Self { + Self::new() + } +} + +impl DurabilityConfigExtractor { + /// Create a new durability config extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // Rust: DurabilityLevel::Eventual | ::Batched | ::Immediate + durability_enum: Regex::new( + r"DurabilityLevel::(Eventual|Batched|Immediate)" + ) + .expect("valid regex"), + + // YAML: durability: "eventual" | "batched" | "immediate" + yaml_durability: Regex::new( + r#"(?i)durability\s*:\s*["']?(eventual|batched|immediate)["']?"# + ) + .expect("valid regex"), + + // TOML: fsync_strategy = "none" | "batched" | "immediate" + toml_fsync: Regex::new( + r#"(?i)fsync_strategy\s*=\s*["']?(none|batched|immediate)["']?"# + ) + .expect("valid regex"), + + // Batched with parameters: DurabilityLevel::batched_with(max_writes, max_duration) + batched_pattern: Regex::new( + r"DurabilityLevel::batched(?:_with)?\(" + ) + .expect("valid regex"), + } + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") || file.contains("bench") { + 0.5 // Test/example code doesn't reflect production config + } else { + 1.0 // Production code + } + } + + /// Extract strategy name and normalize it. + fn normalize_strategy(&self, strategy: &str) -> &'static str { + match strategy.to_lowercase().as_str() { + "eventual" | "none" => "eventual", + "batched" => "batched", + "immediate" => "immediate", + _ => "unknown", + } + } +} + +impl Extractor for DurabilityConfigExtractor { + fn name(&self) -> &str { + "durability_config" + } + + fn languages(&self) -> &[Language] { + &[ + Language::Rust, + Language::Go, + Language::Yaml, + Language::Toml, + Language::Json, + ] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + for (line_idx, line) in content.lines().enumerate() { + let line_num = line_idx + 1; + + // Check Rust enum patterns + if let Some(cap) = self.durability_enum.captures(line) { + let level = cap.get(1).map_or("", |m| m.as_str()); + let normalized = self.normalize_strategy(level); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("wal".to_string()); + concept_path.push("durability".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "strategy".to_string(), + value: ObjectValue::Text(normalized.to_string()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("WAL durability set to {}", normalized), + }); + } + + // Check YAML durability patterns + if let Some(cap) = self.yaml_durability.captures(line) { + let level = cap.get(1).map_or("", |m| m.as_str()); + let normalized = self.normalize_strategy(level); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("wal".to_string()); + concept_path.push("durability".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "strategy".to_string(), + value: ObjectValue::Text(normalized.to_string()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("WAL durability configured as {}", normalized), + }); + } + + // Check TOML fsync_strategy patterns + if let Some(cap) = self.toml_fsync.captures(line) { + let strategy = cap.get(1).map_or("", |m| m.as_str()); + let normalized = self.normalize_strategy(strategy); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("wal".to_string()); + concept_path.push("durability".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "strategy".to_string(), + value: ObjectValue::Text(normalized.to_string()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("Fsync strategy set to {}", normalized), + }); + } + + // Check for batched configuration with custom parameters + if self.batched_pattern.is_match(line) { + let mut concept_path = path_segments.to_vec(); + concept_path.push("wal".to_string()); + concept_path.push("durability".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "strategy".to_string(), + value: ObjectValue::Text("batched".to_string()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence: confidence * 0.9, // Slightly lower since we're not parsing params + description: "WAL durability set to batched with custom parameters".to_string(), + }); + } + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rust_eventual() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" + let journal = Journal::open(&wal_path) + .with_durability(DurabilityLevel::Eventual); + "#; + + let claims = extractor.extract( + &["rust".to_string(), "myproject".to_string()], + content, + Language::Rust, + "src/wal.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].predicate, "strategy"); + if let ObjectValue::Text(ref value) = claims[0].value { + assert_eq!(value, "eventual"); + } else { + panic!("Expected Text value"); + } + assert!(claims[0].concept_path.contains("wal/durability")); + assert_eq!(claims[0].confidence, 1.0); + } + + #[test] + fn test_rust_batched() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" + DurabilityLevel::Batched { max_writes: 100, max_duration: Duration::from_secs(1) } + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert_eq!(claims.len(), 1); + if let ObjectValue::Text(ref value) = claims[0].value { + assert_eq!(value, "batched"); + } else { + panic!("Expected Text value"); + } + } + + #[test] + fn test_rust_immediate() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" + let guard = FsyncGuard::new(file, path, DurabilityLevel::Immediate); + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/guard.rs", + ); + + assert_eq!(claims.len(), 1); + if let ObjectValue::Text(ref value) = claims[0].value { + assert_eq!(value, "immediate"); + } else { + panic!("Expected Text value"); + } + } + + #[test] + fn test_yaml_config() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" +wal: + durability: "eventual" + max_size: 1GB +"#; + + let claims = extractor.extract( + &["config".to_string()], + content, + Language::Yaml, + "config/storage.yaml", + ); + + assert_eq!(claims.len(), 1); + if let ObjectValue::Text(ref value) = claims[0].value { + assert_eq!(value, "eventual"); + } else { + panic!("Expected Text value"); + } + } + + #[test] + fn test_toml_fsync_none() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" +[wal] +fsync_strategy = "none" +max_file_size = 104857600 +"#; + + let claims = extractor.extract( + &["config".to_string()], + content, + Language::Toml, + "config.toml", + ); + + assert_eq!(claims.len(), 1); + if let ObjectValue::Text(ref value) = claims[0].value { + assert_eq!(value, "eventual"); // Normalized from "none" to "eventual" + } else { + panic!("Expected Text value"); + } + } + + #[test] + fn test_batched_with_params() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" + let level = DurabilityLevel::batched_with(50, Duration::from_millis(100)); + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/journal.rs", + ); + + assert_eq!(claims.len(), 1); + if let ObjectValue::Text(ref value) = claims[0].value { + assert_eq!(value, "batched"); + } else { + panic!("Expected Text value"); + } + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" + let journal = Journal::open(&wal_path) + .with_durability(DurabilityLevel::Eventual); + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/wal_test.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].confidence, 0.5); // Test file gets reduced confidence + } + + #[test] + fn test_multiple_durability_settings() { + let extractor = DurabilityConfigExtractor::new(); + let content = r#" + if testing { + journal.with_durability(DurabilityLevel::Eventual); + } else { + journal.with_durability(DurabilityLevel::Immediate); + } + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/config.rs", + ); + + assert_eq!(claims.len(), 2); + // Should detect both eventual and immediate + let values: Vec<_> = claims + .iter() + .filter_map(|c| { + if let ObjectValue::Text(ref v) = c.value { + Some(v.as_str()) + } else { + None + } + }) + .collect(); + assert!(values.contains(&"eventual")); + assert!(values.contains(&"immediate")); + } +} diff --git a/applications/aphoria/src/extractors/import_graph.rs b/applications/aphoria/src/extractors/import_graph.rs new file mode 100644 index 0000000..3d37e8f --- /dev/null +++ b/applications/aphoria/src/extractors/import_graph.rs @@ -0,0 +1,301 @@ +//! Import graph extractor for Rust. +//! +//! Tracks `use` statements to detect architecture boundaries and dependency patterns. +//! Enables learning loop conventions like "core never imports tokio" or +//! "all message types import serde". + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for Rust import patterns. +/// +/// Detects `use` statements to track which modules import which crates. +/// This enables the learning loop to establish and enforce architecture boundaries. +pub struct ImportGraphExtractor { + /// Matches: use crate_name::...; + use_statement: Regex, + /// Matches: use crate::{A, B, C}; + use_group: Regex, +} + +impl Default for ImportGraphExtractor { + fn default() -> Self { + Self::new() + } +} + +impl ImportGraphExtractor { + /// Create a new import graph extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // Matches: use tokio::runtime::Runtime; + // Captures the root crate name + use_statement: Regex::new( + r"^\s*(?:pub\s+)?use\s+([a-zA-Z_][a-zA-Z0-9_]*)" + ) + .expect("valid regex"), + + // For grouped imports: use tokio::{...}; + use_group: Regex::new( + r"^\s*(?:pub\s+)?use\s+([a-zA-Z_][a-zA-Z0-9_]*)::\{" + ) + .expect("valid regex"), + } + } + + /// Extract the root crate name from a use statement. + fn extract_crate_name(&self, line: &str) -> Option { + // Try regular use statement first + if let Some(cap) = self.use_statement.captures(line) { + let crate_name = cap.get(1)?.as_str(); + + // Filter out relative imports and standard patterns + if crate_name == "crate" || crate_name == "self" || crate_name == "super" { + return None; + } + + return Some(crate_name.to_string()); + } + + // Try grouped import + if let Some(cap) = self.use_group.captures(line) { + let crate_name = cap.get(1)?.as_str(); + + if crate_name == "crate" || crate_name == "self" || crate_name == "super" { + return None; + } + + return Some(crate_name.to_string()); + } + + None + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") || file.contains("bench") { + 0.5 // Test/example imports don't reflect production architecture + } else { + 1.0 + } + } +} + +impl Extractor for ImportGraphExtractor { + fn name(&self) -> &str { + "import_graph" + } + + fn languages(&self) -> &[Language] { + &[Language::Rust] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + // Track unique imports to avoid duplicate claims + let mut seen_imports = std::collections::HashSet::new(); + + for (line_idx, line) in content.lines().enumerate() { + let line_num = line_idx + 1; + + if let Some(crate_name) = self.extract_crate_name(line) { + // Only create one claim per imported crate per file + if !seen_imports.contains(&crate_name) { + seen_imports.insert(crate_name.clone()); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("imports".to_string()); + concept_path.push(crate_name.clone()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "imported".to_string(), + value: ObjectValue::Boolean(true), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("Module imports {}", crate_name), + }); + } + } + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_use_statement() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +use tokio::runtime::Runtime; +use serde::{Serialize, Deserialize}; +use std::sync::Arc; + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "core".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + assert_eq!(claims.len(), 3); + + // Check that we captured the right crates + let crate_names: Vec<_> = claims.iter() + .filter_map(|c| c.concept_path.split('/').last()) + .collect(); + + assert!(crate_names.contains(&"tokio")); + assert!(crate_names.contains(&"serde")); + assert!(crate_names.contains(&"std")); + } + + #[test] + fn test_pub_use() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +pub use tokio::sync::Mutex; + "#; + + let claims = extractor.extract( + &["rust".to_string(), "myproject".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("tokio")); + } + + #[test] + fn test_ignores_relative_imports() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +use crate::wallet::Wallet; +use super::common; +use self::internal; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + // Should not create claims for crate/super/self + assert_eq!(claims.len(), 0); + } + + #[test] + fn test_deduplication() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +use tokio::runtime::Runtime; +use tokio::sync::Mutex; +use tokio::time::sleep; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + // Should only create one claim for "tokio" even though it's imported 3 times + assert_eq!(claims.len(), 1); + assert!(claims[0].concept_path.contains("tokio")); + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +use tokio::runtime::Runtime; + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/wallet_test.rs", + ); + + assert_eq!(claims.len(), 1); + assert_eq!(claims[0].confidence, 0.5); // Test file gets reduced confidence + } + + #[test] + fn test_real_world_example() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +//! Wallet module for Maxwell. + +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use serde::{Serialize, Deserialize}; + +#[derive(Serialize, Deserialize)] +pub struct Wallet { + balance: AtomicU64, +} + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "wallet".to_string()], + content, + Language::Rust, + "src/wallet.rs", + ); + + // Should capture std and serde, but deduplicate std + assert_eq!(claims.len(), 2); + assert!(claims.iter().any(|c| c.concept_path.contains("std"))); + assert!(claims.iter().any(|c| c.concept_path.contains("serde"))); + } + + #[test] + fn test_concept_path_structure() { + let extractor = ImportGraphExtractor::new(); + let content = r#" +use tokio::runtime::Runtime; + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "core".to_string()], + content, + Language::Rust, + "src/core/mod.rs", + ); + + assert_eq!(claims.len(), 1); + // Should be: code://rust/maxwell/core/imports/tokio + assert_eq!(claims[0].concept_path, "code://rust/maxwell/core/imports/tokio"); + assert_eq!(claims[0].predicate, "imported"); + assert_eq!(claims[0].value, ObjectValue::Boolean(true)); + } +} diff --git a/applications/aphoria/src/extractors/mod.rs b/applications/aphoria/src/extractors/mod.rs index 3b79ea8..73d9447 100644 --- a/applications/aphoria/src/extractors/mod.rs +++ b/applications/aphoria/src/extractors/mod.rs @@ -8,6 +8,7 @@ //! - `timeout_config`: HTTP/DB/Redis timeout values //! - `dep_versions`: Dependency versions for advisory lookup //! - `cors_config`: CORS allow-origin settings +//! - `durability_config`: WAL durability/fsync strategy settings //! - `rate_limit`: Rate limiting configuration //! - `weak_crypto`: Weak cryptographic algorithms (MD5, SHA1, DES, RC4) //! - `sql_injection`: SQL query construction with string interpolation @@ -17,6 +18,12 @@ //! - `unreal_performance`: Unreal Engine performance pitfalls (Sync loading) //! - `high_entropy_secrets`: High-entropy strings likely to be leaked secrets //! - `auth_bypass`: Authentication bypass patterns (hardcoded creds, debug auth) +//! - `api_key_security`: API key authentication and rate limiting misconfigurations +//! - `import_graph`: Rust `use` statements for architecture boundary tracking +//! - `derive_pattern`: Rust `#[derive(...)]` annotations for API consistency +//! - `const_declarations`: Rust `const`/`static` declarations for provenance tracking +//! - `unsafe_atomic`: Rust `unsafe` blocks and `Ordering::*` patterns for safety conventions +//! - `circuit_breaker_config`: Circuit breaker disabled or missing //! - `insecure_cookies`: Cookies missing Secure/HttpOnly flags //! - `path_traversal`: File operations with user-controlled paths //! - `unvalidated_redirects`: HTTP redirects with user-controlled URLs @@ -46,21 +53,27 @@ //! Users can also define custom extractors via `aphoria.toml` without writing //! Rust code. See [`DeclarativeExtractor`] for details. +mod api_key_security; mod aspnet_security; mod auth_bypass; +mod circuit_breaker_config; mod command_injection; mod config_parser; mod config_security; +mod const_declarations; mod cors_config; mod declarative; mod dep_versions; +mod derive_pattern; mod django_security; +mod durability_config; mod express_security; mod fastapi_security; mod flask_security; mod hardcoded_secrets; mod high_entropy; mod ignore_comments; +mod import_graph; mod insecure_cookies; mod insecure_deserialization; mod jwt_config; @@ -81,6 +94,7 @@ mod tls_verify; mod tls_version; mod traits; mod unreal_config; +mod unsafe_atomic; mod unreal_cpp; mod unreal_performance; mod unvalidated_redirects; @@ -88,23 +102,29 @@ mod weak_crypto; mod weak_password; mod xxe; +pub use api_key_security::ApiKeySecurityExtractor; pub use aspnet_security::AspNetSecurityExtractor; pub use auth_bypass::AuthBypassExtractor; +pub use circuit_breaker_config::CircuitBreakerConfigExtractor; pub use command_injection::CommandInjectionExtractor; pub use config_parser::{parse_config, walk_config, ConfigParseError, ConfigValue}; pub use config_security::ConfigSecurityExtractor; +pub use const_declarations::ConstDeclarationsExtractor; pub use cors_config::CorsConfigExtractor; pub use declarative::{ DeclarativeClaimDef, DeclarativeExtractor, DeclarativeExtractorDef, DeclarativeValue, }; pub use dep_versions::DepVersionsExtractor; +pub use derive_pattern::DerivePatternExtractor; pub use django_security::DjangoSecurityExtractor; +pub use durability_config::DurabilityConfigExtractor; pub use express_security::ExpressSecurityExtractor; pub use fastapi_security::FastApiSecurityExtractor; pub use flask_security::FlaskSecurityExtractor; pub use hardcoded_secrets::HardcodedSecretsExtractor; pub use high_entropy::HighEntropySecretsExtractor; pub use ignore_comments::IgnoreCommentParser; +pub use import_graph::ImportGraphExtractor; pub use insecure_cookies::InsecureCookiesExtractor; pub use insecure_deserialization::InsecureDeserializationExtractor; pub use jwt_config::JwtConfigExtractor; @@ -127,6 +147,7 @@ pub use traits::{build_claim, is_test_file, Extractor}; pub use unreal_config::UnrealConfigExtractor; pub use unreal_cpp::UnrealCppExtractor; pub use unreal_performance::UnrealPerformanceExtractor; +pub use unsafe_atomic::UnsafeAtomicExtractor; pub use unvalidated_redirects::UnvalidatedRedirectsExtractor; pub use weak_crypto::WeakCryptoExtractor; pub use weak_password::WeakPasswordExtractor; diff --git a/applications/aphoria/src/extractors/registry.rs b/applications/aphoria/src/extractors/registry.rs index 0682b3a..892cfc2 100644 --- a/applications/aphoria/src/extractors/registry.rs +++ b/applications/aphoria/src/extractors/registry.rs @@ -5,20 +5,26 @@ use tracing::instrument; use crate::config::AphoriaConfig; use crate::types::{ExtractedClaim, Language}; +use super::api_key_security::ApiKeySecurityExtractor; use super::aspnet_security::AspNetSecurityExtractor; use super::auth_bypass::AuthBypassExtractor; +use super::circuit_breaker_config::CircuitBreakerConfigExtractor; use super::command_injection::CommandInjectionExtractor; use super::config_security::ConfigSecurityExtractor; +use super::const_declarations::ConstDeclarationsExtractor; use super::cors_config::CorsConfigExtractor; use super::declarative::{DeclarativeExtractor, DeclarativeExtractorDef}; use super::dep_versions::DepVersionsExtractor; +use super::derive_pattern::DerivePatternExtractor; use super::django_security::DjangoSecurityExtractor; +use super::durability_config::DurabilityConfigExtractor; use super::express_security::ExpressSecurityExtractor; use super::fastapi_security::FastApiSecurityExtractor; use super::flask_security::FlaskSecurityExtractor; use super::hardcoded_secrets::HardcodedSecretsExtractor; use super::high_entropy::HighEntropySecretsExtractor; use super::ignore_comments::IgnoreCommentParser; +use super::import_graph::ImportGraphExtractor; use super::insecure_cookies::InsecureCookiesExtractor; use super::insecure_deserialization::InsecureDeserializationExtractor; use super::jwt_config::JwtConfigExtractor; @@ -40,6 +46,7 @@ use super::traits::Extractor; use super::unreal_config::UnrealConfigExtractor; use super::unreal_cpp::UnrealCppExtractor; use super::unreal_performance::UnrealPerformanceExtractor; +use super::unsafe_atomic::UnsafeAtomicExtractor; use super::unvalidated_redirects::UnvalidatedRedirectsExtractor; use super::weak_crypto::WeakCryptoExtractor; use super::weak_password::WeakPasswordExtractor; @@ -97,12 +104,15 @@ impl ExtractorRegistry { }; extractors.push(Box::new(TimeoutConfigExtractor::new(thresholds))); } - if is_enabled("dep_versions") { + if is_enabled("dep_versions") && config.extractors.dep_versions.enabled { extractors.push(Box::new(DepVersionsExtractor::new())); } if is_enabled("cors_config") { extractors.push(Box::new(CorsConfigExtractor::new())); } + if is_enabled("durability_config") { + extractors.push(Box::new(DurabilityConfigExtractor::new())); + } if is_enabled("rate_limit") { extractors.push(Box::new(RateLimitExtractor::default())); } @@ -133,6 +143,24 @@ impl ExtractorRegistry { if is_enabled("auth_bypass") { extractors.push(Box::new(AuthBypassExtractor::new())); } + if is_enabled("api_key_security") { + extractors.push(Box::new(ApiKeySecurityExtractor::new())); + } + if is_enabled("circuit_breaker_config") { + extractors.push(Box::new(CircuitBreakerConfigExtractor::new())); + } + if is_enabled("import_graph") { + extractors.push(Box::new(ImportGraphExtractor::new())); + } + if is_enabled("derive_pattern") { + extractors.push(Box::new(DerivePatternExtractor::new())); + } + if is_enabled("const_declarations") { + extractors.push(Box::new(ConstDeclarationsExtractor::new())); + } + if is_enabled("unsafe_atomic") { + extractors.push(Box::new(UnsafeAtomicExtractor::new())); + } if is_enabled("insecure_cookies") { extractors.push(Box::new(InsecureCookiesExtractor::new())); } @@ -288,7 +316,15 @@ mod tests { /// Number of built-in extractors (not counting declarative). /// Phase 8.2 added 10 framework-specific extractors: 26 + 10 = 36 - const BUILTIN_EXTRACTOR_COUNT: usize = 36; + /// dep_versions is now opt-in (disabled by default): 36 - 1 = 35 + /// durability_config added: 35 + 1 = 36 + /// api_key_security added: 36 + 1 = 37 + /// circuit_breaker_config added: 37 + 1 = 38 + /// import_graph added: 38 + 1 = 39 + /// derive_pattern added: 39 + 1 = 40 + /// const_declarations added: 40 + 1 = 41 + /// unsafe_atomic added: 41 + 1 = 42 + const BUILTIN_EXTRACTOR_COUNT: usize = 42; #[test] fn test_registry_creation() { @@ -320,8 +356,15 @@ mod tests { assert!(!rust_extractors.is_empty()); let cargo_extractors = registry.for_language(Language::CargoManifest); - // Only dep_versions works on Cargo.toml - assert!(cargo_extractors.iter().any(|e| e.name() == "dep_versions")); + // dep_versions is disabled by default (opt-in only) + assert!(!cargo_extractors.iter().any(|e| e.name() == "dep_versions")); + + // Test with dep_versions explicitly enabled + let mut config_with_deps = AphoriaConfig::default(); + config_with_deps.extractors.dep_versions.enabled = true; + let registry_with_deps = ExtractorRegistry::new(&config_with_deps); + let cargo_extractors_enabled = registry_with_deps.for_language(Language::CargoManifest); + assert!(cargo_extractors_enabled.iter().any(|e| e.name() == "dep_versions")); } #[test] diff --git a/applications/aphoria/src/extractors/unsafe_atomic.rs b/applications/aphoria/src/extractors/unsafe_atomic.rs new file mode 100644 index 0000000..f5fd7f5 --- /dev/null +++ b/applications/aphoria/src/extractors/unsafe_atomic.rs @@ -0,0 +1,329 @@ +//! Unsafe and atomic patterns extractor for Rust. +//! +//! Tracks `unsafe` blocks and `Ordering::*` patterns for correctness conventions. +//! Enables learning loop to establish patterns like: +//! - "All wallet operations use Ordering::SeqCst" +//! - "Unsafe code requires documented safety invariants" + +use regex::Regex; +use stemedb_core::types::ObjectValue; + +use super::Extractor; +use crate::types::{ExtractedClaim, Language}; + +/// Extractor for unsafe blocks and atomic ordering patterns. +/// +/// Detects safety-critical patterns in Rust code to enable +/// correctness conventions. +pub struct UnsafeAtomicExtractor { + /// Matches: Ordering::SeqCst, Ordering::Relaxed, etc. + ordering_pattern: Regex, + /// Matches: unsafe { ... } or unsafe fn + unsafe_keyword: Regex, +} + +impl Default for UnsafeAtomicExtractor { + fn default() -> Self { + Self::new() + } +} + +impl UnsafeAtomicExtractor { + /// Create a new unsafe/atomic extractor. + /// + /// # Panics + /// Panics if any regex pattern is invalid (programmer error). + #[allow(clippy::expect_used)] + pub fn new() -> Self { + Self { + // Ordering::SeqCst, Ordering::Relaxed, etc. + ordering_pattern: Regex::new( + r"Ordering::(SeqCst|Acquire|Release|AcqRel|Relaxed)" + ) + .expect("valid regex"), + + // unsafe keyword (blocks or functions) + unsafe_keyword: Regex::new( + r"\b(unsafe)\s*(\{|fn)" + ) + .expect("valid regex"), + } + } + + /// Determine confidence based on context. + fn confidence_for_file(&self, file: &str) -> f32 { + if file.contains("test") || file.contains("example") || file.contains("bench") { + 0.5 + } else { + 1.0 + } + } +} + +impl Extractor for UnsafeAtomicExtractor { + fn name(&self) -> &str { + "unsafe_atomic" + } + + fn languages(&self) -> &[Language] { + &[Language::Rust] + } + + fn extract( + &self, + path_segments: &[String], + content: &str, + _language: Language, + file: &str, + ) -> Vec { + let mut claims = Vec::new(); + let confidence = self.confidence_for_file(file); + + // Track unique patterns to avoid excessive claims + let mut seen_orderings = std::collections::HashSet::new(); + let mut unsafe_count = 0; + + for (line_idx, line) in content.lines().enumerate() { + let line_num = line_idx + 1; + + // Check for atomic ordering patterns + if let Some(cap) = self.ordering_pattern.captures(line) { + let ordering = cap.get(1).map_or("", |m| m.as_str()); + + if !seen_orderings.contains(ordering) { + seen_orderings.insert(ordering.to_string()); + + let mut concept_path = path_segments.to_vec(); + concept_path.push("atomics".to_string()); + concept_path.push("ordering".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "pattern".to_string(), + value: ObjectValue::Text(ordering.to_string()), + file: file.to_string(), + line: line_num, + matched_text: line.trim().to_string(), + confidence, + description: format!("Atomic operation uses Ordering::{}", ordering), + }); + } + } + + // Check for unsafe blocks/functions + if self.unsafe_keyword.is_match(line) { + unsafe_count += 1; + } + } + + // Add a summary claim for unsafe usage if found + if unsafe_count > 0 { + let mut concept_path = path_segments.to_vec(); + concept_path.push("unsafe".to_string()); + concept_path.push("count".to_string()); + + claims.push(ExtractedClaim { + concept_path: format!("code://{}", concept_path.join("/")), + predicate: "occurrences".to_string(), + value: ObjectValue::Number(unsafe_count as f64), + file: file.to_string(), + line: 1, + matched_text: format!("{} unsafe blocks/functions", unsafe_count), + confidence: confidence * 0.9, // Slightly lower as this is a summary + description: format!("File contains {} unsafe block(s) or function(s)", unsafe_count), + }); + } + + claims + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_atomic_ordering() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +let balance = self.balance.load(Ordering::SeqCst); +self.balance.store(new_balance, Ordering::SeqCst); + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "wallet".to_string()], + content, + Language::Rust, + "src/wallet.rs", + ); + + // Should have one claim for SeqCst (deduplicated) + assert!(claims.iter().any(|c| { + c.concept_path.contains("atomics/ordering") && + c.value == ObjectValue::Text("SeqCst".to_string()) + })); + } + + #[test] + fn test_multiple_orderings() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +let a = atomic.load(Ordering::Acquire); +let b = atomic.load(Ordering::Relaxed); +atomic.store(x, Ordering::Release); + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/sync.rs", + ); + + // Should have 3 distinct ordering claims (Acquire, Relaxed, Release) + let ordering_claims: Vec<_> = claims.iter() + .filter(|c| c.concept_path.contains("ordering")) + .collect(); + + assert_eq!(ordering_claims.len(), 3); + } + + #[test] + fn test_unsafe_block() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +unsafe { + let ptr = mem::transmute(addr); +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + // Should have one unsafe count claim + let unsafe_claim = claims.iter().find(|c| c.concept_path.contains("unsafe/count")); + assert!(unsafe_claim.is_some()); + assert_eq!(unsafe_claim.unwrap().value, ObjectValue::Number(1.0)); + } + + #[test] + fn test_unsafe_fn() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +unsafe fn read_msr(reg: u32) -> u64 { + // ... +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/msr.rs", + ); + + let unsafe_claim = claims.iter().find(|c| c.concept_path.contains("unsafe")); + assert!(unsafe_claim.is_some()); + } + + #[test] + fn test_multiple_unsafe_blocks() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +unsafe fn foo() {} + +fn bar() { + unsafe { + // block 1 + } + + unsafe { + // block 2 + } +} + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/lib.rs", + ); + + let unsafe_claim = claims.iter().find(|c| c.concept_path.contains("unsafe/count")).unwrap(); + assert_eq!(unsafe_claim.value, ObjectValue::Number(3.0)); // 1 fn + 2 blocks + } + + #[test] + fn test_confidence_in_test_file() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +unsafe { test_something(); } + "#; + + let claims = extractor.extract( + &["rust".to_string()], + content, + Language::Rust, + "src/test.rs", + ); + + assert!(!claims.is_empty()); + // Confidence should be reduced for test files + assert!(claims.iter().all(|c| c.confidence <= 0.5)); + } + + #[test] + fn test_real_world_wallet() { + let extractor = UnsafeAtomicExtractor::new(); + let content = r#" +//! Wallet with atomic balance tracking + +use std::sync::atomic::{AtomicU64, Ordering}; + +pub struct Wallet { + balance: AtomicU64, +} + +impl Wallet { + pub fn deposit(&self, amount: u64) { + self.balance.fetch_add(amount, Ordering::SeqCst); + } + + pub fn withdraw(&self, amount: u64) -> bool { + let current = self.balance.load(Ordering::SeqCst); + if current >= amount { + self.balance.fetch_sub(amount, Ordering::SeqCst); + true + } else { + false + } + } + + pub fn balance(&self) -> u64 { + self.balance.load(Ordering::SeqCst); + } +} + "#; + + let claims = extractor.extract( + &["rust".to_string(), "maxwell".to_string(), "wallet".to_string()], + content, + Language::Rust, + "src/wallet.rs", + ); + + // Should detect SeqCst ordering (all wallet ops use it consistently) + assert!(claims.iter().any(|c| + c.concept_path.contains("ordering") && + c.value == ObjectValue::Text("SeqCst".to_string()) + )); + + // Should NOT have unsafe claims (no unsafe code) + assert!(!claims.iter().any(|c| c.concept_path.contains("unsafe"))); + } +}