//! Core extractor trait and helper functions. use stemedb_core::types::ObjectValue; use crate::types::{ExtractedClaim, Language}; // ============================================================================ // Shared Utilities for Extractors // ============================================================================ /// Check if a file path indicates a test file. /// /// Used by extractors to lower confidence for test fixtures since /// hardcoded values in tests are often intentional. pub fn is_test_file(file: &str) -> bool { let lower = file.to_lowercase(); lower.contains("test") || lower.contains("spec") || lower.contains("example") || lower.contains("fixture") || lower.contains("mock") || lower.contains("_test.") || lower.ends_with("_test.py") || lower.ends_with("_test.go") || lower.ends_with("_test.rs") } /// Build an extracted claim with consistent formatting. /// /// This is a helper for extractors to create claims with: /// - Consistent concept path format (`code://segment1/segment2/...`) /// - Automatic confidence reduction for test files /// - Standard claim structure #[allow(clippy::too_many_arguments)] pub fn build_claim( path_segments: &[String], leaf_segments: &[&str], predicate: &str, value: ObjectValue, file: &str, line: usize, matched_text: &str, base_confidence: f32, description: &str, ) -> ExtractedClaim { let mut concept_path = path_segments.to_vec(); for segment in leaf_segments { concept_path.push((*segment).to_string()); } let confidence = if is_test_file(file) { base_confidence * 0.5 } else { base_confidence }; ExtractedClaim { concept_path: format!("code://{}", concept_path.join("/")), predicate: predicate.to_string(), value, file: file.to_string(), line, matched_text: matched_text.to_string(), confidence, description: description.to_string(), } } /// Trait for claim extractors. /// /// Extractors scan file content and return claims about implicit decisions. pub trait Extractor: Send + Sync { /// Unique identifier for this extractor. fn name(&self) -> &str; /// File types this extractor operates on. fn languages(&self) -> &[Language]; /// Extract claims from a file's content. /// /// # Arguments /// /// * `path_segments` - ConceptPath segments derived from the file's location /// * `content` - The file content as a string /// * `language` - The detected language of the file /// * `file` - The relative file path /// /// # Returns /// /// Zero or more extracted claims. fn extract( &self, path_segments: &[String], content: &str, language: Language, file: &str, ) -> Vec; } #[cfg(test)] mod tests { use super::*; #[test] fn test_is_test_file() { // Should identify test files assert!(is_test_file("tests/test_auth.py")); assert!(is_test_file("src/__tests__/api.spec.js")); assert!(is_test_file("examples/demo.rs")); assert!(is_test_file("fixtures/data.json")); assert!(is_test_file("mocks/handler.ts")); assert!(is_test_file("auth_test.go")); assert!(is_test_file("auth_test.py")); assert!(is_test_file("auth_test.rs")); // Should NOT identify production files assert!(!is_test_file("src/auth.py")); assert!(!is_test_file("handler.go")); assert!(!is_test_file("config.yaml")); } }