//! Core extractor trait and helper functions. use stemedb_core::types::ObjectValue; use crate::types::{Language, Observation}; // ============================================================================ // Shared Utilities for Extractors // ============================================================================ /// Check if a file path indicates a test file. /// /// Used by extractors to lower confidence for test fixtures since /// hardcoded values in tests are often intentional. pub fn is_test_file(file: &str) -> bool { let lower = file.to_lowercase(); lower.contains("test") || lower.contains("spec") || lower.contains("example") || lower.contains("fixture") || lower.contains("mock") || lower.contains("_test.") || lower.ends_with("_test.py") || lower.ends_with("_test.go") || lower.ends_with("_test.rs") } /// Build an observation with consistent formatting. /// /// This is a helper for extractors to create observations with: /// - Consistent concept path format (`code://segment1/segment2/...`) /// - Automatic confidence reduction for test files /// - Standard observation structure #[allow(clippy::too_many_arguments)] pub fn build_claim( path_segments: &[String], leaf_segments: &[&str], predicate: &str, value: ObjectValue, file: &str, line: usize, matched_text: &str, base_confidence: f32, description: &str, ) -> Observation { let mut concept_path = path_segments.to_vec(); for segment in leaf_segments { concept_path.push((*segment).to_string()); } let confidence = if is_test_file(file) { base_confidence * 0.5 } else { base_confidence }; Observation { concept_path: format!("code://{}", concept_path.join("/")), predicate: predicate.to_string(), value, file: file.to_string(), line, matched_text: matched_text.to_string(), confidence, description: description.to_string(), } } /// Trait for observation extractors. /// /// Extractors scan file content and return observations about implicit decisions. pub trait Extractor: Send + Sync { /// Unique identifier for this extractor. fn name(&self) -> &str; /// File types this extractor operates on. fn languages(&self) -> &[Language]; /// Extract observations from a file's content. /// /// # Arguments /// /// * `path_segments` - ConceptPath segments derived from the file's location /// * `content` - The file content as a string /// * `language` - The detected language of the file /// * `file` - The relative file path /// /// # Returns /// /// Zero or more extracted observations. fn extract( &self, path_segments: &[String], content: &str, language: Language, file: &str, ) -> Vec; /// Declare which observation predicates this extractor can verify. /// /// Returns `(tail_path_suffix, predicate)` pairs describing the concept paths /// and predicates this extractor produces. Used by `verify map` to show /// extractor→claim coverage. /// /// Tail-path suffixes use the last 2 segments of the concept path. /// Wildcards are supported: `"imports/*"` matches `"imports/tokio"`, etc. /// /// Default: empty (backward compatible — observation-only extractor). fn verifiable_predicates(&self) -> Vec<(&str, &str)> { vec![] } /// Return lightweight string patterns for pre-screening file content. /// /// The registry compiles these into a `RegexSet` for one-pass DFA matching. /// If *any* pattern matches the file content, this extractor is selected to run. /// /// Return `vec![]` (the default) to **always run** this extractor on matching /// language files — use this for extractors that are cheap or hard to pre-screen. fn screening_patterns(&self) -> Vec<&str> { vec![] } } #[cfg(test)] mod tests { use super::*; #[test] fn test_is_test_file() { // Should identify test files assert!(is_test_file("tests/test_auth.py")); assert!(is_test_file("src/__tests__/api.spec.js")); assert!(is_test_file("examples/demo.rs")); assert!(is_test_file("fixtures/data.json")); assert!(is_test_file("mocks/handler.ts")); assert!(is_test_file("auth_test.go")); assert!(is_test_file("auth_test.py")); assert!(is_test_file("auth_test.rs")); // Should NOT identify production files assert!(!is_test_file("src/auth.py")); assert!(!is_test_file("handler.go")); assert!(!is_test_file("config.yaml")); } }