stemedb/applications/aphoria/src/extractors/traits.rs

//! Core extractor trait and helper functions.

use stemedb_core::types::ObjectValue;

use crate::types::{Language, Observation};

// ============================================================================
// Shared Utilities for Extractors
// ============================================================================

/// Check if a file path indicates a test file.
///
/// Used by extractors to lower confidence for test fixtures since
/// hardcoded values in tests are often intentional.
pub fn is_test_file(file: &str) -> bool {
    let lower = file.to_lowercase();
    lower.contains("test")
        || lower.contains("spec")
        || lower.contains("example")
        || lower.contains("fixture")
        || lower.contains("mock")
        || lower.contains("_test.")
        || lower.ends_with("_test.py")
        || lower.ends_with("_test.go")
        || lower.ends_with("_test.rs")
}

/// Build an observation with consistent formatting.
///
/// This is a helper for extractors to create observations with:
/// - Consistent concept path format (`code://segment1/segment2/...`)
/// - Automatic confidence reduction for test files
/// - Standard observation structure
#[allow(clippy::too_many_arguments)]
pub fn build_claim(
    path_segments: &[String],
    leaf_segments: &[&str],
    predicate: &str,
    value: ObjectValue,
    file: &str,
    line: usize,
    matched_text: &str,
    base_confidence: f32,
    description: &str,
) -> Observation {
    let mut concept_path = path_segments.to_vec();
    for segment in leaf_segments {
        concept_path.push((*segment).to_string());
    }

    let confidence = if is_test_file(file) { base_confidence * 0.5 } else { base_confidence };

    Observation {
        concept_path: format!("code://{}", concept_path.join("/")),
        predicate: predicate.to_string(),
        value,
        file: file.to_string(),
        line,
        matched_text: matched_text.to_string(),
        confidence,
        description: description.to_string(),
    }
}

/// Trait for observation extractors.
///
/// Extractors scan file content and return observations about implicit decisions.
pub trait Extractor: Send + Sync {
    /// Unique identifier for this extractor.
    fn name(&self) -> &str;

    /// File types this extractor operates on.
    fn languages(&self) -> &[Language];

    /// Extract observations from a file's content.
    ///
    /// # Arguments
    ///
    /// * `path_segments` - ConceptPath segments derived from the file's location
    /// * `content` - The file content as a string
    /// * `language` - The detected language of the file
    /// * `file` - The relative file path
    ///
    /// # Returns
    ///
    /// Zero or more extracted observations.
    fn extract(
        &self,
        path_segments: &[String],
        content: &str,
        language: Language,
        file: &str,
    ) -> Vec<Observation>;

    /// Declare which observation predicates this extractor can verify.
    ///
    /// Returns `(tail_path_suffix, predicate)` pairs describing the concept paths
    /// and predicates this extractor produces. Used by `verify map` to show
    /// extractor→claim coverage.
    ///
    /// Tail-path suffixes use the last 2 segments of the concept path.
    /// Wildcards are supported: `"imports/*"` matches `"imports/tokio"`, etc.
    ///
    /// Default: empty (backward compatible — observation-only extractor).
    fn verifiable_predicates(&self) -> Vec<(&str, &str)> {
        vec![]
    }

    /// Return lightweight string patterns for pre-screening file content.
    ///
    /// The registry compiles these into a `RegexSet` for one-pass DFA matching.
    /// If *any* pattern matches the file content, this extractor is selected to run.
    ///
    /// Return `vec![]` (the default) to **always run** this extractor on matching
    /// language files — use this for extractors that are cheap or hard to pre-screen.
    fn screening_patterns(&self) -> Vec<&str> {
        vec![]
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_test_file() {
        // Should identify test files
        assert!(is_test_file("tests/test_auth.py"));
        assert!(is_test_file("src/__tests__/api.spec.js"));
        assert!(is_test_file("examples/demo.rs"));
        assert!(is_test_file("fixtures/data.json"));
        assert!(is_test_file("mocks/handler.ts"));
        assert!(is_test_file("auth_test.go"));
        assert!(is_test_file("auth_test.py"));
        assert!(is_test_file("auth_test.rs"));

        // Should NOT identify production files
        assert!(!is_test_file("src/auth.py"));
        assert!(!is_test_file("handler.go"));
        assert!(!is_test_file("config.yaml"));
    }
}