stemedb/applications/aphoria/src/extractors/traits.rs

//! Core extractor trait and helper functions.

use stemedb_core::types::ObjectValue;

use crate::types::{ExtractedClaim, Language};

// ============================================================================
// Shared Utilities for Extractors
// ============================================================================

/// Check if a file path indicates a test file.
///
/// Used by extractors to lower confidence for test fixtures since
/// hardcoded values in tests are often intentional.
pub fn is_test_file(file: &str) -> bool {
    let lower = file.to_lowercase();
    lower.contains("test")
        || lower.contains("spec")
        || lower.contains("example")
        || lower.contains("fixture")
        || lower.contains("mock")
        || lower.contains("_test.")
        || lower.ends_with("_test.py")
        || lower.ends_with("_test.go")
        || lower.ends_with("_test.rs")
}

/// Build an extracted claim with consistent formatting.
///
/// This is a helper for extractors to create claims with:
/// - Consistent concept path format (`code://segment1/segment2/...`)
/// - Automatic confidence reduction for test files
/// - Standard claim structure
#[allow(clippy::too_many_arguments)]
pub fn build_claim(
    path_segments: &[String],
    leaf_segments: &[&str],
    predicate: &str,
    value: ObjectValue,
    file: &str,
    line: usize,
    matched_text: &str,
    base_confidence: f32,
    description: &str,
) -> ExtractedClaim {
    let mut concept_path = path_segments.to_vec();
    for segment in leaf_segments {
        concept_path.push((*segment).to_string());
    }

    let confidence = if is_test_file(file) { base_confidence * 0.5 } else { base_confidence };

    ExtractedClaim {
        concept_path: format!("code://{}", concept_path.join("/")),
        predicate: predicate.to_string(),
        value,
        file: file.to_string(),
        line,
        matched_text: matched_text.to_string(),
        confidence,
        description: description.to_string(),
    }
}

/// Trait for claim extractors.
///
/// Extractors scan file content and return claims about implicit decisions.
pub trait Extractor: Send + Sync {
    /// Unique identifier for this extractor.
    fn name(&self) -> &str;

    /// File types this extractor operates on.
    fn languages(&self) -> &[Language];

    /// Extract claims from a file's content.
    ///
    /// # Arguments
    ///
    /// * `path_segments` - ConceptPath segments derived from the file's location
    /// * `content` - The file content as a string
    /// * `language` - The detected language of the file
    /// * `file` - The relative file path
    ///
    /// # Returns
    ///
    /// Zero or more extracted claims.
    fn extract(
        &self,
        path_segments: &[String],
        content: &str,
        language: Language,
        file: &str,
    ) -> Vec<ExtractedClaim>;
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_test_file() {
        // Should identify test files
        assert!(is_test_file("tests/test_auth.py"));
        assert!(is_test_file("src/__tests__/api.spec.js"));
        assert!(is_test_file("examples/demo.rs"));
        assert!(is_test_file("fixtures/data.json"));
        assert!(is_test_file("mocks/handler.ts"));
        assert!(is_test_file("auth_test.go"));
        assert!(is_test_file("auth_test.py"));
        assert!(is_test_file("auth_test.rs"));

        // Should NOT identify production files
        assert!(!is_test_file("src/auth.py"));
        assert!(!is_test_file("handler.go"));
        assert!(!is_test_file("config.yaml"));
    }
}