Complete Aphoria claims system overhaul: - A1: Rename ExtractedClaim to Observation (extractors produce observations, not claims) - A2: Add AuthoredClaim with full provenance, invariants, and authority tiers - A3: Verify engine comparing observations against authored claims, CLI + formatters - A4: Corpus as first-class assertions with predicate indexing, authority lens, trust packs - A5: Coverage analysis, explain/docs generation, self-audit extractor, claim suggester skill Also includes: 42 extractors updated for Observation type, verifiable_predicates trait, conflict detection with comparison modes, claims TOML persistence, Grafana dashboard, backup/restore scripts, and comprehensive test coverage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
143 lines
4.6 KiB
Rust
143 lines
4.6 KiB
Rust
//! Core extractor trait and helper functions.
|
|
|
|
use stemedb_core::types::ObjectValue;
|
|
|
|
use crate::types::{Language, Observation};
|
|
|
|
// ============================================================================
|
|
// Shared Utilities for Extractors
|
|
// ============================================================================
|
|
|
|
/// Check if a file path indicates a test file.
|
|
///
|
|
/// Used by extractors to lower confidence for test fixtures since
|
|
/// hardcoded values in tests are often intentional.
|
|
pub fn is_test_file(file: &str) -> bool {
|
|
let lower = file.to_lowercase();
|
|
lower.contains("test")
|
|
|| lower.contains("spec")
|
|
|| lower.contains("example")
|
|
|| lower.contains("fixture")
|
|
|| lower.contains("mock")
|
|
|| lower.contains("_test.")
|
|
|| lower.ends_with("_test.py")
|
|
|| lower.ends_with("_test.go")
|
|
|| lower.ends_with("_test.rs")
|
|
}
|
|
|
|
/// Build an observation with consistent formatting.
|
|
///
|
|
/// This is a helper for extractors to create observations with:
|
|
/// - Consistent concept path format (`code://segment1/segment2/...`)
|
|
/// - Automatic confidence reduction for test files
|
|
/// - Standard observation structure
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub fn build_claim(
|
|
path_segments: &[String],
|
|
leaf_segments: &[&str],
|
|
predicate: &str,
|
|
value: ObjectValue,
|
|
file: &str,
|
|
line: usize,
|
|
matched_text: &str,
|
|
base_confidence: f32,
|
|
description: &str,
|
|
) -> Observation {
|
|
let mut concept_path = path_segments.to_vec();
|
|
for segment in leaf_segments {
|
|
concept_path.push((*segment).to_string());
|
|
}
|
|
|
|
let confidence = if is_test_file(file) { base_confidence * 0.5 } else { base_confidence };
|
|
|
|
Observation {
|
|
concept_path: format!("code://{}", concept_path.join("/")),
|
|
predicate: predicate.to_string(),
|
|
value,
|
|
file: file.to_string(),
|
|
line,
|
|
matched_text: matched_text.to_string(),
|
|
confidence,
|
|
description: description.to_string(),
|
|
}
|
|
}
|
|
|
|
/// Trait for observation extractors.
|
|
///
|
|
/// Extractors scan file content and return observations about implicit decisions.
|
|
pub trait Extractor: Send + Sync {
|
|
/// Unique identifier for this extractor.
|
|
fn name(&self) -> &str;
|
|
|
|
/// File types this extractor operates on.
|
|
fn languages(&self) -> &[Language];
|
|
|
|
/// Extract observations from a file's content.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `path_segments` - ConceptPath segments derived from the file's location
|
|
/// * `content` - The file content as a string
|
|
/// * `language` - The detected language of the file
|
|
/// * `file` - The relative file path
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// Zero or more extracted observations.
|
|
fn extract(
|
|
&self,
|
|
path_segments: &[String],
|
|
content: &str,
|
|
language: Language,
|
|
file: &str,
|
|
) -> Vec<Observation>;
|
|
|
|
/// Declare which observation predicates this extractor can verify.
|
|
///
|
|
/// Returns `(tail_path_suffix, predicate)` pairs describing the concept paths
|
|
/// and predicates this extractor produces. Used by `verify map` to show
|
|
/// extractor→claim coverage.
|
|
///
|
|
/// Tail-path suffixes use the last 2 segments of the concept path.
|
|
/// Wildcards are supported: `"imports/*"` matches `"imports/tokio"`, etc.
|
|
///
|
|
/// Default: empty (backward compatible — observation-only extractor).
|
|
fn verifiable_predicates(&self) -> Vec<(&str, &str)> {
|
|
vec![]
|
|
}
|
|
|
|
/// Return lightweight string patterns for pre-screening file content.
|
|
///
|
|
/// The registry compiles these into a `RegexSet` for one-pass DFA matching.
|
|
/// If *any* pattern matches the file content, this extractor is selected to run.
|
|
///
|
|
/// Return `vec![]` (the default) to **always run** this extractor on matching
|
|
/// language files — use this for extractors that are cheap or hard to pre-screen.
|
|
fn screening_patterns(&self) -> Vec<&str> {
|
|
vec![]
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_is_test_file() {
|
|
// Should identify test files
|
|
assert!(is_test_file("tests/test_auth.py"));
|
|
assert!(is_test_file("src/__tests__/api.spec.js"));
|
|
assert!(is_test_file("examples/demo.rs"));
|
|
assert!(is_test_file("fixtures/data.json"));
|
|
assert!(is_test_file("mocks/handler.ts"));
|
|
assert!(is_test_file("auth_test.go"));
|
|
assert!(is_test_file("auth_test.py"));
|
|
assert!(is_test_file("auth_test.rs"));
|
|
|
|
// Should NOT identify production files
|
|
assert!(!is_test_file("src/auth.py"));
|
|
assert!(!is_test_file("handler.go"));
|
|
assert!(!is_test_file("config.yaml"));
|
|
}
|
|
}
|