stemedb/applications/aphoria/src/scan/scanner.rs
jml 6430ff0fd6 fix(aphoria): move claims.toml to project root and fix verify integration
## Root Cause
Claims file was in applications/aphoria/.aphoria/ but all commands looked
for .aphoria/claims.toml relative to project root. Additionally, .aphoria/
was fully gitignored, preventing version control of claims.

## Changes

### Path Fixes
- Move claims.toml from applications/aphoria/.aphoria/ to .aphoria/ at project root
- Update .gitignore: .aphoria/ → .aphoria/* with !.aphoria/claims.toml exception
- Now claims can be version controlled while keys remain secret

### Verify Integration (Scanner)
- scanner.rs: Load claims from ClaimsFile and call verify_claims()
- ScanResult: Add verify field with VerifyReport
- Report formatters: Add claim verification sections showing PASS/CONFLICT/MISSING

### Clippy Fix
- report/json.rs: Replace filter().map().expect() with filter_map()

## Verification
- aphoria scan . → Shows claim verification with verdicts
- aphoria verify run → Per-claim verification results
- aphoria verify map → Extractor coverage mapping (7/10 claims = 70%)
- aphoria claims list → Reads from project root
- aphoria claims create → Writes to project root
- All tests pass (1120+ aphoria tests)
- clippy --workspace passes

## Impact
Both primary use cases now work:
1. Day-to-day (commit-time): Skills can read/create claims via CLI
2. Audit (scan-time): Scanner verifies code against authored claims

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-08 11:09:57 +00:00

374 lines
14 KiB
Rust

//! Core scanner logic for conflict detection and observation recording.
use std::collections::HashSet;
use std::path::Path;
use std::time::Instant;
use tracing::{info, instrument};
use crate::bridge::{self, observation_to_assertion};
use crate::claims_file::ClaimsFile;
use crate::config::{AphoriaConfig, SyncMode};
use crate::episteme::{
create_authoritative_corpus, current_timestamp_millis, ConceptIndex, EphemeralDetector,
LocalEpisteme,
};
use crate::error::AphoriaError;
use crate::hosted::HostedClient;
use crate::policy::PolicyManager;
use crate::types::{
ConflictResult, DriftResult, FileSource, Observation, ScanArgs, ScanMode, ScanResult,
ScanTiming,
};
use crate::verify;
use crate::walker::{walk_project, walk_staged_files};
use super::walker::extract_claims_from_files;
/// Result of conflict checking including observation count and drift detection.
pub(super) struct ConflictCheckResult {
pub conflicts: Vec<ConflictResult>,
pub drifts: Vec<DriftResult>,
pub observations_recorded: usize,
}
/// Run a scan on the specified project.
///
/// This is the main entry point for scanning a codebase. It:
/// 1. Walks the project directory
/// 2. Extracts claims from config and code
/// 3. Checks for conflicts against authoritative sources
/// 4. (Optional) Persists claims to Episteme storage if `mode == Persistent`
/// 5. (Optional) Records observations for claims with no conflicts if `sync == true`
/// 6. Returns a formatted report
///
/// # Scan Modes
///
/// - **Ephemeral** (default): Fast in-memory scan. No disk I/O for Episteme.
/// Uses `EphemeralDetector` for conflict detection. Does not support
/// diff/baseline features or observation write-back.
///
/// - **Persistent**: Full scan with Episteme storage. Enables diff, baseline,
/// alias creation, and observation write-back (when `--sync` is enabled).
#[instrument(skip(config), fields(path = %args.path.display(), format = %args.format, mode = ?args.mode, sync = args.sync, file_source = ?args.file_source, benchmark = args.benchmark))]
pub async fn run_scan(args: ScanArgs, config: &AphoriaConfig) -> Result<ScanResult, AphoriaError> {
info!("Starting scan");
let total_start = Instant::now();
let project_root = args.path.canonicalize().unwrap_or_else(|_| args.path.clone());
// 1. Walk the project to find files (or just staged files)
let walk_start = Instant::now();
let files = match args.file_source {
FileSource::All => walk_project(&project_root, config)?,
FileSource::Staged => walk_staged_files(&project_root, config)?,
};
let walk_ms = walk_start.elapsed().as_millis() as u64;
info!(files_found = files.len(), file_source = ?args.file_source, walk_ms, "Project walk complete");
// 2. Extract claims from files (LLM extraction only in persistent mode)
let extraction_start = Instant::now();
let all_claims = extract_claims_from_files(&files, config, args.mode, &project_root)?;
let extraction_ms = extraction_start.elapsed().as_millis() as u64;
info!(claims_extracted = all_claims.len(), extraction_ms, "Extraction complete");
// 3. Check for conflicts - mode determines which path
let conflict_start = Instant::now();
let result = check_conflicts(&args, &all_claims, &project_root, config).await?;
let conflict_ms = conflict_start.elapsed().as_millis() as u64;
let total_ms = total_start.elapsed().as_millis() as u64;
// 4. Verify authored claims against observations
let verify_report = {
let claims_path = ClaimsFile::default_path(&project_root);
let claims_file = ClaimsFile::load(&claims_path)?;
if claims_file.is_empty() {
None
} else {
info!(claims = claims_file.len(), "Verifying authored claims");
Some(verify::verify_claims(&claims_file.claims, &all_claims))
}
};
// 5. Calculate lines of code if benchmark mode
let lines_of_code = if args.benchmark { Some(count_lines_of_code(&files)) } else { None };
// 6. Build timing info if benchmark mode
let timing = if args.benchmark {
Some(ScanTiming { walk_ms, extraction_ms, conflict_ms, total_ms, lines_of_code })
} else {
None
};
// 7. Populate claims if requested (clone and sort by file, then line)
let claims = if args.show_claims {
let mut sorted = all_claims.to_vec();
sorted.sort_by(|a, b| a.file.cmp(&b.file).then(a.line.cmp(&b.line)));
Some(sorted)
} else {
None
};
// 8. Build result
let project_name =
project_root.file_name().and_then(|s| s.to_str()).unwrap_or("unknown").to_string();
Ok(ScanResult {
project: project_name,
scan_id: generate_scan_id(),
files_scanned: files.len(),
claims_extracted: all_claims.len(),
conflicts: result.conflicts,
drifts: result.drifts,
format: args.format.clone(),
debug: args.debug,
strict: args.strict,
observations_recorded: result.observations_recorded,
timing,
claims,
deprecated_usages: vec![], // TODO: Populate from lifecycle store during scan
verify: verify_report,
})
}
/// Count lines of code in the scanned files.
///
/// Reads each file and counts non-empty lines. Used for benchmark reporting.
fn count_lines_of_code(files: &[crate::walker::WalkedFile]) -> usize {
files
.iter()
.map(|file| {
std::fs::read_to_string(&file.path)
.map(|content| content.lines().filter(|line| !line.trim().is_empty()).count())
.unwrap_or(0)
})
.sum()
}
/// Check claims for conflicts using either ephemeral or persistent mode.
async fn check_conflicts(
args: &ScanArgs,
all_claims: &[Observation],
project_root: &Path,
config: &AphoriaConfig,
) -> Result<ConflictCheckResult, AphoriaError> {
match args.mode {
ScanMode::Ephemeral => {
let conflicts =
check_conflicts_ephemeral(all_claims, project_root, config, args.debug)?;
// Ephemeral mode never records observations or detects drift (intentionally stateless)
Ok(ConflictCheckResult { conflicts, drifts: vec![], observations_recorded: 0 })
}
ScanMode::Persistent => {
check_conflicts_persistent(all_claims, project_root, config, args.sync).await
}
}
}
/// Fast in-memory conflict detection (no persistence).
fn check_conflicts_ephemeral(
all_claims: &[Observation],
project_root: &Path,
config: &AphoriaConfig,
debug: bool,
) -> Result<Vec<ConflictResult>, AphoriaError> {
info!("Using ephemeral detector (no persistence)");
let signing_key = bridge::load_or_generate_key(project_root)?;
// Load policies if any
let policy_manager = PolicyManager::new(&config.corpus.cache_dir);
let policies = policy_manager.load_policies(&config.policies)?;
// Create detector with policies
let mut detector = EphemeralDetector::new(&signing_key, &config.corpus);
detector.ingest_policies(&policies);
if debug {
Ok(detector.check_conflicts_debug(all_claims, config))
} else {
Ok(detector.check_conflicts(all_claims, config))
}
}
/// Full conflict detection with Episteme persistence.
///
/// When `sync` is enabled, claims with no authority conflict are written back
/// as Tier 4 (Community) observations, creating "project memory".
///
/// Drift detection runs AFTER authority conflict detection: claims that have
/// no authority conflict are checked against prior observations to detect
/// value changes.
///
/// # Hosted Mode
///
/// When `[hosted]` is configured with a URL, sync is automatically enabled
/// and observations are pushed to the remote server. The `sync_mode` setting
/// controls whether local storage is also used:
///
/// - `remote-only`: Only push to remote (no local storage)
/// - `local-and-remote`: Store locally AND push to remote
async fn check_conflicts_persistent(
all_claims: &[Observation],
project_root: &Path,
config: &AphoriaConfig,
sync: bool,
) -> Result<ConflictCheckResult, AphoriaError> {
// Auto-enable sync when hosted mode is configured
let effective_sync = sync || config.hosted.is_enabled();
let hosted_enabled = config.hosted.is_enabled();
info!(
sync = effective_sync,
hosted = hosted_enabled,
"Using persistent mode (with Episteme storage)"
);
// Open local Episteme and ingest claims
let mut episteme = LocalEpisteme::open(config, project_root).await?;
if !all_claims.is_empty() {
episteme.ingest_claims(all_claims).await?;
}
// Build authoritative corpus from bundled sources AND imported Trust Packs
// This uses LocalEpisteme's check_conflicts which also creates aliases
let signing_key = bridge::load_or_generate_key(project_root)?;
let mut corpus = create_authoritative_corpus(&signing_key);
// Include assertions imported from Trust Packs
let imported_assertions = episteme.fetch_authoritative_assertions().await?;
if !imported_assertions.is_empty() {
info!(
count = imported_assertions.len(),
"Including imported Trust Pack assertions in conflict detection"
);
corpus.extend(imported_assertions);
}
// Merge predicate aliases from config AND from persisted/imported Trust Packs
// This ensures both config-defined and pack-imported aliases are used for
// semantic predicate matching (Phase 6.5.3)
let mut all_predicate_aliases = config.predicate_aliases.to_alias_sets();
all_predicate_aliases.extend(episteme.predicate_aliases().iter().cloned());
if !all_predicate_aliases.is_empty() {
info!(
config_count = config.predicate_aliases.to_alias_sets().len(),
stored_count = episteme.predicate_aliases().len(),
total_count = all_predicate_aliases.len(),
"Using predicate aliases for index normalization"
);
}
// Build index WITH predicate alias normalization so both authority and code
// predicates use canonical forms (e.g., "required" normalizes to "enabled")
let index = ConceptIndex::build_with_aliases(&corpus, &all_predicate_aliases);
let conflicts = episteme.check_conflicts(all_claims, config, &index).await?;
// Find claims that DO have an authority conflict
let conflicting_paths: HashSet<_> =
conflicts.iter().map(|c| c.claim.concept_path.as_str()).collect();
// Non-conflicting claims are candidates for drift detection and observation write-back
let non_conflicting_claims: Vec<_> = all_claims
.iter()
.filter(|c| !conflicting_paths.contains(c.concept_path.as_str()))
.cloned()
.collect();
// Drift detection: check non-conflicting claims against prior observations
let drifts = episteme.check_drift(&non_conflicting_claims).await?;
// Find claims that drifted (we don't want to overwrite them with new observations)
let drifting_paths: HashSet<_> = drifts.iter().map(|d| d.claim.concept_path.as_str()).collect();
// Write observations for novel claims (no conflict AND no drift) if sync enabled
let observations_recorded = if effective_sync {
// Novel claims are those with NO authority conflict AND NO drift
let novel_claims: Vec<_> = non_conflicting_claims
.iter()
.filter(|c| !drifting_paths.contains(c.concept_path.as_str()))
.cloned()
.collect();
let mut local_count = 0;
let mut remote_count = 0;
// Local persistence (unless hosted mode is remote-only without fallback)
let should_persist_locally =
!hosted_enabled || config.hosted.sync_mode == SyncMode::LocalAndRemote;
if should_persist_locally && !novel_claims.is_empty() {
local_count = episteme.ingest_observations(&novel_claims).await?;
info!(count = local_count, "Recorded observations locally");
}
// Remote push (if hosted mode is enabled)
if hosted_enabled && !novel_claims.is_empty() {
// Get project name for fallback
let project_name =
project_root.file_name().and_then(|s| s.to_str()).unwrap_or("unknown");
// Create hosted client
if let Some(client) = HostedClient::new(&config.hosted, &signing_key, project_name)? {
// Convert claims to observations
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let observations: Vec<_> = novel_claims
.iter()
.map(|c| observation_to_assertion(c, &signing_key, timestamp))
.collect();
remote_count = client.push_observations(observations)?;
info!(count = remote_count, "Pushed observations to hosted server");
}
}
// Return the higher count (they should be the same for LocalAndRemote)
local_count.max(remote_count)
} else {
0
};
// Shut down Episteme
episteme.shutdown().await;
Ok(ConflictCheckResult { conflicts, drifts, observations_recorded })
}
/// Generate a unique scan ID.
pub fn generate_scan_id() -> String {
format!("scan-{}", current_timestamp_millis())
}
/// Extract claims from a project without running conflict detection.
///
/// This is used for community preview to show what observations would be shared.
/// Note: LLM extraction is not used for preview (uses ScanMode::Ephemeral).
#[instrument(skip(config), fields(path = %args.path.display(), file_source = ?args.file_source))]
pub async fn extract_claims(
args: &ScanArgs,
config: &AphoriaConfig,
) -> Result<Vec<Observation>, AphoriaError> {
info!("Extracting claims for preview");
let project_root = args.path.canonicalize().unwrap_or_else(|_| args.path.clone());
// Walk the project to find files
let files = match args.file_source {
FileSource::All => walk_project(&project_root, config)?,
FileSource::Staged => walk_staged_files(&project_root, config)?,
};
info!(files_found = files.len(), "Project walk complete");
// Extract claims from files (ephemeral mode - no LLM)
let claims = extract_claims_from_files(&files, config, ScanMode::Ephemeral, &project_root)?;
info!(claims_extracted = claims.len(), "Extraction complete");
Ok(claims)
}