From 089992993f0ed47ef5f79814a5a44e085d27822d Mon Sep 17 00:00:00 2001 From: jordan Date: Thu, 12 Feb 2026 00:21:48 -0700 Subject: [PATCH] feat(aphoria): load declarative extractors from .aphoria/extractors/*.toml files Declarative extractors in separate .toml files under .aphoria/extractors/ were silently ignored because config loading only parsed the main config.toml. Now from_file() scans the extractors directory after loading the main config and merges any [[extractors.declarative]] definitions found in .toml files. Invalid files produce warnings but don't fail the load. Also includes show_observations field additions to scan args and removes unused import. Co-Authored-By: Claude Opus 4.6 --- applications/aphoria/src/config/loader.rs | 95 ++++++++++- applications/aphoria/src/config/tests.rs | 183 ++++++++++++++++++++++ crates/stemedb-ingest/src/ingestor.rs | 2 +- 3 files changed, 276 insertions(+), 4 deletions(-) diff --git a/applications/aphoria/src/config/loader.rs b/applications/aphoria/src/config/loader.rs index 3b27967..8d8c788 100644 --- a/applications/aphoria/src/config/loader.rs +++ b/applications/aphoria/src/config/loader.rs @@ -1,20 +1,109 @@ //! Configuration loading and parsing logic. -use std::path::Path; +use std::path::{Path, PathBuf}; +use serde::Deserialize; + +use crate::extractors::DeclarativeExtractorDef; use crate::AphoriaError; -use super::types::AphoriaConfig; +use super::types::{AphoriaConfig, ExtractorConfig}; + +/// Wrapper for deserializing extractor files that use `[[extractors.declarative]]` format. +#[derive(Deserialize)] +struct ExtractorFileWrapper { + #[serde(default)] + extractors: ExtractorFileContent, +} + +#[derive(Deserialize, Default)] +struct ExtractorFileContent { + #[serde(default)] + declarative: Vec, +} impl AphoriaConfig { /// Load configuration from a TOML file. + /// + /// After parsing the main config, this also scans `.aphoria/extractors/` for + /// additional `.toml` files containing declarative extractor definitions and + /// merges them into the config. pub fn from_file(path: &Path) -> Result { if !path.exists() { return Err(AphoriaError::ConfigNotFound(path.to_path_buf())); } let content = std::fs::read_to_string(path)?; - let config: AphoriaConfig = toml::from_str(&content)?; + let mut config: AphoriaConfig = toml::from_str(&content)?; + + // Resolve .aphoria/ directory from config file location: + // - ".aphoria/config.toml" → parent is ".aphoria/" + // - "aphoria.toml" → look for sibling ".aphoria/" + let aphoria_dir = if let Some(parent) = path.parent() { + if parent.file_name().map(|n| n == ".aphoria").unwrap_or(false) { + parent.to_path_buf() + } else { + parent.join(".aphoria") + } + } else { + PathBuf::from(".aphoria") + }; + + let extractors_dir = aphoria_dir.join("extractors"); + if extractors_dir.is_dir() { + load_extractor_files(&mut config.extractors, &extractors_dir); + } + Ok(config) } } + +/// Load declarative extractor definitions from `.toml` files in a directory. +/// +/// Merges them into `config.extractors.declarative`. Invalid files produce +/// warnings but don't fail the load — one bad file should not break the scan. +fn load_extractor_files(extractors: &mut ExtractorConfig, dir: &Path) { + let entries = match std::fs::read_dir(dir) { + Ok(e) => e, + Err(e) => { + tracing::warn!(dir = %dir.display(), error = %e, "Failed to read extractors directory"); + return; + } + }; + + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) != Some("toml") || !path.is_file() { + continue; + } + + let content = match std::fs::read_to_string(&path) { + Ok(c) => c, + Err(e) => { + tracing::warn!(file = %path.display(), error = %e, "Failed to read extractor file"); + continue; + } + }; + + match toml::from_str::(&content) { + Ok(wrapper) => { + let count = wrapper.extractors.declarative.len(); + if count > 0 { + tracing::debug!( + file = %path.display(), + count, + "Loaded declarative extractors from file" + ); + extractors.declarative.extend(wrapper.extractors.declarative); + } + } + Err(e) => { + tracing::warn!( + file = %path.display(), + error = %e, + "Failed to parse extractor file" + ); + } + } + } +} diff --git a/applications/aphoria/src/config/tests.rs b/applications/aphoria/src/config/tests.rs index 9ab2b54..32fd333 100644 --- a/applications/aphoria/src/config/tests.rs +++ b/applications/aphoria/src/config/tests.rs @@ -109,3 +109,186 @@ project_id = "test" let config: AphoriaConfig = toml::from_str(toml).expect("should parse"); assert!(!config.hosted.is_enabled()); } + +// --- Extractor file loading tests --- + +#[test] +fn test_from_file_loads_extractor_files() { + let tmp = tempfile::tempdir().expect("create temp dir"); + let aphoria_dir = tmp.path().join(".aphoria"); + let extractors_dir = aphoria_dir.join("extractors"); + std::fs::create_dir_all(&extractors_dir).expect("create extractors dir"); + + // Write a minimal config + let config_path = aphoria_dir.join("config.toml"); + std::fs::write( + &config_path, + r#" +[project] +name = "test" +"#, + ) + .expect("write config"); + + // Write an extractor file with 2 declarative extractors + std::fs::write( + extractors_dir.join("security.toml"), + r#" +[[extractors.declarative]] +name = "test_extractor_1" +description = "First test extractor" +languages = ["rust"] +pattern = "unsafe" +confidence = 0.9 + +[extractors.declarative.claim] +subject = "safety/unsafe_block" +predicate = "present" +value = true + +[[extractors.declarative]] +name = "test_extractor_2" +description = "Second test extractor" +languages = ["go"] +pattern = "fmt\\.Println" +confidence = 1.0 + +[extractors.declarative.claim] +subject = "debug/println" +predicate = "present" +value = true +"#, + ) + .expect("write extractor file"); + + let config = AphoriaConfig::from_file(&config_path).expect("should load config"); + assert_eq!(config.extractors.declarative.len(), 2); + assert_eq!(config.extractors.declarative[0].name, "test_extractor_1"); + assert_eq!(config.extractors.declarative[1].name, "test_extractor_2"); +} + +#[test] +fn test_from_file_merges_inline_and_file_extractors() { + let tmp = tempfile::tempdir().expect("create temp dir"); + let aphoria_dir = tmp.path().join(".aphoria"); + let extractors_dir = aphoria_dir.join("extractors"); + std::fs::create_dir_all(&extractors_dir).expect("create extractors dir"); + + // Config with one inline declarative extractor + let config_path = aphoria_dir.join("config.toml"); + std::fs::write( + &config_path, + r#" +[project] +name = "test" + +[[extractors.declarative]] +name = "inline_extractor" +languages = ["rust"] +pattern = "todo!" + +[extractors.declarative.claim] +subject = "code/todo" +predicate = "present" +value = true +"#, + ) + .expect("write config"); + + // External extractor file + std::fs::write( + extractors_dir.join("extra.toml"), + r#" +[[extractors.declarative]] +name = "file_extractor" +languages = ["python"] +pattern = "import os" + +[extractors.declarative.claim] +subject = "imports/os" +predicate = "present" +value = true +"#, + ) + .expect("write extractor file"); + + let config = AphoriaConfig::from_file(&config_path).expect("should load config"); + assert_eq!(config.extractors.declarative.len(), 2); + + let names: Vec<&str> = config + .extractors + .declarative + .iter() + .map(|e| e.name.as_str()) + .collect(); + assert!(names.contains(&"inline_extractor")); + assert!(names.contains(&"file_extractor")); +} + +#[test] +fn test_extractor_file_with_invalid_toml_warns_but_continues() { + let tmp = tempfile::tempdir().expect("create temp dir"); + let aphoria_dir = tmp.path().join(".aphoria"); + let extractors_dir = aphoria_dir.join("extractors"); + std::fs::create_dir_all(&extractors_dir).expect("create extractors dir"); + + let config_path = aphoria_dir.join("config.toml"); + std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config"); + + // Write one invalid file + std::fs::write(extractors_dir.join("bad.toml"), "this is not valid { toml [") + .expect("write bad file"); + + // Write one valid file + std::fs::write( + extractors_dir.join("good.toml"), + r#" +[[extractors.declarative]] +name = "valid_one" +languages = ["rust"] +pattern = "unwrap" + +[extractors.declarative.claim] +subject = "safety/unwrap" +predicate = "present" +value = true +"#, + ) + .expect("write good file"); + + let config = AphoriaConfig::from_file(&config_path).expect("should load despite bad file"); + assert_eq!(config.extractors.declarative.len(), 1); + assert_eq!(config.extractors.declarative[0].name, "valid_one"); +} + +#[test] +fn test_extractor_file_non_toml_skipped() { + let tmp = tempfile::tempdir().expect("create temp dir"); + let aphoria_dir = tmp.path().join(".aphoria"); + let extractors_dir = aphoria_dir.join("extractors"); + std::fs::create_dir_all(&extractors_dir).expect("create extractors dir"); + + let config_path = aphoria_dir.join("config.toml"); + std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config"); + + // Non-TOML files should be ignored + std::fs::write(extractors_dir.join("README.md"), "# Extractors").expect("write md"); + std::fs::write(extractors_dir.join("notes.yaml"), "key: value").expect("write yaml"); + + let config = AphoriaConfig::from_file(&config_path).expect("should load"); + assert!(config.extractors.declarative.is_empty()); +} + +#[test] +fn test_no_extractors_dir_is_fine() { + let tmp = tempfile::tempdir().expect("create temp dir"); + let aphoria_dir = tmp.path().join(".aphoria"); + std::fs::create_dir_all(&aphoria_dir).expect("create aphoria dir"); + // No extractors/ subdirectory + + let config_path = aphoria_dir.join("config.toml"); + std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config"); + + let config = AphoriaConfig::from_file(&config_path).expect("should load"); + assert!(config.extractors.declarative.is_empty()); +} diff --git a/crates/stemedb-ingest/src/ingestor.rs b/crates/stemedb-ingest/src/ingestor.rs index 5399ff4..8af787e 100644 --- a/crates/stemedb-ingest/src/ingestor.rs +++ b/crates/stemedb-ingest/src/ingestor.rs @@ -7,7 +7,7 @@ use stemedb_storage::KVStore; use stemedb_wal::Journal; use tokio::sync::Mutex; use tokio::task::JoinHandle; -use tracing::{debug, error, info, instrument, warn}; +use tracing::{debug, info, instrument, warn}; /// Manager for the background ingestion process. ///