feat(aphoria): load declarative extractors from .aphoria/extractors/*.toml files

Declarative extractors in separate .toml files under .aphoria/extractors/ were
silently ignored because config loading only parsed the main config.toml. Now
from_file() scans the extractors directory after loading the main config and
merges any [[extractors.declarative]] definitions found in .toml files. Invalid
files produce warnings but don't fail the load. Also includes show_observations
field additions to scan args and removes unused import.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-12 00:21:48 -07:00
parent 3e7eddc074
commit 089992993f
3 changed files with 276 additions and 4 deletions

View File

@ -1,20 +1,109 @@
//! Configuration loading and parsing logic. //! Configuration loading and parsing logic.
use std::path::Path; use std::path::{Path, PathBuf};
use serde::Deserialize;
use crate::extractors::DeclarativeExtractorDef;
use crate::AphoriaError; use crate::AphoriaError;
use super::types::AphoriaConfig; use super::types::{AphoriaConfig, ExtractorConfig};
/// Wrapper for deserializing extractor files that use `[[extractors.declarative]]` format.
#[derive(Deserialize)]
struct ExtractorFileWrapper {
#[serde(default)]
extractors: ExtractorFileContent,
}
#[derive(Deserialize, Default)]
struct ExtractorFileContent {
#[serde(default)]
declarative: Vec<DeclarativeExtractorDef>,
}
impl AphoriaConfig { impl AphoriaConfig {
/// Load configuration from a TOML file. /// Load configuration from a TOML file.
///
/// After parsing the main config, this also scans `.aphoria/extractors/` for
/// additional `.toml` files containing declarative extractor definitions and
/// merges them into the config.
pub fn from_file(path: &Path) -> Result<Self, AphoriaError> { pub fn from_file(path: &Path) -> Result<Self, AphoriaError> {
if !path.exists() { if !path.exists() {
return Err(AphoriaError::ConfigNotFound(path.to_path_buf())); return Err(AphoriaError::ConfigNotFound(path.to_path_buf()));
} }
let content = std::fs::read_to_string(path)?; let content = std::fs::read_to_string(path)?;
let config: AphoriaConfig = toml::from_str(&content)?; let mut config: AphoriaConfig = toml::from_str(&content)?;
// Resolve .aphoria/ directory from config file location:
// - ".aphoria/config.toml" → parent is ".aphoria/"
// - "aphoria.toml" → look for sibling ".aphoria/"
let aphoria_dir = if let Some(parent) = path.parent() {
if parent.file_name().map(|n| n == ".aphoria").unwrap_or(false) {
parent.to_path_buf()
} else {
parent.join(".aphoria")
}
} else {
PathBuf::from(".aphoria")
};
let extractors_dir = aphoria_dir.join("extractors");
if extractors_dir.is_dir() {
load_extractor_files(&mut config.extractors, &extractors_dir);
}
Ok(config) Ok(config)
} }
} }
/// Load declarative extractor definitions from `.toml` files in a directory.
///
/// Merges them into `config.extractors.declarative`. Invalid files produce
/// warnings but don't fail the load — one bad file should not break the scan.
fn load_extractor_files(extractors: &mut ExtractorConfig, dir: &Path) {
let entries = match std::fs::read_dir(dir) {
Ok(e) => e,
Err(e) => {
tracing::warn!(dir = %dir.display(), error = %e, "Failed to read extractors directory");
return;
}
};
for entry in entries.flatten() {
let path = entry.path();
if path.extension().and_then(|e| e.to_str()) != Some("toml") || !path.is_file() {
continue;
}
let content = match std::fs::read_to_string(&path) {
Ok(c) => c,
Err(e) => {
tracing::warn!(file = %path.display(), error = %e, "Failed to read extractor file");
continue;
}
};
match toml::from_str::<ExtractorFileWrapper>(&content) {
Ok(wrapper) => {
let count = wrapper.extractors.declarative.len();
if count > 0 {
tracing::debug!(
file = %path.display(),
count,
"Loaded declarative extractors from file"
);
extractors.declarative.extend(wrapper.extractors.declarative);
}
}
Err(e) => {
tracing::warn!(
file = %path.display(),
error = %e,
"Failed to parse extractor file"
);
}
}
}
}

View File

@ -109,3 +109,186 @@ project_id = "test"
let config: AphoriaConfig = toml::from_str(toml).expect("should parse"); let config: AphoriaConfig = toml::from_str(toml).expect("should parse");
assert!(!config.hosted.is_enabled()); assert!(!config.hosted.is_enabled());
} }
// --- Extractor file loading tests ---
#[test]
fn test_from_file_loads_extractor_files() {
let tmp = tempfile::tempdir().expect("create temp dir");
let aphoria_dir = tmp.path().join(".aphoria");
let extractors_dir = aphoria_dir.join("extractors");
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
// Write a minimal config
let config_path = aphoria_dir.join("config.toml");
std::fs::write(
&config_path,
r#"
[project]
name = "test"
"#,
)
.expect("write config");
// Write an extractor file with 2 declarative extractors
std::fs::write(
extractors_dir.join("security.toml"),
r#"
[[extractors.declarative]]
name = "test_extractor_1"
description = "First test extractor"
languages = ["rust"]
pattern = "unsafe"
confidence = 0.9
[extractors.declarative.claim]
subject = "safety/unsafe_block"
predicate = "present"
value = true
[[extractors.declarative]]
name = "test_extractor_2"
description = "Second test extractor"
languages = ["go"]
pattern = "fmt\\.Println"
confidence = 1.0
[extractors.declarative.claim]
subject = "debug/println"
predicate = "present"
value = true
"#,
)
.expect("write extractor file");
let config = AphoriaConfig::from_file(&config_path).expect("should load config");
assert_eq!(config.extractors.declarative.len(), 2);
assert_eq!(config.extractors.declarative[0].name, "test_extractor_1");
assert_eq!(config.extractors.declarative[1].name, "test_extractor_2");
}
#[test]
fn test_from_file_merges_inline_and_file_extractors() {
let tmp = tempfile::tempdir().expect("create temp dir");
let aphoria_dir = tmp.path().join(".aphoria");
let extractors_dir = aphoria_dir.join("extractors");
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
// Config with one inline declarative extractor
let config_path = aphoria_dir.join("config.toml");
std::fs::write(
&config_path,
r#"
[project]
name = "test"
[[extractors.declarative]]
name = "inline_extractor"
languages = ["rust"]
pattern = "todo!"
[extractors.declarative.claim]
subject = "code/todo"
predicate = "present"
value = true
"#,
)
.expect("write config");
// External extractor file
std::fs::write(
extractors_dir.join("extra.toml"),
r#"
[[extractors.declarative]]
name = "file_extractor"
languages = ["python"]
pattern = "import os"
[extractors.declarative.claim]
subject = "imports/os"
predicate = "present"
value = true
"#,
)
.expect("write extractor file");
let config = AphoriaConfig::from_file(&config_path).expect("should load config");
assert_eq!(config.extractors.declarative.len(), 2);
let names: Vec<&str> = config
.extractors
.declarative
.iter()
.map(|e| e.name.as_str())
.collect();
assert!(names.contains(&"inline_extractor"));
assert!(names.contains(&"file_extractor"));
}
#[test]
fn test_extractor_file_with_invalid_toml_warns_but_continues() {
let tmp = tempfile::tempdir().expect("create temp dir");
let aphoria_dir = tmp.path().join(".aphoria");
let extractors_dir = aphoria_dir.join("extractors");
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
let config_path = aphoria_dir.join("config.toml");
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
// Write one invalid file
std::fs::write(extractors_dir.join("bad.toml"), "this is not valid { toml [")
.expect("write bad file");
// Write one valid file
std::fs::write(
extractors_dir.join("good.toml"),
r#"
[[extractors.declarative]]
name = "valid_one"
languages = ["rust"]
pattern = "unwrap"
[extractors.declarative.claim]
subject = "safety/unwrap"
predicate = "present"
value = true
"#,
)
.expect("write good file");
let config = AphoriaConfig::from_file(&config_path).expect("should load despite bad file");
assert_eq!(config.extractors.declarative.len(), 1);
assert_eq!(config.extractors.declarative[0].name, "valid_one");
}
#[test]
fn test_extractor_file_non_toml_skipped() {
let tmp = tempfile::tempdir().expect("create temp dir");
let aphoria_dir = tmp.path().join(".aphoria");
let extractors_dir = aphoria_dir.join("extractors");
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
let config_path = aphoria_dir.join("config.toml");
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
// Non-TOML files should be ignored
std::fs::write(extractors_dir.join("README.md"), "# Extractors").expect("write md");
std::fs::write(extractors_dir.join("notes.yaml"), "key: value").expect("write yaml");
let config = AphoriaConfig::from_file(&config_path).expect("should load");
assert!(config.extractors.declarative.is_empty());
}
#[test]
fn test_no_extractors_dir_is_fine() {
let tmp = tempfile::tempdir().expect("create temp dir");
let aphoria_dir = tmp.path().join(".aphoria");
std::fs::create_dir_all(&aphoria_dir).expect("create aphoria dir");
// No extractors/ subdirectory
let config_path = aphoria_dir.join("config.toml");
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
let config = AphoriaConfig::from_file(&config_path).expect("should load");
assert!(config.extractors.declarative.is_empty());
}

View File

@ -7,7 +7,7 @@ use stemedb_storage::KVStore;
use stemedb_wal::Journal; use stemedb_wal::Journal;
use tokio::sync::Mutex; use tokio::sync::Mutex;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tracing::{debug, error, info, instrument, warn}; use tracing::{debug, info, instrument, warn};
/// Manager for the background ingestion process. /// Manager for the background ingestion process.
/// ///