feat(aphoria): load declarative extractors from .aphoria/extractors/*.toml files
Declarative extractors in separate .toml files under .aphoria/extractors/ were silently ignored because config loading only parsed the main config.toml. Now from_file() scans the extractors directory after loading the main config and merges any [[extractors.declarative]] definitions found in .toml files. Invalid files produce warnings but don't fail the load. Also includes show_observations field additions to scan args and removes unused import. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3e7eddc074
commit
089992993f
@ -1,20 +1,109 @@
|
||||
//! Configuration loading and parsing logic.
|
||||
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::extractors::DeclarativeExtractorDef;
|
||||
use crate::AphoriaError;
|
||||
|
||||
use super::types::AphoriaConfig;
|
||||
use super::types::{AphoriaConfig, ExtractorConfig};
|
||||
|
||||
/// Wrapper for deserializing extractor files that use `[[extractors.declarative]]` format.
|
||||
#[derive(Deserialize)]
|
||||
struct ExtractorFileWrapper {
|
||||
#[serde(default)]
|
||||
extractors: ExtractorFileContent,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
struct ExtractorFileContent {
|
||||
#[serde(default)]
|
||||
declarative: Vec<DeclarativeExtractorDef>,
|
||||
}
|
||||
|
||||
impl AphoriaConfig {
|
||||
/// Load configuration from a TOML file.
|
||||
///
|
||||
/// After parsing the main config, this also scans `.aphoria/extractors/` for
|
||||
/// additional `.toml` files containing declarative extractor definitions and
|
||||
/// merges them into the config.
|
||||
pub fn from_file(path: &Path) -> Result<Self, AphoriaError> {
|
||||
if !path.exists() {
|
||||
return Err(AphoriaError::ConfigNotFound(path.to_path_buf()));
|
||||
}
|
||||
|
||||
let content = std::fs::read_to_string(path)?;
|
||||
let config: AphoriaConfig = toml::from_str(&content)?;
|
||||
let mut config: AphoriaConfig = toml::from_str(&content)?;
|
||||
|
||||
// Resolve .aphoria/ directory from config file location:
|
||||
// - ".aphoria/config.toml" → parent is ".aphoria/"
|
||||
// - "aphoria.toml" → look for sibling ".aphoria/"
|
||||
let aphoria_dir = if let Some(parent) = path.parent() {
|
||||
if parent.file_name().map(|n| n == ".aphoria").unwrap_or(false) {
|
||||
parent.to_path_buf()
|
||||
} else {
|
||||
parent.join(".aphoria")
|
||||
}
|
||||
} else {
|
||||
PathBuf::from(".aphoria")
|
||||
};
|
||||
|
||||
let extractors_dir = aphoria_dir.join("extractors");
|
||||
if extractors_dir.is_dir() {
|
||||
load_extractor_files(&mut config.extractors, &extractors_dir);
|
||||
}
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
}
|
||||
|
||||
/// Load declarative extractor definitions from `.toml` files in a directory.
|
||||
///
|
||||
/// Merges them into `config.extractors.declarative`. Invalid files produce
|
||||
/// warnings but don't fail the load — one bad file should not break the scan.
|
||||
fn load_extractor_files(extractors: &mut ExtractorConfig, dir: &Path) {
|
||||
let entries = match std::fs::read_dir(dir) {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
tracing::warn!(dir = %dir.display(), error = %e, "Failed to read extractors directory");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|e| e.to_str()) != Some("toml") || !path.is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let content = match std::fs::read_to_string(&path) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::warn!(file = %path.display(), error = %e, "Failed to read extractor file");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match toml::from_str::<ExtractorFileWrapper>(&content) {
|
||||
Ok(wrapper) => {
|
||||
let count = wrapper.extractors.declarative.len();
|
||||
if count > 0 {
|
||||
tracing::debug!(
|
||||
file = %path.display(),
|
||||
count,
|
||||
"Loaded declarative extractors from file"
|
||||
);
|
||||
extractors.declarative.extend(wrapper.extractors.declarative);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
file = %path.display(),
|
||||
error = %e,
|
||||
"Failed to parse extractor file"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,3 +109,186 @@ project_id = "test"
|
||||
let config: AphoriaConfig = toml::from_str(toml).expect("should parse");
|
||||
assert!(!config.hosted.is_enabled());
|
||||
}
|
||||
|
||||
// --- Extractor file loading tests ---
|
||||
|
||||
#[test]
|
||||
fn test_from_file_loads_extractor_files() {
|
||||
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||
let aphoria_dir = tmp.path().join(".aphoria");
|
||||
let extractors_dir = aphoria_dir.join("extractors");
|
||||
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||
|
||||
// Write a minimal config
|
||||
let config_path = aphoria_dir.join("config.toml");
|
||||
std::fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
[project]
|
||||
name = "test"
|
||||
"#,
|
||||
)
|
||||
.expect("write config");
|
||||
|
||||
// Write an extractor file with 2 declarative extractors
|
||||
std::fs::write(
|
||||
extractors_dir.join("security.toml"),
|
||||
r#"
|
||||
[[extractors.declarative]]
|
||||
name = "test_extractor_1"
|
||||
description = "First test extractor"
|
||||
languages = ["rust"]
|
||||
pattern = "unsafe"
|
||||
confidence = 0.9
|
||||
|
||||
[extractors.declarative.claim]
|
||||
subject = "safety/unsafe_block"
|
||||
predicate = "present"
|
||||
value = true
|
||||
|
||||
[[extractors.declarative]]
|
||||
name = "test_extractor_2"
|
||||
description = "Second test extractor"
|
||||
languages = ["go"]
|
||||
pattern = "fmt\\.Println"
|
||||
confidence = 1.0
|
||||
|
||||
[extractors.declarative.claim]
|
||||
subject = "debug/println"
|
||||
predicate = "present"
|
||||
value = true
|
||||
"#,
|
||||
)
|
||||
.expect("write extractor file");
|
||||
|
||||
let config = AphoriaConfig::from_file(&config_path).expect("should load config");
|
||||
assert_eq!(config.extractors.declarative.len(), 2);
|
||||
assert_eq!(config.extractors.declarative[0].name, "test_extractor_1");
|
||||
assert_eq!(config.extractors.declarative[1].name, "test_extractor_2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_file_merges_inline_and_file_extractors() {
|
||||
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||
let aphoria_dir = tmp.path().join(".aphoria");
|
||||
let extractors_dir = aphoria_dir.join("extractors");
|
||||
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||
|
||||
// Config with one inline declarative extractor
|
||||
let config_path = aphoria_dir.join("config.toml");
|
||||
std::fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
[project]
|
||||
name = "test"
|
||||
|
||||
[[extractors.declarative]]
|
||||
name = "inline_extractor"
|
||||
languages = ["rust"]
|
||||
pattern = "todo!"
|
||||
|
||||
[extractors.declarative.claim]
|
||||
subject = "code/todo"
|
||||
predicate = "present"
|
||||
value = true
|
||||
"#,
|
||||
)
|
||||
.expect("write config");
|
||||
|
||||
// External extractor file
|
||||
std::fs::write(
|
||||
extractors_dir.join("extra.toml"),
|
||||
r#"
|
||||
[[extractors.declarative]]
|
||||
name = "file_extractor"
|
||||
languages = ["python"]
|
||||
pattern = "import os"
|
||||
|
||||
[extractors.declarative.claim]
|
||||
subject = "imports/os"
|
||||
predicate = "present"
|
||||
value = true
|
||||
"#,
|
||||
)
|
||||
.expect("write extractor file");
|
||||
|
||||
let config = AphoriaConfig::from_file(&config_path).expect("should load config");
|
||||
assert_eq!(config.extractors.declarative.len(), 2);
|
||||
|
||||
let names: Vec<&str> = config
|
||||
.extractors
|
||||
.declarative
|
||||
.iter()
|
||||
.map(|e| e.name.as_str())
|
||||
.collect();
|
||||
assert!(names.contains(&"inline_extractor"));
|
||||
assert!(names.contains(&"file_extractor"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extractor_file_with_invalid_toml_warns_but_continues() {
|
||||
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||
let aphoria_dir = tmp.path().join(".aphoria");
|
||||
let extractors_dir = aphoria_dir.join("extractors");
|
||||
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||
|
||||
let config_path = aphoria_dir.join("config.toml");
|
||||
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
|
||||
|
||||
// Write one invalid file
|
||||
std::fs::write(extractors_dir.join("bad.toml"), "this is not valid { toml [")
|
||||
.expect("write bad file");
|
||||
|
||||
// Write one valid file
|
||||
std::fs::write(
|
||||
extractors_dir.join("good.toml"),
|
||||
r#"
|
||||
[[extractors.declarative]]
|
||||
name = "valid_one"
|
||||
languages = ["rust"]
|
||||
pattern = "unwrap"
|
||||
|
||||
[extractors.declarative.claim]
|
||||
subject = "safety/unwrap"
|
||||
predicate = "present"
|
||||
value = true
|
||||
"#,
|
||||
)
|
||||
.expect("write good file");
|
||||
|
||||
let config = AphoriaConfig::from_file(&config_path).expect("should load despite bad file");
|
||||
assert_eq!(config.extractors.declarative.len(), 1);
|
||||
assert_eq!(config.extractors.declarative[0].name, "valid_one");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extractor_file_non_toml_skipped() {
|
||||
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||
let aphoria_dir = tmp.path().join(".aphoria");
|
||||
let extractors_dir = aphoria_dir.join("extractors");
|
||||
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||
|
||||
let config_path = aphoria_dir.join("config.toml");
|
||||
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
|
||||
|
||||
// Non-TOML files should be ignored
|
||||
std::fs::write(extractors_dir.join("README.md"), "# Extractors").expect("write md");
|
||||
std::fs::write(extractors_dir.join("notes.yaml"), "key: value").expect("write yaml");
|
||||
|
||||
let config = AphoriaConfig::from_file(&config_path).expect("should load");
|
||||
assert!(config.extractors.declarative.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_extractors_dir_is_fine() {
|
||||
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||
let aphoria_dir = tmp.path().join(".aphoria");
|
||||
std::fs::create_dir_all(&aphoria_dir).expect("create aphoria dir");
|
||||
// No extractors/ subdirectory
|
||||
|
||||
let config_path = aphoria_dir.join("config.toml");
|
||||
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
|
||||
|
||||
let config = AphoriaConfig::from_file(&config_path).expect("should load");
|
||||
assert!(config.extractors.declarative.is_empty());
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ use stemedb_storage::KVStore;
|
||||
use stemedb_wal::Journal;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::task::JoinHandle;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{debug, info, instrument, warn};
|
||||
|
||||
/// Manager for the background ingestion process.
|
||||
///
|
||||
|
||||
Loading…
Reference in New Issue
Block a user