feat(aphoria): load declarative extractors from .aphoria/extractors/*.toml files
Declarative extractors in separate .toml files under .aphoria/extractors/ were silently ignored because config loading only parsed the main config.toml. Now from_file() scans the extractors directory after loading the main config and merges any [[extractors.declarative]] definitions found in .toml files. Invalid files produce warnings but don't fail the load. Also includes show_observations field additions to scan args and removes unused import. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3e7eddc074
commit
089992993f
@ -1,20 +1,109 @@
|
|||||||
//! Configuration loading and parsing logic.
|
//! Configuration loading and parsing logic.
|
||||||
|
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::extractors::DeclarativeExtractorDef;
|
||||||
use crate::AphoriaError;
|
use crate::AphoriaError;
|
||||||
|
|
||||||
use super::types::AphoriaConfig;
|
use super::types::{AphoriaConfig, ExtractorConfig};
|
||||||
|
|
||||||
|
/// Wrapper for deserializing extractor files that use `[[extractors.declarative]]` format.
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ExtractorFileWrapper {
|
||||||
|
#[serde(default)]
|
||||||
|
extractors: ExtractorFileContent,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Default)]
|
||||||
|
struct ExtractorFileContent {
|
||||||
|
#[serde(default)]
|
||||||
|
declarative: Vec<DeclarativeExtractorDef>,
|
||||||
|
}
|
||||||
|
|
||||||
impl AphoriaConfig {
|
impl AphoriaConfig {
|
||||||
/// Load configuration from a TOML file.
|
/// Load configuration from a TOML file.
|
||||||
|
///
|
||||||
|
/// After parsing the main config, this also scans `.aphoria/extractors/` for
|
||||||
|
/// additional `.toml` files containing declarative extractor definitions and
|
||||||
|
/// merges them into the config.
|
||||||
pub fn from_file(path: &Path) -> Result<Self, AphoriaError> {
|
pub fn from_file(path: &Path) -> Result<Self, AphoriaError> {
|
||||||
if !path.exists() {
|
if !path.exists() {
|
||||||
return Err(AphoriaError::ConfigNotFound(path.to_path_buf()));
|
return Err(AphoriaError::ConfigNotFound(path.to_path_buf()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let content = std::fs::read_to_string(path)?;
|
let content = std::fs::read_to_string(path)?;
|
||||||
let config: AphoriaConfig = toml::from_str(&content)?;
|
let mut config: AphoriaConfig = toml::from_str(&content)?;
|
||||||
|
|
||||||
|
// Resolve .aphoria/ directory from config file location:
|
||||||
|
// - ".aphoria/config.toml" → parent is ".aphoria/"
|
||||||
|
// - "aphoria.toml" → look for sibling ".aphoria/"
|
||||||
|
let aphoria_dir = if let Some(parent) = path.parent() {
|
||||||
|
if parent.file_name().map(|n| n == ".aphoria").unwrap_or(false) {
|
||||||
|
parent.to_path_buf()
|
||||||
|
} else {
|
||||||
|
parent.join(".aphoria")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PathBuf::from(".aphoria")
|
||||||
|
};
|
||||||
|
|
||||||
|
let extractors_dir = aphoria_dir.join("extractors");
|
||||||
|
if extractors_dir.is_dir() {
|
||||||
|
load_extractor_files(&mut config.extractors, &extractors_dir);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Load declarative extractor definitions from `.toml` files in a directory.
|
||||||
|
///
|
||||||
|
/// Merges them into `config.extractors.declarative`. Invalid files produce
|
||||||
|
/// warnings but don't fail the load — one bad file should not break the scan.
|
||||||
|
fn load_extractor_files(extractors: &mut ExtractorConfig, dir: &Path) {
|
||||||
|
let entries = match std::fs::read_dir(dir) {
|
||||||
|
Ok(e) => e,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(dir = %dir.display(), error = %e, "Failed to read extractors directory");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.extension().and_then(|e| e.to_str()) != Some("toml") || !path.is_file() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = match std::fs::read_to_string(&path) {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(file = %path.display(), error = %e, "Failed to read extractor file");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match toml::from_str::<ExtractorFileWrapper>(&content) {
|
||||||
|
Ok(wrapper) => {
|
||||||
|
let count = wrapper.extractors.declarative.len();
|
||||||
|
if count > 0 {
|
||||||
|
tracing::debug!(
|
||||||
|
file = %path.display(),
|
||||||
|
count,
|
||||||
|
"Loaded declarative extractors from file"
|
||||||
|
);
|
||||||
|
extractors.declarative.extend(wrapper.extractors.declarative);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
file = %path.display(),
|
||||||
|
error = %e,
|
||||||
|
"Failed to parse extractor file"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -109,3 +109,186 @@ project_id = "test"
|
|||||||
let config: AphoriaConfig = toml::from_str(toml).expect("should parse");
|
let config: AphoriaConfig = toml::from_str(toml).expect("should parse");
|
||||||
assert!(!config.hosted.is_enabled());
|
assert!(!config.hosted.is_enabled());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Extractor file loading tests ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_from_file_loads_extractor_files() {
|
||||||
|
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||||
|
let aphoria_dir = tmp.path().join(".aphoria");
|
||||||
|
let extractors_dir = aphoria_dir.join("extractors");
|
||||||
|
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||||
|
|
||||||
|
// Write a minimal config
|
||||||
|
let config_path = aphoria_dir.join("config.toml");
|
||||||
|
std::fs::write(
|
||||||
|
&config_path,
|
||||||
|
r#"
|
||||||
|
[project]
|
||||||
|
name = "test"
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.expect("write config");
|
||||||
|
|
||||||
|
// Write an extractor file with 2 declarative extractors
|
||||||
|
std::fs::write(
|
||||||
|
extractors_dir.join("security.toml"),
|
||||||
|
r#"
|
||||||
|
[[extractors.declarative]]
|
||||||
|
name = "test_extractor_1"
|
||||||
|
description = "First test extractor"
|
||||||
|
languages = ["rust"]
|
||||||
|
pattern = "unsafe"
|
||||||
|
confidence = 0.9
|
||||||
|
|
||||||
|
[extractors.declarative.claim]
|
||||||
|
subject = "safety/unsafe_block"
|
||||||
|
predicate = "present"
|
||||||
|
value = true
|
||||||
|
|
||||||
|
[[extractors.declarative]]
|
||||||
|
name = "test_extractor_2"
|
||||||
|
description = "Second test extractor"
|
||||||
|
languages = ["go"]
|
||||||
|
pattern = "fmt\\.Println"
|
||||||
|
confidence = 1.0
|
||||||
|
|
||||||
|
[extractors.declarative.claim]
|
||||||
|
subject = "debug/println"
|
||||||
|
predicate = "present"
|
||||||
|
value = true
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.expect("write extractor file");
|
||||||
|
|
||||||
|
let config = AphoriaConfig::from_file(&config_path).expect("should load config");
|
||||||
|
assert_eq!(config.extractors.declarative.len(), 2);
|
||||||
|
assert_eq!(config.extractors.declarative[0].name, "test_extractor_1");
|
||||||
|
assert_eq!(config.extractors.declarative[1].name, "test_extractor_2");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_from_file_merges_inline_and_file_extractors() {
|
||||||
|
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||||
|
let aphoria_dir = tmp.path().join(".aphoria");
|
||||||
|
let extractors_dir = aphoria_dir.join("extractors");
|
||||||
|
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||||
|
|
||||||
|
// Config with one inline declarative extractor
|
||||||
|
let config_path = aphoria_dir.join("config.toml");
|
||||||
|
std::fs::write(
|
||||||
|
&config_path,
|
||||||
|
r#"
|
||||||
|
[project]
|
||||||
|
name = "test"
|
||||||
|
|
||||||
|
[[extractors.declarative]]
|
||||||
|
name = "inline_extractor"
|
||||||
|
languages = ["rust"]
|
||||||
|
pattern = "todo!"
|
||||||
|
|
||||||
|
[extractors.declarative.claim]
|
||||||
|
subject = "code/todo"
|
||||||
|
predicate = "present"
|
||||||
|
value = true
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.expect("write config");
|
||||||
|
|
||||||
|
// External extractor file
|
||||||
|
std::fs::write(
|
||||||
|
extractors_dir.join("extra.toml"),
|
||||||
|
r#"
|
||||||
|
[[extractors.declarative]]
|
||||||
|
name = "file_extractor"
|
||||||
|
languages = ["python"]
|
||||||
|
pattern = "import os"
|
||||||
|
|
||||||
|
[extractors.declarative.claim]
|
||||||
|
subject = "imports/os"
|
||||||
|
predicate = "present"
|
||||||
|
value = true
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.expect("write extractor file");
|
||||||
|
|
||||||
|
let config = AphoriaConfig::from_file(&config_path).expect("should load config");
|
||||||
|
assert_eq!(config.extractors.declarative.len(), 2);
|
||||||
|
|
||||||
|
let names: Vec<&str> = config
|
||||||
|
.extractors
|
||||||
|
.declarative
|
||||||
|
.iter()
|
||||||
|
.map(|e| e.name.as_str())
|
||||||
|
.collect();
|
||||||
|
assert!(names.contains(&"inline_extractor"));
|
||||||
|
assert!(names.contains(&"file_extractor"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extractor_file_with_invalid_toml_warns_but_continues() {
|
||||||
|
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||||
|
let aphoria_dir = tmp.path().join(".aphoria");
|
||||||
|
let extractors_dir = aphoria_dir.join("extractors");
|
||||||
|
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||||
|
|
||||||
|
let config_path = aphoria_dir.join("config.toml");
|
||||||
|
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
|
||||||
|
|
||||||
|
// Write one invalid file
|
||||||
|
std::fs::write(extractors_dir.join("bad.toml"), "this is not valid { toml [")
|
||||||
|
.expect("write bad file");
|
||||||
|
|
||||||
|
// Write one valid file
|
||||||
|
std::fs::write(
|
||||||
|
extractors_dir.join("good.toml"),
|
||||||
|
r#"
|
||||||
|
[[extractors.declarative]]
|
||||||
|
name = "valid_one"
|
||||||
|
languages = ["rust"]
|
||||||
|
pattern = "unwrap"
|
||||||
|
|
||||||
|
[extractors.declarative.claim]
|
||||||
|
subject = "safety/unwrap"
|
||||||
|
predicate = "present"
|
||||||
|
value = true
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.expect("write good file");
|
||||||
|
|
||||||
|
let config = AphoriaConfig::from_file(&config_path).expect("should load despite bad file");
|
||||||
|
assert_eq!(config.extractors.declarative.len(), 1);
|
||||||
|
assert_eq!(config.extractors.declarative[0].name, "valid_one");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extractor_file_non_toml_skipped() {
|
||||||
|
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||||
|
let aphoria_dir = tmp.path().join(".aphoria");
|
||||||
|
let extractors_dir = aphoria_dir.join("extractors");
|
||||||
|
std::fs::create_dir_all(&extractors_dir).expect("create extractors dir");
|
||||||
|
|
||||||
|
let config_path = aphoria_dir.join("config.toml");
|
||||||
|
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
|
||||||
|
|
||||||
|
// Non-TOML files should be ignored
|
||||||
|
std::fs::write(extractors_dir.join("README.md"), "# Extractors").expect("write md");
|
||||||
|
std::fs::write(extractors_dir.join("notes.yaml"), "key: value").expect("write yaml");
|
||||||
|
|
||||||
|
let config = AphoriaConfig::from_file(&config_path).expect("should load");
|
||||||
|
assert!(config.extractors.declarative.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_no_extractors_dir_is_fine() {
|
||||||
|
let tmp = tempfile::tempdir().expect("create temp dir");
|
||||||
|
let aphoria_dir = tmp.path().join(".aphoria");
|
||||||
|
std::fs::create_dir_all(&aphoria_dir).expect("create aphoria dir");
|
||||||
|
// No extractors/ subdirectory
|
||||||
|
|
||||||
|
let config_path = aphoria_dir.join("config.toml");
|
||||||
|
std::fs::write(&config_path, "[project]\nname = \"test\"\n").expect("write config");
|
||||||
|
|
||||||
|
let config = AphoriaConfig::from_file(&config_path).expect("should load");
|
||||||
|
assert!(config.extractors.declarative.is_empty());
|
||||||
|
}
|
||||||
|
|||||||
@ -7,7 +7,7 @@ use stemedb_storage::KVStore;
|
|||||||
use stemedb_wal::Journal;
|
use stemedb_wal::Journal;
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tracing::{debug, error, info, instrument, warn};
|
use tracing::{debug, info, instrument, warn};
|
||||||
|
|
||||||
/// Manager for the background ingestion process.
|
/// Manager for the background ingestion process.
|
||||||
///
|
///
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user