Enterprise Features: - Hosted mode with remote sync for team pattern aggregation - Community sharing with privacy-preserving anonymization - LLM-based semantic claim extraction with Gemini integration - Pattern learning with promotion to declarative extractors - High-entropy secrets extractor with configurable thresholds - Auth bypass and insecure cookies extractors Module Refactoring: - Split oversized files to comply with 500-line limit - Config split: types/core.rs, types/extractors.rs, types/hosted.rs, etc. - Handlers split: scan.rs, policy.rs, report.rs modules - Extractors split: declarative/, high_entropy_secrets/, insecure_cookies/ - Learning split: store modules with metrics and persistence SDK & Ontology: - stemedb-ontology SDK with fluent builders and StemeDB client - Pharma domain extractors for FDA Orange Book data - Consumer health UAT test infrastructure Code Quality: - Fixed clippy warnings (needless_borrows_for_generic_args) - Added KVStore trait imports where needed - Fixed utoipa path re-exports for OpenAPI docs Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
378 lines
13 KiB
Rust
378 lines
13 KiB
Rust
//! Promotion pipeline for converting learned patterns to declarative extractors.
|
|
//!
|
|
//! Orchestrates the full promotion flow: candidates → regex generation → validation → YAML output.
|
|
|
|
use std::path::PathBuf;
|
|
|
|
use tracing::{debug, info, warn};
|
|
use uuid::Uuid;
|
|
|
|
use super::regex_gen::RegexGenerator;
|
|
use super::types::{PromotionCandidate, PromotionStats, ValidationResult};
|
|
use super::validator::ExtractorValidator;
|
|
use super::writer::YamlWriter;
|
|
use crate::config::PromotionConfig;
|
|
use crate::learning::{LearnedPattern, PatternStore};
|
|
use crate::llm::GeminiClient;
|
|
use crate::AphoriaError;
|
|
|
|
/// The promotion pipeline orchestrates pattern-to-extractor conversion.
|
|
pub struct PromotionPipeline<'a, S: PatternStore> {
|
|
/// Pattern store for fetching candidates.
|
|
store: &'a S,
|
|
|
|
/// LLM client for regex generation.
|
|
client: Option<&'a GeminiClient>,
|
|
|
|
/// Configuration for promotion thresholds.
|
|
config: &'a PromotionConfig,
|
|
|
|
/// Validator for testing generated extractors.
|
|
validator: ExtractorValidator,
|
|
|
|
/// YAML writer for output.
|
|
writer: Option<YamlWriter>,
|
|
}
|
|
|
|
impl<'a, S: PatternStore> PromotionPipeline<'a, S> {
|
|
/// Create a new promotion pipeline.
|
|
///
|
|
/// If `output_dir` is None, uses the default `.aphoria/extractors/learned/`.
|
|
pub fn new(
|
|
store: &'a S,
|
|
client: Option<&'a GeminiClient>,
|
|
config: &'a PromotionConfig,
|
|
output_dir: Option<PathBuf>,
|
|
) -> Result<Self, AphoriaError> {
|
|
let writer = if let Some(dir) = output_dir { Some(YamlWriter::new(dir)?) } else { None };
|
|
|
|
Ok(Self { store, client, config, validator: ExtractorValidator::default(), writer })
|
|
}
|
|
|
|
/// Get patterns eligible for promotion.
|
|
///
|
|
/// Returns patterns that meet the configured thresholds for project count
|
|
/// and confidence.
|
|
pub fn get_candidates(&self) -> Vec<LearnedPattern> {
|
|
self.store.get_promotion_candidates(self.config.min_projects, self.config.min_confidence)
|
|
}
|
|
|
|
/// Generate a promotion candidate from a learned pattern.
|
|
///
|
|
/// Uses the LLM to generate a regex pattern and validates it.
|
|
pub fn generate_candidate(
|
|
&self,
|
|
pattern: &LearnedPattern,
|
|
) -> Result<PromotionCandidate, AphoriaError> {
|
|
let client = self.client.ok_or_else(|| {
|
|
AphoriaError::Promotion("LLM client not configured for regex generation".to_string())
|
|
})?;
|
|
|
|
// Generate extractor definition using LLM
|
|
let generator = RegexGenerator::new(client);
|
|
let extractor_def = generator.generate(pattern)?;
|
|
|
|
// Validate the generated extractor
|
|
let validation = self.validator.validate(&extractor_def, pattern)?;
|
|
|
|
Ok(PromotionCandidate::new(pattern.clone(), extractor_def, validation))
|
|
}
|
|
|
|
/// Promote a candidate by writing it to YAML and marking the pattern as promoted.
|
|
///
|
|
/// Returns the path to the written YAML file.
|
|
pub fn promote(&self, candidate: &PromotionCandidate) -> Result<PathBuf, AphoriaError> {
|
|
// Check if candidate is ready
|
|
if !candidate.is_ready() {
|
|
return Err(AphoriaError::Promotion(format!(
|
|
"Candidate {} is not ready for promotion: validation={}, performance={}",
|
|
candidate.pattern_id(),
|
|
candidate.validation.passed,
|
|
candidate.validation.performance_ok
|
|
)));
|
|
}
|
|
|
|
// Get or create writer
|
|
let writer = if let Some(ref w) = self.writer {
|
|
w
|
|
} else {
|
|
return Err(AphoriaError::Promotion("YAML writer not configured".to_string()));
|
|
};
|
|
|
|
// Check if already exists
|
|
if writer.exists(candidate.extractor_name()) {
|
|
return Err(AphoriaError::Promotion(format!(
|
|
"Extractor '{}' already exists",
|
|
candidate.extractor_name()
|
|
)));
|
|
}
|
|
|
|
// Write YAML file
|
|
let path = writer.write(&candidate.extractor_def, &candidate.pattern)?;
|
|
|
|
// Mark pattern as promoted
|
|
self.store.mark_promoted(&candidate.pattern_id(), candidate.extractor_name())?;
|
|
|
|
info!(
|
|
pattern_id = %candidate.pattern_id(),
|
|
extractor = %candidate.extractor_name(),
|
|
path = %path.display(),
|
|
"Pattern promoted to extractor"
|
|
);
|
|
|
|
Ok(path)
|
|
}
|
|
|
|
/// Process all eligible patterns and return promotion candidates.
|
|
///
|
|
/// Generates and validates extractors for each eligible pattern.
|
|
/// Does not actually promote (write YAML) - use `promote()` for that.
|
|
pub fn process_all(&self) -> Vec<Result<PromotionCandidate, AphoriaError>> {
|
|
let patterns = self.get_candidates();
|
|
|
|
debug!(count = patterns.len(), "Processing promotion candidates");
|
|
|
|
patterns.iter().map(|pattern| self.generate_candidate(pattern)).collect()
|
|
}
|
|
|
|
/// Auto-promote all ready candidates.
|
|
///
|
|
/// Only runs if `auto_promote` is enabled in config.
|
|
/// Returns the number of patterns promoted and any errors.
|
|
pub fn auto_promote_all(&self) -> (usize, Vec<AphoriaError>) {
|
|
if !self.config.auto_promote {
|
|
warn!("auto_promote is disabled in config");
|
|
return (0, vec![]);
|
|
}
|
|
|
|
let candidates = self.process_all();
|
|
let mut promoted = 0;
|
|
let mut errors = Vec::new();
|
|
|
|
for result in candidates {
|
|
match result {
|
|
Ok(candidate) if candidate.is_ready() => match self.promote(&candidate) {
|
|
Ok(_) => promoted += 1,
|
|
Err(e) => errors.push(e),
|
|
},
|
|
Ok(candidate) => {
|
|
debug!(
|
|
pattern_id = %candidate.pattern_id(),
|
|
"Candidate not ready for auto-promotion"
|
|
);
|
|
}
|
|
Err(e) => errors.push(e),
|
|
}
|
|
}
|
|
|
|
(promoted, errors)
|
|
}
|
|
|
|
/// Get statistics about the promotion pipeline.
|
|
pub fn stats(&self) -> PromotionStats {
|
|
let all_patterns: Vec<LearnedPattern> = self.store.get_promotion_candidates(0, 0.0); // Get all patterns
|
|
|
|
let eligible = self.get_candidates();
|
|
let promoted: Vec<_> = all_patterns.iter().filter(|p| p.promoted).collect();
|
|
|
|
let avg_confidence = if eligible.is_empty() {
|
|
0.0
|
|
} else {
|
|
eligible.iter().map(|p| p.avg_confidence).sum::<f32>() / eligible.len() as f32
|
|
};
|
|
|
|
let avg_projects = if eligible.is_empty() {
|
|
0.0
|
|
} else {
|
|
eligible.iter().map(|p| p.project_count() as f32).sum::<f32>() / eligible.len() as f32
|
|
};
|
|
|
|
PromotionStats {
|
|
total_patterns: all_patterns.len(),
|
|
eligible_patterns: eligible.len(),
|
|
promoted_patterns: promoted.len(),
|
|
pending_review: eligible.len().saturating_sub(promoted.len()),
|
|
avg_confidence,
|
|
avg_projects,
|
|
}
|
|
}
|
|
|
|
/// Promote a specific pattern by ID.
|
|
pub fn promote_by_id(&self, pattern_id: &Uuid) -> Result<PathBuf, AphoriaError> {
|
|
// Find the pattern
|
|
let candidates = self.get_candidates();
|
|
let pattern = candidates.iter().find(|p| &p.id == pattern_id).ok_or_else(|| {
|
|
AphoriaError::Promotion(format!("Pattern {} not found in candidates", pattern_id))
|
|
})?;
|
|
|
|
// Generate and validate
|
|
let candidate = self.generate_candidate(pattern)?;
|
|
|
|
// Promote
|
|
self.promote(&candidate)
|
|
}
|
|
|
|
/// Validate a pattern without promoting it.
|
|
///
|
|
/// Returns the validation result for inspection.
|
|
pub fn validate_pattern(
|
|
&self,
|
|
pattern: &LearnedPattern,
|
|
) -> Result<ValidationResult, AphoriaError> {
|
|
let client = self.client.ok_or_else(|| {
|
|
AphoriaError::Promotion("LLM client not configured for regex generation".to_string())
|
|
})?;
|
|
|
|
let generator = RegexGenerator::new(client);
|
|
let extractor_def = generator.generate(pattern)?;
|
|
|
|
self.validator.validate(&extractor_def, pattern)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::config::PromotionConfig;
|
|
use crate::learning::{ClaimTemplate, LocalPatternStore, ValueType};
|
|
use crate::types::Language;
|
|
use chrono::Utc;
|
|
use tempfile::TempDir;
|
|
|
|
fn create_test_store(temp: &TempDir) -> LocalPatternStore {
|
|
LocalPatternStore::new(temp.path()).expect("create store")
|
|
}
|
|
|
|
fn create_eligible_pattern() -> LearnedPattern {
|
|
let mut pattern = LearnedPattern::new(
|
|
"verify_ssl = false",
|
|
"verify_ssl = <boolean>",
|
|
ClaimTemplate::new("ssl/verify", "enabled", ValueType::Boolean, "SSL verification"),
|
|
Language::Python,
|
|
"project1",
|
|
0.9,
|
|
);
|
|
|
|
// Add enough projects to meet threshold
|
|
for i in 2..=6 {
|
|
pattern.record_observation(format!("project{}", i), 0.85, Utc::now());
|
|
}
|
|
|
|
pattern
|
|
}
|
|
|
|
#[test]
|
|
fn test_pipeline_creation() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig::default();
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, Some(temp.path().to_path_buf()));
|
|
assert!(pipeline.is_ok());
|
|
}
|
|
|
|
#[test]
|
|
fn test_get_candidates_empty() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig::default();
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
|
|
|
|
let candidates = pipeline.get_candidates();
|
|
assert!(candidates.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_get_candidates_with_eligible() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig::default();
|
|
|
|
// Add eligible pattern
|
|
let pattern = create_eligible_pattern();
|
|
store.record_pattern(&pattern, None).expect("record");
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
|
|
|
|
let candidates = pipeline.get_candidates();
|
|
assert_eq!(candidates.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_stats_empty_store() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig::default();
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
|
|
|
|
let stats = pipeline.stats();
|
|
assert_eq!(stats.total_patterns, 0);
|
|
assert_eq!(stats.eligible_patterns, 0);
|
|
assert_eq!(stats.promoted_patterns, 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_stats_with_patterns() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig::default();
|
|
|
|
// Add eligible pattern
|
|
let pattern = create_eligible_pattern();
|
|
store.record_pattern(&pattern, None).expect("record");
|
|
|
|
// Add non-eligible pattern (not enough projects)
|
|
let small_pattern = LearnedPattern::new(
|
|
"test = true",
|
|
"test = <boolean>",
|
|
ClaimTemplate::new("test", "value", ValueType::Boolean, "Test"),
|
|
Language::Rust,
|
|
"project1",
|
|
0.9,
|
|
);
|
|
store.record_pattern(&small_pattern, None).expect("record");
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
|
|
|
|
let stats = pipeline.stats();
|
|
assert_eq!(stats.eligible_patterns, 1);
|
|
assert_eq!(stats.pending_review, 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_candidate_requires_client() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig::default();
|
|
let pattern = create_eligible_pattern();
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
|
|
|
|
let result = pipeline.generate_candidate(&pattern);
|
|
assert!(result.is_err());
|
|
assert!(result.unwrap_err().to_string().contains("LLM client not configured"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_auto_promote_disabled() {
|
|
let temp = TempDir::new().expect("temp dir");
|
|
let store = create_test_store(&temp);
|
|
let config = PromotionConfig { auto_promote: false, ..Default::default() };
|
|
|
|
let pipeline =
|
|
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
|
|
|
|
let (promoted, errors) = pipeline.auto_promote_all();
|
|
assert_eq!(promoted, 0);
|
|
assert!(errors.is_empty());
|
|
}
|
|
}
|