stemedb/applications/aphoria/src/promotion/pipeline.rs
jordan 41c676a78e feat: Aphoria enterprise features + ontology SDK + file length compliance
Enterprise Features:
- Hosted mode with remote sync for team pattern aggregation
- Community sharing with privacy-preserving anonymization
- LLM-based semantic claim extraction with Gemini integration
- Pattern learning with promotion to declarative extractors
- High-entropy secrets extractor with configurable thresholds
- Auth bypass and insecure cookies extractors

Module Refactoring:
- Split oversized files to comply with 500-line limit
- Config split: types/core.rs, types/extractors.rs, types/hosted.rs, etc.
- Handlers split: scan.rs, policy.rs, report.rs modules
- Extractors split: declarative/, high_entropy_secrets/, insecure_cookies/
- Learning split: store modules with metrics and persistence

SDK & Ontology:
- stemedb-ontology SDK with fluent builders and StemeDB client
- Pharma domain extractors for FDA Orange Book data
- Consumer health UAT test infrastructure

Code Quality:
- Fixed clippy warnings (needless_borrows_for_generic_args)
- Added KVStore trait imports where needed
- Fixed utoipa path re-exports for OpenAPI docs

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 12:55:29 -07:00

378 lines
13 KiB
Rust

//! Promotion pipeline for converting learned patterns to declarative extractors.
//!
//! Orchestrates the full promotion flow: candidates → regex generation → validation → YAML output.
use std::path::PathBuf;
use tracing::{debug, info, warn};
use uuid::Uuid;
use super::regex_gen::RegexGenerator;
use super::types::{PromotionCandidate, PromotionStats, ValidationResult};
use super::validator::ExtractorValidator;
use super::writer::YamlWriter;
use crate::config::PromotionConfig;
use crate::learning::{LearnedPattern, PatternStore};
use crate::llm::GeminiClient;
use crate::AphoriaError;
/// The promotion pipeline orchestrates pattern-to-extractor conversion.
pub struct PromotionPipeline<'a, S: PatternStore> {
/// Pattern store for fetching candidates.
store: &'a S,
/// LLM client for regex generation.
client: Option<&'a GeminiClient>,
/// Configuration for promotion thresholds.
config: &'a PromotionConfig,
/// Validator for testing generated extractors.
validator: ExtractorValidator,
/// YAML writer for output.
writer: Option<YamlWriter>,
}
impl<'a, S: PatternStore> PromotionPipeline<'a, S> {
/// Create a new promotion pipeline.
///
/// If `output_dir` is None, uses the default `.aphoria/extractors/learned/`.
pub fn new(
store: &'a S,
client: Option<&'a GeminiClient>,
config: &'a PromotionConfig,
output_dir: Option<PathBuf>,
) -> Result<Self, AphoriaError> {
let writer = if let Some(dir) = output_dir { Some(YamlWriter::new(dir)?) } else { None };
Ok(Self { store, client, config, validator: ExtractorValidator::default(), writer })
}
/// Get patterns eligible for promotion.
///
/// Returns patterns that meet the configured thresholds for project count
/// and confidence.
pub fn get_candidates(&self) -> Vec<LearnedPattern> {
self.store.get_promotion_candidates(self.config.min_projects, self.config.min_confidence)
}
/// Generate a promotion candidate from a learned pattern.
///
/// Uses the LLM to generate a regex pattern and validates it.
pub fn generate_candidate(
&self,
pattern: &LearnedPattern,
) -> Result<PromotionCandidate, AphoriaError> {
let client = self.client.ok_or_else(|| {
AphoriaError::Promotion("LLM client not configured for regex generation".to_string())
})?;
// Generate extractor definition using LLM
let generator = RegexGenerator::new(client);
let extractor_def = generator.generate(pattern)?;
// Validate the generated extractor
let validation = self.validator.validate(&extractor_def, pattern)?;
Ok(PromotionCandidate::new(pattern.clone(), extractor_def, validation))
}
/// Promote a candidate by writing it to YAML and marking the pattern as promoted.
///
/// Returns the path to the written YAML file.
pub fn promote(&self, candidate: &PromotionCandidate) -> Result<PathBuf, AphoriaError> {
// Check if candidate is ready
if !candidate.is_ready() {
return Err(AphoriaError::Promotion(format!(
"Candidate {} is not ready for promotion: validation={}, performance={}",
candidate.pattern_id(),
candidate.validation.passed,
candidate.validation.performance_ok
)));
}
// Get or create writer
let writer = if let Some(ref w) = self.writer {
w
} else {
return Err(AphoriaError::Promotion("YAML writer not configured".to_string()));
};
// Check if already exists
if writer.exists(candidate.extractor_name()) {
return Err(AphoriaError::Promotion(format!(
"Extractor '{}' already exists",
candidate.extractor_name()
)));
}
// Write YAML file
let path = writer.write(&candidate.extractor_def, &candidate.pattern)?;
// Mark pattern as promoted
self.store.mark_promoted(&candidate.pattern_id(), candidate.extractor_name())?;
info!(
pattern_id = %candidate.pattern_id(),
extractor = %candidate.extractor_name(),
path = %path.display(),
"Pattern promoted to extractor"
);
Ok(path)
}
/// Process all eligible patterns and return promotion candidates.
///
/// Generates and validates extractors for each eligible pattern.
/// Does not actually promote (write YAML) - use `promote()` for that.
pub fn process_all(&self) -> Vec<Result<PromotionCandidate, AphoriaError>> {
let patterns = self.get_candidates();
debug!(count = patterns.len(), "Processing promotion candidates");
patterns.iter().map(|pattern| self.generate_candidate(pattern)).collect()
}
/// Auto-promote all ready candidates.
///
/// Only runs if `auto_promote` is enabled in config.
/// Returns the number of patterns promoted and any errors.
pub fn auto_promote_all(&self) -> (usize, Vec<AphoriaError>) {
if !self.config.auto_promote {
warn!("auto_promote is disabled in config");
return (0, vec![]);
}
let candidates = self.process_all();
let mut promoted = 0;
let mut errors = Vec::new();
for result in candidates {
match result {
Ok(candidate) if candidate.is_ready() => match self.promote(&candidate) {
Ok(_) => promoted += 1,
Err(e) => errors.push(e),
},
Ok(candidate) => {
debug!(
pattern_id = %candidate.pattern_id(),
"Candidate not ready for auto-promotion"
);
}
Err(e) => errors.push(e),
}
}
(promoted, errors)
}
/// Get statistics about the promotion pipeline.
pub fn stats(&self) -> PromotionStats {
let all_patterns: Vec<LearnedPattern> = self.store.get_promotion_candidates(0, 0.0); // Get all patterns
let eligible = self.get_candidates();
let promoted: Vec<_> = all_patterns.iter().filter(|p| p.promoted).collect();
let avg_confidence = if eligible.is_empty() {
0.0
} else {
eligible.iter().map(|p| p.avg_confidence).sum::<f32>() / eligible.len() as f32
};
let avg_projects = if eligible.is_empty() {
0.0
} else {
eligible.iter().map(|p| p.project_count() as f32).sum::<f32>() / eligible.len() as f32
};
PromotionStats {
total_patterns: all_patterns.len(),
eligible_patterns: eligible.len(),
promoted_patterns: promoted.len(),
pending_review: eligible.len().saturating_sub(promoted.len()),
avg_confidence,
avg_projects,
}
}
/// Promote a specific pattern by ID.
pub fn promote_by_id(&self, pattern_id: &Uuid) -> Result<PathBuf, AphoriaError> {
// Find the pattern
let candidates = self.get_candidates();
let pattern = candidates.iter().find(|p| &p.id == pattern_id).ok_or_else(|| {
AphoriaError::Promotion(format!("Pattern {} not found in candidates", pattern_id))
})?;
// Generate and validate
let candidate = self.generate_candidate(pattern)?;
// Promote
self.promote(&candidate)
}
/// Validate a pattern without promoting it.
///
/// Returns the validation result for inspection.
pub fn validate_pattern(
&self,
pattern: &LearnedPattern,
) -> Result<ValidationResult, AphoriaError> {
let client = self.client.ok_or_else(|| {
AphoriaError::Promotion("LLM client not configured for regex generation".to_string())
})?;
let generator = RegexGenerator::new(client);
let extractor_def = generator.generate(pattern)?;
self.validator.validate(&extractor_def, pattern)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::PromotionConfig;
use crate::learning::{ClaimTemplate, LocalPatternStore, ValueType};
use crate::types::Language;
use chrono::Utc;
use tempfile::TempDir;
fn create_test_store(temp: &TempDir) -> LocalPatternStore {
LocalPatternStore::new(temp.path()).expect("create store")
}
fn create_eligible_pattern() -> LearnedPattern {
let mut pattern = LearnedPattern::new(
"verify_ssl = false",
"verify_ssl = <boolean>",
ClaimTemplate::new("ssl/verify", "enabled", ValueType::Boolean, "SSL verification"),
Language::Python,
"project1",
0.9,
);
// Add enough projects to meet threshold
for i in 2..=6 {
pattern.record_observation(format!("project{}", i), 0.85, Utc::now());
}
pattern
}
#[test]
fn test_pipeline_creation() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig::default();
let pipeline =
PromotionPipeline::new(&store, None, &config, Some(temp.path().to_path_buf()));
assert!(pipeline.is_ok());
}
#[test]
fn test_get_candidates_empty() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig::default();
let pipeline =
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
let candidates = pipeline.get_candidates();
assert!(candidates.is_empty());
}
#[test]
fn test_get_candidates_with_eligible() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig::default();
// Add eligible pattern
let pattern = create_eligible_pattern();
store.record_pattern(&pattern, None).expect("record");
let pipeline =
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
let candidates = pipeline.get_candidates();
assert_eq!(candidates.len(), 1);
}
#[test]
fn test_stats_empty_store() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig::default();
let pipeline =
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
let stats = pipeline.stats();
assert_eq!(stats.total_patterns, 0);
assert_eq!(stats.eligible_patterns, 0);
assert_eq!(stats.promoted_patterns, 0);
}
#[test]
fn test_stats_with_patterns() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig::default();
// Add eligible pattern
let pattern = create_eligible_pattern();
store.record_pattern(&pattern, None).expect("record");
// Add non-eligible pattern (not enough projects)
let small_pattern = LearnedPattern::new(
"test = true",
"test = <boolean>",
ClaimTemplate::new("test", "value", ValueType::Boolean, "Test"),
Language::Rust,
"project1",
0.9,
);
store.record_pattern(&small_pattern, None).expect("record");
let pipeline =
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
let stats = pipeline.stats();
assert_eq!(stats.eligible_patterns, 1);
assert_eq!(stats.pending_review, 1);
}
#[test]
fn test_generate_candidate_requires_client() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig::default();
let pattern = create_eligible_pattern();
let pipeline =
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
let result = pipeline.generate_candidate(&pattern);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("LLM client not configured"));
}
#[test]
fn test_auto_promote_disabled() {
let temp = TempDir::new().expect("temp dir");
let store = create_test_store(&temp);
let config = PromotionConfig { auto_promote: false, ..Default::default() };
let pipeline =
PromotionPipeline::new(&store, None, &config, None).expect("create pipeline");
let (promoted, errors) = pipeline.auto_promote_all();
assert_eq!(promoted, 0);
assert!(errors.is_empty());
}
}