//! Quality validation for researched claims. //! //! Ensures that claims extracted from research meet quality standards before //! being ingested into the corpus. High-quality data is critical for Aphoria's //! accuracy - false positives erode trust. use serde::{Deserialize, Serialize}; use tracing::{debug, info, warn}; use super::researcher::ResearchedClaim; /// Quality validation report for a set of researched claims. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct QualityReport { /// Overall quality score (0.0 to 1.0). pub overall_score: f32, /// Number of claims that passed validation. pub passed: usize, /// Number of claims that failed validation. pub failed: usize, /// Number of claims that passed with warnings. pub warnings: usize, /// Per-claim validation results. pub claim_results: Vec, /// Source attribution score (0.0 to 1.0). pub source_attribution_score: f32, /// Normative language score (0.0 to 1.0). pub normative_language_score: f32, /// Consistency score (0.0 to 1.0). pub consistency_score: f32, } /// Validation result for a single claim. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ClaimValidationResult { /// Subject of the claim. pub subject: String, /// Whether the claim passed validation. pub passed: bool, /// Confidence in this claim's quality. pub confidence: f32, /// Validation issues found. pub issues: Vec, /// Validation warnings (non-fatal). pub warnings: Vec, } /// A validation issue that caused a claim to fail. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ValidationIssue { /// Issue category. pub category: IssueCategory, /// Human-readable description. pub description: String, /// Severity (higher = worse). pub severity: u8, } /// Categories of validation issues. #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] pub enum IssueCategory { /// Missing or invalid source attribution. SourceAttribution, /// Claim lacks normative language (MUST, SHOULD, etc.). NormativeLanguage, /// Claim is too vague or generic. VagueContent, /// Claim conflicts with existing corpus. Conflict, /// Subject path is malformed. MalformedSubject, /// Value is invalid or ambiguous. InvalidValue, /// Description is missing or too short. InsufficientDescription, /// Duplicate of existing claim. Duplicate, } /// Validator for researched claims. pub struct QualityValidator { /// Minimum confidence threshold for accepting claims. min_confidence: f32, /// Minimum description length. min_description_len: usize, /// Whether to allow claims without explicit normative language. allow_implicit_normative: bool, } impl Default for QualityValidator { fn default() -> Self { Self { min_confidence: 0.7, min_description_len: 20, allow_implicit_normative: false } } } impl QualityValidator { /// Create a new validator with custom settings. pub fn new(min_confidence: f32) -> Self { Self { min_confidence, ..Default::default() } } /// Create a strict validator (higher thresholds). pub fn strict() -> Self { Self { min_confidence: 0.85, min_description_len: 40, allow_implicit_normative: false } } /// Create a lenient validator (lower thresholds). pub fn lenient() -> Self { Self { min_confidence: 0.5, min_description_len: 10, allow_implicit_normative: true } } /// Validate a batch of researched claims. pub fn validate(&self, claims: &[ResearchedClaim]) -> QualityReport { let mut claim_results = Vec::with_capacity(claims.len()); let mut passed = 0; let mut failed = 0; let mut warnings = 0; let mut source_scores = Vec::new(); let mut normative_scores = Vec::new(); for claim in claims { let result = self.validate_claim(claim); if result.passed { passed += 1; if !result.warnings.is_empty() { warnings += 1; } } else { failed += 1; } // Track component scores source_scores.push(self.score_source_attribution(claim)); normative_scores.push(self.score_normative_language(&claim.description)); claim_results.push(result); } let total = claims.len(); let overall_score = if total > 0 { passed as f32 / total as f32 } else { 0.0 }; let source_attribution_score = if source_scores.is_empty() { 0.0 } else { source_scores.iter().sum::() / source_scores.len() as f32 }; let normative_language_score = if normative_scores.is_empty() { 0.0 } else { normative_scores.iter().sum::() / normative_scores.len() as f32 }; // Consistency score: check for conflicting claims let consistency_score = self.score_consistency(claims); info!( total, passed, failed, warnings, overall_score, source_attribution_score, normative_language_score, consistency_score, "Quality validation complete" ); QualityReport { overall_score, passed, failed, warnings, claim_results, source_attribution_score, normative_language_score, consistency_score, } } /// Validate a single claim. fn validate_claim(&self, claim: &ResearchedClaim) -> ClaimValidationResult { let mut issues = Vec::new(); let mut validation_warnings = Vec::new(); let mut confidence = claim.confidence; // Check subject path format if !self.is_valid_subject(&claim.subject) { issues.push(ValidationIssue { category: IssueCategory::MalformedSubject, description: format!("Subject path is malformed: {}", claim.subject), severity: 3, }); confidence *= 0.5; } // Check source attribution if claim.source_url.is_empty() { issues.push(ValidationIssue { category: IssueCategory::SourceAttribution, description: "Missing source URL".to_string(), severity: 2, }); confidence *= 0.7; } else if !self.is_authoritative_source(&claim.source_url) { validation_warnings .push(format!("Source may not be authoritative: {}", claim.source_url)); confidence *= 0.9; } // Check description quality if claim.description.len() < self.min_description_len { issues.push(ValidationIssue { category: IssueCategory::InsufficientDescription, description: format!( "Description too short ({} chars, min {})", claim.description.len(), self.min_description_len ), severity: 2, }); confidence *= 0.8; } // Check normative language let has_normative = self.has_normative_language(&claim.description); if !has_normative && !self.allow_implicit_normative { issues.push(ValidationIssue { category: IssueCategory::NormativeLanguage, description: "Description lacks normative language (MUST, SHOULD, etc.)" .to_string(), severity: 2, }); confidence *= 0.8; } else if !has_normative { validation_warnings.push("Implicit normative statement (no MUST/SHOULD)".to_string()); } // Check for vague content if self.is_vague_content(&claim.description) { issues.push(ValidationIssue { category: IssueCategory::VagueContent, description: "Content is too vague or generic".to_string(), severity: 2, }); confidence *= 0.7; } // Determine pass/fail let passed = issues.is_empty() || confidence >= self.min_confidence; if !passed { debug!( subject = %claim.subject, confidence, issues = issues.len(), "Claim failed validation" ); } ClaimValidationResult { subject: claim.subject.clone(), passed, confidence: confidence.min(1.0), issues, warnings: validation_warnings, } } /// Check if a subject path is valid. fn is_valid_subject(&self, subject: &str) -> bool { // Must have scheme://path format if !subject.contains("://") { return false; } // Must have at least 2 path segments let path = subject.find("://").map(|i| &subject[i + 3..]).unwrap_or(""); let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); segments.len() >= 2 } /// Check if a source URL is from an authoritative domain. fn is_authoritative_source(&self, url: &str) -> bool { let authoritative_domains = [ "rfc-editor.org", "ietf.org", "owasp.org", "nist.gov", "w3.org", "postgresql.org", "redis.io", "docs.rs", "go.dev", "python.org", "rust-lang.org", "apache.org", "microsoft.com/docs", "aws.amazon.com/docs", "cloud.google.com/docs", "developer.mozilla.org", ]; authoritative_domains.iter().any(|domain| url.contains(domain)) } /// Check if text contains normative language. fn has_normative_language(&self, text: &str) -> bool { let upper = text.to_uppercase(); let normative_keywords = ["MUST", "SHALL", "SHOULD", "REQUIRED", "RECOMMENDED", "MAY NOT"]; normative_keywords.iter().any(|kw| upper.contains(kw)) } /// Check if content is too vague. fn is_vague_content(&self, text: &str) -> bool { let vague_phrases = [ "should be configured", "it depends", "varies", "may or may not", "could be", "might be", "typically", "usually", "often", "sometimes", "in some cases", ]; let lower = text.to_lowercase(); let vague_count = vague_phrases.iter().filter(|p| lower.contains(*p)).count(); // Too vague if more than 2 vague phrases or text is very short with any vague phrase vague_count > 2 || (text.len() < 50 && vague_count > 0) } /// Score source attribution (0.0 to 1.0). fn score_source_attribution(&self, claim: &ResearchedClaim) -> f32 { if claim.source_url.is_empty() { return 0.0; } let mut score: f32 = 0.5; // Base score for having a URL if self.is_authoritative_source(&claim.source_url) { score += 0.3; } if !claim.source_section.is_empty() { score += 0.1; } if claim.source_url.starts_with("https://") { score += 0.1; } score.min(1.0) } /// Score normative language (0.0 to 1.0). fn score_normative_language(&self, text: &str) -> f32 { let upper = text.to_uppercase(); // Strong normative = higher score if upper.contains("MUST") || upper.contains("SHALL") || upper.contains("REQUIRED") { return 1.0; } if upper.contains("SHOULD") || upper.contains("RECOMMENDED") { return 0.8; } if upper.contains("MAY NOT") { return 0.7; } if upper.contains("MAY") { return 0.5; } // Implicit recommendations if text.to_lowercase().contains("recommended") || text.to_lowercase().contains("best practice") { return 0.4; } 0.2 } /// Score consistency among claims (0.0 to 1.0). fn score_consistency(&self, claims: &[ResearchedClaim]) -> f32 { if claims.len() < 2 { return 1.0; } // Check for conflicting claims on the same subject+predicate let mut subject_values: std::collections::HashMap> = std::collections::HashMap::new(); for claim in claims { let key = format!("{}::{}", claim.subject, claim.predicate); subject_values.entry(key).or_default().push(claim); } let mut conflicts = 0; for (key, claims_for_key) in &subject_values { if claims_for_key.len() > 1 { // Check if values differ let first_value = &claims_for_key[0].value; for claim in claims_for_key.iter().skip(1) { if &claim.value != first_value { warn!(key, "Conflicting claims detected"); conflicts += 1; } } } } if conflicts == 0 { 1.0 } else { (1.0 - (conflicts as f32 / claims.len() as f32)).max(0.0) } } /// Filter claims to only those that passed validation. pub fn filter_passed(&self, claims: Vec) -> Vec { let report = self.validate(&claims); claims .into_iter() .zip(report.claim_results.iter()) .filter(|(_, result)| result.passed) .map(|(claim, _)| claim) .collect() } } #[cfg(test)] #[path = "quality_tests.rs"] mod tests;