stemedb/applications/aphoria/src/research/quality.rs

//! Quality validation for researched claims.
//!
//! Ensures that claims extracted from research meet quality standards before
//! being ingested into the corpus. High-quality data is critical for Aphoria's
//! accuracy - false positives erode trust.

use serde::{Deserialize, Serialize};
use tracing::{debug, info, warn};

use super::researcher::ResearchedClaim;

/// Quality validation report for a set of researched claims.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityReport {
    /// Overall quality score (0.0 to 1.0).
    pub overall_score: f32,

    /// Number of claims that passed validation.
    pub passed: usize,

    /// Number of claims that failed validation.
    pub failed: usize,

    /// Number of claims that passed with warnings.
    pub warnings: usize,

    /// Per-claim validation results.
    pub claim_results: Vec<ClaimValidationResult>,

    /// Source attribution score (0.0 to 1.0).
    pub source_attribution_score: f32,

    /// Normative language score (0.0 to 1.0).
    pub normative_language_score: f32,

    /// Consistency score (0.0 to 1.0).
    pub consistency_score: f32,
}

/// Validation result for a single claim.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClaimValidationResult {
    /// Subject of the claim.
    pub subject: String,

    /// Whether the claim passed validation.
    pub passed: bool,

    /// Confidence in this claim's quality.
    pub confidence: f32,

    /// Validation issues found.
    pub issues: Vec<ValidationIssue>,

    /// Validation warnings (non-fatal).
    pub warnings: Vec<String>,
}

/// A validation issue that caused a claim to fail.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ValidationIssue {
    /// Issue category.
    pub category: IssueCategory,

    /// Human-readable description.
    pub description: String,

    /// Severity (higher = worse).
    pub severity: u8,
}

/// Categories of validation issues.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
pub enum IssueCategory {
    /// Missing or invalid source attribution.
    SourceAttribution,

    /// Claim lacks normative language (MUST, SHOULD, etc.).
    NormativeLanguage,

    /// Claim is too vague or generic.
    VagueContent,

    /// Claim conflicts with existing corpus.
    Conflict,

    /// Subject path is malformed.
    MalformedSubject,

    /// Value is invalid or ambiguous.
    InvalidValue,

    /// Description is missing or too short.
    InsufficientDescription,

    /// Duplicate of existing claim.
    Duplicate,
}

/// Validator for researched claims.
pub struct QualityValidator {
    /// Minimum confidence threshold for accepting claims.
    min_confidence: f32,

    /// Minimum description length.
    min_description_len: usize,

    /// Whether to allow claims without explicit normative language.
    allow_implicit_normative: bool,
}

impl Default for QualityValidator {
    fn default() -> Self {
        Self { min_confidence: 0.7, min_description_len: 20, allow_implicit_normative: false }
    }
}

impl QualityValidator {
    /// Create a new validator with custom settings.
    pub fn new(min_confidence: f32) -> Self {
        Self { min_confidence, ..Default::default() }
    }

    /// Create a strict validator (higher thresholds).
    pub fn strict() -> Self {
        Self { min_confidence: 0.85, min_description_len: 40, allow_implicit_normative: false }
    }

    /// Create a lenient validator (lower thresholds).
    pub fn lenient() -> Self {
        Self { min_confidence: 0.5, min_description_len: 10, allow_implicit_normative: true }
    }

    /// Validate a batch of researched claims.
    pub fn validate(&self, claims: &[ResearchedClaim]) -> QualityReport {
        let mut claim_results = Vec::with_capacity(claims.len());
        let mut passed = 0;
        let mut failed = 0;
        let mut warnings = 0;

        let mut source_scores = Vec::new();
        let mut normative_scores = Vec::new();

        for claim in claims {
            let result = self.validate_claim(claim);

            if result.passed {
                passed += 1;
                if !result.warnings.is_empty() {
                    warnings += 1;
                }
            } else {
                failed += 1;
            }

            // Track component scores
            source_scores.push(self.score_source_attribution(claim));
            normative_scores.push(self.score_normative_language(&claim.description));

            claim_results.push(result);
        }

        let total = claims.len();
        let overall_score = if total > 0 { passed as f32 / total as f32 } else { 0.0 };

        let source_attribution_score = if source_scores.is_empty() {
            0.0
        } else {
            source_scores.iter().sum::<f32>() / source_scores.len() as f32
        };

        let normative_language_score = if normative_scores.is_empty() {
            0.0
        } else {
            normative_scores.iter().sum::<f32>() / normative_scores.len() as f32
        };

        // Consistency score: check for conflicting claims
        let consistency_score = self.score_consistency(claims);

        info!(
            total,
            passed,
            failed,
            warnings,
            overall_score,
            source_attribution_score,
            normative_language_score,
            consistency_score,
            "Quality validation complete"
        );

        QualityReport {
            overall_score,
            passed,
            failed,
            warnings,
            claim_results,
            source_attribution_score,
            normative_language_score,
            consistency_score,
        }
    }

    /// Validate a single claim.
    fn validate_claim(&self, claim: &ResearchedClaim) -> ClaimValidationResult {
        let mut issues = Vec::new();
        let mut validation_warnings = Vec::new();
        let mut confidence = claim.confidence;

        // Check subject path format
        if !self.is_valid_subject(&claim.subject) {
            issues.push(ValidationIssue {
                category: IssueCategory::MalformedSubject,
                description: format!("Subject path is malformed: {}", claim.subject),
                severity: 3,
            });
            confidence *= 0.5;
        }

        // Check source attribution
        if claim.source_url.is_empty() {
            issues.push(ValidationIssue {
                category: IssueCategory::SourceAttribution,
                description: "Missing source URL".to_string(),
                severity: 2,
            });
            confidence *= 0.7;
        } else if !self.is_authoritative_source(&claim.source_url) {
            validation_warnings
                .push(format!("Source may not be authoritative: {}", claim.source_url));
            confidence *= 0.9;
        }

        // Check description quality
        if claim.description.len() < self.min_description_len {
            issues.push(ValidationIssue {
                category: IssueCategory::InsufficientDescription,
                description: format!(
                    "Description too short ({} chars, min {})",
                    claim.description.len(),
                    self.min_description_len
                ),
                severity: 2,
            });
            confidence *= 0.8;
        }

        // Check normative language
        let has_normative = self.has_normative_language(&claim.description);
        if !has_normative && !self.allow_implicit_normative {
            issues.push(ValidationIssue {
                category: IssueCategory::NormativeLanguage,
                description: "Description lacks normative language (MUST, SHOULD, etc.)"
                    .to_string(),
                severity: 2,
            });
            confidence *= 0.8;
        } else if !has_normative {
            validation_warnings.push("Implicit normative statement (no MUST/SHOULD)".to_string());
        }

        // Check for vague content
        if self.is_vague_content(&claim.description) {
            issues.push(ValidationIssue {
                category: IssueCategory::VagueContent,
                description: "Content is too vague or generic".to_string(),
                severity: 2,
            });
            confidence *= 0.7;
        }

        // Determine pass/fail
        let passed = issues.is_empty() || confidence >= self.min_confidence;

        if !passed {
            debug!(
                subject = %claim.subject,
                confidence,
                issues = issues.len(),
                "Claim failed validation"
            );
        }

        ClaimValidationResult {
            subject: claim.subject.clone(),
            passed,
            confidence: confidence.min(1.0),
            issues,
            warnings: validation_warnings,
        }
    }

    /// Check if a subject path is valid.
    fn is_valid_subject(&self, subject: &str) -> bool {
        // Must have scheme://path format
        if !subject.contains("://") {
            return false;
        }

        // Must have at least 2 path segments
        let path = subject.find("://").map(|i| &subject[i + 3..]).unwrap_or("");
        let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();

        segments.len() >= 2
    }

    /// Check if a source URL is from an authoritative domain.
    fn is_authoritative_source(&self, url: &str) -> bool {
        let authoritative_domains = [
            "rfc-editor.org",
            "ietf.org",
            "owasp.org",
            "nist.gov",
            "w3.org",
            "postgresql.org",
            "redis.io",
            "docs.rs",
            "go.dev",
            "python.org",
            "rust-lang.org",
            "apache.org",
            "microsoft.com/docs",
            "aws.amazon.com/docs",
            "cloud.google.com/docs",
            "developer.mozilla.org",
        ];

        authoritative_domains.iter().any(|domain| url.contains(domain))
    }

    /// Check if text contains normative language.
    fn has_normative_language(&self, text: &str) -> bool {
        let upper = text.to_uppercase();
        let normative_keywords = ["MUST", "SHALL", "SHOULD", "REQUIRED", "RECOMMENDED", "MAY NOT"];

        normative_keywords.iter().any(|kw| upper.contains(kw))
    }

    /// Check if content is too vague.
    fn is_vague_content(&self, text: &str) -> bool {
        let vague_phrases = [
            "should be configured",
            "it depends",
            "varies",
            "may or may not",
            "could be",
            "might be",
            "typically",
            "usually",
            "often",
            "sometimes",
            "in some cases",
        ];

        let lower = text.to_lowercase();
        let vague_count = vague_phrases.iter().filter(|p| lower.contains(*p)).count();

        // Too vague if more than 2 vague phrases or text is very short with any vague phrase
        vague_count > 2 || (text.len() < 50 && vague_count > 0)
    }

    /// Score source attribution (0.0 to 1.0).
    fn score_source_attribution(&self, claim: &ResearchedClaim) -> f32 {
        if claim.source_url.is_empty() {
            return 0.0;
        }

        let mut score: f32 = 0.5; // Base score for having a URL

        if self.is_authoritative_source(&claim.source_url) {
            score += 0.3;
        }

        if !claim.source_section.is_empty() {
            score += 0.1;
        }

        if claim.source_url.starts_with("https://") {
            score += 0.1;
        }

        score.min(1.0)
    }

    /// Score normative language (0.0 to 1.0).
    fn score_normative_language(&self, text: &str) -> f32 {
        let upper = text.to_uppercase();

        // Strong normative = higher score
        if upper.contains("MUST") || upper.contains("SHALL") || upper.contains("REQUIRED") {
            return 1.0;
        }

        if upper.contains("SHOULD") || upper.contains("RECOMMENDED") {
            return 0.8;
        }

        if upper.contains("MAY NOT") {
            return 0.7;
        }

        if upper.contains("MAY") {
            return 0.5;
        }

        // Implicit recommendations
        if text.to_lowercase().contains("recommended")
            || text.to_lowercase().contains("best practice")
        {
            return 0.4;
        }

        0.2
    }

    /// Score consistency among claims (0.0 to 1.0).
    fn score_consistency(&self, claims: &[ResearchedClaim]) -> f32 {
        if claims.len() < 2 {
            return 1.0;
        }

        // Check for conflicting claims on the same subject+predicate
        let mut subject_values: std::collections::HashMap<String, Vec<&ResearchedClaim>> =
            std::collections::HashMap::new();

        for claim in claims {
            let key = format!("{}::{}", claim.subject, claim.predicate);
            subject_values.entry(key).or_default().push(claim);
        }

        let mut conflicts = 0;
        for (key, claims_for_key) in &subject_values {
            if claims_for_key.len() > 1 {
                // Check if values differ
                let first_value = &claims_for_key[0].value;
                for claim in claims_for_key.iter().skip(1) {
                    if &claim.value != first_value {
                        warn!(key, "Conflicting claims detected");
                        conflicts += 1;
                    }
                }
            }
        }

        if conflicts == 0 {
            1.0
        } else {
            (1.0 - (conflicts as f32 / claims.len() as f32)).max(0.0)
        }
    }

    /// Filter claims to only those that passed validation.
    pub fn filter_passed(&self, claims: Vec<ResearchedClaim>) -> Vec<ResearchedClaim> {
        let report = self.validate(&claims);

        claims
            .into_iter()
            .zip(report.claim_results.iter())
            .filter(|(_, result)| result.passed)
            .map(|(claim, _)| claim)
            .collect()
    }
}

#[cfg(test)]
#[path = "quality_tests.rs"]
mod tests;