stemedb/crates/stemedb-ontology/src/validator.rs
jordan 8f6506b70a feat: Aphoria scan modes + stemedb-ontology crate + consumer health UAT
Major additions:
- Staged scanning modes (working tree, staged, committed) with git integration
- Drift detection for baseline vs current state comparisons
- Hosted API handlers for policy CRUD operations via StemeDB API
- stemedb-ontology crate with domain definitions and medical extractors
- Consumer health vertical UAT scenarios (GLP-1, gastroparesis, etc.)
- Aphoria development skill documentation

Code organization:
- Split large files into focused modules to stay under 500-line limit
- Extracted config tests, episteme helpers/drift/aliases, API helpers

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 21:57:33 -07:00

333 lines
11 KiB
Rust

//! Claim validation against domain schemas.
//!
//! Validates that claims conform to the domain ontology before ingestion.
use std::collections::HashMap;
use thiserror::Error;
use crate::domain::{Domain, PredicateSchema};
/// Errors that can occur during claim validation.
#[derive(Debug, Error)]
pub enum ValidationError {
/// The predicate is not defined in the domain.
#[error("Unknown predicate: '{0}' not in domain '{1}'")]
UnknownPredicate(String, String),
/// The subject doesn't match the expected pattern.
#[error("Subject '{subject}' doesn't match pattern '{pattern}' for predicate '{predicate}'")]
SubjectMismatch {
/// The actual subject string.
subject: String,
/// The expected pattern.
pattern: String,
/// The predicate name.
predicate: String,
},
/// A required entity is missing from the subject.
#[error("Subject missing required entity '{entity}' for predicate '{predicate}'")]
MissingEntity {
/// The missing entity name.
entity: String,
/// The predicate name.
predicate: String,
},
/// The confidence score is out of range.
#[error("Confidence {0} out of range [0.0, 1.0]")]
ConfidenceOutOfRange(f32),
/// The object value type doesn't match expected type.
#[error("Object type mismatch: expected {expected}, got {actual}")]
ObjectTypeMismatch {
/// The expected type.
expected: String,
/// The actual type received.
actual: String,
},
/// Multiple validation errors occurred.
#[error("Multiple validation errors: {}", .0.join("; "))]
Multiple(Vec<String>),
}
/// Validator for claims against a domain ontology.
#[derive(Debug)]
pub struct Validator<'a> {
domain: &'a Domain,
strict_mode: bool,
}
impl<'a> Validator<'a> {
/// Create a new validator for the given domain.
pub fn new(domain: &'a Domain) -> Self {
Self { domain, strict_mode: false }
}
/// Enable strict mode (unknown predicates are errors instead of warnings).
pub fn strict(mut self) -> Self {
self.strict_mode = true;
self
}
/// Validate a claim's predicate and subject against the domain.
///
/// # Arguments
///
/// * `predicate` - The predicate name
/// * `subject` - The subject string
/// * `confidence` - The confidence score (0.0 to 1.0)
///
/// # Returns
///
/// Ok if valid, or a ValidationError describing what's wrong.
pub fn validate(
&self,
predicate: &str,
subject: &str,
confidence: f32,
) -> Result<(), ValidationError> {
// Validate confidence first
if !(0.0..=1.0).contains(&confidence) {
return Err(ValidationError::ConfidenceOutOfRange(confidence));
}
// Find the schema for this predicate
let schema = match self.domain.schema_for_predicate(predicate) {
Some(s) => s,
None if self.strict_mode => {
return Err(ValidationError::UnknownPredicate(
predicate.to_string(),
self.domain.name.clone(),
));
}
None => {
// Non-strict: warn but allow
tracing::warn!(
predicate = predicate,
domain = self.domain.name,
"Unknown predicate, skipping subject validation"
);
return Ok(());
}
};
// Validate subject matches pattern
self.validate_subject(subject, schema, predicate)
}
/// Validate just the subject against a schema.
fn validate_subject(
&self,
subject: &str,
schema: &PredicateSchema,
predicate: &str,
) -> Result<(), ValidationError> {
// Count separators in subject
let subject_parts: Vec<&str> = subject.split(':').collect();
let expected_parts = schema.required_entities.len();
if subject_parts.len() != expected_parts {
return Err(ValidationError::SubjectMismatch {
subject: subject.to_string(),
pattern: schema.subject_pattern.clone(),
predicate: predicate.to_string(),
});
}
// Check for empty parts
for (i, part) in subject_parts.iter().enumerate() {
if part.is_empty() {
return Err(ValidationError::MissingEntity {
entity: schema
.required_entities
.get(i)
.cloned()
.unwrap_or_else(|| format!("part_{}", i)),
predicate: predicate.to_string(),
});
}
}
Ok(())
}
/// Validate a batch of claims.
///
/// Returns a map of claim index to validation error.
pub fn validate_batch(
&self,
claims: &[(String, String, f32)], // (predicate, subject, confidence)
) -> HashMap<usize, ValidationError> {
let mut errors = HashMap::new();
for (i, (predicate, subject, confidence)) in claims.iter().enumerate() {
if let Err(e) = self.validate(predicate, subject, *confidence) {
errors.insert(i, e);
}
}
errors
}
/// Check if a predicate is known in the domain.
pub fn is_known_predicate(&self, predicate: &str) -> bool {
self.domain.schema_for_predicate(predicate).is_some()
}
/// Get the expected subject pattern for a predicate.
pub fn expected_pattern(&self, predicate: &str) -> Option<&str> {
self.domain.schema_for_predicate(predicate).map(|s| s.subject_pattern.as_str())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::domain::{Domain, EntityType, PredicateSchema};
fn test_domain() -> Domain {
Domain::new("Pharma", "Test pharmaceutical domain")
.with_entity_type("Drug", EntityType::required("A pharmaceutical compound"))
.with_entity_type("Indication", EntityType::required("A medical condition"))
.with_predicate_schema(
"efficacy",
PredicateSchema::new("Efficacy predicates", "{Drug}:{Indication}")
.with_predicates(vec!["hba1c_reduction", "weight_loss"]),
)
.with_predicate_schema(
"safety",
PredicateSchema::new("Safety predicates", "{Drug}")
.with_predicates(vec!["has_boxed_warning", "adverse_event_rate"]),
)
}
#[test]
fn test_valid_efficacy_claim() {
let domain = test_domain();
let validator = Validator::new(&domain);
let result = validator.validate("hba1c_reduction", "Semaglutide:Type2Diabetes", 0.95);
assert!(result.is_ok());
}
#[test]
fn test_valid_safety_claim() {
let domain = test_domain();
let validator = Validator::new(&domain);
let result = validator.validate("has_boxed_warning", "Semaglutide", 0.99);
assert!(result.is_ok());
}
#[test]
fn test_subject_mismatch_too_few_parts() {
let domain = test_domain();
let validator = Validator::new(&domain);
// Efficacy requires Drug:Indication, but we only provided Drug
let result = validator.validate("hba1c_reduction", "Semaglutide", 0.95);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), ValidationError::SubjectMismatch { .. }));
}
#[test]
fn test_subject_mismatch_too_many_parts() {
let domain = test_domain();
let validator = Validator::new(&domain);
// Safety requires just Drug, but we provided Drug:Indication
let result = validator.validate("has_boxed_warning", "Semaglutide:T2D", 0.95);
assert!(result.is_err());
}
#[test]
fn test_confidence_out_of_range_high() {
let domain = test_domain();
let validator = Validator::new(&domain);
let result = validator.validate("has_boxed_warning", "Semaglutide", 1.5);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), ValidationError::ConfidenceOutOfRange(_)));
}
#[test]
fn test_confidence_out_of_range_negative() {
let domain = test_domain();
let validator = Validator::new(&domain);
let result = validator.validate("has_boxed_warning", "Semaglutide", -0.1);
assert!(result.is_err());
}
#[test]
fn test_unknown_predicate_strict() {
let domain = test_domain();
let validator = Validator::new(&domain).strict();
let result = validator.validate("unknown_predicate", "Semaglutide", 0.5);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), ValidationError::UnknownPredicate(_, _)));
}
#[test]
fn test_unknown_predicate_nonstrict() {
let domain = test_domain();
let validator = Validator::new(&domain); // non-strict
// Should pass even with unknown predicate
let result = validator.validate("unknown_predicate", "Semaglutide", 0.5);
assert!(result.is_ok());
}
#[test]
fn test_empty_subject_part() {
let domain = test_domain();
let validator = Validator::new(&domain);
// Empty indication part
let result = validator.validate("hba1c_reduction", "Semaglutide:", 0.95);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), ValidationError::MissingEntity { .. }));
}
#[test]
fn test_validate_batch() {
let domain = test_domain();
let validator = Validator::new(&domain);
let claims = vec![
("hba1c_reduction".to_string(), "Semaglutide:T2D".to_string(), 0.95),
("has_boxed_warning".to_string(), "Semaglutide".to_string(), 0.99),
("hba1c_reduction".to_string(), "BadSubject".to_string(), 0.5), // Will fail
("has_boxed_warning".to_string(), "Drug".to_string(), 1.5), // Confidence will fail
];
let errors = validator.validate_batch(&claims);
assert_eq!(errors.len(), 2); // Claims 2 and 3 should fail
assert!(errors.contains_key(&2));
assert!(errors.contains_key(&3));
}
#[test]
fn test_is_known_predicate() {
let domain = test_domain();
let validator = Validator::new(&domain);
assert!(validator.is_known_predicate("hba1c_reduction"));
assert!(validator.is_known_predicate("has_boxed_warning"));
assert!(!validator.is_known_predicate("unknown"));
}
#[test]
fn test_expected_pattern() {
let domain = test_domain();
let validator = Validator::new(&domain);
assert_eq!(validator.expected_pattern("hba1c_reduction"), Some("{Drug}:{Indication}"));
assert_eq!(validator.expected_pattern("has_boxed_warning"), Some("{Drug}"));
assert_eq!(validator.expected_pattern("unknown"), None);
}
}