//! Content defense layer for spam detection and quality control. //! //! This module provides the `ContentDefenseLayer` that coordinates: //! - Bloom filter for fast duplicate detection //! - MinHash + LSH for near-duplicate detection //! - Quality scoring for spam and low-quality content detection //! - Suspicious pattern detection (untrusted + high confidence) //! //! # Usage //! //! ```ignore //! use stemedb_ingest::ContentDefenseLayer; //! //! let defense = ContentDefenseLayer::new( //! similarity_index, //! quality_scorer, //! quarantine_store, //! ); //! //! // Check content before indexing //! let decision = defense.check(&assertion, trust_tier).await?; //! match decision { //! QuarantineDecision::Pass => { /* index normally */ } //! QuarantineDecision::Quarantine(reason) => { /* store in quarantine */ } //! } //! ``` use std::sync::Arc; use stemedb_core::types::{ Assertion, ContentQuality, Hash, QuarantineDecision, QuarantineEvent, QuarantineReason, TrustTier, }; use stemedb_storage::{ ContentQualityScorer, QualityScoringConfig, QuarantineStore, Result as StorageResult, SimilarityIndex, }; use tracing::{debug, info, instrument}; use crate::error::Result; /// Configuration for the content defense layer. #[derive(Debug, Clone)] pub struct ContentDefenseConfig { /// Enable near-duplicate detection via MinHash + LSH. pub enable_duplicate_detection: bool, /// Enable quality scoring (entropy, length, structure). pub enable_quality_scoring: bool, /// Enable suspicious pattern detection (untrusted + high confidence). pub enable_pattern_detection: bool, /// Quality scoring configuration. pub quality_config: QualityScoringConfig, } impl Default for ContentDefenseConfig { fn default() -> Self { Self { enable_duplicate_detection: true, enable_quality_scoring: true, enable_pattern_detection: true, quality_config: QualityScoringConfig::default(), } } } /// Content defense layer that coordinates spam and quality checks. /// /// This layer sits between signature verification and storage in the /// ingestion pipeline. It checks each assertion against: /// /// 1. **Bloom filter**: Fast "definitely not duplicate" check /// 2. **MinHash + LSH**: Near-duplicate detection /// 3. **Quality scoring**: Entropy, length, structure checks /// 4. **Pattern detection**: Suspicious agent behavior /// /// If any check fails, the assertion is quarantined for admin review. pub struct ContentDefenseLayer { /// Similarity index for duplicate detection. similarity_index: Arc, /// Quality scorer for content analysis. quality_scorer: ContentQualityScorer, /// Quarantine store for flagged assertions. quarantine_store: Arc, /// Configuration. config: ContentDefenseConfig, } impl ContentDefenseLayer { /// Create a new content defense layer. pub fn new( similarity_index: Arc, quarantine_store: Arc, config: ContentDefenseConfig, ) -> Self { let quality_scorer = ContentQualityScorer::new(config.quality_config.clone()); Self { similarity_index, quality_scorer, quarantine_store, config } } /// Create a new content defense layer with default configuration. pub fn with_defaults(similarity_index: Arc, quarantine_store: Arc) -> Self { Self::new(similarity_index, quarantine_store, ContentDefenseConfig::default()) } /// Get the configuration. pub fn config(&self) -> &ContentDefenseConfig { &self.config } /// Check an assertion against all defense mechanisms. /// /// Returns a decision on whether to pass or quarantine the assertion. /// /// # Arguments /// /// * `assertion` - The assertion to check /// * `assertion_bytes` - The serialized assertion (for quarantine storage) /// * `assertion_hash` - The content hash of the assertion /// * `trust_tier` - The submitting agent's trust tier /// /// # Returns /// /// - `Ok((QuarantineDecision::Pass, quality))` - Assertion passed all checks /// - `Ok((QuarantineDecision::Quarantine(reason), quality))` - Assertion should be quarantined #[instrument(skip(self, assertion, assertion_bytes), fields( subject = %assertion.subject, predicate = %assertion.predicate, trust_tier = ?trust_tier, ))] pub async fn check( &self, assertion: &Assertion, assertion_bytes: &[u8], assertion_hash: Hash, trust_tier: TrustTier, ) -> Result<(QuarantineDecision, ContentQuality)> { // 1. Quality scoring (fast, no I/O) let mut quality = self.quality_scorer.score(assertion, trust_tier); // 2. Check for suspicious pattern (untrusted + high confidence) if self.config.enable_pattern_detection && self.quality_scorer.is_suspicious_pattern(trust_tier, assertion.confidence) { debug!( confidence = assertion.confidence, "Suspicious pattern: untrusted agent with high confidence" ); return self .quarantine( assertion_hash, assertion_bytes, QuarantineReason::UntrustedHighConfidence, quality, assertion, ) .await; } // 3. Check quality threshold if self.config.enable_quality_scoring && !self.quality_scorer.meets_threshold(&quality) { debug!(score = quality.score, entropy = quality.entropy, "Low quality score"); return self .quarantine( assertion_hash, assertion_bytes, QuarantineReason::LowQuality, quality, assertion, ) .await; } // 4. Check for duplicates (requires I/O) if self.config.enable_duplicate_detection { let result = self .similarity_index .check_similarity(&assertion.subject, &assertion.predicate) .await .map_err(crate::error::IngestError::Storage)?; if result.is_duplicate { quality.duplicate = true; debug!( max_similarity = result.max_similarity, similar_count = result.similar_entries.len(), "Near-duplicate detected" ); return self .quarantine_with_similar( assertion_hash, assertion_bytes, QuarantineReason::Duplicate, quality, result.similar_entries.first().copied(), assertion, ) .await; } } debug!("Content defense: passed all checks"); Ok((QuarantineDecision::Pass, quality)) } /// Add an assertion to the similarity index after it passes all checks. /// /// Call this after successfully indexing an assertion so future duplicates /// can be detected. #[instrument(skip(self, assertion), fields( subject = %assertion.subject, predicate = %assertion.predicate, ))] pub async fn add_to_index(&self, assertion: &Assertion, timestamp: u64) -> Result<()> { if self.config.enable_duplicate_detection { self.similarity_index .add(&assertion.subject, &assertion.predicate, timestamp) .await .map_err(crate::error::IngestError::Storage)?; } Ok(()) } /// Quarantine an assertion. async fn quarantine( &self, hash: Hash, assertion_bytes: &[u8], reason: QuarantineReason, quality: ContentQuality, assertion: &Assertion, ) -> Result<(QuarantineDecision, ContentQuality)> { self.quarantine_with_similar(hash, assertion_bytes, reason, quality, None, assertion).await } /// Quarantine an assertion with a reference to a similar entry. async fn quarantine_with_similar( &self, hash: Hash, assertion_bytes: &[u8], reason: QuarantineReason, quality: ContentQuality, similar_to: Option, assertion: &Assertion, ) -> Result<(QuarantineDecision, ContentQuality)> { let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_nanos() as u64) .unwrap_or(0); let mut event = QuarantineEvent::new( hash, assertion_bytes.to_vec(), reason, quality.clone(), timestamp, ); if let Some(similar) = similar_to { event = event.with_similar_to(similar); } // Extract agent ID from first signature if available if let Some(sig) = assertion.signatures.first() { event = event.with_agent_id(sig.agent_id); } self.quarantine_store .write_quarantine(&event) .await .map_err(crate::error::IngestError::Storage)?; info!( hash = %hex::encode(hash), reason = ?reason, "Assertion quarantined" ); Ok((QuarantineDecision::Quarantine(reason), quality)) } /// Rebuild the similarity index Bloom filter from persisted data. /// /// Call this on startup to restore in-memory state. pub async fn rebuild_bloom_filter(&self) -> StorageResult { self.similarity_index.rebuild_bloom_filter().await } /// Get the number of pending quarantine events. pub async fn pending_quarantine_count(&self) -> StorageResult { self.quarantine_store.pending_count().await } } #[cfg(test)] mod tests { use super::*; use stemedb_core::testing::AssertionBuilder; use stemedb_core::types::{LifecycleStage, ObjectValue}; use stemedb_storage::{GenericQuarantineStore, GenericSimilarityIndex, HybridStore}; fn create_test_assertion(subject: &str, predicate: &str) -> Assertion { AssertionBuilder::new() .subject(subject) .predicate(predicate) .object(ObjectValue::Text("test value for content defense".to_string())) .confidence(0.5) .lifecycle(LifecycleStage::Proposed) .build() } #[tokio::test] async fn test_pass_normal_assertion() { let store = Arc::new(HybridStore::open_temp().expect("store")); let similarity_index = Arc::new(GenericSimilarityIndex::with_defaults(Arc::clone(&store))); let quarantine_store = Arc::new(GenericQuarantineStore::new(Arc::clone(&store))); let defense = ContentDefenseLayer::with_defaults(similarity_index, quarantine_store); let assertion = create_test_assertion("Tesla_Inc", "has_revenue"); let assertion_bytes = stemedb_core::serde::serialize(&assertion).expect("serialize"); let hash = *blake3::hash(&assertion_bytes).as_bytes(); let (decision, quality) = defense .check(&assertion, &assertion_bytes, hash, TrustTier::Verified) .await .expect("check"); assert!(decision.is_pass(), "Normal assertion should pass"); assert!(quality.score >= 0.4, "Quality score should be acceptable"); } #[tokio::test] async fn test_quarantine_short_subject() { let store = Arc::new(HybridStore::open_temp().expect("store")); let similarity_index = Arc::new(GenericSimilarityIndex::with_defaults(Arc::clone(&store))); let quarantine_store = Arc::new(GenericQuarantineStore::new(Arc::clone(&store))); let defense = ContentDefenseLayer::with_defaults(similarity_index, quarantine_store); let assertion = create_test_assertion("AB", "x"); let assertion_bytes = stemedb_core::serde::serialize(&assertion).expect("serialize"); let hash = *blake3::hash(&assertion_bytes).as_bytes(); let (decision, _quality) = defense .check(&assertion, &assertion_bytes, hash, TrustTier::Verified) .await .expect("check"); assert!(decision.is_quarantine(), "Short content should be quarantined"); assert_eq!(decision.reason(), Some(QuarantineReason::LowQuality)); } #[tokio::test] async fn test_quarantine_untrusted_high_confidence() { let store = Arc::new(HybridStore::open_temp().expect("store")); let similarity_index = Arc::new(GenericSimilarityIndex::with_defaults(Arc::clone(&store))); let quarantine_store = Arc::new(GenericQuarantineStore::new(Arc::clone(&store))); let defense = ContentDefenseLayer::with_defaults(similarity_index, quarantine_store); let mut assertion = create_test_assertion("Tesla_Inc", "has_revenue"); assertion.confidence = 0.95; let assertion_bytes = stemedb_core::serde::serialize(&assertion).expect("serialize"); let hash = *blake3::hash(&assertion_bytes).as_bytes(); let (decision, _quality) = defense .check(&assertion, &assertion_bytes, hash, TrustTier::Untrusted) .await .expect("check"); assert!(decision.is_quarantine(), "Untrusted + high confidence should be quarantined"); assert_eq!(decision.reason(), Some(QuarantineReason::UntrustedHighConfidence)); } #[tokio::test] async fn test_quarantine_duplicate() { let store = Arc::new(HybridStore::open_temp().expect("store")); let similarity_index = Arc::new(GenericSimilarityIndex::with_defaults(Arc::clone(&store))); let quarantine_store = Arc::new(GenericQuarantineStore::new(Arc::clone(&store))); let defense = ContentDefenseLayer::with_defaults( Arc::clone(&similarity_index), Arc::clone(&quarantine_store), ); // First assertion - should pass let assertion1 = create_test_assertion("Tesla_Inc", "has_revenue"); let assertion_bytes1 = stemedb_core::serde::serialize(&assertion1).expect("serialize"); let hash1 = *blake3::hash(&assertion_bytes1).as_bytes(); let (decision1, _) = defense .check(&assertion1, &assertion_bytes1, hash1, TrustTier::Verified) .await .expect("check"); assert!(decision1.is_pass()); // Add to index defense.add_to_index(&assertion1, 1000).await.expect("add_to_index"); // Second assertion with identical content - should be quarantined as duplicate let assertion2 = create_test_assertion("Tesla_Inc", "has_revenue"); let assertion_bytes2 = stemedb_core::serde::serialize(&assertion2).expect("serialize"); let hash2 = *blake3::hash(&assertion_bytes2).as_bytes(); let (decision2, quality2) = defense .check(&assertion2, &assertion_bytes2, hash2, TrustTier::Verified) .await .expect("check"); assert!(decision2.is_quarantine(), "Duplicate should be quarantined"); assert_eq!(decision2.reason(), Some(QuarantineReason::Duplicate)); assert!(quality2.duplicate, "Quality should indicate duplicate"); } #[tokio::test] async fn test_config_disable_duplicate_detection() { let store = Arc::new(HybridStore::open_temp().expect("store")); let similarity_index = Arc::new(GenericSimilarityIndex::with_defaults(Arc::clone(&store))); let quarantine_store = Arc::new(GenericQuarantineStore::new(Arc::clone(&store))); let config = ContentDefenseConfig { enable_duplicate_detection: false, ..Default::default() }; let defense = ContentDefenseLayer::new( Arc::clone(&similarity_index), Arc::clone(&quarantine_store), config, ); // Add first assertion let assertion1 = create_test_assertion("Tesla_Inc", "has_revenue"); defense.add_to_index(&assertion1, 1000).await.expect("add_to_index"); // Second identical assertion - should pass because duplicate detection is disabled let assertion2 = create_test_assertion("Tesla_Inc", "has_revenue"); let assertion_bytes2 = stemedb_core::serde::serialize(&assertion2).expect("serialize"); let hash2 = *blake3::hash(&assertion_bytes2).as_bytes(); let (decision2, _) = defense .check(&assertion2, &assertion_bytes2, hash2, TrustTier::Verified) .await .expect("check"); assert!(decision2.is_pass(), "Should pass when duplicate detection disabled"); } }