Content Defense (Phase 7): - Add SimilarityIndex with MinHash/LSH for near-duplicate detection - Add QuarantineStore for flagged assertions awaiting admin review - Add CircuitBreakerStore for per-agent circuit breaker state - Add ContentDefenseLayer for ingestion pipeline integration - Add API endpoints for quarantine and circuit breaker management - Add research module with gap detection and documentation fetching Code Structure Improvements: - Extract research CLI commands to research_commands.rs - Extract API routers to routers.rs module - Extract key_codec extraction functions to separate module - Extract test modules to separate files across multiple crates - All files now under 500 line limit per pre-commit hook Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
315 lines
9.4 KiB
Rust
315 lines
9.4 KiB
Rust
//! Data models for the similarity index.
|
|
//!
|
|
//! This module defines the core data structures used for near-duplicate detection:
|
|
//! - [`MinHashSignature`]: A MinHash signature for an assertion's content
|
|
//! - [`LshBucket`]: A bucket of similar assertions in LSH space
|
|
//! - [`SimilarityIndexConfig`]: Configuration for MinHash/LSH parameters
|
|
//! - [`SimilarityCheckResult`]: Result of a similarity check
|
|
|
|
use rkyv::{Archive, Deserialize, Serialize};
|
|
use stemedb_core::types::Hash;
|
|
|
|
/// Number of hash functions in the MinHash signature.
|
|
/// 128 provides 95% confidence for 0.9 Jaccard threshold.
|
|
pub const DEFAULT_MINHASH_K: usize = 128;
|
|
|
|
/// Size of character n-grams (shingles) for MinHash.
|
|
/// 3-grams are language-agnostic and work well for short strings.
|
|
pub const DEFAULT_SHINGLE_SIZE: usize = 3;
|
|
|
|
/// Number of LSH bands.
|
|
/// 16 bands with 8 rows each = 128 total (matches MinHash k).
|
|
pub const DEFAULT_LSH_BANDS: u8 = 16;
|
|
|
|
/// Number of rows per LSH band.
|
|
pub const DEFAULT_LSH_ROWS_PER_BAND: usize = 8;
|
|
|
|
/// Default Bloom filter expected items.
|
|
pub const DEFAULT_BLOOM_EXPECTED_ITEMS: usize = 1_000_000;
|
|
|
|
/// Default Bloom filter false positive rate.
|
|
pub const DEFAULT_BLOOM_FP_RATE: f64 = 0.01;
|
|
|
|
/// MinHash signature for an assertion's subject+predicate content.
|
|
///
|
|
/// Stored at `\x00MH:{content_hash_hex}` for persistence and Bloom filter rebuild.
|
|
#[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq)]
|
|
#[archive(check_bytes)]
|
|
pub struct MinHashSignature {
|
|
/// BLAKE3 hash of the content (subject + predicate).
|
|
pub content_hash: Hash,
|
|
|
|
/// Original subject string (for debugging/auditing).
|
|
pub subject: String,
|
|
|
|
/// Original predicate string (for debugging/auditing).
|
|
pub predicate: String,
|
|
|
|
/// The MinHash signature: k hash values, one per hash function.
|
|
/// Each u64 is the minimum hash value seen for that function.
|
|
pub signature: Vec<u64>,
|
|
|
|
/// Unix timestamp (nanoseconds) when this signature was created.
|
|
pub created_at: u64,
|
|
}
|
|
|
|
impl MinHashSignature {
|
|
/// Create a new MinHash signature.
|
|
pub fn new(
|
|
content_hash: Hash,
|
|
subject: String,
|
|
predicate: String,
|
|
signature: Vec<u64>,
|
|
created_at: u64,
|
|
) -> Self {
|
|
Self { content_hash, subject, predicate, signature, created_at }
|
|
}
|
|
|
|
/// Compute the Jaccard similarity estimate between this signature and another.
|
|
///
|
|
/// Returns a value in [0.0, 1.0] where 1.0 means identical and 0.0 means
|
|
/// completely different.
|
|
pub fn estimate_similarity(&self, other: &Self) -> f32 {
|
|
if self.signature.len() != other.signature.len() {
|
|
return 0.0;
|
|
}
|
|
if self.signature.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
let matches =
|
|
self.signature.iter().zip(other.signature.iter()).filter(|(a, b)| a == b).count();
|
|
|
|
matches as f32 / self.signature.len() as f32
|
|
}
|
|
}
|
|
|
|
/// An LSH bucket containing hashes of similar assertions.
|
|
///
|
|
/// Stored at `\x00LSH:{band:02}:{bucket_hash_hex}`.
|
|
#[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq, Default)]
|
|
#[archive(check_bytes)]
|
|
pub struct LshBucket {
|
|
/// Content hashes of assertions that hash to this bucket.
|
|
pub members: Vec<Hash>,
|
|
}
|
|
|
|
impl LshBucket {
|
|
/// Create a new empty LSH bucket.
|
|
pub fn new() -> Self {
|
|
Self { members: Vec::new() }
|
|
}
|
|
|
|
/// Add a content hash to this bucket.
|
|
pub fn add(&mut self, hash: Hash) {
|
|
if !self.members.contains(&hash) {
|
|
self.members.push(hash);
|
|
}
|
|
}
|
|
|
|
/// Check if this bucket contains a given hash.
|
|
pub fn contains(&self, hash: &Hash) -> bool {
|
|
self.members.contains(hash)
|
|
}
|
|
|
|
/// Get the number of members in this bucket.
|
|
pub fn len(&self) -> usize {
|
|
self.members.len()
|
|
}
|
|
|
|
/// Check if this bucket is empty.
|
|
pub fn is_empty(&self) -> bool {
|
|
self.members.is_empty()
|
|
}
|
|
}
|
|
|
|
/// Configuration for the similarity index.
|
|
#[derive(Debug, Clone)]
|
|
pub struct SimilarityIndexConfig {
|
|
/// Number of hash functions for MinHash (default: 128).
|
|
pub minhash_k: usize,
|
|
|
|
/// Size of character n-grams for shingling (default: 3).
|
|
pub shingle_size: usize,
|
|
|
|
/// Number of LSH bands (default: 16).
|
|
pub lsh_bands: u8,
|
|
|
|
/// Number of rows per LSH band (default: 8).
|
|
pub lsh_rows_per_band: usize,
|
|
|
|
/// Bloom filter expected number of items (default: 1M).
|
|
pub bloom_expected_items: usize,
|
|
|
|
/// Bloom filter target false positive rate (default: 1%).
|
|
pub bloom_fp_rate: f64,
|
|
|
|
/// Jaccard similarity threshold for duplicate detection (default: 0.9).
|
|
pub similarity_threshold: f32,
|
|
}
|
|
|
|
impl Default for SimilarityIndexConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
minhash_k: DEFAULT_MINHASH_K,
|
|
shingle_size: DEFAULT_SHINGLE_SIZE,
|
|
lsh_bands: DEFAULT_LSH_BANDS,
|
|
lsh_rows_per_band: DEFAULT_LSH_ROWS_PER_BAND,
|
|
bloom_expected_items: DEFAULT_BLOOM_EXPECTED_ITEMS,
|
|
bloom_fp_rate: DEFAULT_BLOOM_FP_RATE,
|
|
similarity_threshold: 0.9,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl SimilarityIndexConfig {
|
|
/// Create a new config with custom similarity threshold.
|
|
pub fn with_threshold(threshold: f32) -> Self {
|
|
Self { similarity_threshold: threshold, ..Default::default() }
|
|
}
|
|
}
|
|
|
|
/// Result of a similarity check against the index.
|
|
#[derive(Debug, Clone)]
|
|
pub struct SimilarityCheckResult {
|
|
/// Whether a near-duplicate was found (similarity >= threshold).
|
|
pub is_duplicate: bool,
|
|
|
|
/// Content hashes of similar entries found.
|
|
pub similar_entries: Vec<Hash>,
|
|
|
|
/// Maximum similarity found (0.0 if no similar entries).
|
|
pub max_similarity: f32,
|
|
}
|
|
|
|
impl SimilarityCheckResult {
|
|
/// Create a result indicating no duplicates found.
|
|
pub fn no_duplicate() -> Self {
|
|
Self { is_duplicate: false, similar_entries: Vec::new(), max_similarity: 0.0 }
|
|
}
|
|
|
|
/// Create a result indicating a duplicate was found.
|
|
pub fn duplicate(similar_entries: Vec<Hash>, max_similarity: f32) -> Self {
|
|
Self { is_duplicate: true, similar_entries, max_similarity }
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_minhash_signature_similarity_identical() {
|
|
let sig1 = MinHashSignature::new(
|
|
[1u8; 32],
|
|
"Tesla".to_string(),
|
|
"revenue".to_string(),
|
|
vec![100, 200, 300, 400],
|
|
1000,
|
|
);
|
|
|
|
let sig2 = MinHashSignature::new(
|
|
[2u8; 32],
|
|
"Tesla".to_string(),
|
|
"profit".to_string(),
|
|
vec![100, 200, 300, 400],
|
|
1001,
|
|
);
|
|
|
|
let similarity = sig1.estimate_similarity(&sig2);
|
|
assert!((similarity - 1.0).abs() < f32::EPSILON);
|
|
}
|
|
|
|
#[test]
|
|
fn test_minhash_signature_similarity_partial() {
|
|
let sig1 = MinHashSignature::new(
|
|
[1u8; 32],
|
|
"Tesla".to_string(),
|
|
"revenue".to_string(),
|
|
vec![100, 200, 300, 400],
|
|
1000,
|
|
);
|
|
|
|
let sig2 = MinHashSignature::new(
|
|
[2u8; 32],
|
|
"Apple".to_string(),
|
|
"profit".to_string(),
|
|
vec![100, 200, 999, 888],
|
|
1001,
|
|
);
|
|
|
|
let similarity = sig1.estimate_similarity(&sig2);
|
|
assert!((similarity - 0.5).abs() < f32::EPSILON);
|
|
}
|
|
|
|
#[test]
|
|
fn test_minhash_signature_similarity_different_lengths() {
|
|
let sig1 = MinHashSignature::new(
|
|
[1u8; 32],
|
|
"Tesla".to_string(),
|
|
"revenue".to_string(),
|
|
vec![100, 200, 300],
|
|
1000,
|
|
);
|
|
|
|
let sig2 = MinHashSignature::new(
|
|
[2u8; 32],
|
|
"Apple".to_string(),
|
|
"profit".to_string(),
|
|
vec![100, 200],
|
|
1001,
|
|
);
|
|
|
|
let similarity = sig1.estimate_similarity(&sig2);
|
|
assert!((similarity - 0.0).abs() < f32::EPSILON);
|
|
}
|
|
|
|
#[test]
|
|
fn test_lsh_bucket_operations() {
|
|
let mut bucket = LshBucket::new();
|
|
assert!(bucket.is_empty());
|
|
assert_eq!(bucket.len(), 0);
|
|
|
|
let hash1 = [1u8; 32];
|
|
let hash2 = [2u8; 32];
|
|
|
|
bucket.add(hash1);
|
|
assert_eq!(bucket.len(), 1);
|
|
assert!(bucket.contains(&hash1));
|
|
assert!(!bucket.contains(&hash2));
|
|
|
|
// Adding same hash again should not duplicate
|
|
bucket.add(hash1);
|
|
assert_eq!(bucket.len(), 1);
|
|
|
|
bucket.add(hash2);
|
|
assert_eq!(bucket.len(), 2);
|
|
assert!(bucket.contains(&hash2));
|
|
}
|
|
|
|
#[test]
|
|
fn test_similarity_check_result() {
|
|
let no_dup = SimilarityCheckResult::no_duplicate();
|
|
assert!(!no_dup.is_duplicate);
|
|
assert!(no_dup.similar_entries.is_empty());
|
|
assert!((no_dup.max_similarity - 0.0).abs() < f32::EPSILON);
|
|
|
|
let dup = SimilarityCheckResult::duplicate(vec![[1u8; 32]], 0.95);
|
|
assert!(dup.is_duplicate);
|
|
assert_eq!(dup.similar_entries.len(), 1);
|
|
assert!((dup.max_similarity - 0.95).abs() < f32::EPSILON);
|
|
}
|
|
|
|
#[test]
|
|
fn test_config_defaults() {
|
|
let config = SimilarityIndexConfig::default();
|
|
assert_eq!(config.minhash_k, 128);
|
|
assert_eq!(config.shingle_size, 3);
|
|
assert_eq!(config.lsh_bands, 16);
|
|
assert_eq!(config.lsh_rows_per_band, 8);
|
|
assert_eq!(config.bloom_expected_items, 1_000_000);
|
|
assert!((config.bloom_fp_rate - 0.01).abs() < f64::EPSILON);
|
|
assert!((config.similarity_threshold - 0.9).abs() < f32::EPSILON);
|
|
}
|
|
}
|