//! Data models for the similarity index. //! //! This module defines the core data structures used for near-duplicate detection: //! - [`MinHashSignature`]: A MinHash signature for an assertion's content //! - [`LshBucket`]: A bucket of similar assertions in LSH space //! - [`SimilarityIndexConfig`]: Configuration for MinHash/LSH parameters //! - [`SimilarityCheckResult`]: Result of a similarity check use rkyv::{Archive, Deserialize, Serialize}; use stemedb_core::types::Hash; /// Number of hash functions in the MinHash signature. /// 128 provides 95% confidence for 0.9 Jaccard threshold. pub const DEFAULT_MINHASH_K: usize = 128; /// Size of character n-grams (shingles) for MinHash. /// 3-grams are language-agnostic and work well for short strings. pub const DEFAULT_SHINGLE_SIZE: usize = 3; /// Number of LSH bands. /// 16 bands with 8 rows each = 128 total (matches MinHash k). pub const DEFAULT_LSH_BANDS: u8 = 16; /// Number of rows per LSH band. pub const DEFAULT_LSH_ROWS_PER_BAND: usize = 8; /// Default Bloom filter expected items. pub const DEFAULT_BLOOM_EXPECTED_ITEMS: usize = 1_000_000; /// Default Bloom filter false positive rate. pub const DEFAULT_BLOOM_FP_RATE: f64 = 0.01; /// MinHash signature for an assertion's subject+predicate content. /// /// Stored at `\x00MH:{content_hash_hex}` for persistence and Bloom filter rebuild. #[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq)] #[archive(check_bytes)] pub struct MinHashSignature { /// BLAKE3 hash of the content (subject + predicate). pub content_hash: Hash, /// Original subject string (for debugging/auditing). pub subject: String, /// Original predicate string (for debugging/auditing). pub predicate: String, /// The MinHash signature: k hash values, one per hash function. /// Each u64 is the minimum hash value seen for that function. pub signature: Vec, /// Unix timestamp (nanoseconds) when this signature was created. pub created_at: u64, } impl MinHashSignature { /// Create a new MinHash signature. pub fn new( content_hash: Hash, subject: String, predicate: String, signature: Vec, created_at: u64, ) -> Self { Self { content_hash, subject, predicate, signature, created_at } } /// Compute the Jaccard similarity estimate between this signature and another. /// /// Returns a value in [0.0, 1.0] where 1.0 means identical and 0.0 means /// completely different. pub fn estimate_similarity(&self, other: &Self) -> f32 { if self.signature.len() != other.signature.len() { return 0.0; } if self.signature.is_empty() { return 0.0; } let matches = self.signature.iter().zip(other.signature.iter()).filter(|(a, b)| a == b).count(); matches as f32 / self.signature.len() as f32 } } /// An LSH bucket containing hashes of similar assertions. /// /// Stored at `\x00LSH:{band:02}:{bucket_hash_hex}`. #[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq, Default)] #[archive(check_bytes)] pub struct LshBucket { /// Content hashes of assertions that hash to this bucket. pub members: Vec, } impl LshBucket { /// Create a new empty LSH bucket. pub fn new() -> Self { Self { members: Vec::new() } } /// Add a content hash to this bucket. pub fn add(&mut self, hash: Hash) { if !self.members.contains(&hash) { self.members.push(hash); } } /// Check if this bucket contains a given hash. pub fn contains(&self, hash: &Hash) -> bool { self.members.contains(hash) } /// Get the number of members in this bucket. pub fn len(&self) -> usize { self.members.len() } /// Check if this bucket is empty. pub fn is_empty(&self) -> bool { self.members.is_empty() } } /// Configuration for the similarity index. #[derive(Debug, Clone)] pub struct SimilarityIndexConfig { /// Number of hash functions for MinHash (default: 128). pub minhash_k: usize, /// Size of character n-grams for shingling (default: 3). pub shingle_size: usize, /// Number of LSH bands (default: 16). pub lsh_bands: u8, /// Number of rows per LSH band (default: 8). pub lsh_rows_per_band: usize, /// Bloom filter expected number of items (default: 1M). pub bloom_expected_items: usize, /// Bloom filter target false positive rate (default: 1%). pub bloom_fp_rate: f64, /// Jaccard similarity threshold for duplicate detection (default: 0.9). pub similarity_threshold: f32, } impl Default for SimilarityIndexConfig { fn default() -> Self { Self { minhash_k: DEFAULT_MINHASH_K, shingle_size: DEFAULT_SHINGLE_SIZE, lsh_bands: DEFAULT_LSH_BANDS, lsh_rows_per_band: DEFAULT_LSH_ROWS_PER_BAND, bloom_expected_items: DEFAULT_BLOOM_EXPECTED_ITEMS, bloom_fp_rate: DEFAULT_BLOOM_FP_RATE, similarity_threshold: 0.9, } } } impl SimilarityIndexConfig { /// Create a new config with custom similarity threshold. pub fn with_threshold(threshold: f32) -> Self { Self { similarity_threshold: threshold, ..Default::default() } } } /// Result of a similarity check against the index. #[derive(Debug, Clone)] pub struct SimilarityCheckResult { /// Whether a near-duplicate was found (similarity >= threshold). pub is_duplicate: bool, /// Content hashes of similar entries found. pub similar_entries: Vec, /// Maximum similarity found (0.0 if no similar entries). pub max_similarity: f32, } impl SimilarityCheckResult { /// Create a result indicating no duplicates found. pub fn no_duplicate() -> Self { Self { is_duplicate: false, similar_entries: Vec::new(), max_similarity: 0.0 } } /// Create a result indicating a duplicate was found. pub fn duplicate(similar_entries: Vec, max_similarity: f32) -> Self { Self { is_duplicate: true, similar_entries, max_similarity } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_minhash_signature_similarity_identical() { let sig1 = MinHashSignature::new( [1u8; 32], "Tesla".to_string(), "revenue".to_string(), vec![100, 200, 300, 400], 1000, ); let sig2 = MinHashSignature::new( [2u8; 32], "Tesla".to_string(), "profit".to_string(), vec![100, 200, 300, 400], 1001, ); let similarity = sig1.estimate_similarity(&sig2); assert!((similarity - 1.0).abs() < f32::EPSILON); } #[test] fn test_minhash_signature_similarity_partial() { let sig1 = MinHashSignature::new( [1u8; 32], "Tesla".to_string(), "revenue".to_string(), vec![100, 200, 300, 400], 1000, ); let sig2 = MinHashSignature::new( [2u8; 32], "Apple".to_string(), "profit".to_string(), vec![100, 200, 999, 888], 1001, ); let similarity = sig1.estimate_similarity(&sig2); assert!((similarity - 0.5).abs() < f32::EPSILON); } #[test] fn test_minhash_signature_similarity_different_lengths() { let sig1 = MinHashSignature::new( [1u8; 32], "Tesla".to_string(), "revenue".to_string(), vec![100, 200, 300], 1000, ); let sig2 = MinHashSignature::new( [2u8; 32], "Apple".to_string(), "profit".to_string(), vec![100, 200], 1001, ); let similarity = sig1.estimate_similarity(&sig2); assert!((similarity - 0.0).abs() < f32::EPSILON); } #[test] fn test_lsh_bucket_operations() { let mut bucket = LshBucket::new(); assert!(bucket.is_empty()); assert_eq!(bucket.len(), 0); let hash1 = [1u8; 32]; let hash2 = [2u8; 32]; bucket.add(hash1); assert_eq!(bucket.len(), 1); assert!(bucket.contains(&hash1)); assert!(!bucket.contains(&hash2)); // Adding same hash again should not duplicate bucket.add(hash1); assert_eq!(bucket.len(), 1); bucket.add(hash2); assert_eq!(bucket.len(), 2); assert!(bucket.contains(&hash2)); } #[test] fn test_similarity_check_result() { let no_dup = SimilarityCheckResult::no_duplicate(); assert!(!no_dup.is_duplicate); assert!(no_dup.similar_entries.is_empty()); assert!((no_dup.max_similarity - 0.0).abs() < f32::EPSILON); let dup = SimilarityCheckResult::duplicate(vec![[1u8; 32]], 0.95); assert!(dup.is_duplicate); assert_eq!(dup.similar_entries.len(), 1); assert!((dup.max_similarity - 0.95).abs() < f32::EPSILON); } #[test] fn test_config_defaults() { let config = SimilarityIndexConfig::default(); assert_eq!(config.minhash_k, 128); assert_eq!(config.shingle_size, 3); assert_eq!(config.lsh_bands, 16); assert_eq!(config.lsh_rows_per_band, 8); assert_eq!(config.bloom_expected_items, 1_000_000); assert!((config.bloom_fp_rate - 0.01).abs() < f64::EPSILON); assert!((config.similarity_threshold - 0.9).abs() < f32::EPSILON); } }