stemedb/crates/stemedb-storage/src/similarity_index/model.rs
jordan a734be3a0d feat: Phase 7 Content Defense + code structure refactoring
Content Defense (Phase 7):
- Add SimilarityIndex with MinHash/LSH for near-duplicate detection
- Add QuarantineStore for flagged assertions awaiting admin review
- Add CircuitBreakerStore for per-agent circuit breaker state
- Add ContentDefenseLayer for ingestion pipeline integration
- Add API endpoints for quarantine and circuit breaker management
- Add research module with gap detection and documentation fetching

Code Structure Improvements:
- Extract research CLI commands to research_commands.rs
- Extract API routers to routers.rs module
- Extract key_codec extraction functions to separate module
- Extract test modules to separate files across multiple crates
- All files now under 500 line limit per pre-commit hook

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 12:44:05 -07:00

315 lines
9.4 KiB
Rust

//! Data models for the similarity index.
//!
//! This module defines the core data structures used for near-duplicate detection:
//! - [`MinHashSignature`]: A MinHash signature for an assertion's content
//! - [`LshBucket`]: A bucket of similar assertions in LSH space
//! - [`SimilarityIndexConfig`]: Configuration for MinHash/LSH parameters
//! - [`SimilarityCheckResult`]: Result of a similarity check
use rkyv::{Archive, Deserialize, Serialize};
use stemedb_core::types::Hash;
/// Number of hash functions in the MinHash signature.
/// 128 provides 95% confidence for 0.9 Jaccard threshold.
pub const DEFAULT_MINHASH_K: usize = 128;
/// Size of character n-grams (shingles) for MinHash.
/// 3-grams are language-agnostic and work well for short strings.
pub const DEFAULT_SHINGLE_SIZE: usize = 3;
/// Number of LSH bands.
/// 16 bands with 8 rows each = 128 total (matches MinHash k).
pub const DEFAULT_LSH_BANDS: u8 = 16;
/// Number of rows per LSH band.
pub const DEFAULT_LSH_ROWS_PER_BAND: usize = 8;
/// Default Bloom filter expected items.
pub const DEFAULT_BLOOM_EXPECTED_ITEMS: usize = 1_000_000;
/// Default Bloom filter false positive rate.
pub const DEFAULT_BLOOM_FP_RATE: f64 = 0.01;
/// MinHash signature for an assertion's subject+predicate content.
///
/// Stored at `\x00MH:{content_hash_hex}` for persistence and Bloom filter rebuild.
#[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq)]
#[archive(check_bytes)]
pub struct MinHashSignature {
/// BLAKE3 hash of the content (subject + predicate).
pub content_hash: Hash,
/// Original subject string (for debugging/auditing).
pub subject: String,
/// Original predicate string (for debugging/auditing).
pub predicate: String,
/// The MinHash signature: k hash values, one per hash function.
/// Each u64 is the minimum hash value seen for that function.
pub signature: Vec<u64>,
/// Unix timestamp (nanoseconds) when this signature was created.
pub created_at: u64,
}
impl MinHashSignature {
/// Create a new MinHash signature.
pub fn new(
content_hash: Hash,
subject: String,
predicate: String,
signature: Vec<u64>,
created_at: u64,
) -> Self {
Self { content_hash, subject, predicate, signature, created_at }
}
/// Compute the Jaccard similarity estimate between this signature and another.
///
/// Returns a value in [0.0, 1.0] where 1.0 means identical and 0.0 means
/// completely different.
pub fn estimate_similarity(&self, other: &Self) -> f32 {
if self.signature.len() != other.signature.len() {
return 0.0;
}
if self.signature.is_empty() {
return 0.0;
}
let matches =
self.signature.iter().zip(other.signature.iter()).filter(|(a, b)| a == b).count();
matches as f32 / self.signature.len() as f32
}
}
/// An LSH bucket containing hashes of similar assertions.
///
/// Stored at `\x00LSH:{band:02}:{bucket_hash_hex}`.
#[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq, Default)]
#[archive(check_bytes)]
pub struct LshBucket {
/// Content hashes of assertions that hash to this bucket.
pub members: Vec<Hash>,
}
impl LshBucket {
/// Create a new empty LSH bucket.
pub fn new() -> Self {
Self { members: Vec::new() }
}
/// Add a content hash to this bucket.
pub fn add(&mut self, hash: Hash) {
if !self.members.contains(&hash) {
self.members.push(hash);
}
}
/// Check if this bucket contains a given hash.
pub fn contains(&self, hash: &Hash) -> bool {
self.members.contains(hash)
}
/// Get the number of members in this bucket.
pub fn len(&self) -> usize {
self.members.len()
}
/// Check if this bucket is empty.
pub fn is_empty(&self) -> bool {
self.members.is_empty()
}
}
/// Configuration for the similarity index.
#[derive(Debug, Clone)]
pub struct SimilarityIndexConfig {
/// Number of hash functions for MinHash (default: 128).
pub minhash_k: usize,
/// Size of character n-grams for shingling (default: 3).
pub shingle_size: usize,
/// Number of LSH bands (default: 16).
pub lsh_bands: u8,
/// Number of rows per LSH band (default: 8).
pub lsh_rows_per_band: usize,
/// Bloom filter expected number of items (default: 1M).
pub bloom_expected_items: usize,
/// Bloom filter target false positive rate (default: 1%).
pub bloom_fp_rate: f64,
/// Jaccard similarity threshold for duplicate detection (default: 0.9).
pub similarity_threshold: f32,
}
impl Default for SimilarityIndexConfig {
fn default() -> Self {
Self {
minhash_k: DEFAULT_MINHASH_K,
shingle_size: DEFAULT_SHINGLE_SIZE,
lsh_bands: DEFAULT_LSH_BANDS,
lsh_rows_per_band: DEFAULT_LSH_ROWS_PER_BAND,
bloom_expected_items: DEFAULT_BLOOM_EXPECTED_ITEMS,
bloom_fp_rate: DEFAULT_BLOOM_FP_RATE,
similarity_threshold: 0.9,
}
}
}
impl SimilarityIndexConfig {
/// Create a new config with custom similarity threshold.
pub fn with_threshold(threshold: f32) -> Self {
Self { similarity_threshold: threshold, ..Default::default() }
}
}
/// Result of a similarity check against the index.
#[derive(Debug, Clone)]
pub struct SimilarityCheckResult {
/// Whether a near-duplicate was found (similarity >= threshold).
pub is_duplicate: bool,
/// Content hashes of similar entries found.
pub similar_entries: Vec<Hash>,
/// Maximum similarity found (0.0 if no similar entries).
pub max_similarity: f32,
}
impl SimilarityCheckResult {
/// Create a result indicating no duplicates found.
pub fn no_duplicate() -> Self {
Self { is_duplicate: false, similar_entries: Vec::new(), max_similarity: 0.0 }
}
/// Create a result indicating a duplicate was found.
pub fn duplicate(similar_entries: Vec<Hash>, max_similarity: f32) -> Self {
Self { is_duplicate: true, similar_entries, max_similarity }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_minhash_signature_similarity_identical() {
let sig1 = MinHashSignature::new(
[1u8; 32],
"Tesla".to_string(),
"revenue".to_string(),
vec![100, 200, 300, 400],
1000,
);
let sig2 = MinHashSignature::new(
[2u8; 32],
"Tesla".to_string(),
"profit".to_string(),
vec![100, 200, 300, 400],
1001,
);
let similarity = sig1.estimate_similarity(&sig2);
assert!((similarity - 1.0).abs() < f32::EPSILON);
}
#[test]
fn test_minhash_signature_similarity_partial() {
let sig1 = MinHashSignature::new(
[1u8; 32],
"Tesla".to_string(),
"revenue".to_string(),
vec![100, 200, 300, 400],
1000,
);
let sig2 = MinHashSignature::new(
[2u8; 32],
"Apple".to_string(),
"profit".to_string(),
vec![100, 200, 999, 888],
1001,
);
let similarity = sig1.estimate_similarity(&sig2);
assert!((similarity - 0.5).abs() < f32::EPSILON);
}
#[test]
fn test_minhash_signature_similarity_different_lengths() {
let sig1 = MinHashSignature::new(
[1u8; 32],
"Tesla".to_string(),
"revenue".to_string(),
vec![100, 200, 300],
1000,
);
let sig2 = MinHashSignature::new(
[2u8; 32],
"Apple".to_string(),
"profit".to_string(),
vec![100, 200],
1001,
);
let similarity = sig1.estimate_similarity(&sig2);
assert!((similarity - 0.0).abs() < f32::EPSILON);
}
#[test]
fn test_lsh_bucket_operations() {
let mut bucket = LshBucket::new();
assert!(bucket.is_empty());
assert_eq!(bucket.len(), 0);
let hash1 = [1u8; 32];
let hash2 = [2u8; 32];
bucket.add(hash1);
assert_eq!(bucket.len(), 1);
assert!(bucket.contains(&hash1));
assert!(!bucket.contains(&hash2));
// Adding same hash again should not duplicate
bucket.add(hash1);
assert_eq!(bucket.len(), 1);
bucket.add(hash2);
assert_eq!(bucket.len(), 2);
assert!(bucket.contains(&hash2));
}
#[test]
fn test_similarity_check_result() {
let no_dup = SimilarityCheckResult::no_duplicate();
assert!(!no_dup.is_duplicate);
assert!(no_dup.similar_entries.is_empty());
assert!((no_dup.max_similarity - 0.0).abs() < f32::EPSILON);
let dup = SimilarityCheckResult::duplicate(vec![[1u8; 32]], 0.95);
assert!(dup.is_duplicate);
assert_eq!(dup.similar_entries.len(), 1);
assert!((dup.max_similarity - 0.95).abs() < f32::EPSILON);
}
#[test]
fn test_config_defaults() {
let config = SimilarityIndexConfig::default();
assert_eq!(config.minhash_k, 128);
assert_eq!(config.shingle_size, 3);
assert_eq!(config.lsh_bands, 16);
assert_eq!(config.lsh_rows_per_band, 8);
assert_eq!(config.bloom_expected_items, 1_000_000);
assert!((config.bloom_fp_rate - 0.01).abs() < f64::EPSILON);
assert!((config.similarity_threshold - 0.9).abs() < f32::EPSILON);
}
}