stemedb/crates/stemedb-storage/src/similarity_index/model.rs

//! Data models for the similarity index.
//!
//! This module defines the core data structures used for near-duplicate detection:
//! - [`MinHashSignature`]: A MinHash signature for an assertion's content
//! - [`LshBucket`]: A bucket of similar assertions in LSH space
//! - [`SimilarityIndexConfig`]: Configuration for MinHash/LSH parameters
//! - [`SimilarityCheckResult`]: Result of a similarity check

use rkyv::{Archive, Deserialize, Serialize};
use stemedb_core::types::Hash;

/// Number of hash functions in the MinHash signature.
/// 128 provides 95% confidence for 0.9 Jaccard threshold.
pub const DEFAULT_MINHASH_K: usize = 128;

/// Size of character n-grams (shingles) for MinHash.
/// 3-grams are language-agnostic and work well for short strings.
pub const DEFAULT_SHINGLE_SIZE: usize = 3;

/// Number of LSH bands.
/// 16 bands with 8 rows each = 128 total (matches MinHash k).
pub const DEFAULT_LSH_BANDS: u8 = 16;

/// Number of rows per LSH band.
pub const DEFAULT_LSH_ROWS_PER_BAND: usize = 8;

/// Default Bloom filter expected items.
pub const DEFAULT_BLOOM_EXPECTED_ITEMS: usize = 1_000_000;

/// Default Bloom filter false positive rate.
pub const DEFAULT_BLOOM_FP_RATE: f64 = 0.01;

/// MinHash signature for an assertion's subject+predicate content.
///
/// Stored at `\x00MH:{content_hash_hex}` for persistence and Bloom filter rebuild.
#[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq)]
#[archive(check_bytes)]
pub struct MinHashSignature {
    /// BLAKE3 hash of the content (subject + predicate).
    pub content_hash: Hash,

    /// Original subject string (for debugging/auditing).
    pub subject: String,

    /// Original predicate string (for debugging/auditing).
    pub predicate: String,

    /// The MinHash signature: k hash values, one per hash function.
    /// Each u64 is the minimum hash value seen for that function.
    pub signature: Vec<u64>,

    /// Unix timestamp (nanoseconds) when this signature was created.
    pub created_at: u64,
}

impl MinHashSignature {
    /// Create a new MinHash signature.
    pub fn new(
        content_hash: Hash,
        subject: String,
        predicate: String,
        signature: Vec<u64>,
        created_at: u64,
    ) -> Self {
        Self { content_hash, subject, predicate, signature, created_at }
    }

    /// Compute the Jaccard similarity estimate between this signature and another.
    ///
    /// Returns a value in [0.0, 1.0] where 1.0 means identical and 0.0 means
    /// completely different.
    pub fn estimate_similarity(&self, other: &Self) -> f32 {
        if self.signature.len() != other.signature.len() {
            return 0.0;
        }
        if self.signature.is_empty() {
            return 0.0;
        }

        let matches =
            self.signature.iter().zip(other.signature.iter()).filter(|(a, b)| a == b).count();

        matches as f32 / self.signature.len() as f32
    }
}

/// An LSH bucket containing hashes of similar assertions.
///
/// Stored at `\x00LSH:{band:02}:{bucket_hash_hex}`.
#[derive(Archive, Deserialize, Serialize, Debug, Clone, PartialEq, Default)]
#[archive(check_bytes)]
pub struct LshBucket {
    /// Content hashes of assertions that hash to this bucket.
    pub members: Vec<Hash>,
}

impl LshBucket {
    /// Create a new empty LSH bucket.
    pub fn new() -> Self {
        Self { members: Vec::new() }
    }

    /// Add a content hash to this bucket.
    pub fn add(&mut self, hash: Hash) {
        if !self.members.contains(&hash) {
            self.members.push(hash);
        }
    }

    /// Check if this bucket contains a given hash.
    pub fn contains(&self, hash: &Hash) -> bool {
        self.members.contains(hash)
    }

    /// Get the number of members in this bucket.
    pub fn len(&self) -> usize {
        self.members.len()
    }

    /// Check if this bucket is empty.
    pub fn is_empty(&self) -> bool {
        self.members.is_empty()
    }
}

/// Configuration for the similarity index.
#[derive(Debug, Clone)]
pub struct SimilarityIndexConfig {
    /// Number of hash functions for MinHash (default: 128).
    pub minhash_k: usize,

    /// Size of character n-grams for shingling (default: 3).
    pub shingle_size: usize,

    /// Number of LSH bands (default: 16).
    pub lsh_bands: u8,

    /// Number of rows per LSH band (default: 8).
    pub lsh_rows_per_band: usize,

    /// Bloom filter expected number of items (default: 1M).
    pub bloom_expected_items: usize,

    /// Bloom filter target false positive rate (default: 1%).
    pub bloom_fp_rate: f64,

    /// Jaccard similarity threshold for duplicate detection (default: 0.9).
    pub similarity_threshold: f32,
}

impl Default for SimilarityIndexConfig {
    fn default() -> Self {
        Self {
            minhash_k: DEFAULT_MINHASH_K,
            shingle_size: DEFAULT_SHINGLE_SIZE,
            lsh_bands: DEFAULT_LSH_BANDS,
            lsh_rows_per_band: DEFAULT_LSH_ROWS_PER_BAND,
            bloom_expected_items: DEFAULT_BLOOM_EXPECTED_ITEMS,
            bloom_fp_rate: DEFAULT_BLOOM_FP_RATE,
            similarity_threshold: 0.9,
        }
    }
}

impl SimilarityIndexConfig {
    /// Create a new config with custom similarity threshold.
    pub fn with_threshold(threshold: f32) -> Self {
        Self { similarity_threshold: threshold, ..Default::default() }
    }
}

/// Result of a similarity check against the index.
#[derive(Debug, Clone)]
pub struct SimilarityCheckResult {
    /// Whether a near-duplicate was found (similarity >= threshold).
    pub is_duplicate: bool,

    /// Content hashes of similar entries found.
    pub similar_entries: Vec<Hash>,

    /// Maximum similarity found (0.0 if no similar entries).
    pub max_similarity: f32,
}

impl SimilarityCheckResult {
    /// Create a result indicating no duplicates found.
    pub fn no_duplicate() -> Self {
        Self { is_duplicate: false, similar_entries: Vec::new(), max_similarity: 0.0 }
    }

    /// Create a result indicating a duplicate was found.
    pub fn duplicate(similar_entries: Vec<Hash>, max_similarity: f32) -> Self {
        Self { is_duplicate: true, similar_entries, max_similarity }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_minhash_signature_similarity_identical() {
        let sig1 = MinHashSignature::new(
            [1u8; 32],
            "Tesla".to_string(),
            "revenue".to_string(),
            vec![100, 200, 300, 400],
            1000,
        );

        let sig2 = MinHashSignature::new(
            [2u8; 32],
            "Tesla".to_string(),
            "profit".to_string(),
            vec![100, 200, 300, 400],
            1001,
        );

        let similarity = sig1.estimate_similarity(&sig2);
        assert!((similarity - 1.0).abs() < f32::EPSILON);
    }

    #[test]
    fn test_minhash_signature_similarity_partial() {
        let sig1 = MinHashSignature::new(
            [1u8; 32],
            "Tesla".to_string(),
            "revenue".to_string(),
            vec![100, 200, 300, 400],
            1000,
        );

        let sig2 = MinHashSignature::new(
            [2u8; 32],
            "Apple".to_string(),
            "profit".to_string(),
            vec![100, 200, 999, 888],
            1001,
        );

        let similarity = sig1.estimate_similarity(&sig2);
        assert!((similarity - 0.5).abs() < f32::EPSILON);
    }

    #[test]
    fn test_minhash_signature_similarity_different_lengths() {
        let sig1 = MinHashSignature::new(
            [1u8; 32],
            "Tesla".to_string(),
            "revenue".to_string(),
            vec![100, 200, 300],
            1000,
        );

        let sig2 = MinHashSignature::new(
            [2u8; 32],
            "Apple".to_string(),
            "profit".to_string(),
            vec![100, 200],
            1001,
        );

        let similarity = sig1.estimate_similarity(&sig2);
        assert!((similarity - 0.0).abs() < f32::EPSILON);
    }

    #[test]
    fn test_lsh_bucket_operations() {
        let mut bucket = LshBucket::new();
        assert!(bucket.is_empty());
        assert_eq!(bucket.len(), 0);

        let hash1 = [1u8; 32];
        let hash2 = [2u8; 32];

        bucket.add(hash1);
        assert_eq!(bucket.len(), 1);
        assert!(bucket.contains(&hash1));
        assert!(!bucket.contains(&hash2));

        // Adding same hash again should not duplicate
        bucket.add(hash1);
        assert_eq!(bucket.len(), 1);

        bucket.add(hash2);
        assert_eq!(bucket.len(), 2);
        assert!(bucket.contains(&hash2));
    }

    #[test]
    fn test_similarity_check_result() {
        let no_dup = SimilarityCheckResult::no_duplicate();
        assert!(!no_dup.is_duplicate);
        assert!(no_dup.similar_entries.is_empty());
        assert!((no_dup.max_similarity - 0.0).abs() < f32::EPSILON);

        let dup = SimilarityCheckResult::duplicate(vec![[1u8; 32]], 0.95);
        assert!(dup.is_duplicate);
        assert_eq!(dup.similar_entries.len(), 1);
        assert!((dup.max_similarity - 0.95).abs() < f32::EPSILON);
    }

    #[test]
    fn test_config_defaults() {
        let config = SimilarityIndexConfig::default();
        assert_eq!(config.minhash_k, 128);
        assert_eq!(config.shingle_size, 3);
        assert_eq!(config.lsh_bands, 16);
        assert_eq!(config.lsh_rows_per_band, 8);
        assert_eq!(config.bloom_expected_items, 1_000_000);
        assert!((config.bloom_fp_rate - 0.01).abs() < f64::EPSILON);
        assert!((config.similarity_threshold - 0.9).abs() < f32::EPSILON);
    }
}