stemedb/applications/aphoria/src/learning/normalizer.rs

//! Pattern normalization for learned patterns.
//!
//! Converts code snippets into normalized patterns by replacing
//! literal values with typed placeholders. This enables deduplication
//! and similarity matching across different instances of the same pattern.

use once_cell::sync::Lazy;
use regex::Regex;

/// Compile a regex pattern, returning None on failure.
///
/// Returns `Option<Regex>` instead of panicking because:
/// 1. Clippy forbids `expect()` in this codebase for production safety
/// 2. Regex compilation is infallible for our known-valid patterns, but
///    the type system can't prove this at compile time
/// 3. Callers gracefully skip normalization if regex is None, which is
///    acceptable degradation (patterns just won't be normalized)
fn compile_regex(pattern: &str) -> Option<Regex> {
    Regex::new(pattern).ok()
}

// Match version-like strings: "1.0", "TLSv1.2", "SSLv3", etc.
static VERSION_RE: Lazy<Option<Regex>> =
    Lazy::new(|| compile_regex(r#"["'](?:TLS|SSL)?v?(\d+(?:\.\d+)*)["']"#));

static BOOL_RE: Lazy<Option<Regex>> =
    Lazy::new(|| compile_regex(r"\b(true|false|True|False|TRUE|FALSE)\b"));

// Match standalone numbers after : or = (common in configs).
//
// LIMITATION: This regex requires `:` or `=` context, so it won't match:
// - Array elements like `[1, 2, 3]`
// - Bare numbers in function arguments
// - Numbers in other syntactic positions
//
// This is intentional to avoid false positives on line numbers, indices,
// and other numeric literals that aren't configuration values.
static NUM_RE: Lazy<Option<Regex>> = Lazy::new(|| compile_regex(r"([:=]\s*)(\d+(?:\.\d+)?)\b"));

static STRING_RE: Lazy<Option<Regex>> = Lazy::new(|| compile_regex(r#"["'][^"']*["']"#));

/// Normalize a code pattern by replacing literals with typed placeholders.
///
/// # Placeholder Types
/// - `<string>` - Generic string value
/// - `<string:version>` - Version-like string (e.g., "1.0", "TLSv1.2")
/// - `<number>` - Numeric value
/// - `<boolean>` - true/false
///
/// # Examples
///
/// ```ignore
/// normalize_pattern("const TLS_MIN = \"1.0\"")
/// // => "const TLS_MIN = <string:version>"
///
/// normalize_pattern("pool_size: 25")
/// // => "pool_size: <number>"
///
/// normalize_pattern("verify_ssl = false")
/// // => "verify_ssl = <boolean>"
/// ```
pub fn normalize_pattern(code: &str) -> String {
    let mut result = code.to_string();

    // Order matters: more specific patterns first

    // 1. Version-like strings (1.0, 1.2, TLSv1.2, SSLv3, etc.)
    if let Some(re) = VERSION_RE.as_ref() {
        result = re.replace_all(&result, "<string:version>").to_string();
    }

    // 2. Boolean literals
    if let Some(re) = BOOL_RE.as_ref() {
        result = re.replace_all(&result, "<boolean>").to_string();
    }

    // 3. Numeric literals after : or = (common in configs)
    if let Some(re) = NUM_RE.as_ref() {
        result = re.replace_all(&result, "$1<number>").to_string();
    }

    // 4. Remaining quoted strings (that weren't versions)
    if let Some(re) = STRING_RE.as_ref() {
        result = re.replace_all(&result, "<string>").to_string();
    }

    result
}

/// Calculate similarity score between two normalized patterns.
///
/// Uses normalized Levenshtein distance for comparison.
/// Returns a value between 0.0 (completely different) and 1.0 (identical).
///
/// # Threshold
///
/// Patterns with similarity >= 0.8 are typically considered duplicates.
pub fn pattern_similarity(a: &str, b: &str) -> f32 {
    if a == b {
        return 1.0;
    }

    let distance = levenshtein_distance(a, b);
    let max_len = a.len().max(b.len());

    if max_len == 0 {
        return 1.0;
    }

    1.0 - (distance as f32 / max_len as f32)
}

/// Compute the Levenshtein edit distance between two strings.
fn levenshtein_distance(a: &str, b: &str) -> usize {
    let a_chars: Vec<char> = a.chars().collect();
    let b_chars: Vec<char> = b.chars().collect();

    let m = a_chars.len();
    let n = b_chars.len();

    if m == 0 {
        return n;
    }
    if n == 0 {
        return m;
    }

    // Use two rows instead of full matrix for memory efficiency
    let mut prev_row: Vec<usize> = (0..=n).collect();
    let mut curr_row: Vec<usize> = vec![0; n + 1];

    for i in 1..=m {
        curr_row[0] = i;

        for j in 1..=n {
            let cost = if a_chars[i - 1] == b_chars[j - 1] { 0 } else { 1 };

            curr_row[j] = (prev_row[j] + 1) // deletion
                .min(curr_row[j - 1] + 1) // insertion
                .min(prev_row[j - 1] + cost); // substitution
        }

        std::mem::swap(&mut prev_row, &mut curr_row);
    }

    prev_row[n]
}

/// Check if two patterns are similar enough to be considered duplicates.
///
/// Returns `Some(similarity)` if patterns meet the threshold, `None` otherwise.
/// This avoids computing similarity twice when both the check and score are needed.
pub fn are_patterns_similar(a: &str, b: &str, threshold: f32) -> Option<f32> {
    let similarity = pattern_similarity(a, b);
    if similarity >= threshold {
        Some(similarity)
    } else {
        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_version_string() {
        assert_eq!(
            normalize_pattern(r#"const TLS_MIN = "1.0""#),
            "const TLS_MIN = <string:version>"
        );
        assert_eq!(normalize_pattern(r#"min_version: "TLSv1.2""#), "min_version: <string:version>");
        assert_eq!(normalize_pattern(r#"ssl_version = "SSLv3""#), "ssl_version = <string:version>");
    }

    #[test]
    fn test_normalize_boolean() {
        assert_eq!(normalize_pattern("verify_ssl = false"), "verify_ssl = <boolean>");
        assert_eq!(normalize_pattern("enabled: true"), "enabled: <boolean>");
        assert_eq!(normalize_pattern("DEBUG = True"), "DEBUG = <boolean>");
        assert_eq!(normalize_pattern("SKIP_AUTH = FALSE"), "SKIP_AUTH = <boolean>");
    }

    #[test]
    fn test_normalize_number() {
        assert_eq!(normalize_pattern("pool_size: 25"), "pool_size: <number>");
        assert_eq!(normalize_pattern("timeout = 30.5"), "timeout = <number>");
        assert_eq!(normalize_pattern("max_connections: 100"), "max_connections: <number>");
    }

    #[test]
    fn test_normalize_string() {
        assert_eq!(normalize_pattern(r#"algorithm = "AES-256""#), "algorithm = <string>");
        assert_eq!(normalize_pattern(r#"mode: "CBC""#), "mode: <string>");
    }

    #[test]
    fn test_normalize_preserves_identifiers() {
        // Should not replace variable names or function names
        let input = "config.tls_version = 1.0";
        let result = normalize_pattern(input);
        assert!(result.contains("config.tls_version"));
    }

    #[test]
    fn test_normalize_mixed() {
        let input = r#"config = { version: "1.2", enabled: true, max: 100 }"#;
        let result = normalize_pattern(input);
        assert!(result.contains("<string:version>"));
        assert!(result.contains("<boolean>"));
        assert!(result.contains("<number>"));
    }

    #[test]
    fn test_levenshtein_identical() {
        assert_eq!(levenshtein_distance("hello", "hello"), 0);
    }

    #[test]
    fn test_levenshtein_empty() {
        assert_eq!(levenshtein_distance("", "hello"), 5);
        assert_eq!(levenshtein_distance("hello", ""), 5);
        assert_eq!(levenshtein_distance("", ""), 0);
    }

    #[test]
    fn test_levenshtein_single_edit() {
        assert_eq!(levenshtein_distance("hello", "hallo"), 1);
        assert_eq!(levenshtein_distance("hello", "hell"), 1);
        assert_eq!(levenshtein_distance("hello", "helloo"), 1);
    }

    #[test]
    fn test_levenshtein_multiple_edits() {
        assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
        assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
    }

    #[test]
    fn test_similarity_identical() {
        assert!((pattern_similarity("hello", "hello") - 1.0).abs() < 0.001);
    }

    #[test]
    fn test_similarity_empty() {
        assert!((pattern_similarity("", "") - 1.0).abs() < 0.001);
    }

    #[test]
    fn test_similarity_completely_different() {
        let sim = pattern_similarity("abc", "xyz");
        assert!(sim < 0.5);
    }

    #[test]
    fn test_similarity_threshold() {
        // Similar patterns should be above 0.8
        let a = "const TLS_MIN = <string:version>";
        let b = "const TLS_MIN_VERSION = <string:version>";
        let sim = pattern_similarity(a, b);
        // These are fairly similar but not identical
        assert!(sim > 0.7);
    }

    #[test]
    fn test_are_patterns_similar() {
        let a = "verify_ssl = <boolean>";
        let b = "verify_ssl = <boolean>";
        assert!(are_patterns_similar(a, b, 0.8).is_some());

        let c = "verify_ssl = <boolean>";
        let d = "skip_verification = <boolean>";
        assert!(are_patterns_similar(c, d, 0.8).is_none());

        // Verify we get the actual similarity score back
        let score = are_patterns_similar(a, b, 0.8);
        assert!(score.is_some());
        assert!((score.unwrap() - 1.0).abs() < 0.001);
    }

    #[test]
    fn test_normalize_does_not_affect_placeholders() {
        // Placeholders should remain unchanged
        let already_normalized = "verify_ssl = <boolean>";
        let result = normalize_pattern(already_normalized);
        // The < and > should survive
        assert!(result.contains("<boolean>") || result.contains("<string>"));
    }
}