//! Pattern normalization for learned patterns. //! //! Converts code snippets into normalized patterns by replacing //! literal values with typed placeholders. This enables deduplication //! and similarity matching across different instances of the same pattern. use once_cell::sync::Lazy; use regex::Regex; /// Compile a regex pattern, returning None on failure. /// /// Returns `Option` instead of panicking because: /// 1. Clippy forbids `expect()` in this codebase for production safety /// 2. Regex compilation is infallible for our known-valid patterns, but /// the type system can't prove this at compile time /// 3. Callers gracefully skip normalization if regex is None, which is /// acceptable degradation (patterns just won't be normalized) fn compile_regex(pattern: &str) -> Option { Regex::new(pattern).ok() } // Match version-like strings: "1.0", "TLSv1.2", "SSLv3", etc. static VERSION_RE: Lazy> = Lazy::new(|| compile_regex(r#"["'](?:TLS|SSL)?v?(\d+(?:\.\d+)*)["']"#)); static BOOL_RE: Lazy> = Lazy::new(|| compile_regex(r"\b(true|false|True|False|TRUE|FALSE)\b")); // Match standalone numbers after : or = (common in configs). // // LIMITATION: This regex requires `:` or `=` context, so it won't match: // - Array elements like `[1, 2, 3]` // - Bare numbers in function arguments // - Numbers in other syntactic positions // // This is intentional to avoid false positives on line numbers, indices, // and other numeric literals that aren't configuration values. static NUM_RE: Lazy> = Lazy::new(|| compile_regex(r"([:=]\s*)(\d+(?:\.\d+)?)\b")); static STRING_RE: Lazy> = Lazy::new(|| compile_regex(r#"["'][^"']*["']"#)); /// Normalize a code pattern by replacing literals with typed placeholders. /// /// # Placeholder Types /// - `` - Generic string value /// - `` - Version-like string (e.g., "1.0", "TLSv1.2") /// - `` - Numeric value /// - `` - true/false /// /// # Examples /// /// ```ignore /// normalize_pattern("const TLS_MIN = \"1.0\"") /// // => "const TLS_MIN = " /// /// normalize_pattern("pool_size: 25") /// // => "pool_size: " /// /// normalize_pattern("verify_ssl = false") /// // => "verify_ssl = " /// ``` pub fn normalize_pattern(code: &str) -> String { let mut result = code.to_string(); // Order matters: more specific patterns first // 1. Version-like strings (1.0, 1.2, TLSv1.2, SSLv3, etc.) if let Some(re) = VERSION_RE.as_ref() { result = re.replace_all(&result, "").to_string(); } // 2. Boolean literals if let Some(re) = BOOL_RE.as_ref() { result = re.replace_all(&result, "").to_string(); } // 3. Numeric literals after : or = (common in configs) if let Some(re) = NUM_RE.as_ref() { result = re.replace_all(&result, "$1").to_string(); } // 4. Remaining quoted strings (that weren't versions) if let Some(re) = STRING_RE.as_ref() { result = re.replace_all(&result, "").to_string(); } result } /// Calculate similarity score between two normalized patterns. /// /// Uses normalized Levenshtein distance for comparison. /// Returns a value between 0.0 (completely different) and 1.0 (identical). /// /// # Threshold /// /// Patterns with similarity >= 0.8 are typically considered duplicates. pub fn pattern_similarity(a: &str, b: &str) -> f32 { if a == b { return 1.0; } let distance = levenshtein_distance(a, b); let max_len = a.len().max(b.len()); if max_len == 0 { return 1.0; } 1.0 - (distance as f32 / max_len as f32) } /// Compute the Levenshtein edit distance between two strings. fn levenshtein_distance(a: &str, b: &str) -> usize { let a_chars: Vec = a.chars().collect(); let b_chars: Vec = b.chars().collect(); let m = a_chars.len(); let n = b_chars.len(); if m == 0 { return n; } if n == 0 { return m; } // Use two rows instead of full matrix for memory efficiency let mut prev_row: Vec = (0..=n).collect(); let mut curr_row: Vec = vec![0; n + 1]; for i in 1..=m { curr_row[0] = i; for j in 1..=n { let cost = if a_chars[i - 1] == b_chars[j - 1] { 0 } else { 1 }; curr_row[j] = (prev_row[j] + 1) // deletion .min(curr_row[j - 1] + 1) // insertion .min(prev_row[j - 1] + cost); // substitution } std::mem::swap(&mut prev_row, &mut curr_row); } prev_row[n] } /// Check if two patterns are similar enough to be considered duplicates. /// /// Returns `Some(similarity)` if patterns meet the threshold, `None` otherwise. /// This avoids computing similarity twice when both the check and score are needed. pub fn are_patterns_similar(a: &str, b: &str, threshold: f32) -> Option { let similarity = pattern_similarity(a, b); if similarity >= threshold { Some(similarity) } else { None } } #[cfg(test)] mod tests { use super::*; #[test] fn test_normalize_version_string() { assert_eq!( normalize_pattern(r#"const TLS_MIN = "1.0""#), "const TLS_MIN = " ); assert_eq!(normalize_pattern(r#"min_version: "TLSv1.2""#), "min_version: "); assert_eq!(normalize_pattern(r#"ssl_version = "SSLv3""#), "ssl_version = "); } #[test] fn test_normalize_boolean() { assert_eq!(normalize_pattern("verify_ssl = false"), "verify_ssl = "); assert_eq!(normalize_pattern("enabled: true"), "enabled: "); assert_eq!(normalize_pattern("DEBUG = True"), "DEBUG = "); assert_eq!(normalize_pattern("SKIP_AUTH = FALSE"), "SKIP_AUTH = "); } #[test] fn test_normalize_number() { assert_eq!(normalize_pattern("pool_size: 25"), "pool_size: "); assert_eq!(normalize_pattern("timeout = 30.5"), "timeout = "); assert_eq!(normalize_pattern("max_connections: 100"), "max_connections: "); } #[test] fn test_normalize_string() { assert_eq!(normalize_pattern(r#"algorithm = "AES-256""#), "algorithm = "); assert_eq!(normalize_pattern(r#"mode: "CBC""#), "mode: "); } #[test] fn test_normalize_preserves_identifiers() { // Should not replace variable names or function names let input = "config.tls_version = 1.0"; let result = normalize_pattern(input); assert!(result.contains("config.tls_version")); } #[test] fn test_normalize_mixed() { let input = r#"config = { version: "1.2", enabled: true, max: 100 }"#; let result = normalize_pattern(input); assert!(result.contains("")); assert!(result.contains("")); assert!(result.contains("")); } #[test] fn test_levenshtein_identical() { assert_eq!(levenshtein_distance("hello", "hello"), 0); } #[test] fn test_levenshtein_empty() { assert_eq!(levenshtein_distance("", "hello"), 5); assert_eq!(levenshtein_distance("hello", ""), 5); assert_eq!(levenshtein_distance("", ""), 0); } #[test] fn test_levenshtein_single_edit() { assert_eq!(levenshtein_distance("hello", "hallo"), 1); assert_eq!(levenshtein_distance("hello", "hell"), 1); assert_eq!(levenshtein_distance("hello", "helloo"), 1); } #[test] fn test_levenshtein_multiple_edits() { assert_eq!(levenshtein_distance("kitten", "sitting"), 3); assert_eq!(levenshtein_distance("saturday", "sunday"), 3); } #[test] fn test_similarity_identical() { assert!((pattern_similarity("hello", "hello") - 1.0).abs() < 0.001); } #[test] fn test_similarity_empty() { assert!((pattern_similarity("", "") - 1.0).abs() < 0.001); } #[test] fn test_similarity_completely_different() { let sim = pattern_similarity("abc", "xyz"); assert!(sim < 0.5); } #[test] fn test_similarity_threshold() { // Similar patterns should be above 0.8 let a = "const TLS_MIN = "; let b = "const TLS_MIN_VERSION = "; let sim = pattern_similarity(a, b); // These are fairly similar but not identical assert!(sim > 0.7); } #[test] fn test_are_patterns_similar() { let a = "verify_ssl = "; let b = "verify_ssl = "; assert!(are_patterns_similar(a, b, 0.8).is_some()); let c = "verify_ssl = "; let d = "skip_verification = "; assert!(are_patterns_similar(c, d, 0.8).is_none()); // Verify we get the actual similarity score back let score = are_patterns_similar(a, b, 0.8); assert!(score.is_some()); assert!((score.unwrap() - 1.0).abs() < 0.001); } #[test] fn test_normalize_does_not_affect_placeholders() { // Placeholders should remain unchanged let already_normalized = "verify_ssl = "; let result = normalize_pattern(already_normalized); // The < and > should survive assert!(result.contains("") || result.contains("")); } }