Enterprise Features: - Hosted mode with remote sync for team pattern aggregation - Community sharing with privacy-preserving anonymization - LLM-based semantic claim extraction with Gemini integration - Pattern learning with promotion to declarative extractors - High-entropy secrets extractor with configurable thresholds - Auth bypass and insecure cookies extractors Module Refactoring: - Split oversized files to comply with 500-line limit - Config split: types/core.rs, types/extractors.rs, types/hosted.rs, etc. - Handlers split: scan.rs, policy.rs, report.rs modules - Extractors split: declarative/, high_entropy_secrets/, insecure_cookies/ - Learning split: store modules with metrics and persistence SDK & Ontology: - stemedb-ontology SDK with fluent builders and StemeDB client - Pharma domain extractors for FDA Orange Book data - Consumer health UAT test infrastructure Code Quality: - Fixed clippy warnings (needless_borrows_for_generic_args) - Added KVStore trait imports where needed - Fixed utoipa path re-exports for OpenAPI docs Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
290 lines
9.3 KiB
Rust
290 lines
9.3 KiB
Rust
//! Pattern normalization for learned patterns.
|
|
//!
|
|
//! Converts code snippets into normalized patterns by replacing
|
|
//! literal values with typed placeholders. This enables deduplication
|
|
//! and similarity matching across different instances of the same pattern.
|
|
|
|
use once_cell::sync::Lazy;
|
|
use regex::Regex;
|
|
|
|
/// Compile a regex pattern, returning None on failure.
|
|
///
|
|
/// Returns `Option<Regex>` instead of panicking because:
|
|
/// 1. Clippy forbids `expect()` in this codebase for production safety
|
|
/// 2. Regex compilation is infallible for our known-valid patterns, but
|
|
/// the type system can't prove this at compile time
|
|
/// 3. Callers gracefully skip normalization if regex is None, which is
|
|
/// acceptable degradation (patterns just won't be normalized)
|
|
fn compile_regex(pattern: &str) -> Option<Regex> {
|
|
Regex::new(pattern).ok()
|
|
}
|
|
|
|
// Match version-like strings: "1.0", "TLSv1.2", "SSLv3", etc.
|
|
static VERSION_RE: Lazy<Option<Regex>> =
|
|
Lazy::new(|| compile_regex(r#"["'](?:TLS|SSL)?v?(\d+(?:\.\d+)*)["']"#));
|
|
|
|
static BOOL_RE: Lazy<Option<Regex>> =
|
|
Lazy::new(|| compile_regex(r"\b(true|false|True|False|TRUE|FALSE)\b"));
|
|
|
|
// Match standalone numbers after : or = (common in configs).
|
|
//
|
|
// LIMITATION: This regex requires `:` or `=` context, so it won't match:
|
|
// - Array elements like `[1, 2, 3]`
|
|
// - Bare numbers in function arguments
|
|
// - Numbers in other syntactic positions
|
|
//
|
|
// This is intentional to avoid false positives on line numbers, indices,
|
|
// and other numeric literals that aren't configuration values.
|
|
static NUM_RE: Lazy<Option<Regex>> = Lazy::new(|| compile_regex(r"([:=]\s*)(\d+(?:\.\d+)?)\b"));
|
|
|
|
static STRING_RE: Lazy<Option<Regex>> = Lazy::new(|| compile_regex(r#"["'][^"']*["']"#));
|
|
|
|
/// Normalize a code pattern by replacing literals with typed placeholders.
|
|
///
|
|
/// # Placeholder Types
|
|
/// - `<string>` - Generic string value
|
|
/// - `<string:version>` - Version-like string (e.g., "1.0", "TLSv1.2")
|
|
/// - `<number>` - Numeric value
|
|
/// - `<boolean>` - true/false
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```ignore
|
|
/// normalize_pattern("const TLS_MIN = \"1.0\"")
|
|
/// // => "const TLS_MIN = <string:version>"
|
|
///
|
|
/// normalize_pattern("pool_size: 25")
|
|
/// // => "pool_size: <number>"
|
|
///
|
|
/// normalize_pattern("verify_ssl = false")
|
|
/// // => "verify_ssl = <boolean>"
|
|
/// ```
|
|
pub fn normalize_pattern(code: &str) -> String {
|
|
let mut result = code.to_string();
|
|
|
|
// Order matters: more specific patterns first
|
|
|
|
// 1. Version-like strings (1.0, 1.2, TLSv1.2, SSLv3, etc.)
|
|
if let Some(re) = VERSION_RE.as_ref() {
|
|
result = re.replace_all(&result, "<string:version>").to_string();
|
|
}
|
|
|
|
// 2. Boolean literals
|
|
if let Some(re) = BOOL_RE.as_ref() {
|
|
result = re.replace_all(&result, "<boolean>").to_string();
|
|
}
|
|
|
|
// 3. Numeric literals after : or = (common in configs)
|
|
if let Some(re) = NUM_RE.as_ref() {
|
|
result = re.replace_all(&result, "$1<number>").to_string();
|
|
}
|
|
|
|
// 4. Remaining quoted strings (that weren't versions)
|
|
if let Some(re) = STRING_RE.as_ref() {
|
|
result = re.replace_all(&result, "<string>").to_string();
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Calculate similarity score between two normalized patterns.
|
|
///
|
|
/// Uses normalized Levenshtein distance for comparison.
|
|
/// Returns a value between 0.0 (completely different) and 1.0 (identical).
|
|
///
|
|
/// # Threshold
|
|
///
|
|
/// Patterns with similarity >= 0.8 are typically considered duplicates.
|
|
pub fn pattern_similarity(a: &str, b: &str) -> f32 {
|
|
if a == b {
|
|
return 1.0;
|
|
}
|
|
|
|
let distance = levenshtein_distance(a, b);
|
|
let max_len = a.len().max(b.len());
|
|
|
|
if max_len == 0 {
|
|
return 1.0;
|
|
}
|
|
|
|
1.0 - (distance as f32 / max_len as f32)
|
|
}
|
|
|
|
/// Compute the Levenshtein edit distance between two strings.
|
|
fn levenshtein_distance(a: &str, b: &str) -> usize {
|
|
let a_chars: Vec<char> = a.chars().collect();
|
|
let b_chars: Vec<char> = b.chars().collect();
|
|
|
|
let m = a_chars.len();
|
|
let n = b_chars.len();
|
|
|
|
if m == 0 {
|
|
return n;
|
|
}
|
|
if n == 0 {
|
|
return m;
|
|
}
|
|
|
|
// Use two rows instead of full matrix for memory efficiency
|
|
let mut prev_row: Vec<usize> = (0..=n).collect();
|
|
let mut curr_row: Vec<usize> = vec![0; n + 1];
|
|
|
|
for i in 1..=m {
|
|
curr_row[0] = i;
|
|
|
|
for j in 1..=n {
|
|
let cost = if a_chars[i - 1] == b_chars[j - 1] { 0 } else { 1 };
|
|
|
|
curr_row[j] = (prev_row[j] + 1) // deletion
|
|
.min(curr_row[j - 1] + 1) // insertion
|
|
.min(prev_row[j - 1] + cost); // substitution
|
|
}
|
|
|
|
std::mem::swap(&mut prev_row, &mut curr_row);
|
|
}
|
|
|
|
prev_row[n]
|
|
}
|
|
|
|
/// Check if two patterns are similar enough to be considered duplicates.
|
|
///
|
|
/// Returns `Some(similarity)` if patterns meet the threshold, `None` otherwise.
|
|
/// This avoids computing similarity twice when both the check and score are needed.
|
|
pub fn are_patterns_similar(a: &str, b: &str, threshold: f32) -> Option<f32> {
|
|
let similarity = pattern_similarity(a, b);
|
|
if similarity >= threshold {
|
|
Some(similarity)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_normalize_version_string() {
|
|
assert_eq!(
|
|
normalize_pattern(r#"const TLS_MIN = "1.0""#),
|
|
"const TLS_MIN = <string:version>"
|
|
);
|
|
assert_eq!(normalize_pattern(r#"min_version: "TLSv1.2""#), "min_version: <string:version>");
|
|
assert_eq!(normalize_pattern(r#"ssl_version = "SSLv3""#), "ssl_version = <string:version>");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_boolean() {
|
|
assert_eq!(normalize_pattern("verify_ssl = false"), "verify_ssl = <boolean>");
|
|
assert_eq!(normalize_pattern("enabled: true"), "enabled: <boolean>");
|
|
assert_eq!(normalize_pattern("DEBUG = True"), "DEBUG = <boolean>");
|
|
assert_eq!(normalize_pattern("SKIP_AUTH = FALSE"), "SKIP_AUTH = <boolean>");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_number() {
|
|
assert_eq!(normalize_pattern("pool_size: 25"), "pool_size: <number>");
|
|
assert_eq!(normalize_pattern("timeout = 30.5"), "timeout = <number>");
|
|
assert_eq!(normalize_pattern("max_connections: 100"), "max_connections: <number>");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_string() {
|
|
assert_eq!(normalize_pattern(r#"algorithm = "AES-256""#), "algorithm = <string>");
|
|
assert_eq!(normalize_pattern(r#"mode: "CBC""#), "mode: <string>");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_preserves_identifiers() {
|
|
// Should not replace variable names or function names
|
|
let input = "config.tls_version = 1.0";
|
|
let result = normalize_pattern(input);
|
|
assert!(result.contains("config.tls_version"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_mixed() {
|
|
let input = r#"config = { version: "1.2", enabled: true, max: 100 }"#;
|
|
let result = normalize_pattern(input);
|
|
assert!(result.contains("<string:version>"));
|
|
assert!(result.contains("<boolean>"));
|
|
assert!(result.contains("<number>"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_levenshtein_identical() {
|
|
assert_eq!(levenshtein_distance("hello", "hello"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_levenshtein_empty() {
|
|
assert_eq!(levenshtein_distance("", "hello"), 5);
|
|
assert_eq!(levenshtein_distance("hello", ""), 5);
|
|
assert_eq!(levenshtein_distance("", ""), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_levenshtein_single_edit() {
|
|
assert_eq!(levenshtein_distance("hello", "hallo"), 1);
|
|
assert_eq!(levenshtein_distance("hello", "hell"), 1);
|
|
assert_eq!(levenshtein_distance("hello", "helloo"), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_levenshtein_multiple_edits() {
|
|
assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
|
|
assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
|
|
}
|
|
|
|
#[test]
|
|
fn test_similarity_identical() {
|
|
assert!((pattern_similarity("hello", "hello") - 1.0).abs() < 0.001);
|
|
}
|
|
|
|
#[test]
|
|
fn test_similarity_empty() {
|
|
assert!((pattern_similarity("", "") - 1.0).abs() < 0.001);
|
|
}
|
|
|
|
#[test]
|
|
fn test_similarity_completely_different() {
|
|
let sim = pattern_similarity("abc", "xyz");
|
|
assert!(sim < 0.5);
|
|
}
|
|
|
|
#[test]
|
|
fn test_similarity_threshold() {
|
|
// Similar patterns should be above 0.8
|
|
let a = "const TLS_MIN = <string:version>";
|
|
let b = "const TLS_MIN_VERSION = <string:version>";
|
|
let sim = pattern_similarity(a, b);
|
|
// These are fairly similar but not identical
|
|
assert!(sim > 0.7);
|
|
}
|
|
|
|
#[test]
|
|
fn test_are_patterns_similar() {
|
|
let a = "verify_ssl = <boolean>";
|
|
let b = "verify_ssl = <boolean>";
|
|
assert!(are_patterns_similar(a, b, 0.8).is_some());
|
|
|
|
let c = "verify_ssl = <boolean>";
|
|
let d = "skip_verification = <boolean>";
|
|
assert!(are_patterns_similar(c, d, 0.8).is_none());
|
|
|
|
// Verify we get the actual similarity score back
|
|
let score = are_patterns_similar(a, b, 0.8);
|
|
assert!(score.is_some());
|
|
assert!((score.unwrap() - 1.0).abs() < 0.001);
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_does_not_affect_placeholders() {
|
|
// Placeholders should remain unchanged
|
|
let already_normalized = "verify_ssl = <boolean>";
|
|
let result = normalize_pattern(already_normalized);
|
|
// The < and > should survive
|
|
assert!(result.contains("<boolean>") || result.contains("<string>"));
|
|
}
|
|
}
|