//! Helper functions for the research module. //! //! Contains extraction, normalization, and scoring logic. use regex::Regex; use stemedb_core::types::ObjectValue; use super::researcher::DocumentationSource; /// Default documentation sources to search. pub(super) fn default_documentation_sources() -> Vec { vec![ DocumentationSource { name: "Redis Official Docs".to_string(), url_pattern: "https://redis.io/docs/management/{topic}/".to_string(), topics: vec!["redis".to_string(), "cache".to_string(), "memory".to_string()], tier: 2, }, DocumentationSource { name: "PostgreSQL Docs".to_string(), url_pattern: "https://www.postgresql.org/docs/current/{topic}.html".to_string(), topics: vec![ "postgres".to_string(), "postgresql".to_string(), "database".to_string(), "connection".to_string(), "pool".to_string(), ], tier: 2, }, DocumentationSource { name: "Go Documentation".to_string(), url_pattern: "https://pkg.go.dev/net/http#{topic}".to_string(), topics: vec!["http".to_string(), "timeout".to_string(), "server".to_string()], tier: 2, }, DocumentationSource { name: "Rust reqwest Docs".to_string(), url_pattern: "https://docs.rs/reqwest/latest/reqwest/".to_string(), topics: vec![ "reqwest".to_string(), "http".to_string(), "client".to_string(), "tls".to_string(), ], tier: 2, }, DocumentationSource { name: "OWASP".to_string(), url_pattern: "https://cheatsheetseries.owasp.org/cheatsheets/{topic}_Cheat_Sheet.html" .to_string(), topics: vec![ "authentication".to_string(), "session".to_string(), "jwt".to_string(), "password".to_string(), "input".to_string(), ], tier: 1, }, DocumentationSource { name: "Kafka Documentation".to_string(), url_pattern: "https://kafka.apache.org/documentation/#{topic}".to_string(), topics: vec![ "kafka".to_string(), "producer".to_string(), "consumer".to_string(), "retention".to_string(), ], tier: 2, }, DocumentationSource { name: "MongoDB Docs".to_string(), url_pattern: "https://www.mongodb.com/docs/manual/reference/{topic}/".to_string(), topics: vec![ "mongo".to_string(), "mongodb".to_string(), "connection".to_string(), "replica".to_string(), ], tier: 2, }, ] } /// Determine scheme from URL. pub(super) fn determine_scheme_from_url(url: &str) -> &'static str { if url.contains("rfc-editor.org") || url.contains("ietf.org") { "rfc" } else if url.contains("owasp.org") { "owasp" } else { "vendor" } } /// Normalize a topic for use in a subject path. pub(super) fn normalize_topic(topic: &str) -> String { topic .to_lowercase() .chars() .map(|c| if c.is_alphanumeric() || c == '/' { c } else { '_' }) .collect::() .trim_matches('_') .to_string() } /// Extract normative statements from content. pub(super) fn extract_normative_statements( content: &str, topic: &str, ) -> Vec<(String, String, u8)> { let mut statements = Vec::new(); // Pattern for normative keywords with context let keyword_pattern = Regex::new( r"(?i)(?P[^.]*?)\b(MUST NOT|MUST|SHALL NOT|SHALL|SHOULD NOT|SHOULD|REQUIRED|RECOMMENDED)\b(?P[^.]*\.)" ).ok(); // Pattern for section headings (HTML and markdown) let heading_pattern = Regex::new(r"(?i)]*>([^<]+)|^#{1,6}\s+(.+)$").ok(); // Extract headings for context let mut current_section = "General".to_string(); for line in content.lines() { // Update section context from headings if let Some(ref pattern) = heading_pattern { if let Some(caps) = pattern.captures(line) { current_section = caps .get(1) .or_else(|| caps.get(2)) .map(|m| m.as_str().trim().to_string()) .unwrap_or_else(|| "General".to_string()); } } // Check if line is relevant to topic let line_lower = line.to_lowercase(); let topic_lower = topic.to_lowercase(); let topic_parts: Vec<&str> = topic_lower.split('/').collect(); let is_relevant = topic_parts.iter().any(|part| line_lower.contains(part)); if !is_relevant { continue; } // Extract normative statements if let Some(ref pattern) = keyword_pattern { for caps in pattern.captures_iter(line) { let keyword = caps.get(2).map(|m| m.as_str().to_uppercase()).unwrap_or_default(); let full_statement = caps.get(0).map(|m| m.as_str().trim().to_string()).unwrap_or_default(); // Determine keyword strength let strength = match keyword.as_str() { "MUST" | "SHALL" | "REQUIRED" => 3, "MUST NOT" | "SHALL NOT" => 3, "SHOULD" | "RECOMMENDED" => 2, "SHOULD NOT" => 2, _ => 1, }; if !full_statement.is_empty() && full_statement.len() > 10 { statements.push((current_section.clone(), full_statement, strength)); } } } } statements } /// Determine value and predicate from a statement. pub(super) fn determine_value_and_predicate( statement: &str, default_predicate: &str, ) -> (ObjectValue, String) { let upper = statement.to_uppercase(); // Check for boolean-like patterns if upper.contains("MUST NOT") || upper.contains("SHALL NOT") || upper.contains("SHOULD NOT") { return (ObjectValue::Boolean(false), "disabled".to_string()); } if upper.contains("MUST") || upper.contains("SHALL") || upper.contains("REQUIRED") { return (ObjectValue::Boolean(true), "required".to_string()); } if upper.contains("SHOULD") || upper.contains("RECOMMENDED") { return (ObjectValue::Boolean(true), "recommended".to_string()); } // Default (ObjectValue::Boolean(true), default_predicate.to_string()) } /// Calculate confidence score based on various factors. pub(super) fn calculate_confidence( keyword_strength: u8, statement: &str, content_length: usize, ) -> f32 { let mut confidence = 0.5; // Base confidence // Keyword strength contribution (0.0 to 0.3) confidence += (keyword_strength as f32) * 0.1; // Statement length contribution (longer = better context) if statement.len() > 50 { confidence += 0.1; } if statement.len() > 100 { confidence += 0.05; } // Content length contribution (more content = more context) if content_length > 5000 { confidence += 0.05; } if content_length > 20000 { confidence += 0.05; } confidence.min(1.0) }