//! XML External Entity (XXE) vulnerability extractor. //! //! Detects patterns where XML parsers are used without disabling external entity //! processing, which can lead to data exfiltration, SSRF, or denial of service. use regex::Regex; use stemedb_core::types::ObjectValue; use super::traits::{build_claim, Extractor}; use crate::types::{Observation, Language}; /// Extractor for XXE vulnerabilities. /// /// Detects patterns indicating potentially unsafe XML parsing: /// - Python: lxml, xml.etree, xml.dom.minidom, xml.sax /// - JavaScript: xml2js, libxmljs /// - Go: encoding/xml /// - Java-style patterns (polyglot detection) /// - DTD entity declarations pub struct XxeExtractor { // Python patterns python_lxml: Regex, python_etree: Regex, python_minidom: Regex, python_sax: Regex, // JavaScript patterns js_xml2js: Regex, js_libxmljs: Regex, // Go patterns go_xml: Regex, // Java-style patterns java_xxe: Regex, // DTD entity declaration entity_decl: Regex, } impl Default for XxeExtractor { fn default() -> Self { Self::new() } } impl XxeExtractor { /// Create a new XXE extractor with compiled regexes. /// /// # Panics /// Panics if any regex pattern is invalid (programmer error). #[allow(clippy::expect_used)] pub fn new() -> Self { Self { // Python: lxml/etree parse python_lxml: Regex::new(r#"(?:etree|lxml)\.(?:parse|fromstring|XML)\s*\("#) .expect("valid regex"), // Python: xml.etree.ElementTree python_etree: Regex::new( r#"(?:xml\.etree\.ElementTree|ET)\.(?:parse|fromstring|XMLParser)\s*\("#, ) .expect("valid regex"), // Python: xml.dom.minidom python_minidom: Regex::new(r#"xml\.dom\.minidom\.(?:parse|parseString)\s*\("#) .expect("valid regex"), // Python: xml.sax python_sax: Regex::new(r#"xml\.sax\.(?:parse|parseString|make_parser)\s*\("#) .expect("valid regex"), // JavaScript: xml2js js_xml2js: Regex::new(r#"xml2js\.(?:parseString|Parser)\s*\("#).expect("valid regex"), // JavaScript: libxmljs js_libxmljs: Regex::new(r#"libxmljs\.parseXml\s*\("#).expect("valid regex"), // Go: encoding/xml go_xml: Regex::new(r#"xml\.(?:Unmarshal|NewDecoder)\s*\("#).expect("valid regex"), // Java-style patterns (polyglot detection in config files, etc.) java_xxe: Regex::new( r#"(?:DocumentBuilder|SAXParser|XMLReader|TransformerFactory)(?:Factory)?\.new"#, ) .expect("valid regex"), // DTD entity declaration (dangerous in untrusted XML) entity_decl: Regex::new(r#" Observation { build_claim( path_segments, &["xml", "parsing"], "parser_config", ObjectValue::Text(parser.to_string()), file, line, matched, confidence, description, ) } } impl Extractor for XxeExtractor { fn name(&self) -> &str { "xxe" } fn languages(&self) -> &[Language] { &[Language::Python, Language::JavaScript, Language::TypeScript, Language::Go] } fn extract( &self, path_segments: &[String], content: &str, language: Language, file: &str, ) -> Vec { let mut claims = Vec::new(); for (line_idx, line) in content.lines().enumerate() { let line_num = line_idx + 1; // Check for DTD entity declarations (high risk in any context) if let Some(m) = self.entity_decl.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "dtd_entity", 0.95, "DTD SYSTEM/PUBLIC entity declaration (XXE attack vector)", )); } match language { Language::Python => { // lxml/etree (can be safe with proper configuration) if let Some(m) = self.python_lxml.find(line) { // Lower confidence if defusedxml is imported or resolve_entities=False let confidence = if content.contains("defusedxml") || line.contains("resolve_entities=False") { 0.5 } else { 0.85 }; claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "lxml", confidence, "lxml XML parsing may be vulnerable to XXE without proper config", )); } // xml.etree.ElementTree if let Some(m) = self.python_etree.find(line) { // Python 3.8+ has some protections, but external entities still a concern claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "elementtree", 0.75, "xml.etree.ElementTree may allow external entity expansion", )); } // xml.dom.minidom (vulnerable by default) if let Some(m) = self.python_minidom.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "minidom", 0.85, "xml.dom.minidom is vulnerable to XXE attacks", )); } // xml.sax (needs feature flags to be safe) if let Some(m) = self.python_sax.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "sax", 0.85, "xml.sax is vulnerable to XXE without feature_external_ges=False", )); } } Language::JavaScript | Language::TypeScript => { // xml2js (generally safer, but can be misconfigured) if let Some(m) = self.js_xml2js.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "xml2js", 0.7, "xml2js XML parsing - verify external entity settings", )); } // libxmljs (can be vulnerable) if let Some(m) = self.js_libxmljs.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "libxmljs", 0.85, "libxmljs may be vulnerable to XXE attacks", )); } } Language::Go => { // encoding/xml (safer by default, but DTD expansion can be issue) if let Some(m) = self.go_xml.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "encoding_xml", 0.65, "Go xml package - generally safe but verify with untrusted input", )); } } _ => {} } // Check for Java patterns (polyglot detection) if let Some(m) = self.java_xxe.find(line) { claims.push(Self::make_claim( path_segments, file, line_num, m.as_str(), "java_parser", 0.9, "Java XML parser - requires feature flags to prevent XXE", )); } } claims } } #[cfg(test)] mod tests { use super::*; #[test] fn test_python_lxml() { let extractor = XxeExtractor::new(); let content = r#" doc = etree.parse(xml_file) "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "parser.py"); assert_eq!(claims.len(), 1); assert!(claims[0].concept_path.contains("xml/parsing")); } #[test] fn test_python_lxml_with_defusedxml() { let extractor = XxeExtractor::new(); let content = r#" import defusedxml.ElementTree as ET doc = etree.parse(xml_file) "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "parser.py"); // Should still detect but with lower confidence assert_eq!(claims.len(), 1); assert!(claims[0].confidence < 0.6); } #[test] fn test_python_elementtree() { let extractor = XxeExtractor::new(); let content = r#" import xml.etree.ElementTree as ET tree = ET.parse(source) "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "xml.py"); assert_eq!(claims.len(), 1); } #[test] fn test_python_minidom() { let extractor = XxeExtractor::new(); let content = r#" from xml.dom.minidom import parse doc = xml.dom.minidom.parse(xml_string) "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "parser.py"); assert_eq!(claims.len(), 1); assert!(claims[0].description.contains("minidom")); } #[test] fn test_python_sax() { let extractor = XxeExtractor::new(); let content = r#" xml.sax.parse(source, handler) "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "handler.py"); assert_eq!(claims.len(), 1); } #[test] fn test_js_xml2js() { let extractor = XxeExtractor::new(); let content = r#" xml2js.parseString(xmlData, callback); "#; let claims = extractor.extract(&["js".to_string()], content, Language::JavaScript, "parser.js"); assert_eq!(claims.len(), 1); } #[test] fn test_js_libxmljs() { let extractor = XxeExtractor::new(); let content = r#" const doc = libxmljs.parseXml(xmlString); "#; let claims = extractor.extract(&["js".to_string()], content, Language::JavaScript, "parser.js"); assert_eq!(claims.len(), 1); } #[test] fn test_go_xml() { let extractor = XxeExtractor::new(); let content = r#" err := xml.Unmarshal(data, &result) "#; let claims = extractor.extract(&["go".to_string()], content, Language::Go, "parser.go"); assert_eq!(claims.len(), 1); } #[test] fn test_java_parser() { let extractor = XxeExtractor::new(); let content = r#" DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "mixed.py"); assert_eq!(claims.len(), 1); assert!(claims[0].description.contains("Java")); } #[test] fn test_dtd_entity() { let extractor = XxeExtractor::new(); let content = r#" "#; // Use a non-test filename to avoid confidence reduction let claims = extractor.extract(&["python".to_string()], content, Language::Python, "parser.xml"); assert_eq!(claims.len(), 1); assert!(claims[0].confidence >= 0.9); assert!(claims[0].description.contains("XXE attack vector")); } #[test] fn test_dtd_public_entity() { let extractor = XxeExtractor::new(); let content = r#" "#; let claims = extractor.extract(&["python".to_string()], content, Language::Python, "test.xml"); assert_eq!(claims.len(), 1); } }