Major additions: - Community Next.js app (port 18187) for browsing claims with API docs - stemedb-chaos crate: Fault injection, chaos testing, CRDT properties - Latent ingestion system: Reddit/FDA ingesters with ADK-Go agents - Disputed claims handling: Manual review workflows and validation - Aphoria security scanner: New extractors (SQL injection, command injection, weak crypto, TLS version), policy-based ignores, UAT reports - Docker infrastructure: Dockerfile, docker-compose.yml for full stack - VulnBank demo: Intentionally vulnerable multi-language test corpus SDK & API enhancements: - Source registry handlers for tracking data provenance - Metrics endpoint - Skeptic filtering improvements Code quality: - Split 14 large files (>500 lines) into focused modules - All files now under 500-line limit per project guidelines Documentation: - Chaos testing guide, circuit breakers, observability docs - Phase 7 UAT documentation updates - Martin Kleppmann technical writer agent Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
228 lines
7.4 KiB
Rust
228 lines
7.4 KiB
Rust
//! Helper functions for the research module.
|
|
//!
|
|
//! Contains extraction, normalization, and scoring logic.
|
|
|
|
use regex::Regex;
|
|
use stemedb_core::types::ObjectValue;
|
|
|
|
use super::researcher::DocumentationSource;
|
|
|
|
/// Default documentation sources to search.
|
|
pub(super) fn default_documentation_sources() -> Vec<DocumentationSource> {
|
|
vec![
|
|
DocumentationSource {
|
|
name: "Redis Official Docs".to_string(),
|
|
url_pattern: "https://redis.io/docs/management/{topic}/".to_string(),
|
|
topics: vec!["redis".to_string(), "cache".to_string(), "memory".to_string()],
|
|
tier: 2,
|
|
},
|
|
DocumentationSource {
|
|
name: "PostgreSQL Docs".to_string(),
|
|
url_pattern: "https://www.postgresql.org/docs/current/{topic}.html".to_string(),
|
|
topics: vec![
|
|
"postgres".to_string(),
|
|
"postgresql".to_string(),
|
|
"database".to_string(),
|
|
"connection".to_string(),
|
|
"pool".to_string(),
|
|
],
|
|
tier: 2,
|
|
},
|
|
DocumentationSource {
|
|
name: "Go Documentation".to_string(),
|
|
url_pattern: "https://pkg.go.dev/net/http#{topic}".to_string(),
|
|
topics: vec!["http".to_string(), "timeout".to_string(), "server".to_string()],
|
|
tier: 2,
|
|
},
|
|
DocumentationSource {
|
|
name: "Rust reqwest Docs".to_string(),
|
|
url_pattern: "https://docs.rs/reqwest/latest/reqwest/".to_string(),
|
|
topics: vec![
|
|
"reqwest".to_string(),
|
|
"http".to_string(),
|
|
"client".to_string(),
|
|
"tls".to_string(),
|
|
],
|
|
tier: 2,
|
|
},
|
|
DocumentationSource {
|
|
name: "OWASP".to_string(),
|
|
url_pattern: "https://cheatsheetseries.owasp.org/cheatsheets/{topic}_Cheat_Sheet.html"
|
|
.to_string(),
|
|
topics: vec![
|
|
"authentication".to_string(),
|
|
"session".to_string(),
|
|
"jwt".to_string(),
|
|
"password".to_string(),
|
|
"input".to_string(),
|
|
],
|
|
tier: 1,
|
|
},
|
|
DocumentationSource {
|
|
name: "Kafka Documentation".to_string(),
|
|
url_pattern: "https://kafka.apache.org/documentation/#{topic}".to_string(),
|
|
topics: vec![
|
|
"kafka".to_string(),
|
|
"producer".to_string(),
|
|
"consumer".to_string(),
|
|
"retention".to_string(),
|
|
],
|
|
tier: 2,
|
|
},
|
|
DocumentationSource {
|
|
name: "MongoDB Docs".to_string(),
|
|
url_pattern: "https://www.mongodb.com/docs/manual/reference/{topic}/".to_string(),
|
|
topics: vec![
|
|
"mongo".to_string(),
|
|
"mongodb".to_string(),
|
|
"connection".to_string(),
|
|
"replica".to_string(),
|
|
],
|
|
tier: 2,
|
|
},
|
|
]
|
|
}
|
|
|
|
/// Determine scheme from URL.
|
|
pub(super) fn determine_scheme_from_url(url: &str) -> &'static str {
|
|
if url.contains("rfc-editor.org") || url.contains("ietf.org") {
|
|
"rfc"
|
|
} else if url.contains("owasp.org") {
|
|
"owasp"
|
|
} else {
|
|
"vendor"
|
|
}
|
|
}
|
|
|
|
/// Normalize a topic for use in a subject path.
|
|
pub(super) fn normalize_topic(topic: &str) -> String {
|
|
topic
|
|
.to_lowercase()
|
|
.chars()
|
|
.map(|c| if c.is_alphanumeric() || c == '/' { c } else { '_' })
|
|
.collect::<String>()
|
|
.trim_matches('_')
|
|
.to_string()
|
|
}
|
|
|
|
/// Extract normative statements from content.
|
|
pub(super) fn extract_normative_statements(
|
|
content: &str,
|
|
topic: &str,
|
|
) -> Vec<(String, String, u8)> {
|
|
let mut statements = Vec::new();
|
|
|
|
// Pattern for normative keywords with context
|
|
let keyword_pattern = Regex::new(
|
|
r"(?i)(?P<context>[^.]*?)\b(MUST NOT|MUST|SHALL NOT|SHALL|SHOULD NOT|SHOULD|REQUIRED|RECOMMENDED)\b(?P<rest>[^.]*\.)"
|
|
).ok();
|
|
|
|
// Pattern for section headings (HTML and markdown)
|
|
let heading_pattern = Regex::new(r"(?i)<h[1-6][^>]*>([^<]+)</h[1-6]>|^#{1,6}\s+(.+)$").ok();
|
|
|
|
// Extract headings for context
|
|
let mut current_section = "General".to_string();
|
|
|
|
for line in content.lines() {
|
|
// Update section context from headings
|
|
if let Some(ref pattern) = heading_pattern {
|
|
if let Some(caps) = pattern.captures(line) {
|
|
current_section = caps
|
|
.get(1)
|
|
.or_else(|| caps.get(2))
|
|
.map(|m| m.as_str().trim().to_string())
|
|
.unwrap_or_else(|| "General".to_string());
|
|
}
|
|
}
|
|
|
|
// Check if line is relevant to topic
|
|
let line_lower = line.to_lowercase();
|
|
let topic_lower = topic.to_lowercase();
|
|
let topic_parts: Vec<&str> = topic_lower.split('/').collect();
|
|
|
|
let is_relevant = topic_parts.iter().any(|part| line_lower.contains(part));
|
|
|
|
if !is_relevant {
|
|
continue;
|
|
}
|
|
|
|
// Extract normative statements
|
|
if let Some(ref pattern) = keyword_pattern {
|
|
for caps in pattern.captures_iter(line) {
|
|
let keyword = caps.get(2).map(|m| m.as_str().to_uppercase()).unwrap_or_default();
|
|
let full_statement =
|
|
caps.get(0).map(|m| m.as_str().trim().to_string()).unwrap_or_default();
|
|
|
|
// Determine keyword strength
|
|
let strength = match keyword.as_str() {
|
|
"MUST" | "SHALL" | "REQUIRED" => 3,
|
|
"MUST NOT" | "SHALL NOT" => 3,
|
|
"SHOULD" | "RECOMMENDED" => 2,
|
|
"SHOULD NOT" => 2,
|
|
_ => 1,
|
|
};
|
|
|
|
if !full_statement.is_empty() && full_statement.len() > 10 {
|
|
statements.push((current_section.clone(), full_statement, strength));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
statements
|
|
}
|
|
|
|
/// Determine value and predicate from a statement.
|
|
pub(super) fn determine_value_and_predicate(
|
|
statement: &str,
|
|
default_predicate: &str,
|
|
) -> (ObjectValue, String) {
|
|
let upper = statement.to_uppercase();
|
|
|
|
// Check for boolean-like patterns
|
|
if upper.contains("MUST NOT") || upper.contains("SHALL NOT") || upper.contains("SHOULD NOT") {
|
|
return (ObjectValue::Boolean(false), "disabled".to_string());
|
|
}
|
|
|
|
if upper.contains("MUST") || upper.contains("SHALL") || upper.contains("REQUIRED") {
|
|
return (ObjectValue::Boolean(true), "required".to_string());
|
|
}
|
|
|
|
if upper.contains("SHOULD") || upper.contains("RECOMMENDED") {
|
|
return (ObjectValue::Boolean(true), "recommended".to_string());
|
|
}
|
|
|
|
// Default
|
|
(ObjectValue::Boolean(true), default_predicate.to_string())
|
|
}
|
|
|
|
/// Calculate confidence score based on various factors.
|
|
pub(super) fn calculate_confidence(
|
|
keyword_strength: u8,
|
|
statement: &str,
|
|
content_length: usize,
|
|
) -> f32 {
|
|
let mut confidence = 0.5; // Base confidence
|
|
|
|
// Keyword strength contribution (0.0 to 0.3)
|
|
confidence += (keyword_strength as f32) * 0.1;
|
|
|
|
// Statement length contribution (longer = better context)
|
|
if statement.len() > 50 {
|
|
confidence += 0.1;
|
|
}
|
|
if statement.len() > 100 {
|
|
confidence += 0.05;
|
|
}
|
|
|
|
// Content length contribution (more content = more context)
|
|
if content_length > 5000 {
|
|
confidence += 0.05;
|
|
}
|
|
if content_length > 20000 {
|
|
confidence += 0.05;
|
|
}
|
|
|
|
confidence.min(1.0)
|
|
}
|