stemedb/applications/aphoria/src/research/helpers.rs
jordan b3e8a9a058 feat: Multi-application expansion with chaos testing and community UI
Major additions:
- Community Next.js app (port 18187) for browsing claims with API docs
- stemedb-chaos crate: Fault injection, chaos testing, CRDT properties
- Latent ingestion system: Reddit/FDA ingesters with ADK-Go agents
- Disputed claims handling: Manual review workflows and validation
- Aphoria security scanner: New extractors (SQL injection, command
  injection, weak crypto, TLS version), policy-based ignores, UAT reports
- Docker infrastructure: Dockerfile, docker-compose.yml for full stack
- VulnBank demo: Intentionally vulnerable multi-language test corpus

SDK & API enhancements:
- Source registry handlers for tracking data provenance
- Metrics endpoint
- Skeptic filtering improvements

Code quality:
- Split 14 large files (>500 lines) into focused modules
- All files now under 500-line limit per project guidelines

Documentation:
- Chaos testing guide, circuit breakers, observability docs
- Phase 7 UAT documentation updates
- Martin Kleppmann technical writer agent

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 01:24:14 -07:00

228 lines
7.4 KiB
Rust

//! Helper functions for the research module.
//!
//! Contains extraction, normalization, and scoring logic.
use regex::Regex;
use stemedb_core::types::ObjectValue;
use super::researcher::DocumentationSource;
/// Default documentation sources to search.
pub(super) fn default_documentation_sources() -> Vec<DocumentationSource> {
vec![
DocumentationSource {
name: "Redis Official Docs".to_string(),
url_pattern: "https://redis.io/docs/management/{topic}/".to_string(),
topics: vec!["redis".to_string(), "cache".to_string(), "memory".to_string()],
tier: 2,
},
DocumentationSource {
name: "PostgreSQL Docs".to_string(),
url_pattern: "https://www.postgresql.org/docs/current/{topic}.html".to_string(),
topics: vec![
"postgres".to_string(),
"postgresql".to_string(),
"database".to_string(),
"connection".to_string(),
"pool".to_string(),
],
tier: 2,
},
DocumentationSource {
name: "Go Documentation".to_string(),
url_pattern: "https://pkg.go.dev/net/http#{topic}".to_string(),
topics: vec!["http".to_string(), "timeout".to_string(), "server".to_string()],
tier: 2,
},
DocumentationSource {
name: "Rust reqwest Docs".to_string(),
url_pattern: "https://docs.rs/reqwest/latest/reqwest/".to_string(),
topics: vec![
"reqwest".to_string(),
"http".to_string(),
"client".to_string(),
"tls".to_string(),
],
tier: 2,
},
DocumentationSource {
name: "OWASP".to_string(),
url_pattern: "https://cheatsheetseries.owasp.org/cheatsheets/{topic}_Cheat_Sheet.html"
.to_string(),
topics: vec![
"authentication".to_string(),
"session".to_string(),
"jwt".to_string(),
"password".to_string(),
"input".to_string(),
],
tier: 1,
},
DocumentationSource {
name: "Kafka Documentation".to_string(),
url_pattern: "https://kafka.apache.org/documentation/#{topic}".to_string(),
topics: vec![
"kafka".to_string(),
"producer".to_string(),
"consumer".to_string(),
"retention".to_string(),
],
tier: 2,
},
DocumentationSource {
name: "MongoDB Docs".to_string(),
url_pattern: "https://www.mongodb.com/docs/manual/reference/{topic}/".to_string(),
topics: vec![
"mongo".to_string(),
"mongodb".to_string(),
"connection".to_string(),
"replica".to_string(),
],
tier: 2,
},
]
}
/// Determine scheme from URL.
pub(super) fn determine_scheme_from_url(url: &str) -> &'static str {
if url.contains("rfc-editor.org") || url.contains("ietf.org") {
"rfc"
} else if url.contains("owasp.org") {
"owasp"
} else {
"vendor"
}
}
/// Normalize a topic for use in a subject path.
pub(super) fn normalize_topic(topic: &str) -> String {
topic
.to_lowercase()
.chars()
.map(|c| if c.is_alphanumeric() || c == '/' { c } else { '_' })
.collect::<String>()
.trim_matches('_')
.to_string()
}
/// Extract normative statements from content.
pub(super) fn extract_normative_statements(
content: &str,
topic: &str,
) -> Vec<(String, String, u8)> {
let mut statements = Vec::new();
// Pattern for normative keywords with context
let keyword_pattern = Regex::new(
r"(?i)(?P<context>[^.]*?)\b(MUST NOT|MUST|SHALL NOT|SHALL|SHOULD NOT|SHOULD|REQUIRED|RECOMMENDED)\b(?P<rest>[^.]*\.)"
).ok();
// Pattern for section headings (HTML and markdown)
let heading_pattern = Regex::new(r"(?i)<h[1-6][^>]*>([^<]+)</h[1-6]>|^#{1,6}\s+(.+)$").ok();
// Extract headings for context
let mut current_section = "General".to_string();
for line in content.lines() {
// Update section context from headings
if let Some(ref pattern) = heading_pattern {
if let Some(caps) = pattern.captures(line) {
current_section = caps
.get(1)
.or_else(|| caps.get(2))
.map(|m| m.as_str().trim().to_string())
.unwrap_or_else(|| "General".to_string());
}
}
// Check if line is relevant to topic
let line_lower = line.to_lowercase();
let topic_lower = topic.to_lowercase();
let topic_parts: Vec<&str> = topic_lower.split('/').collect();
let is_relevant = topic_parts.iter().any(|part| line_lower.contains(part));
if !is_relevant {
continue;
}
// Extract normative statements
if let Some(ref pattern) = keyword_pattern {
for caps in pattern.captures_iter(line) {
let keyword = caps.get(2).map(|m| m.as_str().to_uppercase()).unwrap_or_default();
let full_statement =
caps.get(0).map(|m| m.as_str().trim().to_string()).unwrap_or_default();
// Determine keyword strength
let strength = match keyword.as_str() {
"MUST" | "SHALL" | "REQUIRED" => 3,
"MUST NOT" | "SHALL NOT" => 3,
"SHOULD" | "RECOMMENDED" => 2,
"SHOULD NOT" => 2,
_ => 1,
};
if !full_statement.is_empty() && full_statement.len() > 10 {
statements.push((current_section.clone(), full_statement, strength));
}
}
}
}
statements
}
/// Determine value and predicate from a statement.
pub(super) fn determine_value_and_predicate(
statement: &str,
default_predicate: &str,
) -> (ObjectValue, String) {
let upper = statement.to_uppercase();
// Check for boolean-like patterns
if upper.contains("MUST NOT") || upper.contains("SHALL NOT") || upper.contains("SHOULD NOT") {
return (ObjectValue::Boolean(false), "disabled".to_string());
}
if upper.contains("MUST") || upper.contains("SHALL") || upper.contains("REQUIRED") {
return (ObjectValue::Boolean(true), "required".to_string());
}
if upper.contains("SHOULD") || upper.contains("RECOMMENDED") {
return (ObjectValue::Boolean(true), "recommended".to_string());
}
// Default
(ObjectValue::Boolean(true), default_predicate.to_string())
}
/// Calculate confidence score based on various factors.
pub(super) fn calculate_confidence(
keyword_strength: u8,
statement: &str,
content_length: usize,
) -> f32 {
let mut confidence = 0.5; // Base confidence
// Keyword strength contribution (0.0 to 0.3)
confidence += (keyword_strength as f32) * 0.1;
// Statement length contribution (longer = better context)
if statement.len() > 50 {
confidence += 0.1;
}
if statement.len() > 100 {
confidence += 0.05;
}
// Content length contribution (more content = more context)
if content_length > 5000 {
confidence += 0.05;
}
if content_length > 20000 {
confidence += 0.05;
}
confidence.min(1.0)
}