Enterprise Features: - Hosted mode with remote sync for team pattern aggregation - Community sharing with privacy-preserving anonymization - LLM-based semantic claim extraction with Gemini integration - Pattern learning with promotion to declarative extractors - High-entropy secrets extractor with configurable thresholds - Auth bypass and insecure cookies extractors Module Refactoring: - Split oversized files to comply with 500-line limit - Config split: types/core.rs, types/extractors.rs, types/hosted.rs, etc. - Handlers split: scan.rs, policy.rs, report.rs modules - Extractors split: declarative/, high_entropy_secrets/, insecure_cookies/ - Learning split: store modules with metrics and persistence SDK & Ontology: - stemedb-ontology SDK with fluent builders and StemeDB client - Pharma domain extractors for FDA Orange Book data - Consumer health UAT test infrastructure Code Quality: - Fixed clippy warnings (needless_borrows_for_generic_args) - Added KVStore trait imports where needed - Fixed utoipa path re-exports for OpenAPI docs Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
137 lines
4.8 KiB
Rust
137 lines
4.8 KiB
Rust
//! Dynamic prompt builder with ontology vocabulary injection.
|
|
//!
|
|
//! Builds system prompts that constrain LLM output to use authority-compatible
|
|
//! concept paths, ensuring conflict detection works correctly.
|
|
|
|
use crate::llm::ontology::OntologyVocabulary;
|
|
|
|
/// System prompt template with vocabulary placeholder.
|
|
const SYSTEM_PROMPT_TEMPLATE: &str = r#"You are a security code analyzer. Extract security-relevant claims from the provided code.
|
|
|
|
CRITICAL INSTRUCTION: You MUST use ONLY the concept paths listed in the VALID CONCEPT VOCABULARY table below.
|
|
Do NOT invent new paths. If the code doesn't match any known concept, return an empty claims array.
|
|
|
|
## VALID CONCEPT VOCABULARY
|
|
|
|
{vocabulary_section}
|
|
|
|
## CLAIM EXTRACTION RULES
|
|
|
|
1. **Subject Path**: MUST be one of the leaf paths from the table above (e.g., "rate_limit/enabled", "tls/cert_verification")
|
|
2. **Predicate**: MUST match the predicate for that concept from the table
|
|
3. **Value Type**: Use the value type specified in the table (boolean, text, number)
|
|
4. **Confidence**: Only report claims with confidence >= 0.7
|
|
|
|
## OUTPUT FORMAT
|
|
|
|
For each security claim found, provide:
|
|
- subject: A leaf path from the vocabulary table
|
|
- predicate: The predicate for that concept
|
|
- value: The actual value found in the code
|
|
- value_type: One of "text", "number", "boolean" (must match the concept's expected type)
|
|
- line: Line number where found (1-indexed)
|
|
- matched_text: The exact code snippet containing this claim (single line)
|
|
- confidence: How confident you are (0.0-1.0)
|
|
- description: Brief explanation of the security implications
|
|
|
|
Respond with JSON only, no markdown code blocks:
|
|
{
|
|
"claims": [
|
|
{
|
|
"subject": "tls/cert_verification",
|
|
"predicate": "enabled",
|
|
"value": false,
|
|
"value_type": "boolean",
|
|
"line": 42,
|
|
"matched_text": "verify=False",
|
|
"confidence": 0.95,
|
|
"description": "TLS certificate verification disabled, vulnerable to MITM attacks"
|
|
}
|
|
]
|
|
}
|
|
|
|
If no security claims matching the vocabulary are found, return: {"claims": []}"#;
|
|
|
|
/// Build a system prompt with ontology vocabulary injected.
|
|
pub fn build_system_prompt(vocabulary: &OntologyVocabulary) -> String {
|
|
let vocabulary_section = vocabulary.to_prompt_section();
|
|
SYSTEM_PROMPT_TEMPLATE.replace("{vocabulary_section}", &vocabulary_section)
|
|
}
|
|
|
|
/// Build a system prompt from raw vocabulary section string.
|
|
///
|
|
/// Useful when vocabulary is pre-computed or comes from a different source.
|
|
pub fn build_system_prompt_from_section(vocabulary_section: &str) -> String {
|
|
SYSTEM_PROMPT_TEMPLATE.replace("{vocabulary_section}", vocabulary_section)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, ObjectValue, SourceClass};
|
|
|
|
fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
|
|
let source_metadata = serde_json::json!({
|
|
"description": "Test description",
|
|
"source": "test",
|
|
});
|
|
|
|
Assertion {
|
|
subject: subject.to_string(),
|
|
predicate: predicate.to_string(),
|
|
object: value,
|
|
parent_hash: None,
|
|
source_hash: [0u8; 32],
|
|
source_class: SourceClass::Clinical,
|
|
visual_hash: None,
|
|
epoch: None,
|
|
source_metadata: serde_json::to_vec(&source_metadata).ok(),
|
|
lifecycle: LifecycleStage::Approved,
|
|
signatures: vec![],
|
|
confidence: 1.0,
|
|
timestamp: 0,
|
|
hlc_timestamp: HlcTimestamp::default(),
|
|
vector: None,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_system_prompt() {
|
|
let assertions = vec![
|
|
make_test_assertion(
|
|
"rfc://5246/tls/cert_verification",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
let prompt = build_system_prompt(&vocab);
|
|
|
|
// Check vocabulary is included
|
|
assert!(prompt.contains("tls/cert_verification"));
|
|
assert!(prompt.contains("rate_limit/enabled"));
|
|
|
|
// Check critical instruction is present
|
|
assert!(prompt.contains("CRITICAL INSTRUCTION"));
|
|
assert!(prompt.contains("MUST use ONLY the concept paths"));
|
|
|
|
// Check output format instructions
|
|
assert!(prompt.contains("Respond with JSON only"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_system_prompt_from_section() {
|
|
let section = "| test/path | enabled | boolean | true | Test |";
|
|
let prompt = build_system_prompt_from_section(section);
|
|
|
|
assert!(prompt.contains("test/path"));
|
|
assert!(prompt.contains("CRITICAL INSTRUCTION"));
|
|
}
|
|
}
|