## Phase 8: Enterprise Extractor Improvements ✅ - 14 security extractors (TLS, JWT, SQL injection, XSS, etc.) - 10 framework-specific extractors (Spring, Django, Rails, etc.) - Config file security detection (YAML, TOML) ## Phase 9: Autonomous Extractor Generation ✅ - Shadow mode executor with TP/FP tracking - Graduation pipeline with confidence thresholds - Auto-rollback on regression detection - Cross-project pattern syncing ## UAT Suite Complete (14 scripts, 90 tests) - test-core-detection.sh (6 tests) - test-declarative-extractors.sh (5 tests) - test-domain-frameworks.sh (5 tests) - test-domain-unreal.sh (3 tests) - test-llm-extraction.sh (6 tests) - test-eval-harness.sh (5 tests) - test-cross-language.sh (3 tests) - test-precommit-performance.sh (4 tests) - test-output-formats.sh (8 tests) - test-drift-detection.sh (6 tests) - test-exit-codes.sh (12 tests) + 3 more scripts ## Other Changes - Updated roadmap to mark Phase 8-9 complete - Added .gitignore entries for build artifacts - Updated pre-commit: 800 line limit, exclude tests/data/cmd Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
160 lines
5.6 KiB
Rust
160 lines
5.6 KiB
Rust
//! Dynamic prompt builder with ontology vocabulary injection.
|
|
//!
|
|
//! Builds system prompts that constrain LLM output to use authority-compatible
|
|
//! concept paths, ensuring conflict detection works correctly.
|
|
|
|
use crate::llm::ontology::OntologyVocabulary;
|
|
|
|
/// System prompt template with vocabulary placeholder.
|
|
const SYSTEM_PROMPT_TEMPLATE: &str = r#"You are a security code analyzer. Extract security-relevant claims from the provided code.
|
|
|
|
CRITICAL INSTRUCTION: You MUST use ONLY the concept paths listed in the VALID CONCEPT VOCABULARY table below.
|
|
Do NOT invent new paths. If the code doesn't match any known concept, return an empty claims array.
|
|
|
|
## VALID CONCEPT VOCABULARY
|
|
|
|
{vocabulary_section}
|
|
|
|
## CLAIM EXTRACTION RULES
|
|
|
|
1. **Subject Path**: MUST be EXACTLY one of the leaf paths from the table above
|
|
2. **Predicate**: MUST EXACTLY match the predicate for that concept from the table
|
|
3. **Value Type**: Use the value type specified in the table (boolean, text, number)
|
|
4. **Confidence**: Only report claims with confidence >= 0.7
|
|
|
|
## EXAMPLES
|
|
|
|
### Example 1: Python with verify=False
|
|
Code: `requests.get(url, verify=False)`
|
|
If vocabulary contains `tls/cert_verification | enabled | boolean`:
|
|
```json
|
|
{"subject": "tls/cert_verification", "predicate": "enabled", "value": false, "value_type": "boolean"}
|
|
```
|
|
|
|
### Example 2: Hardcoded API key
|
|
Code: `API_KEY = "sk-live-abc123"`
|
|
If vocabulary contains `secrets/api_key | hardcoded | boolean`:
|
|
```json
|
|
{"subject": "secrets/api_key", "predicate": "hardcoded", "value": true, "value_type": "boolean"}
|
|
```
|
|
|
|
### Example 3: JWT with algorithm none
|
|
Code: `algorithms: ['HS256', 'none']`
|
|
If vocabulary contains `jwt/algorithms | allows_none | boolean`:
|
|
```json
|
|
{"subject": "jwt/algorithms", "predicate": "allows_none", "value": true, "value_type": "boolean"}
|
|
```
|
|
|
|
## OUTPUT FORMAT
|
|
|
|
For each security claim found, provide:
|
|
- subject: A leaf path from the vocabulary table (MUST match exactly)
|
|
- predicate: The predicate for that concept (MUST match exactly)
|
|
- value: The actual value found in the code
|
|
- value_type: One of "text", "number", "boolean" (must match the concept's expected type)
|
|
- line: Line number where found (1-indexed)
|
|
- matched_text: The exact code snippet containing this claim (single line)
|
|
- confidence: How confident you are (0.0-1.0)
|
|
- description: Brief explanation of the security implications
|
|
|
|
Respond with JSON only, no markdown code blocks:
|
|
{
|
|
"claims": [
|
|
{
|
|
"subject": "tls/cert_verification",
|
|
"predicate": "enabled",
|
|
"value": false,
|
|
"value_type": "boolean",
|
|
"line": 42,
|
|
"matched_text": "verify=False",
|
|
"confidence": 0.95,
|
|
"description": "TLS certificate verification disabled, vulnerable to MITM attacks"
|
|
}
|
|
]
|
|
}
|
|
|
|
If no security claims matching the vocabulary are found, return: {"claims": []}"#;
|
|
|
|
/// Build a system prompt with ontology vocabulary injected.
|
|
pub fn build_system_prompt(vocabulary: &OntologyVocabulary) -> String {
|
|
let vocabulary_section = vocabulary.to_prompt_section();
|
|
SYSTEM_PROMPT_TEMPLATE.replace("{vocabulary_section}", &vocabulary_section)
|
|
}
|
|
|
|
/// Build a system prompt from raw vocabulary section string.
|
|
///
|
|
/// Useful when vocabulary is pre-computed or comes from a different source.
|
|
pub fn build_system_prompt_from_section(vocabulary_section: &str) -> String {
|
|
SYSTEM_PROMPT_TEMPLATE.replace("{vocabulary_section}", vocabulary_section)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, ObjectValue, SourceClass};
|
|
|
|
fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
|
|
let source_metadata = serde_json::json!({
|
|
"description": "Test description",
|
|
"source": "test",
|
|
});
|
|
|
|
Assertion {
|
|
subject: subject.to_string(),
|
|
predicate: predicate.to_string(),
|
|
object: value,
|
|
parent_hash: None,
|
|
source_hash: [0u8; 32],
|
|
source_class: SourceClass::Clinical,
|
|
visual_hash: None,
|
|
epoch: None,
|
|
source_metadata: serde_json::to_vec(&source_metadata).ok(),
|
|
lifecycle: LifecycleStage::Approved,
|
|
signatures: vec![],
|
|
confidence: 1.0,
|
|
timestamp: 0,
|
|
hlc_timestamp: HlcTimestamp::default(),
|
|
vector: None,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_system_prompt() {
|
|
let assertions = vec![
|
|
make_test_assertion(
|
|
"rfc://5246/tls/cert_verification",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
make_test_assertion(
|
|
"owasp://rate_limit/enabled",
|
|
"enabled",
|
|
ObjectValue::Boolean(true),
|
|
),
|
|
];
|
|
|
|
let vocab = OntologyVocabulary::from_assertions(&assertions);
|
|
let prompt = build_system_prompt(&vocab);
|
|
|
|
// Check vocabulary is included
|
|
assert!(prompt.contains("tls/cert_verification"));
|
|
assert!(prompt.contains("rate_limit/enabled"));
|
|
|
|
// Check critical instruction is present
|
|
assert!(prompt.contains("CRITICAL INSTRUCTION"));
|
|
assert!(prompt.contains("MUST use ONLY the concept paths"));
|
|
|
|
// Check output format instructions
|
|
assert!(prompt.contains("Respond with JSON only"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_system_prompt_from_section() {
|
|
let section = "| test/path | enabled | boolean | true | Test |";
|
|
let prompt = build_system_prompt_from_section(section);
|
|
|
|
assert!(prompt.contains("test/path"));
|
|
assert!(prompt.contains("CRITICAL INSTRUCTION"));
|
|
}
|
|
}
|