stemedb/applications/aphoria/src/llm/prompt.rs

//! Dynamic prompt builder with ontology vocabulary injection.
//!
//! Builds system prompts that constrain LLM output to use authority-compatible
//! concept paths, ensuring conflict detection works correctly.

use crate::llm::ontology::OntologyVocabulary;

/// System prompt template with vocabulary placeholder.
const SYSTEM_PROMPT_TEMPLATE: &str = r#"You are a security code analyzer. Extract security-relevant claims from the provided code.

CRITICAL INSTRUCTION: You MUST use ONLY the concept paths listed in the VALID CONCEPT VOCABULARY table below.
Do NOT invent new paths. If the code doesn't match any known concept, return an empty claims array.

## VALID CONCEPT VOCABULARY

{vocabulary_section}

## CLAIM EXTRACTION RULES

1. **Subject Path**: MUST be EXACTLY one of the leaf paths from the table above
2. **Predicate**: MUST EXACTLY match the predicate for that concept from the table
3. **Value Type**: Use the value type specified in the table (boolean, text, number)
4. **Confidence**: Only report claims with confidence >= 0.7

## EXAMPLES

### Example 1: Python with verify=False
Code: `requests.get(url, verify=False)`
If vocabulary contains `tls/cert_verification | enabled | boolean`:
```json
{"subject": "tls/cert_verification", "predicate": "enabled", "value": false, "value_type": "boolean"}
```

### Example 2: Hardcoded API key
Code: `API_KEY = "sk-live-abc123"`
If vocabulary contains `secrets/api_key | hardcoded | boolean`:
```json
{"subject": "secrets/api_key", "predicate": "hardcoded", "value": true, "value_type": "boolean"}
```

### Example 3: JWT with algorithm none
Code: `algorithms: ['HS256', 'none']`
If vocabulary contains `jwt/algorithms | allows_none | boolean`:
```json
{"subject": "jwt/algorithms", "predicate": "allows_none", "value": true, "value_type": "boolean"}
```

## OUTPUT FORMAT

For each security claim found, provide:
- subject: A leaf path from the vocabulary table (MUST match exactly)
- predicate: The predicate for that concept (MUST match exactly)
- value: The actual value found in the code
- value_type: One of "text", "number", "boolean" (must match the concept's expected type)
- line: Line number where found (1-indexed)
- matched_text: The exact code snippet containing this claim (single line)
- confidence: How confident you are (0.0-1.0)
- description: Brief explanation of the security implications

Respond with JSON only, no markdown code blocks:
{
  "claims": [
    {
      "subject": "tls/cert_verification",
      "predicate": "enabled",
      "value": false,
      "value_type": "boolean",
      "line": 42,
      "matched_text": "verify=False",
      "confidence": 0.95,
      "description": "TLS certificate verification disabled, vulnerable to MITM attacks"
    }
  ]
}

If no security claims matching the vocabulary are found, return: {"claims": []}"#;

/// Build a system prompt with ontology vocabulary injected.
pub fn build_system_prompt(vocabulary: &OntologyVocabulary) -> String {
    let vocabulary_section = vocabulary.to_prompt_section();
    SYSTEM_PROMPT_TEMPLATE.replace("{vocabulary_section}", &vocabulary_section)
}

/// Build a system prompt from raw vocabulary section string.
///
/// Useful when vocabulary is pre-computed or comes from a different source.
pub fn build_system_prompt_from_section(vocabulary_section: &str) -> String {
    SYSTEM_PROMPT_TEMPLATE.replace("{vocabulary_section}", vocabulary_section)
}

#[cfg(test)]
mod tests {
    use super::*;
    use stemedb_core::types::{Assertion, HlcTimestamp, LifecycleStage, ObjectValue, SourceClass};

    fn make_test_assertion(subject: &str, predicate: &str, value: ObjectValue) -> Assertion {
        let source_metadata = serde_json::json!({
            "description": "Test description",
            "source": "test",
        });

        Assertion {
            subject: subject.to_string(),
            predicate: predicate.to_string(),
            object: value,
            parent_hash: None,
            source_hash: [0u8; 32],
            source_class: SourceClass::Clinical,
            visual_hash: None,
            epoch: None,
            source_metadata: serde_json::to_vec(&source_metadata).ok(),
            lifecycle: LifecycleStage::Approved,
            signatures: vec![],
            confidence: 1.0,
            timestamp: 0,
            hlc_timestamp: HlcTimestamp::default(),
            vector: None,
        }
    }

    #[test]
    fn test_build_system_prompt() {
        let assertions = vec![
            make_test_assertion(
                "rfc://5246/tls/cert_verification",
                "enabled",
                ObjectValue::Boolean(true),
            ),
            make_test_assertion(
                "owasp://rate_limit/enabled",
                "enabled",
                ObjectValue::Boolean(true),
            ),
        ];

        let vocab = OntologyVocabulary::from_assertions(&assertions);
        let prompt = build_system_prompt(&vocab);

        // Check vocabulary is included
        assert!(prompt.contains("tls/cert_verification"));
        assert!(prompt.contains("rate_limit/enabled"));

        // Check critical instruction is present
        assert!(prompt.contains("CRITICAL INSTRUCTION"));
        assert!(prompt.contains("MUST use ONLY the concept paths"));

        // Check output format instructions
        assert!(prompt.contains("Respond with JSON only"));
    }

    #[test]
    fn test_build_system_prompt_from_section() {
        let section = "| test/path | enabled | boolean | true | Test |";
        let prompt = build_system_prompt_from_section(section);

        assert!(prompt.contains("test/path"));
        assert!(prompt.contains("CRITICAL INSTRUCTION"));
    }
}