stemedb/applications/aphoria/src/research/helpers.rs

//! Helper functions for the research module.
//!
//! Contains extraction, normalization, and scoring logic.

use regex::Regex;
use stemedb_core::types::ObjectValue;

use super::researcher::DocumentationSource;

/// Default documentation sources to search.
pub(super) fn default_documentation_sources() -> Vec<DocumentationSource> {
    vec![
        DocumentationSource {
            name: "Redis Official Docs".to_string(),
            url_pattern: "https://redis.io/docs/management/{topic}/".to_string(),
            topics: vec!["redis".to_string(), "cache".to_string(), "memory".to_string()],
            tier: 2,
        },
        DocumentationSource {
            name: "PostgreSQL Docs".to_string(),
            url_pattern: "https://www.postgresql.org/docs/current/{topic}.html".to_string(),
            topics: vec![
                "postgres".to_string(),
                "postgresql".to_string(),
                "database".to_string(),
                "connection".to_string(),
                "pool".to_string(),
            ],
            tier: 2,
        },
        DocumentationSource {
            name: "Go Documentation".to_string(),
            url_pattern: "https://pkg.go.dev/net/http#{topic}".to_string(),
            topics: vec!["http".to_string(), "timeout".to_string(), "server".to_string()],
            tier: 2,
        },
        DocumentationSource {
            name: "Rust reqwest Docs".to_string(),
            url_pattern: "https://docs.rs/reqwest/latest/reqwest/".to_string(),
            topics: vec![
                "reqwest".to_string(),
                "http".to_string(),
                "client".to_string(),
                "tls".to_string(),
            ],
            tier: 2,
        },
        DocumentationSource {
            name: "OWASP".to_string(),
            url_pattern: "https://cheatsheetseries.owasp.org/cheatsheets/{topic}_Cheat_Sheet.html"
                .to_string(),
            topics: vec![
                "authentication".to_string(),
                "session".to_string(),
                "jwt".to_string(),
                "password".to_string(),
                "input".to_string(),
            ],
            tier: 1,
        },
        DocumentationSource {
            name: "Kafka Documentation".to_string(),
            url_pattern: "https://kafka.apache.org/documentation/#{topic}".to_string(),
            topics: vec![
                "kafka".to_string(),
                "producer".to_string(),
                "consumer".to_string(),
                "retention".to_string(),
            ],
            tier: 2,
        },
        DocumentationSource {
            name: "MongoDB Docs".to_string(),
            url_pattern: "https://www.mongodb.com/docs/manual/reference/{topic}/".to_string(),
            topics: vec![
                "mongo".to_string(),
                "mongodb".to_string(),
                "connection".to_string(),
                "replica".to_string(),
            ],
            tier: 2,
        },
    ]
}

/// Determine scheme from URL.
pub(super) fn determine_scheme_from_url(url: &str) -> &'static str {
    if url.contains("rfc-editor.org") || url.contains("ietf.org") {
        "rfc"
    } else if url.contains("owasp.org") {
        "owasp"
    } else {
        "vendor"
    }
}

/// Normalize a topic for use in a subject path.
pub(super) fn normalize_topic(topic: &str) -> String {
    topic
        .to_lowercase()
        .chars()
        .map(|c| if c.is_alphanumeric() || c == '/' { c } else { '_' })
        .collect::<String>()
        .trim_matches('_')
        .to_string()
}

/// Extract normative statements from content.
pub(super) fn extract_normative_statements(
    content: &str,
    topic: &str,
) -> Vec<(String, String, u8)> {
    let mut statements = Vec::new();

    // Pattern for normative keywords with context
    let keyword_pattern = Regex::new(
        r"(?i)(?P<context>[^.]*?)\b(MUST NOT|MUST|SHALL NOT|SHALL|SHOULD NOT|SHOULD|REQUIRED|RECOMMENDED)\b(?P<rest>[^.]*\.)"
    ).ok();

    // Pattern for section headings (HTML and markdown)
    let heading_pattern = Regex::new(r"(?i)<h[1-6][^>]*>([^<]+)</h[1-6]>|^#{1,6}\s+(.+)$").ok();

    // Extract headings for context
    let mut current_section = "General".to_string();

    for line in content.lines() {
        // Update section context from headings
        if let Some(ref pattern) = heading_pattern {
            if let Some(caps) = pattern.captures(line) {
                current_section = caps
                    .get(1)
                    .or_else(|| caps.get(2))
                    .map(|m| m.as_str().trim().to_string())
                    .unwrap_or_else(|| "General".to_string());
            }
        }

        // Check if line is relevant to topic
        let line_lower = line.to_lowercase();
        let topic_lower = topic.to_lowercase();
        let topic_parts: Vec<&str> = topic_lower.split('/').collect();

        let is_relevant = topic_parts.iter().any(|part| line_lower.contains(part));

        if !is_relevant {
            continue;
        }

        // Extract normative statements
        if let Some(ref pattern) = keyword_pattern {
            for caps in pattern.captures_iter(line) {
                let keyword = caps.get(2).map(|m| m.as_str().to_uppercase()).unwrap_or_default();
                let full_statement =
                    caps.get(0).map(|m| m.as_str().trim().to_string()).unwrap_or_default();

                // Determine keyword strength
                let strength = match keyword.as_str() {
                    "MUST" | "SHALL" | "REQUIRED" => 3,
                    "MUST NOT" | "SHALL NOT" => 3,
                    "SHOULD" | "RECOMMENDED" => 2,
                    "SHOULD NOT" => 2,
                    _ => 1,
                };

                if !full_statement.is_empty() && full_statement.len() > 10 {
                    statements.push((current_section.clone(), full_statement, strength));
                }
            }
        }
    }

    statements
}

/// Determine value and predicate from a statement.
pub(super) fn determine_value_and_predicate(
    statement: &str,
    default_predicate: &str,
) -> (ObjectValue, String) {
    let upper = statement.to_uppercase();

    // Check for boolean-like patterns
    if upper.contains("MUST NOT") || upper.contains("SHALL NOT") || upper.contains("SHOULD NOT") {
        return (ObjectValue::Boolean(false), "disabled".to_string());
    }

    if upper.contains("MUST") || upper.contains("SHALL") || upper.contains("REQUIRED") {
        return (ObjectValue::Boolean(true), "required".to_string());
    }

    if upper.contains("SHOULD") || upper.contains("RECOMMENDED") {
        return (ObjectValue::Boolean(true), "recommended".to_string());
    }

    // Default
    (ObjectValue::Boolean(true), default_predicate.to_string())
}

/// Calculate confidence score based on various factors.
pub(super) fn calculate_confidence(
    keyword_strength: u8,
    statement: &str,
    content_length: usize,
) -> f32 {
    let mut confidence = 0.5; // Base confidence

    // Keyword strength contribution (0.0 to 0.3)
    confidence += (keyword_strength as f32) * 0.1;

    // Statement length contribution (longer = better context)
    if statement.len() > 50 {
        confidence += 0.1;
    }
    if statement.len() > 100 {
        confidence += 0.05;
    }

    // Content length contribution (more content = more context)
    if content_length > 5000 {
        confidence += 0.05;
    }
    if content_length > 20000 {
        confidence += 0.05;
    }

    confidence.min(1.0)
}