stemedb/crates/stemedb-query/src/query/mod.rs

//! Query types and builder for filtering assertions.
//!
//! The Query struct represents a read request against the knowledge graph.
//! Queries can filter by any combination of subject, predicate, lifecycle,
//! and epoch.

use stemedb_core::types::{Assertion, EpochId, LifecycleStage, PHash};

// Re-export hamming_distance from stemedb_storage for public API consumers
pub use stemedb_storage::hamming_distance;

mod builder;
mod result;

pub use builder::QueryBuilder;
pub use result::QueryResult;

#[cfg(test)]
mod tests;

/// Parse hex string to 8-byte pHash.
///
/// Returns `None` if the hex string is not exactly 16 characters
/// or contains invalid hex digits. Case-insensitive: both "A3F2..."
/// and "a3f2..." are valid and produce identical results.
pub(crate) fn parse_hex_phash(hex_str: &str) -> Option<PHash> {
    if hex_str.len() != 16 {
        return None;
    }
    let bytes = hex::decode(hex_str).ok()?;
    if bytes.len() != 8 {
        return None;
    }
    let mut hash = [0u8; 8];
    hash.copy_from_slice(&bytes);
    Some(hash)
}

/// A query against the knowledge graph.
///
/// All fields are optional filters. If a field is `None`, it matches any value.
/// Multiple filters combine with AND semantics.
#[derive(Debug, Clone, Default)]
pub struct Query {
    /// Filter by subject entity.
    pub subject: Option<String>,

    /// Filter by predicate/relation.
    pub predicate: Option<String>,

    /// Filter by lifecycle stage.
    pub lifecycle: Option<LifecycleStage>,

    /// Filter by epoch (paradigm context).
    pub epoch: Option<EpochId>,

    /// Maximum number of results to return.
    pub limit: Option<usize>,

    /// Maximum acceptable staleness of materialized views in seconds.
    ///
    /// When set, the fast path (MV lookup) will be skipped if the materialized
    /// view is older than this threshold. This causes the query to fall through
    /// to the slow path, which re-computes the result from all candidate assertions.
    ///
    /// - `None` (default): Accept any MV age (backward-compatible behavior)
    /// - `Some(0)`: Never use MV, always use slow path
    /// - `Some(60)`: Only use MV if materialized within the last 60 seconds
    pub max_stale: Option<u64>,

    /// Filter by visual similarity to a reference pHash (hex-encoded, 16 chars).
    ///
    /// Returns assertions whose `visual_hash` has hamming distance <= `visual_threshold`.
    /// Assertions without a `visual_hash` are excluded from results when this is set.
    pub visual_near: Option<String>,

    /// Maximum hamming distance for `visual_near` matching.
    ///
    /// Range: 0-64 (8 bytes = 64 bits). Default: 8 (12.5% bit difference).
    /// Lower values require closer visual similarity.
    pub visual_threshold: Option<u32>,

    /// Query state as of this Unix timestamp (time-travel).
    ///
    /// When set, returns only assertions created at or before this timestamp.
    /// The fast path (MV lookup) is bypassed since MVs reflect current state.
    ///
    /// - `None` (default): Query current state (backward-compatible)
    /// - `Some(ts)`: Query historical state as it existed at timestamp `ts`
    pub as_of: Option<u64>,

    /// Decay half-life in seconds for confidence decay.
    ///
    /// When set, older assertions have their confidence scores reduced based on age.
    /// This implements semantic decay: a Reddit post from 2022 shouldn't compete
    /// equally with a 2024 RCT.
    ///
    /// Formula: `effective_confidence = confidence * 2^(-(age / halflife))`
    ///
    /// - `None` (default): No decay, all assertions weighted by original confidence
    /// - `Some(31536000)`: 1-year half-life (assertions lose ~50% confidence per year)
    /// - `Some(86400)`: 1-day half-life (fast decay for rapidly changing data)
    ///
    /// **Note**: When decay is enabled, the fast path (materialized view lookup) is
    /// bypassed because MVs store pre-computed winners without decay applied.
    /// Queries with decay always use the slow path for accurate results.
    ///
    /// # Example
    /// ```rust
    /// use stemedb_query::Query;
    ///
    /// // Medical queries with 6-month decay half-life
    /// let query = Query::builder()
    ///     .subject("Semaglutide")
    ///     .predicate("muscle_effect")
    ///     .decay_halflife(15768000) // 6 months in seconds
    ///     .build();
    /// ```
    pub decay_halflife: Option<u64>,

    /// Use source-class-aware decay instead of uniform decay.
    ///
    /// When `true` and `decay_halflife` is also set, the decay half-life
    /// is determined by each assertion's `source_class` tier:
    /// - Tier 0 (Regulatory): No decay
    /// - Tier 1 (Clinical): 2-year half-life
    /// - Tier 2 (Observational): 1-year half-life
    /// - Tier 3 (Expert): 6-month half-life
    /// - Tier 4 (Community): 3-month half-life
    /// - Tier 5 (Anecdotal): 1-month half-life
    ///
    /// The `decay_halflife` field serves as a fallback for assertions
    /// without a source_class, or when this flag is `false`.
    pub source_class_decay: bool,

    /// Query by semantic vector similarity (k-nearest neighbors).
    ///
    /// When set, the QueryEngine uses the vector index for candidate retrieval
    /// instead of the standard SP/S indexes. This enables semantic similarity
    /// queries like "find assertions with embeddings similar to this one."
    ///
    /// The `k` field specifies how many nearest neighbors to return.
    ///
    /// - `None` (default): Use standard index-based lookup
    /// - `Some(vec)`: Use vector index for k-NN search
    ///
    /// **Note**: When `vector_near` is set:
    /// - The fast path (MV lookup) is bypassed
    /// - Subject/predicate filters are applied AFTER vector search
    /// - Results are sorted by distance, not by lens resolution
    ///
    /// # Example
    /// ```rust
    /// use stemedb_query::Query;
    ///
    /// // Find 10 assertions with similar embeddings
    /// let embedding = vec![0.1, 0.2, 0.3, /* ... */];
    /// let query = Query::builder()
    ///     .vector_near(embedding, 10)
    ///     .subject("Semaglutide") // Optional: filter results
    ///     .build();
    /// ```
    pub vector_near: Option<Vec<f32>>,

    /// Number of nearest neighbors to return for vector search.
    ///
    /// Only used when `vector_near` is set. Defaults to 10 if not specified.
    pub k: Option<usize>,

    /// Minimum conflict score threshold (0.0 to 1.0).
    ///
    /// When set, only returns results where the materialized view's conflict_score
    /// is >= this value. Used to filter for controversial claims where assertions
    /// significantly disagree.
    ///
    /// - `None` (default): No conflict filtering
    /// - `Some(0.7)`: Only show claims with high conflict (disagreement)
    /// - `Some(0.0)`: Show all claims (equivalent to None)
    ///
    /// **Note**: This is a POST-resolution filter. The query executes normally,
    /// but results are filtered by conflict score after lens resolution.
    ///
    /// # Example
    /// ```rust
    /// use stemedb_query::Query;
    ///
    /// // Only show controversial claims (high disagreement)
    /// let query = Query::builder()
    ///     .subject("Semaglutide")
    ///     .predicate("muscle_effect")
    ///     .min_conflict_score(0.7)
    ///     .build();
    /// ```
    pub min_conflict_score: Option<f32>,

    /// Maximum conflict score threshold (0.0 to 1.0).
    ///
    /// When set, only returns results where the materialized view's conflict_score
    /// is <= this value. Used to filter for claims with strong agreement.
    ///
    /// - `None` (default): No conflict filtering
    /// - `Some(0.2)`: Only show claims with high agreement
    /// - `Some(1.0)`: Show all claims (equivalent to None)
    ///
    /// **Note**: This is a POST-resolution filter. The query executes normally,
    /// but results are filtered by conflict score after lens resolution.
    ///
    /// # Example
    /// ```rust
    /// use stemedb_query::Query;
    ///
    /// // Only show claims with strong consensus
    /// let query = Query::builder()
    ///     .subject("Semaglutide")
    ///     .predicate("muscle_effect")
    ///     .max_conflict_score(0.2)
    ///     .build();
    /// ```
    pub max_conflict_score: Option<f32>,

    /// Resolve aliases when querying by subject.
    ///
    /// When `true` and `subject` is specified, the QueryEngine will:
    /// 1. Call `alias_store.resolve_all(&subject)` to find all related subjects
    /// 2. Fetch assertions for ALL resolved subjects
    /// 3. Deduplicate results by assertion hash
    ///
    /// This enables cross-scheme concept resolution. For example, querying
    /// `code://rust/myapp/tls/cert_verification` with aliases enabled would also
    /// return assertions from `rfc://5246/tls/cert_verification` if they are aliased.
    ///
    /// - `false` (default): Query exact subject only (backward-compatible)
    /// - `true`: Expand subject to all aliased paths before querying
    ///
    /// **Note**: Requires an `AliasStore` to be configured on the `QueryEngine`.
    /// If no alias store is configured, this flag has no effect.
    ///
    /// # Example
    /// ```rust
    /// use stemedb_query::Query;
    ///
    /// // Find assertions from both code and RFC sources
    /// let query = Query::builder()
    ///     .subject("code://rust/myapp/tls/cert_verification")
    ///     .resolve_aliases(true)
    ///     .build();
    /// ```
    pub resolve_aliases: bool,
}

impl Query {
    /// Create a new empty query (matches all assertions).
    pub fn new() -> Self {
        Self::default()
    }

    /// Create a query builder for ergonomic query construction.
    pub fn builder() -> QueryBuilder {
        QueryBuilder::new()
    }

    /// Check if an assertion matches this query's filters.
    pub fn matches(&self, assertion: &Assertion) -> bool {
        // Check subject filter
        // Skip subject check when resolve_aliases is true, since the expanded
        // subjects (including aliases) were already used to fetch candidates.
        if !self.resolve_aliases {
            if let Some(ref subject) = self.subject {
                if &assertion.subject != subject {
                    return false;
                }
            }
        }

        // Check predicate filter
        if let Some(ref predicate) = self.predicate {
            if &assertion.predicate != predicate {
                return false;
            }
        }

        // Check lifecycle filter
        if let Some(lifecycle) = self.lifecycle {
            if assertion.lifecycle != lifecycle {
                return false;
            }
        }

        // Check epoch filter
        if let Some(epoch) = self.epoch {
            match assertion.epoch {
                Some(assertion_epoch) if assertion_epoch == epoch => {}
                _ => return false,
            }
        }

        // Check visual similarity filter
        if let Some(ref target_hex) = self.visual_near {
            let target = match parse_hex_phash(target_hex) {
                Some(h) => h,
                None => return false, // Invalid hex = no match
            };
            match assertion.visual_hash {
                Some(ref assertion_hash) => {
                    let threshold = self.visual_threshold.unwrap_or(8);
                    if hamming_distance(&target, assertion_hash) > threshold {
                        return false;
                    }
                }
                None => return false, // No visual_hash = no match when visual_near is specified
            }
        }

        // Check as_of (time-travel) filter
        if let Some(as_of_ts) = self.as_of {
            if assertion.timestamp > as_of_ts {
                return false;
            }
        }

        true
    }
}