stemedb/crates/stemedb-query/src/query/mod.rs
jordan d3a88585fe feat: Phase 6 UAT - Admission control, HLC recency, cluster coordination
This commit includes comprehensive work on Phase 6 features:

## Admission Control (Phase 6 admission middleware)
- AdmissionStore implementation backed by TrustRankStore
- PoW verification with tier-based difficulty computation
- Trust tier progression (Newcomer → Established → Trusted → Authority)
- API integration with admission status endpoints

## HLC Recency Lens (Phase 6C)
- HlcRecencyLens for distributed system ordering
- Hybrid logical clock integration with causality preservation

## Cluster Coordination (Phase 6C)
- Multi-node cluster tests (availability, partition tolerance)
- CRDT convergence tests for anti-entropy sync
- Gateway handler improvements

## Aphoria Code Linter (Phase 2A)
- RFC/OWASP corpus builders with network fetching and caching
- Concept hierarchy with auto-alias creation on conflict detection
- Multiple security extractors (TLS, JWT, CORS, secrets, rate limiting)

## Code Organization
- Split large files into modules to comply with 500-line limit
- Improved test organization with separate test modules
- Fixed rkyv serialization for EigenTrustState (AgentScore struct)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 00:43:37 -07:00

324 lines
12 KiB
Rust

//! Query types and builder for filtering assertions.
//!
//! The Query struct represents a read request against the knowledge graph.
//! Queries can filter by any combination of subject, predicate, lifecycle,
//! and epoch.
use stemedb_core::types::{Assertion, EpochId, LifecycleStage, PHash};
// Re-export hamming_distance from stemedb_storage for public API consumers
pub use stemedb_storage::hamming_distance;
mod builder;
mod result;
pub use builder::QueryBuilder;
pub use result::QueryResult;
#[cfg(test)]
mod tests;
/// Parse hex string to 8-byte pHash.
///
/// Returns `None` if the hex string is not exactly 16 characters
/// or contains invalid hex digits. Case-insensitive: both "A3F2..."
/// and "a3f2..." are valid and produce identical results.
pub(crate) fn parse_hex_phash(hex_str: &str) -> Option<PHash> {
if hex_str.len() != 16 {
return None;
}
let bytes = hex::decode(hex_str).ok()?;
if bytes.len() != 8 {
return None;
}
let mut hash = [0u8; 8];
hash.copy_from_slice(&bytes);
Some(hash)
}
/// A query against the knowledge graph.
///
/// All fields are optional filters. If a field is `None`, it matches any value.
/// Multiple filters combine with AND semantics.
#[derive(Debug, Clone, Default)]
pub struct Query {
/// Filter by subject entity.
pub subject: Option<String>,
/// Filter by predicate/relation.
pub predicate: Option<String>,
/// Filter by lifecycle stage.
pub lifecycle: Option<LifecycleStage>,
/// Filter by epoch (paradigm context).
pub epoch: Option<EpochId>,
/// Maximum number of results to return.
pub limit: Option<usize>,
/// Maximum acceptable staleness of materialized views in seconds.
///
/// When set, the fast path (MV lookup) will be skipped if the materialized
/// view is older than this threshold. This causes the query to fall through
/// to the slow path, which re-computes the result from all candidate assertions.
///
/// - `None` (default): Accept any MV age (backward-compatible behavior)
/// - `Some(0)`: Never use MV, always use slow path
/// - `Some(60)`: Only use MV if materialized within the last 60 seconds
pub max_stale: Option<u64>,
/// Filter by visual similarity to a reference pHash (hex-encoded, 16 chars).
///
/// Returns assertions whose `visual_hash` has hamming distance <= `visual_threshold`.
/// Assertions without a `visual_hash` are excluded from results when this is set.
pub visual_near: Option<String>,
/// Maximum hamming distance for `visual_near` matching.
///
/// Range: 0-64 (8 bytes = 64 bits). Default: 8 (12.5% bit difference).
/// Lower values require closer visual similarity.
pub visual_threshold: Option<u32>,
/// Query state as of this Unix timestamp (time-travel).
///
/// When set, returns only assertions created at or before this timestamp.
/// The fast path (MV lookup) is bypassed since MVs reflect current state.
///
/// - `None` (default): Query current state (backward-compatible)
/// - `Some(ts)`: Query historical state as it existed at timestamp `ts`
pub as_of: Option<u64>,
/// Decay half-life in seconds for confidence decay.
///
/// When set, older assertions have their confidence scores reduced based on age.
/// This implements semantic decay: a Reddit post from 2022 shouldn't compete
/// equally with a 2024 RCT.
///
/// Formula: `effective_confidence = confidence * 2^(-(age / halflife))`
///
/// - `None` (default): No decay, all assertions weighted by original confidence
/// - `Some(31536000)`: 1-year half-life (assertions lose ~50% confidence per year)
/// - `Some(86400)`: 1-day half-life (fast decay for rapidly changing data)
///
/// **Note**: When decay is enabled, the fast path (materialized view lookup) is
/// bypassed because MVs store pre-computed winners without decay applied.
/// Queries with decay always use the slow path for accurate results.
///
/// # Example
/// ```rust
/// use stemedb_query::Query;
///
/// // Medical queries with 6-month decay half-life
/// let query = Query::builder()
/// .subject("Semaglutide")
/// .predicate("muscle_effect")
/// .decay_halflife(15768000) // 6 months in seconds
/// .build();
/// ```
pub decay_halflife: Option<u64>,
/// Use source-class-aware decay instead of uniform decay.
///
/// When `true` and `decay_halflife` is also set, the decay half-life
/// is determined by each assertion's `source_class` tier:
/// - Tier 0 (Regulatory): No decay
/// - Tier 1 (Clinical): 2-year half-life
/// - Tier 2 (Observational): 1-year half-life
/// - Tier 3 (Expert): 6-month half-life
/// - Tier 4 (Community): 3-month half-life
/// - Tier 5 (Anecdotal): 1-month half-life
///
/// The `decay_halflife` field serves as a fallback for assertions
/// without a source_class, or when this flag is `false`.
pub source_class_decay: bool,
/// Query by semantic vector similarity (k-nearest neighbors).
///
/// When set, the QueryEngine uses the vector index for candidate retrieval
/// instead of the standard SP/S indexes. This enables semantic similarity
/// queries like "find assertions with embeddings similar to this one."
///
/// The `k` field specifies how many nearest neighbors to return.
///
/// - `None` (default): Use standard index-based lookup
/// - `Some(vec)`: Use vector index for k-NN search
///
/// **Note**: When `vector_near` is set:
/// - The fast path (MV lookup) is bypassed
/// - Subject/predicate filters are applied AFTER vector search
/// - Results are sorted by distance, not by lens resolution
///
/// # Example
/// ```rust
/// use stemedb_query::Query;
///
/// // Find 10 assertions with similar embeddings
/// let embedding = vec![0.1, 0.2, 0.3, /* ... */];
/// let query = Query::builder()
/// .vector_near(embedding, 10)
/// .subject("Semaglutide") // Optional: filter results
/// .build();
/// ```
pub vector_near: Option<Vec<f32>>,
/// Number of nearest neighbors to return for vector search.
///
/// Only used when `vector_near` is set. Defaults to 10 if not specified.
pub k: Option<usize>,
/// Minimum conflict score threshold (0.0 to 1.0).
///
/// When set, only returns results where the materialized view's conflict_score
/// is >= this value. Used to filter for controversial claims where assertions
/// significantly disagree.
///
/// - `None` (default): No conflict filtering
/// - `Some(0.7)`: Only show claims with high conflict (disagreement)
/// - `Some(0.0)`: Show all claims (equivalent to None)
///
/// **Note**: This is a POST-resolution filter. The query executes normally,
/// but results are filtered by conflict score after lens resolution.
///
/// # Example
/// ```rust
/// use stemedb_query::Query;
///
/// // Only show controversial claims (high disagreement)
/// let query = Query::builder()
/// .subject("Semaglutide")
/// .predicate("muscle_effect")
/// .min_conflict_score(0.7)
/// .build();
/// ```
pub min_conflict_score: Option<f32>,
/// Maximum conflict score threshold (0.0 to 1.0).
///
/// When set, only returns results where the materialized view's conflict_score
/// is <= this value. Used to filter for claims with strong agreement.
///
/// - `None` (default): No conflict filtering
/// - `Some(0.2)`: Only show claims with high agreement
/// - `Some(1.0)`: Show all claims (equivalent to None)
///
/// **Note**: This is a POST-resolution filter. The query executes normally,
/// but results are filtered by conflict score after lens resolution.
///
/// # Example
/// ```rust
/// use stemedb_query::Query;
///
/// // Only show claims with strong consensus
/// let query = Query::builder()
/// .subject("Semaglutide")
/// .predicate("muscle_effect")
/// .max_conflict_score(0.2)
/// .build();
/// ```
pub max_conflict_score: Option<f32>,
/// Resolve aliases when querying by subject.
///
/// When `true` and `subject` is specified, the QueryEngine will:
/// 1. Call `alias_store.resolve_all(&subject)` to find all related subjects
/// 2. Fetch assertions for ALL resolved subjects
/// 3. Deduplicate results by assertion hash
///
/// This enables cross-scheme concept resolution. For example, querying
/// `code://rust/myapp/tls/cert_verification` with aliases enabled would also
/// return assertions from `rfc://5246/tls/cert_verification` if they are aliased.
///
/// - `false` (default): Query exact subject only (backward-compatible)
/// - `true`: Expand subject to all aliased paths before querying
///
/// **Note**: Requires an `AliasStore` to be configured on the `QueryEngine`.
/// If no alias store is configured, this flag has no effect.
///
/// # Example
/// ```rust
/// use stemedb_query::Query;
///
/// // Find assertions from both code and RFC sources
/// let query = Query::builder()
/// .subject("code://rust/myapp/tls/cert_verification")
/// .resolve_aliases(true)
/// .build();
/// ```
pub resolve_aliases: bool,
}
impl Query {
/// Create a new empty query (matches all assertions).
pub fn new() -> Self {
Self::default()
}
/// Create a query builder for ergonomic query construction.
pub fn builder() -> QueryBuilder {
QueryBuilder::new()
}
/// Check if an assertion matches this query's filters.
pub fn matches(&self, assertion: &Assertion) -> bool {
// Check subject filter
// Skip subject check when resolve_aliases is true, since the expanded
// subjects (including aliases) were already used to fetch candidates.
if !self.resolve_aliases {
if let Some(ref subject) = self.subject {
if &assertion.subject != subject {
return false;
}
}
}
// Check predicate filter
if let Some(ref predicate) = self.predicate {
if &assertion.predicate != predicate {
return false;
}
}
// Check lifecycle filter
if let Some(lifecycle) = self.lifecycle {
if assertion.lifecycle != lifecycle {
return false;
}
}
// Check epoch filter
if let Some(epoch) = self.epoch {
match assertion.epoch {
Some(assertion_epoch) if assertion_epoch == epoch => {}
_ => return false,
}
}
// Check visual similarity filter
if let Some(ref target_hex) = self.visual_near {
let target = match parse_hex_phash(target_hex) {
Some(h) => h,
None => return false, // Invalid hex = no match
};
match assertion.visual_hash {
Some(ref assertion_hash) => {
let threshold = self.visual_threshold.unwrap_or(8);
if hamming_distance(&target, assertion_hash) > threshold {
return false;
}
}
None => return false, // No visual_hash = no match when visual_near is specified
}
}
// Check as_of (time-travel) filter
if let Some(as_of_ts) = self.as_of {
if assertion.timestamp > as_of_ts {
return false;
}
}
true
}
}