stemedb/crates/stemedb-storage/src/audit_store/mod.rs
jordan 3320c24afa feat: WAL hardening (Phase 5B) - CRC32C, crash recovery, group commit, log rotation
Add CRC32C checksums to WAL record format (v2), implement crash recovery
with automatic truncation of corrupt records, add feature-gated group commit
buffer for batched fsync under concurrent load, and implement log rotation
via segment files with global offset addressing.

Key changes:
- Record format v2: [len:u32][crc32c:u32][blake3:32][payload:N]
- recover_file() scans and truncates corrupt tail records
- GroupCommitBuffer batches fsync via MPSC channel (tokio feature gate)
- SegmentManager with binary search resolution and cursor-based cleanup
- Journal::read() auto-refreshes segments on miss for writer/reader split
- Split recovery.rs and key_codec.rs into directory modules for 500-line max

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 12:36:35 -07:00

286 lines
10 KiB
Rust

//! Query audit trail storage for incident investigation.
//!
//! Every query is logged with provenance to enable "Why did you think that?"
//! debugging. This is critical for SRE incident investigation and agent
//! decision auditing.
//!
//! # Storage Layout
//!
//! | Key Pattern | Value | Purpose |
//! |-------------|-------|---------|
//! | `\x00AUD:{query_id}` | Serialized QueryAudit | Individual audit records |
//! | `\x00AUDA:{agent_id}:{timestamp}:{query_id}` | Empty | Agent index for temporal queries |
//!
//! # Design Philosophy
//!
//! Following the "Deep Module" principle:
//! - Simple interface hiding complex indexing
//! - O(1) individual audit lookups
//! - Agent+time range queries via prefix scan
//!
//! All operations are append-only (audits are never modified or deleted).
mod store_impl;
use async_trait::async_trait;
use stemedb_core::types::{QueryAudit, QueryId};
use crate::error::Result;
pub use store_impl::GenericAuditStore;
/// Specialized storage trait for query audit operations.
///
/// This trait provides audit-specific operations on top of a generic KVStore,
/// enabling efficient audit logging and retrieval for incident investigation.
///
/// # Example
///
/// ```ignore
/// let audit_store = GenericAuditStore::new(kv_store);
///
/// // Log a query audit
/// audit_store.put_audit(&audit).await?;
///
/// // Retrieve a specific audit
/// let audit = audit_store.get_audit(&query_id).await?;
///
/// // Find all audits for an agent in a time range
/// let audits = audit_store.get_audits_for_agent(&agent_id, from, to).await?;
/// ```
#[async_trait]
pub trait AuditStore: Send + Sync {
/// Store a query audit record.
///
/// This operation:
/// 1. Serializes the audit using rkyv
/// 2. Stores at `\x00AUD:{query_id}`
/// 3. Creates agent index entry at `\x00AUDA:{agent_id}:{timestamp}:{query_id}`
///
/// # Returns
/// The query_id for reference.
async fn put_audit(&self, audit: &QueryAudit) -> Result<QueryId>;
/// Get a specific audit record by its query ID.
///
/// # Returns
/// The audit record if found, None otherwise.
async fn get_audit(&self, query_id: &QueryId) -> Result<Option<QueryAudit>>;
/// Get audit records for a specific agent within a time range.
///
/// Uses the agent index for efficient temporal queries.
///
/// # Arguments
/// * `agent_id` - The agent's public key
/// * `from_timestamp` - Start of time range (inclusive)
/// * `to_timestamp` - End of time range (inclusive), None for unbounded
/// * `limit` - Maximum number of records to return
///
/// # Returns
/// Vector of audit records, sorted by timestamp ascending, capped at limit.
async fn get_audits_for_agent(
&self,
agent_id: &[u8; 32],
from_timestamp: u64,
to_timestamp: Option<u64>,
limit: usize,
) -> Result<Vec<QueryAudit>>;
/// List recent audit records across all agents.
///
/// Scans all `\x00AUD:` keys and returns the most recent audits.
///
/// # Arguments
/// * `limit` - Maximum number of records to return
///
/// # Returns
/// Vector of audit records, sorted by timestamp descending (most recent first).
async fn list_recent_audits(&self, limit: usize) -> Result<Vec<QueryAudit>>;
/// Check if any audits exist for an agent.
async fn has_audits_for_agent(&self, agent_id: &[u8; 32]) -> Result<bool>;
}
#[cfg(test)]
mod tests {
use super::*;
use crate::HybridStore;
use std::sync::Arc;
use stemedb_core::types::{ContributingAssertion, LifecycleStage};
fn create_test_audit(
query_id: QueryId,
agent_id: Option<[u8; 32]>,
timestamp: u64,
) -> QueryAudit {
QueryAudit {
query_id,
agent_id,
timestamp,
params: stemedb_core::types::QueryParams {
subject: Some("Tesla".to_string()),
predicate: Some("revenue".to_string()),
lifecycle: Some(LifecycleStage::Approved),
epoch: None,
lens: Some("Recency".to_string()),
},
result_hash: Some([1u8; 32]),
result_confidence: 0.95,
contributing_assertions: vec![ContributingAssertion {
assertion_hash: [2u8; 32],
weight: 1.0,
source_hash: [3u8; 32],
lifecycle: LifecycleStage::Approved,
}],
}
}
#[tokio::test]
async fn test_put_and_get_audit() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
let query_id = [10u8; 32];
let agent_id = Some([1u8; 32]);
let audit = create_test_audit(query_id, agent_id, 1000);
// Put audit
let returned_id = audit_store.put_audit(&audit).await.expect("Failed to put audit");
assert_eq!(returned_id, query_id);
// Get audit back
let retrieved = audit_store.get_audit(&query_id).await.expect("Failed to get audit");
assert!(retrieved.is_some());
let retrieved_audit = retrieved.expect("Audit should exist");
assert_eq!(retrieved_audit.query_id, query_id);
assert_eq!(retrieved_audit.agent_id, agent_id);
assert_eq!(retrieved_audit.timestamp, 1000);
assert!((retrieved_audit.result_confidence - 0.95).abs() < f32::EPSILON);
}
#[tokio::test]
async fn test_get_audits_for_agent() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
let agent1 = [1u8; 32];
let agent2 = [2u8; 32];
// Create audits for different agents and times
let audit1 = create_test_audit([10u8; 32], Some(agent1), 1000);
let audit2 = create_test_audit([11u8; 32], Some(agent1), 2000);
let audit3 = create_test_audit([12u8; 32], Some(agent1), 3000);
let audit4 = create_test_audit([13u8; 32], Some(agent2), 2500);
audit_store.put_audit(&audit1).await.expect("put");
audit_store.put_audit(&audit2).await.expect("put");
audit_store.put_audit(&audit3).await.expect("put");
audit_store.put_audit(&audit4).await.expect("put");
// Get all audits for agent1
let agent1_audits =
audit_store.get_audits_for_agent(&agent1, 0, None, 100).await.expect("get");
assert_eq!(agent1_audits.len(), 3);
// Get audits for agent1 in time range
let agent1_range =
audit_store.get_audits_for_agent(&agent1, 1500, Some(2500), 100).await.expect("get");
assert_eq!(agent1_range.len(), 1);
assert_eq!(agent1_range[0].timestamp, 2000);
// Get audits for agent2
let agent2_audits =
audit_store.get_audits_for_agent(&agent2, 0, None, 100).await.expect("get");
assert_eq!(agent2_audits.len(), 1);
// Test limit parameter
let limited = audit_store.get_audits_for_agent(&agent1, 0, None, 2).await.expect("get");
assert_eq!(limited.len(), 2);
}
#[tokio::test]
async fn test_list_recent_audits() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
// Create audits with different timestamps
for i in 0..5 {
let mut query_id = [0u8; 32];
query_id[0] = i;
let audit = create_test_audit(query_id, Some([1u8; 32]), 1000 + (i as u64) * 100);
audit_store.put_audit(&audit).await.expect("put");
}
// List recent with limit
let recent = audit_store.list_recent_audits(3).await.expect("list");
assert_eq!(recent.len(), 3);
// Should be in descending timestamp order
assert_eq!(recent[0].timestamp, 1400);
assert_eq!(recent[1].timestamp, 1300);
assert_eq!(recent[2].timestamp, 1200);
}
#[tokio::test]
async fn test_audit_without_agent() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
// Audit without agent_id (anonymous query)
let query_id = [20u8; 32];
let audit = create_test_audit(query_id, None, 1000);
audit_store.put_audit(&audit).await.expect("put");
// Should still be retrievable
let retrieved = audit_store.get_audit(&query_id).await.expect("get");
assert!(retrieved.is_some());
assert!(retrieved.expect("exists").agent_id.is_none());
}
#[tokio::test]
async fn test_has_audits_for_agent() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
let agent1 = [1u8; 32];
let agent2 = [2u8; 32];
// No audits initially
assert!(!audit_store.has_audits_for_agent(&agent1).await.expect("has"));
assert!(!audit_store.has_audits_for_agent(&agent2).await.expect("has"));
// Add audit for agent1
let audit = create_test_audit([10u8; 32], Some(agent1), 1000);
audit_store.put_audit(&audit).await.expect("put");
// Now agent1 has audits, agent2 still doesn't
assert!(audit_store.has_audits_for_agent(&agent1).await.expect("has"));
assert!(!audit_store.has_audits_for_agent(&agent2).await.expect("has"));
}
#[tokio::test]
async fn test_get_nonexistent_audit() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
let nonexistent = [99u8; 32];
let result = audit_store.get_audit(&nonexistent).await.expect("get");
assert!(result.is_none(), "Should return None for nonexistent audit");
}
#[tokio::test]
async fn test_empty_agent_audits() {
let store = Arc::new(HybridStore::open_temp().expect("Failed to create store"));
let audit_store = GenericAuditStore::new(store);
let agent = [1u8; 32];
let audits = audit_store.get_audits_for_agent(&agent, 0, None, 100).await.expect("get");
assert!(audits.is_empty(), "Should return empty vec for agent with no audits");
}
}