stemedb/crates/stemedb-storage/src/gold_standard_store.rs
jordan 3320c24afa feat: WAL hardening (Phase 5B) - CRC32C, crash recovery, group commit, log rotation
Add CRC32C checksums to WAL record format (v2), implement crash recovery
with automatic truncation of corrupt records, add feature-gated group commit
buffer for batched fsync under concurrent load, and implement log rotation
via segment files with global offset addressing.

Key changes:
- Record format v2: [len:u32][crc32c:u32][blake3:32][payload:N]
- recover_file() scans and truncates corrupt tail records
- GroupCommitBuffer batches fsync via MPSC channel (tokio feature gate)
- SegmentManager with binary search resolution and cursor-based cleanup
- Journal::read() auto-refreshes segments on miss for writer/reader split
- Split recovery.rs and key_codec.rs into directory modules for 500-line max

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 12:36:35 -07:00

321 lines
11 KiB
Rust

//! Storage for gold standard assertions.
//!
//! Gold standards are stored at `{subject}\x00GS:{predicate}` with a secondary
//! index at `\x00GS_LIST:{subject}:{predicate}` for listing all gold standards.
use crate::{key_codec, KVStore, Result, StorageError};
use async_trait::async_trait;
use std::sync::Arc;
use stemedb_core::types::GoldStandard;
use tracing::{debug, instrument};
/// Storage trait for gold standard operations.
///
/// Provides operations for creating, reading, listing, and removing gold standards
/// that define ground truth for agent verification.
#[async_trait]
pub trait GoldStandardStore: Send + Sync {
/// Store a gold standard.
///
/// Key format: `GS:{subject}:{predicate}`
///
/// # Arguments
///
/// - `gs` - The gold standard to store
async fn set_gold_standard(&self, gs: &GoldStandard) -> Result<()>;
/// Get a gold standard by subject and predicate.
///
/// # Arguments
///
/// - `subject` - Subject entity (e.g., "Earth")
/// - `predicate` - Predicate (e.g., "has_shape")
///
/// # Returns
///
/// `Some(GoldStandard)` if found, `None` otherwise.
async fn get_gold_standard(
&self,
subject: &str,
predicate: &str,
) -> Result<Option<GoldStandard>>;
/// List all gold standards.
///
/// Returns all gold standards ordered by subject and predicate.
async fn list_gold_standards(&self) -> Result<Vec<GoldStandard>>;
/// Remove a gold standard.
///
/// # Arguments
///
/// - `subject` - Subject entity
/// - `predicate` - Predicate
///
/// # Returns
///
/// `true` if the gold standard was found and removed, `false` if not found.
async fn remove_gold_standard(&self, subject: &str, predicate: &str) -> Result<bool>;
}
/// Generic implementation of `GoldStandardStore` backed by any `KVStore`.
pub struct GenericGoldStandardStore<S> {
store: Arc<S>,
}
impl<S: KVStore> GenericGoldStandardStore<S> {
/// Create a new gold standard store backed by the given KV store.
pub fn new(store: Arc<S>) -> Self {
Self { store }
}
}
#[async_trait]
impl<S: KVStore + 'static> GoldStandardStore for GenericGoldStandardStore<S> {
#[instrument(skip(self, gs), fields(subject = %gs.subject, predicate = %gs.predicate))]
async fn set_gold_standard(&self, gs: &GoldStandard) -> Result<()> {
let key = key_codec::gold_standard_key(&gs.subject, &gs.predicate);
let list_key = key_codec::gs_list_key(&gs.subject, &gs.predicate);
let serialized = stemedb_core::serde::serialize(gs)
.map_err(|e| StorageError::Serialization(e.to_string()))?;
// Write primary key
self.store.put(&key, &serialized).await?;
// Write secondary index for listing (empty value, just presence matters)
self.store.put(&list_key, &[]).await?;
debug!(
subject = %gs.subject,
predicate = %gs.predicate,
assertion_hash = %hex::encode(gs.assertion_hash),
"Stored gold standard"
);
Ok(())
}
#[instrument(skip(self), fields(subject = %subject, predicate = %predicate))]
async fn get_gold_standard(
&self,
subject: &str,
predicate: &str,
) -> Result<Option<GoldStandard>> {
let key = key_codec::gold_standard_key(subject, predicate);
match self.store.get(&key).await? {
Some(data) => {
let gs: GoldStandard = stemedb_core::serde::deserialize(&data)
.map_err(|e| StorageError::Serialization(e.to_string()))?;
debug!(
subject = %subject,
predicate = %predicate,
expected_object = %gs.expected_object,
"Retrieved gold standard"
);
Ok(Some(gs))
}
None => {
debug!(
subject = %subject,
predicate = %predicate,
"Gold standard not found"
);
Ok(None)
}
}
}
#[instrument(skip(self))]
async fn list_gold_standards(&self) -> Result<Vec<GoldStandard>> {
// Scan the GS_LIST secondary index
let list_entries = self.store.scan_prefix(&key_codec::gs_list_scan_prefix()).await?;
let mut gold_standards = Vec::new();
for (list_key, _) in list_entries {
// Extract subject and predicate from GS_LIST key: \x00GS_LIST:{subject}:{predicate}
let tag = key_codec::extract_tag(&list_key);
if let Some(suffix) = tag.strip_prefix(b"GS_LIST:") {
if let Ok(suffix_str) = std::str::from_utf8(suffix) {
// Split by first colon to get subject and predicate
if let Some(colon_pos) = suffix_str.find(':') {
let subject = &suffix_str[..colon_pos];
let predicate = &suffix_str[colon_pos + 1..];
// Fetch the actual gold standard from the primary key
let key = key_codec::gold_standard_key(subject, predicate);
if let Some(data) = self.store.get(&key).await? {
match stemedb_core::serde::deserialize::<GoldStandard>(&data) {
Ok(gs) => gold_standards.push(gs),
Err(e) => {
debug!(error = %e, subject = %subject, predicate = %predicate, "Skipping malformed gold standard");
}
}
}
}
}
}
}
// Sort by subject, then predicate for deterministic output
gold_standards
.sort_by(|a, b| a.subject.cmp(&b.subject).then_with(|| a.predicate.cmp(&b.predicate)));
debug!(count = gold_standards.len(), "Listed gold standards");
Ok(gold_standards)
}
#[instrument(skip(self), fields(subject = %subject, predicate = %predicate))]
async fn remove_gold_standard(&self, subject: &str, predicate: &str) -> Result<bool> {
let key = key_codec::gold_standard_key(subject, predicate);
let list_key = key_codec::gs_list_key(subject, predicate);
// Check if it exists first
let exists = self.store.get(&key).await?.is_some();
if exists {
// Delete both primary key and secondary index
self.store.delete(&key).await?;
self.store.delete(&list_key).await?;
debug!(
subject = %subject,
predicate = %predicate,
"Removed gold standard"
);
Ok(true)
} else {
debug!(
subject = %subject,
predicate = %predicate,
"Gold standard not found for removal"
);
Ok(false)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::HybridStore;
use stemedb_core::types::GoldStandard;
fn create_gold_standard(subject: &str, predicate: &str, expected_object: &str) -> GoldStandard {
GoldStandard::new(
[42u8; 32],
subject.to_string(),
predicate.to_string(),
expected_object.to_string(),
1000,
"admin".to_string(),
)
}
#[tokio::test]
async fn test_set_and_get_gold_standard() {
let store = Arc::new(HybridStore::open_temp().expect("store"));
let gs_store = GenericGoldStandardStore::new(store);
let gs = create_gold_standard("Earth", "has_shape", "oblate_spheroid");
gs_store.set_gold_standard(&gs).await.expect("set");
let retrieved = gs_store
.get_gold_standard("Earth", "has_shape")
.await
.expect("get")
.expect("should exist");
assert_eq!(retrieved, gs);
}
#[tokio::test]
async fn test_get_nonexistent_gold_standard() {
let store = Arc::new(HybridStore::open_temp().expect("store"));
let gs_store = GenericGoldStandardStore::new(store);
let result = gs_store.get_gold_standard("NonExistent", "predicate").await.expect("get");
assert!(result.is_none());
}
#[tokio::test]
async fn test_list_gold_standards() {
let store = Arc::new(HybridStore::open_temp().expect("store"));
let gs_store = GenericGoldStandardStore::new(store);
let gs1 = create_gold_standard("Earth", "has_shape", "oblate_spheroid");
let gs2 = create_gold_standard("Semaglutide", "treats_condition", "type_2_diabetes");
let gs3 = create_gold_standard("Earth", "has_moon", "Luna");
gs_store.set_gold_standard(&gs1).await.expect("set gs1");
gs_store.set_gold_standard(&gs2).await.expect("set gs2");
gs_store.set_gold_standard(&gs3).await.expect("set gs3");
let list = gs_store.list_gold_standards().await.expect("list");
assert_eq!(list.len(), 3);
// Should be sorted by subject, then predicate
assert_eq!(list[0].subject, "Earth");
assert_eq!(list[0].predicate, "has_moon");
assert_eq!(list[1].subject, "Earth");
assert_eq!(list[1].predicate, "has_shape");
assert_eq!(list[2].subject, "Semaglutide");
}
#[tokio::test]
async fn test_remove_gold_standard() {
let store = Arc::new(HybridStore::open_temp().expect("store"));
let gs_store = GenericGoldStandardStore::new(store);
let gs = create_gold_standard("Earth", "has_shape", "oblate_spheroid");
gs_store.set_gold_standard(&gs).await.expect("set");
// Verify it exists
let retrieved = gs_store.get_gold_standard("Earth", "has_shape").await.expect("get");
assert!(retrieved.is_some());
// Remove it
let removed = gs_store.remove_gold_standard("Earth", "has_shape").await.expect("remove");
assert!(removed);
// Verify it's gone
let after_removal = gs_store.get_gold_standard("Earth", "has_shape").await.expect("get");
assert!(after_removal.is_none());
}
#[tokio::test]
async fn test_remove_nonexistent_gold_standard() {
let store = Arc::new(HybridStore::open_temp().expect("store"));
let gs_store = GenericGoldStandardStore::new(store);
let removed =
gs_store.remove_gold_standard("NonExistent", "predicate").await.expect("remove");
assert!(!removed);
}
#[tokio::test]
async fn test_overwrite_gold_standard() {
let store = Arc::new(HybridStore::open_temp().expect("store"));
let gs_store = GenericGoldStandardStore::new(store);
let gs1 = create_gold_standard("Earth", "has_shape", "sphere");
gs_store.set_gold_standard(&gs1).await.expect("set");
// Overwrite with more accurate answer
let gs2 = create_gold_standard("Earth", "has_shape", "oblate_spheroid");
gs_store.set_gold_standard(&gs2).await.expect("set");
let retrieved = gs_store
.get_gold_standard("Earth", "has_shape")
.await
.expect("get")
.expect("should exist");
assert_eq!(retrieved.expected_object, "oblate_spheroid");
}
}