13 KiB
Task 05: Session TTL Auto-Cleanup Sweeper
Delivers
Background thread that scans active sessions every 60 seconds and auto-closes any session that has exceeded its policy's max_session_duration. SessionSummary.auto_closed: bool field to distinguish agent-initiated closes from sweeper-initiated closes. Graceful cancellation on TidalDb::close() with no dangling threads.
Complexity: M
Dependencies
- task-01 (module structure -- sweeper thread field on
TidalDb) - m4 sessions (
SessionState,SessionHandle,close_session(),sessions: DashMap) AgentPolicy.max_session_duration(tidal/src/schema/validation/policies.rs)
Technical Design
1. Add auto_closed to SessionSummary
// In tidal/src/session/types.rs, add to SessionSummary:
/// Summary returned by `close_session()`.
#[derive(Debug, Clone)]
pub struct SessionSummary {
pub id: SessionId,
pub duration_ms: u64,
pub signals_written: u64,
pub rejections: u64,
/// `true` if this session was auto-closed by the TTL sweeper
/// rather than explicitly closed by the agent.
pub auto_closed: bool,
}
Update the existing close_session() to set auto_closed: false:
Ok(SessionSummary {
id: session_id,
duration_ms,
signals_written,
rejections,
auto_closed: false,
})
2. Internal close method that does not require SessionHandle
close_session() currently takes SessionHandle by value. The sweeper does not have a SessionHandle -- it only has the SessionId from iterating the DashMap. We need an internal variant that closes by ID.
// In tidal/src/db/sessions.rs:
impl TidalDb {
/// Internal: close a session by ID without requiring a `SessionHandle`.
///
/// Used by the TTL sweeper and shutdown cleanup. Sets the `closed`
/// AtomicBool on the session state to prevent further signal writes
/// from any handle that may still be held (defense-in-depth).
///
/// Returns the summary with `auto_closed` set to the caller's value.
pub(crate) fn close_session_internal(
&self,
session_id: SessionId,
auto_closed: bool,
) -> crate::Result<SessionSummary> {
let (_id, state) = self.sessions.remove(&session_id).ok_or_else(|| {
TidalError::Internal(format!("session {session_id} not found (already closed?)"))
})?;
// Mark as closed to prevent further signal writes from any
// outstanding SessionHandle references.
state.closed.store(true, Ordering::Release);
let duration_ms = state.started_at.elapsed().as_millis() as u64;
let signals_written = state.signals_written.load(Ordering::Relaxed);
let rejections = state.signals_rejected.load(Ordering::Relaxed);
let snapshot = crate::session::build_frozen_snapshot(&state, duration_ms);
// Persist snapshot, remove start record.
if let Some(storage) = self.storage.as_ref() {
let snapshot_key = encode_key(
EntityId::new(session_id.as_u64()),
Tag::Session,
b"snapshot",
);
let start_key = encode_key(
EntityId::new(session_id.as_u64()),
Tag::Session,
b"start",
);
let snapshot_bytes = crate::session::serialize_snapshot(&snapshot);
let mut batch = crate::storage::WriteBatch::new();
batch.put(snapshot_key, snapshot_bytes);
batch.delete(start_key);
if let Err(e) = storage.items_engine().write_batch(batch) {
tracing::warn!(error = %e, session_id = %session_id, "failed to persist auto-close snapshot");
}
}
// Write session close event to WAL journal.
if let Ok(guard) = self.wal.lock()
&& let Some(wal) = guard.as_ref()
{
let _ = wal.session_close(session_id.as_u64());
}
// Evict oldest closed session if cap exceeded.
if self.closed_sessions.len() >= crate::session::MAX_CLOSED_SESSIONS
&& let Some(oldest_key) = self.closed_sessions.iter().map(|e| *e.key()).min()
{
self.closed_sessions.remove(&oldest_key);
}
// Cross-session preference update.
// Resolve user_id from the state before it's dropped.
let user_id = state.user_id;
// Clean up rate limiter bucket.
self.rate_limiter.remove(state.agent_id.as_str(), session_id.as_u64());
self.apply_session_preference_update(user_id, &snapshot);
self.closed_sessions.insert(session_id, snapshot);
tracing::info!(
session_id = %session_id,
auto_closed,
signals_written,
duration_ms,
"session closed (sweeper)"
);
Ok(SessionSummary {
id: session_id,
duration_ms,
signals_written,
rejections,
auto_closed,
})
}
}
Refactor the existing close_session() to delegate to close_session_internal():
pub fn close_session(&self, handle: SessionHandle) -> crate::Result<SessionSummary> {
handle.closed.store(true, Ordering::Release);
self.close_session_internal(handle.id, false)
}
3. Sweeper thread
The sweeper is a simple loop: sleep 60 seconds, scan active sessions, close any that are expired. The loop checks a shutdown AtomicBool each iteration.
// In tidal/src/db/mod.rs (or a new tidal/src/db/sweeper.rs):
/// Interval between sweeper scans.
const SWEEPER_INTERVAL: std::time::Duration = std::time::Duration::from_secs(60);
impl TidalDb {
/// Spawn the session TTL sweeper thread.
///
/// Returns a `JoinHandle` that can be joined on shutdown.
/// The sweeper checks `shutdown_sweeper` each iteration and exits
/// when it is set to `true`.
pub(crate) fn spawn_sweeper(
db: &Arc<Self>,
shutdown: Arc<AtomicBool>,
) -> std::thread::JoinHandle<()> {
let db = Arc::clone(db);
std::thread::Builder::new()
.name("tidaldb-session-sweeper".into())
.spawn(move || {
tracing::info!("session TTL sweeper started");
loop {
// Sleep with interruptible check.
// Break the 60s sleep into 1s intervals so that
// shutdown is detected within ~1s.
for _ in 0..60 {
if shutdown.load(Ordering::Relaxed) {
tracing::info!("session TTL sweeper shutting down");
return;
}
std::thread::sleep(std::time::Duration::from_secs(1));
}
if shutdown.load(Ordering::Relaxed) {
return;
}
db.sweep_expired_sessions();
}
})
.expect("failed to spawn session sweeper thread")
}
/// Scan all active sessions and close any that have exceeded their
/// policy's `max_session_duration`.
fn sweep_expired_sessions(&self) {
let now = std::time::Instant::now();
let mut expired_ids = Vec::new();
for entry in self.sessions.iter() {
let state = entry.value();
let elapsed = now.duration_since(state.started_at);
// Look up the policy's max_session_duration.
let max_duration = self
.schema_def
.as_ref()
.and_then(|s| s.session_policy(&state.policy_name))
.map(|p| p.max_session_duration);
if let Some(max) = max_duration {
if elapsed > max {
expired_ids.push(state.id);
}
}
}
if !expired_ids.is_empty() {
tracing::info!(
count = expired_ids.len(),
"sweeper: closing expired sessions"
);
}
for session_id in expired_ids {
if let Err(e) = self.close_session_internal(session_id, true) {
tracing::warn!(
error = %e,
session_id = %session_id,
"sweeper: failed to close expired session"
);
}
}
}
}
4. Wire sweeper thread into TidalDb
Add fields:
// In tidal/src/db/mod.rs, add to TidalDb:
shutdown_sweeper: Arc<AtomicBool>,
sweeper_thread: std::sync::Mutex<Option<std::thread::JoinHandle<()>>>,
Initialize in from_parts() and from_config():
shutdown_sweeper: Arc::new(AtomicBool::new(false)),
sweeper_thread: std::sync::Mutex::new(None),
Spawn after construction (in the open() method, after from_parts() returns):
// Only spawn in durable (non-ephemeral) mode:
if config.storage_mode != StorageMode::Ephemeral {
let handle = TidalDb::spawn_sweeper(&db_arc, Arc::clone(&db_arc.shutdown_sweeper));
if let Ok(mut guard) = db_arc.sweeper_thread.lock() {
*guard = Some(handle);
}
}
5. Graceful shutdown
In TidalDb::close() / shutdown_inner(), signal the sweeper to stop and join the thread:
// Signal the sweeper to stop.
self.shutdown_sweeper.store(true, Ordering::Release);
// Join the sweeper thread.
if let Ok(mut guard) = self.sweeper_thread.lock()
&& let Some(thread) = guard.take()
{
let _ = thread.join();
}
This must happen BEFORE closing the WAL and storage engines, because close_session_internal() (called by the sweeper) writes to the WAL and storage.
6. Sweep on shutdown
As a final cleanup, call sweep_expired_sessions() one last time during close() to catch any sessions that expired since the last sweep. Then close any still-active sessions (non-expired ones that the agent forgot to close):
// In shutdown_inner():
// Final sweep for expired sessions.
self.sweep_expired_sessions();
// Force-close any remaining active sessions.
let remaining: Vec<SessionId> = self.sessions.iter().map(|e| *e.key()).collect();
for session_id in remaining {
if let Err(e) = self.close_session_internal(session_id, true) {
tracing::warn!(error = %e, session_id = %session_id, "shutdown: failed to close session");
}
}
Acceptance Criteria
SessionSummary.auto_closed: boolfield added- Existing
close_session()setsauto_closed: false close_session_internal(session_id, auto_closed)works withoutSessionHandleclose_session()delegates toclose_session_internal(handle.id, false)- Sweeper thread scans every 60s (interruptible via 1s sleep intervals)
- Expired sessions detected by comparing elapsed time to
policy.max_session_duration - Expired sessions closed with
auto_closed: true shutdown_sweeper: AtomicBoolsignals the sweeper to exit- Sweeper joins within ~1s on
db.close() - No dangling threads after
db.close()returns - Final sweep runs during shutdown to catch sessions expired since last scan
- Remaining non-expired sessions force-closed during shutdown
- Rate limiter bucket cleaned up for auto-closed sessions
cargo testpasses,cargo clippy -D warningsclean
Test Strategy
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn session_summary_auto_closed_false_by_default() {
// Open db, start session, close it normally.
// Assert summary.auto_closed == false.
}
#[test]
fn close_session_internal_sets_auto_closed() {
// Open db, start session.
// Call close_session_internal(session_id, true).
// Assert summary.auto_closed == true.
}
#[test]
fn sweeper_closes_expired_sessions() {
// Open db with a policy that has max_session_duration = 100ms.
// Start a session.
// Sleep 200ms.
// Call sweep_expired_sessions() directly (no need to test the thread).
// Assert the session was removed from active_sessions.
// Assert the snapshot exists in closed_sessions.
}
#[test]
fn sweeper_does_not_close_active_sessions() {
// Open db with max_session_duration = 1 hour.
// Start a session.
// Call sweep_expired_sessions().
// Assert the session is still in active_sessions.
}
#[test]
fn sweeper_thread_cancellation() {
// Open db (spawns sweeper).
// Close db (signals sweeper shutdown).
// Assert no panic, no hanging.
// Time the close: should be < 2 seconds.
}
#[test]
fn shutdown_force_closes_remaining_sessions() {
// Open db, start 3 sessions, close none.
// Call db.close().
// Assert all 3 sessions are in closed_sessions with auto_closed == true.
}
#[test]
fn closed_flag_set_on_auto_close() {
// Open db, start a session, hold the SessionHandle.
// Call close_session_internal(session_id, true).
// Assert handle.closed.load() == true.
// Assert session_signal() with the handle returns error (session closed).
}
// Integration test: sweeper + rate limiter cleanup.
#[test]
fn auto_close_cleans_up_rate_limiter_bucket() {
// Open db, start a session, write a few session signals.
// Assert rate_limiter.active_buckets() >= 1.
// Call close_session_internal(session_id, true).
// Assert rate_limiter.active_buckets() == 0.
}
}