tidaldb/tidal/src/entities/preference.rs
jordan 192c473f55 feat: complete Milestone 5 — full-text search, RRF fusion, and creator search
- M5p1: BM25 text indexing via Tantivy with background syncer (0.26ms @ 10K docs)
- M5p2: RRF fusion layer combining BM25 + ANN scores (46µs @ 1K candidates)
- M5p3: unified Search query API (8-stage pipeline, BM25 + vector + ranking)
- M5p4: creator text + vector indexing and creator search executor (< 20ms @ 200 creators)
- Refactor db/mod.rs into focused sub-modules (creators, items, sessions, signals, etc.)
- Decompose monolithic files into directory modules (query/executor, ranking/diversity, etc.)
- Split brute.rs → brute/mod.rs + brute/tests.rs; extract search executor helpers
- Add benches: fusion, search, session, text_index
- Add M5 UAT test suites (m5_uat, m5_search, m5p4_creator_search, text_index)
- Update blog posts, roadmap, content strategy, and M5 planning docs
- Add tmp/ and .claude/worktrees/ to .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 23:53:16 -07:00

261 lines
8.5 KiB
Rust

//! Preference vector: per-user taste embedding with L2 normalization invariant.
//!
//! Tracks user taste by maintaining a preference vector that evolves with
//! interactions. The vector is L2-normalized on every update to ensure
//! consistent cosine similarity scoring during personalized ranking.
use dashmap::DashMap;
/// Per-user preference vector, L2-normalized.
///
/// The vector is updated via exponential moving average: each new interaction
/// embedding is blended with the current preference using a learning rate.
///
/// Thread-safe via `DashMap` -- concurrent updates to different users never
/// contend.
pub struct PreferenceVectors {
/// `user_id` -> normalized preference vector
inner: DashMap<u64, Vec<f32>>,
/// Dimensionality of the embedding space. All vectors must have this length.
dim: usize,
/// Learning rate for exponential moving average updates.
/// Default: 0.1 (the new interaction contributes 10% to the updated preference).
learning_rate: f32,
}
impl PreferenceVectors {
/// Create a new preference vector store for the given embedding dimensionality.
#[must_use]
pub fn new(dim: usize) -> Self {
Self {
inner: DashMap::new(),
dim,
learning_rate: 0.1,
}
}
/// Create with a custom learning rate.
#[must_use]
pub fn with_learning_rate(dim: usize, learning_rate: f32) -> Self {
Self {
inner: DashMap::new(),
dim,
learning_rate,
}
}
/// Get the current preference vector for a user (cloned).
///
/// Returns `None` if no preference has been recorded.
#[must_use]
pub fn get(&self, user_id: u64) -> Option<Vec<f32>> {
self.inner.get(&user_id).map(|r| r.clone())
}
/// Set the preference vector directly (e.g., from cold-start initialization).
///
/// The vector is L2-normalized before storage. Returns `false` if the
/// dimension does not match.
#[must_use]
pub fn set(&self, user_id: u64, mut vec: Vec<f32>) -> bool {
if vec.len() != self.dim {
return false;
}
l2_normalize(&mut vec);
self.inner.insert(user_id, vec);
true
}
/// Update a user's preference vector by blending with an interaction embedding.
///
/// Uses exponential moving average:
/// `pref = (1 - lr) * pref + lr * interaction`
/// then L2-normalizes the result.
///
/// If no preference exists yet, the interaction embedding becomes the initial
/// preference (after normalization). Uses `Entry::Occupied`/`Entry::Vacant`
/// to avoid double-applying the blend on first insertion.
///
/// Returns `false` if the interaction embedding dimension does not match.
#[must_use]
pub fn update(&self, user_id: u64, interaction_embedding: &[f32]) -> bool {
use dashmap::mapref::entry::Entry;
if interaction_embedding.len() != self.dim {
return false;
}
let lr = self.learning_rate;
match self.inner.entry(user_id) {
Entry::Occupied(mut occ) => {
let pref = occ.get_mut();
for (p, &i) in pref.iter_mut().zip(interaction_embedding.iter()) {
*p = (1.0 - lr).mul_add(*p, lr * i);
}
l2_normalize(pref);
}
Entry::Vacant(vac) => {
let mut v = interaction_embedding.to_vec();
l2_normalize(&mut v);
vac.insert(v);
}
}
true
}
/// Compute cosine similarity between a user's preference and a candidate embedding.
///
/// Returns `None` if the user has no preference vector or dimensions mismatch.
/// The stored preference is L2-normalized; the candidate is normalized on-the-fly
/// so callers do not need to pre-normalize.
#[must_use]
#[allow(clippy::significant_drop_tightening)]
pub fn cosine_similarity(&self, user_id: u64, candidate: &[f32]) -> Option<f32> {
if candidate.len() != self.dim {
return None;
}
let pref = self.inner.get(&user_id)?;
let dot: f32 = pref.iter().zip(candidate.iter()).map(|(a, b)| a * b).sum();
// Divide by the candidate's L2 norm to get true cosine similarity.
// The stored preference is already unit-length, so we only need
// to normalize the candidate side.
let candidate_norm: f32 = candidate.iter().map(|x| x * x).sum::<f32>().sqrt();
if candidate_norm < f32::EPSILON {
return Some(0.0);
}
Some(dot / candidate_norm)
}
/// Number of users with stored preferences.
#[must_use]
pub fn len(&self) -> usize {
self.inner.len()
}
/// Whether no preferences are stored.
#[must_use]
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
}
/// L2-normalize a vector in-place. If the vector has zero magnitude, it remains
/// as-is (all zeros).
fn l2_normalize(vec: &mut [f32]) {
let norm: f32 = vec.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > f32::EPSILON {
for v in vec.iter_mut() {
*v /= norm;
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::float_cmp)]
mod tests {
use super::*;
#[test]
fn set_and_get() {
let pv = PreferenceVectors::new(3);
assert!(pv.set(1, vec![3.0, 4.0, 0.0]));
let v = pv.get(1).unwrap();
// 3/5, 4/5, 0
assert!((v[0] - 0.6).abs() < 1e-6);
assert!((v[1] - 0.8).abs() < 1e-6);
assert!((v[2] - 0.0).abs() < 1e-6);
}
#[test]
fn set_wrong_dim_rejected() {
let pv = PreferenceVectors::new(3);
assert!(!pv.set(1, vec![1.0, 2.0]));
assert!(pv.get(1).is_none());
}
#[test]
fn update_creates_initial() {
let pv = PreferenceVectors::new(3);
assert!(pv.update(1, &[1.0, 0.0, 0.0]));
let v = pv.get(1).unwrap();
assert!((v[0] - 1.0).abs() < 1e-6);
}
#[test]
fn update_blends() {
let pv = PreferenceVectors::with_learning_rate(2, 0.5);
let _ = pv.set(1, vec![1.0, 0.0]);
let _ = pv.update(1, &[0.0, 1.0]);
let v = pv.get(1).unwrap();
// After blend: (0.5, 0.5), normalized: (1/sqrt(2), 1/sqrt(2))
let expected = 1.0 / 2.0f32.sqrt();
assert!((v[0] - expected).abs() < 1e-5);
assert!((v[1] - expected).abs() < 1e-5);
}
#[test]
fn cosine_similarity_normalized() {
let pv = PreferenceVectors::new(3);
let _ = pv.set(1, vec![1.0, 0.0, 0.0]);
// Cosine with self = 1.0
let sim = pv.cosine_similarity(1, &[1.0, 0.0, 0.0]).unwrap();
assert!((sim - 1.0).abs() < 1e-6);
// Orthogonal = 0.0
let sim = pv.cosine_similarity(1, &[0.0, 1.0, 0.0]).unwrap();
assert!(sim.abs() < 1e-6);
}
#[test]
fn cosine_similarity_no_pref() {
let pv = PreferenceVectors::new(3);
assert!(pv.cosine_similarity(1, &[1.0, 0.0, 0.0]).is_none());
}
#[test]
fn l2_normalize_zero_vec() {
let mut v = vec![0.0f32, 0.0, 0.0];
l2_normalize(&mut v);
assert!(v.iter().all(|&x| x == 0.0));
}
#[test]
fn len_and_is_empty() {
let pv = PreferenceVectors::new(3);
assert!(pv.is_empty());
assert_eq!(pv.len(), 0);
let _ = pv.set(1, vec![1.0, 0.0, 0.0]);
assert!(!pv.is_empty());
assert_eq!(pv.len(), 1);
}
mod proptests {
use super::*;
use proptest::prelude::*;
proptest! {
/// After any sequence of updates, the L2 norm stays approximately 1.0.
#[test]
fn l2_norm_invariant(
updates in proptest::collection::vec(
proptest::collection::vec(-1.0f32..1.0f32, 4..=4),
1..20
),
) {
let pv = PreferenceVectors::new(4);
for emb in &updates {
let _ = pv.update(1, emb);
}
let v = pv.get(1).unwrap();
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
// After a sequence of updates, the vector should be unit-length
// (within floating-point tolerance) or exactly zero if all inputs
// collapse to the origin.
prop_assert!(
(norm - 1.0).abs() < 1e-4 || norm < f32::EPSILON,
"norm was {norm}, expected ~1.0"
);
}
}
}
}