tidaldb/applications/forage/engine/src/seed.rs
2026-02-23 22:41:16 -07:00

951 lines
29 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::collections::HashMap;
use tidaldb::TidalDb;
use tidaldb::schema::{EntityId, Timestamp};
#[derive(Clone)]
pub struct SeedItem {
pub id: u64,
pub title: String,
pub source: String,
pub category: String,
pub reading_time_min: u32,
pub description: String,
pub url: String,
}
/// Map a URL to a stable item ID using FNV-1a.
///
/// Produces a deterministic, collision-resistant u64 that is always > 100_000,
/// keeping it well clear of the seed corpus range (1100).
pub fn url_to_item_id(url: &str) -> u64 {
let mut hash: u64 = 14_695_981_039_346_656_037;
for byte in url.bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(1_099_511_628_211);
}
// Map tiny hashes out of the seed range without clustering.
if hash <= 100_000 {
hash.wrapping_add(100_001)
} else {
hash
}
}
const CATEGORIES: &[&str] = &[
"technology",
"science",
"jazz",
"travel",
"cooking",
"design",
"history",
"health",
];
fn category_dim(cat: &str) -> usize {
CATEGORIES.iter().position(|&c| c == cat).unwrap_or(0)
}
/// 8-dim unit vector along the category axis, with small deterministic per-item noise.
pub fn category_vector(cat: &str, item_id: u64) -> Vec<f32> {
let dim = category_dim(cat);
let mut v = vec![0.0f32; 8];
v[dim] = 0.9;
// Deterministic LCG noise so same item always gets same vector.
let mut state = item_id
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1);
for x in &mut v {
state = state
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1);
let noise = ((state >> 33) as f32 / u32::MAX as f32 - 0.5) * 0.2;
*x += noise;
}
l2_normalize(&mut v);
v
}
fn l2_normalize(v: &mut [f32]) {
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 1e-9 {
for x in v.iter_mut() {
*x /= norm;
}
}
}
pub fn default_corpus() -> Vec<SeedItem> {
let raw: &[(&str, &str, &str, u32, &str, &str)] = &[
// technology (ids 112)
(
"Rust's Ownership Model Explained",
"blog.rustlang.org",
"technology",
8,
"A deep dive into Rust's ownership, borrowing, and lifetimes.",
"https://blog.rustlang.org/ownership",
),
(
"Building Fast APIs with Axum",
"tokio.rs",
"technology",
6,
"How to build high-performance HTTP APIs with Axum and Tokio.",
"https://tokio.rs/axum-guide",
),
(
"WebAssembly in 2025",
"webassembly.org",
"technology",
10,
"State of the WebAssembly ecosystem and what's coming next.",
"https://webassembly.org/2025",
),
(
"LLMs Are Not Magic",
"ml.substack.com",
"technology",
7,
"Demystifying large language models for engineers.",
"https://ml.substack.com/llms",
),
(
"Kubernetes Cost Optimization",
"infra.dev",
"technology",
9,
"Practical techniques for cutting your cloud bill.",
"https://infra.dev/k8s-costs",
),
(
"Database Internals: LSM Trees",
"db-internals.com",
"technology",
12,
"How log-structured merge trees power modern databases.",
"https://db-internals.com/lsm",
),
(
"TypeScript 5.5 Features",
"devblog.ts",
"technology",
5,
"Everything new in TypeScript 5.5 with code examples.",
"https://devblog.ts/5.5",
),
(
"Zero-Copy Networking in Linux",
"lwn.net",
"technology",
11,
"io_uring, splice, and the future of zero-copy I/O.",
"https://lwn.net/zero-copy",
),
(
"AI Code Assistants Compared",
"dev.to",
"technology",
6,
"GitHub Copilot vs Cursor vs Codeium: real-world benchmarks.",
"https://dev.to/ai-assistants",
),
(
"Serverless Is Not Stateless",
"serverless.blog",
"technology",
8,
"Why serverless architectures still need to manage state.",
"https://serverless.blog/state",
),
(
"Open Source Observability Stack",
"grafana.com",
"technology",
7,
"Building a full observability pipeline with OSS tools.",
"https://grafana.com/oss-stack",
),
(
"Columnar Storage for Analytics",
"clickhouse.blog",
"technology",
9,
"Why column-oriented storage dominates analytical workloads.",
"https://clickhouse.blog/columnar",
),
// science (ids 1324)
(
"The Dark Matter Problem",
"physics.org",
"science",
9,
"What we know — and don't know — about dark matter.",
"https://physics.org/dark-matter",
),
(
"CRISPR Off-Target Effects",
"nature.com",
"science",
11,
"New research on precision genome editing and its risks.",
"https://nature.com/crispr",
),
(
"Ocean Acidification Accelerates",
"oceanresearch.edu",
"science",
8,
"Latest measurements show faster-than-predicted pH decline.",
"https://oceanresearch.edu/acidification",
),
(
"Quantum Entanglement at Scale",
"quanta.mag",
"science",
10,
"Experiments demonstrating entanglement across 100km distances.",
"https://quanta.mag/entanglement",
),
(
"Antibiotic Resistance Crisis",
"who.int",
"science",
7,
"The global health emergency that antibiotics are losing.",
"https://who.int/amr",
),
(
"How Sleep Cleans Your Brain",
"neuroscience.review",
"science",
6,
"The glymphatic system and why sleep deprivation is dangerous.",
"https://neuroscience.review/sleep",
),
(
"Mars Soil Composition Update",
"nasa.gov",
"science",
9,
"Perseverance rover findings on Martian mineralogy.",
"https://nasa.gov/mars-soil",
),
(
"Photosynthesis Efficiency Breakthrough",
"biotech.news",
"science",
8,
"Engineers achieve 12% improvement in plant energy conversion.",
"https://biotech.news/photosynthesis",
),
(
"The Gut-Brain Axis",
"microbiome.journal",
"science",
10,
"How intestinal bacteria influence mood and cognition.",
"https://microbiome.journal/gut-brain",
),
(
"Superconductors at Room Temperature?",
"condensed.matter.news",
"science",
12,
"Examining the controversial LK-99 claims and new research.",
"https://condensed.matter.news/superconductors",
),
(
"Mitochondrial DNA Inheritance",
"genetics.lab",
"science",
7,
"Paternal inheritance of mitochondria: exceptions to the rule.",
"https://genetics.lab/mitochondria",
),
(
"James Webb Telescope: Year Two",
"space.telescope.com",
"science",
11,
"Key discoveries from JWST's second year of observations.",
"https://space.telescope.com/jwst-y2",
),
// jazz (ids 2536)
(
"Miles Davis and Modal Jazz",
"jazzhistory.com",
"jazz",
7,
"How Kind of Blue changed everything about jazz harmony.",
"https://jazzhistory.com/miles-modal",
),
(
"The Art of Improvisation",
"berklee.edu",
"jazz",
9,
"Techniques for developing a personal improvisational voice.",
"https://berklee.edu/improvisation",
),
(
"John Coltrane's Sheets of Sound",
"jazztimes.com",
"jazz",
8,
"Analysis of Coltrane's density-of-notes approach.",
"https://jazztimes.com/coltrane",
),
(
"Thelonious Monk's Rhythmic Genius",
"allaboutjazz.com",
"jazz",
6,
"How Monk's unconventional rhythms shaped modern piano.",
"https://allaboutjazz.com/monk",
),
(
"Bill Evans and Impressionism",
"piano.quarterly",
"jazz",
10,
"The harmonic language of Bill Evans and its classical roots.",
"https://piano.quarterly/evans",
),
(
"Free Jazz: Ornette Coleman",
"freeform.press",
"jazz",
8,
"Liberation from chord changes and its lasting influence.",
"https://freeform.press/coleman",
),
(
"Brazilian Jazz Fusion",
"musica.rio",
"jazz",
7,
"How bossa nova and samba shaped modern jazz harmony.",
"https://musica.rio/fusion",
),
(
"The Jazz Rhythm Section",
"drummerworld.com",
"jazz",
5,
"Roles and interactions between bass, drums, and piano.",
"https://drummerworld.com/rhythm-section",
),
(
"Hard Bop's Social Context",
"civilrights.music",
"jazz",
9,
"How Art Blakey's music reflected the civil rights era.",
"https://civilrights.music/hardbop",
),
(
"Vocal Jazz: Sarah Vaughan",
"singing.academy",
"jazz",
6,
"Technique, range, and phrasing in Vaughan's recordings.",
"https://singing.academy/vaughan",
),
(
"Electric Miles: Bitches Brew",
"fusion.history",
"jazz",
11,
"The recording that launched jazz fusion.",
"https://fusion.history/bitches-brew",
),
(
"Jazz Standards and Their Stories",
"greatamericansongbook.com",
"jazz",
8,
"Origins of the most-played jazz compositions.",
"https://greatamericansongbook.com/standards",
),
// travel (ids 3748)
(
"Solo Travel in Patagonia",
"lonelyhiker.com",
"travel",
9,
"A week on the W Trek: logistics, gear, and memories.",
"https://lonelyhiker.com/patagonia",
),
(
"Japan by Rail Pass",
"japantravel.guide",
"travel",
10,
"How to maximize the JR Pass for a two-week Japan trip.",
"https://japantravel.guide/rail",
),
(
"Hidden Gems of Portugal",
"slowtravel.eu",
"travel",
7,
"Beyond Lisbon: the Alentejo region's villages and wine.",
"https://slowtravel.eu/portugal",
),
(
"Crossing the Sahara",
"overland.adventures",
"travel",
12,
"A guide to trans-Saharan routes for experienced overlanders.",
"https://overland.adventures/sahara",
),
(
"Southeast Asia on $40/Day",
"budgetbacker.com",
"travel",
8,
"Real costs and tips for Thailand, Vietnam, and Cambodia.",
"https://budgetbacker.com/sea",
),
(
"Antarctic Expedition Diary",
"polartravels.net",
"travel",
11,
"Fourteen days on the white continent, firsthand.",
"https://polartravels.net/antarctica",
),
(
"Urban Photography: Tokyo Streets",
"streetphoto.jp",
"travel",
6,
"Finding the best photogenic spots in Tokyo's neighborhoods.",
"https://streetphoto.jp/tokyo",
),
(
"Road Tripping New Zealand",
"campervanlife.nz",
"travel",
9,
"South Island in a campervan: route and practical tips.",
"https://campervanlife.nz/south-island",
),
(
"The Camino de Santiago Experience",
"pilgrim.path",
"travel",
8,
"What nobody tells you about the Camino Frances.",
"https://pilgrim.path/camino",
),
(
"Morocco's Imperial Cities",
"wanderlust.maroc",
"travel",
7,
"Fez, Marrakech, Meknes, and Rabat in one trip.",
"https://wanderlust.maroc/imperial",
),
(
"Slow Travel in Sicily",
"mediterranean.wander",
"travel",
9,
"Food, history, and beaches on a three-week Sicily circuit.",
"https://mediterranean.wander/sicily",
),
(
"Iceland in January",
"nordic.adventures",
"travel",
10,
"Northern lights, geysers, and surviving the darkness.",
"https://nordic.adventures/iceland-winter",
),
// cooking (ids 4960)
(
"The Maillard Reaction Explained",
"scienceofeating.com",
"cooking",
6,
"Why browning makes food taste better: the chemistry.",
"https://scienceofeating.com/maillard",
),
(
"Sourdough from Scratch",
"breadbakers.club",
"cooking",
11,
"Building a starter and baking your first sourdough loaf.",
"https://breadbakers.club/sourdough",
),
(
"Japanese Knife Skills",
"culinary.tokyo",
"cooking",
8,
"Mastering the santoku and nakiri for home cooks.",
"https://culinary.tokyo/knives",
),
(
"Fermentation Fundamentals",
"cultured.foods",
"cooking",
9,
"Kimchi, miso, and kombucha: the science of fermentation.",
"https://cultured.foods/fermentation",
),
(
"French Mother Sauces",
"cordonbleu.tips",
"cooking",
7,
"The five foundations of classical French cuisine.",
"https://cordonbleu.tips/sauces",
),
(
"Cooking with Cast Iron",
"homestead.kitchen",
"cooking",
5,
"Seasoning, maintenance, and what cast iron does best.",
"https://homestead.kitchen/cast-iron",
),
(
"Ramen Broth Deep Dive",
"ramen.lab",
"cooking",
12,
"Tonkotsu, shio, shoyu, miso: building each from scratch.",
"https://ramen.lab/broths",
),
(
"Spice Blends of the World",
"spiceroute.org",
"cooking",
7,
"Za'atar, garam masala, berbere, and how to use them.",
"https://spiceroute.org/blends",
),
(
"Chocolate Tempering Technique",
"chocolatier.craft",
"cooking",
8,
"Achieving snap, shine, and bloom resistance in chocolate.",
"https://chocolatier.craft/tempering",
),
(
"Plant-Based Protein Cooking",
"vegkitchen.io",
"cooking",
6,
"Tempeh, seitan, and lentils: maximizing flavor and texture.",
"https://vegkitchen.io/protein",
),
(
"Wine Pairing Fundamentals",
"sommelier.school",
"cooking",
9,
"How acidity, tannin, and weight guide food-wine pairing.",
"https://sommelier.school/pairing",
),
(
"Knife Sharpening Guide",
"bladecraft.com",
"cooking",
5,
"Whetstones, strops, and maintaining a razor edge.",
"https://bladecraft.com/sharpening",
),
// design (ids 6172)
(
"Typography Rules Everyone Breaks",
"typestudio.co",
"design",
7,
"The ten typography mistakes even experienced designers make.",
"https://typestudio.co/rules",
),
(
"Dieter Rams: Ten Principles",
"vitsoe.com",
"design",
8,
"The philosophy behind the world's most influential designer.",
"https://vitsoe.com/rams",
),
(
"Color Theory for UI Designers",
"smashingmagazine.com",
"design",
9,
"How to build accessible, beautiful color systems.",
"https://smashingmagazine.com/color",
),
(
"The Grid System in Print and Web",
"gridness.net",
"design",
6,
"How 12-column grids evolved from Swiss typography.",
"https://gridness.net/history",
),
(
"Icon Design Best Practices",
"icons8.com",
"design",
5,
"Clarity, consistency, and the right level of detail.",
"https://icons8.com/design-guide",
),
(
"Motion Design Principles",
"motion.design",
"design",
8,
"Timing, easing, and purposeful animation in UI.",
"https://motion.design/principles",
),
(
"Design Systems at Scale",
"figma.design",
"design",
10,
"How Figma, Shopify, and IBM manage design systems.",
"https://figma.design/systems",
),
(
"The Psychology of Negative Space",
"whitespace.studio",
"design",
6,
"Why what you leave out matters more than what you include.",
"https://whitespace.studio/negative",
),
(
"Brutalist Web Design Revival",
"brutalist.web",
"design",
7,
"Raw HTML aesthetics and their place in modern web design.",
"https://brutalist.web/revival",
),
(
"Human Factors in Interface Design",
"nngroup.com",
"design",
11,
"Applying cognitive science to create usable interfaces.",
"https://nngroup.com/human-factors",
),
(
"Hand-Lettering for Beginners",
"letteringco.com",
"design",
6,
"Tools, techniques, and practice drills to start lettering.",
"https://letteringco.com/beginners",
),
(
"Accessible Design Checklist",
"a11y.project",
"design",
8,
"WCAG 2.2 requirements translated into actionable steps.",
"https://a11y.project/checklist",
),
// history (ids 7384)
(
"The Fall of Constantinople",
"byzantine.history",
"history",
10,
"1453 and the end of the Eastern Roman Empire.",
"https://byzantine.history/1453",
),
(
"How the Printing Press Changed Europe",
"gutenberg.archive",
"history",
8,
"From scarcity to information explosion in 50 years.",
"https://gutenberg.archive/press",
),
(
"Silk Road Trade Networks",
"ancientroads.org",
"history",
9,
"How goods, ideas, and disease traveled between civilizations.",
"https://ancientroads.org/silk-road",
),
(
"The Irish Famine: Causes and Response",
"ireland.history",
"history",
11,
"Examining the political economy of the Great Hunger.",
"https://ireland.history/famine",
),
(
"Genghis Khan's Administrative Genius",
"mongolia.chronicle",
"history",
9,
"How the Mongol Empire managed the largest land empire.",
"https://mongolia.chronicle/admin",
),
(
"The Haitian Revolution",
"caribbean.history",
"history",
10,
"The only successful slave revolt in history and its legacy.",
"https://caribbean.history/haiti",
),
(
"Victorian-Era Technological Anxiety",
"modernism.review",
"history",
7,
"How the 19th century responded to rapid industrialization.",
"https://modernism.review/victorian",
),
(
"The Origins of Money",
"economic.anthropology",
"history",
8,
"From barter myths to commodity money to fiat currency.",
"https://economic.anthropology/money",
),
(
"Medieval Islamic Science",
"scholar.islam",
"history",
9,
"How the House of Wisdom preserved and advanced knowledge.",
"https://scholar.islam/golden-age",
),
(
"World War I: The July Crisis",
"ww1.centenary",
"history",
12,
"The 37 days that turned assassination into world war.",
"https://ww1.centenary/july",
),
(
"The Scramble for Africa",
"colonial.history",
"history",
10,
"The Berlin Conference and partition of a continent.",
"https://colonial.history/africa",
),
(
"Ancient Roman Supply Chains",
"logistics.antiquity",
"history",
7,
"How Rome fed a million-person city before refrigeration.",
"https://logistics.antiquity/rome",
),
// health (ids 85100, 16 items)
(
"Zone 2 Cardio Explained",
"endurance.science",
"health",
7,
"Why low-intensity aerobic work is the foundation of fitness.",
"https://endurance.science/zone2",
),
(
"Sleep Hygiene That Actually Works",
"sleepdoctor.com",
"health",
6,
"Evidence-based habits for better sleep quality.",
"https://sleepdoctor.com/hygiene",
),
(
"Resistance Training for Longevity",
"muscleandlife.com",
"health",
9,
"How strength training slows aging at the cellular level.",
"https://muscleandlife.com/longevity",
),
(
"The Science of Intermittent Fasting",
"metabolic.health",
"health",
8,
"What IF actually does — and doesn't do — to your body.",
"https://metabolic.health/if",
),
(
"Managing Chronic Inflammation",
"inflammation.md",
"health",
10,
"Diet, sleep, and stress interventions for inflammation.",
"https://inflammation.md/chronic",
),
(
"Cold Exposure Benefits and Risks",
"wim.hof.institute",
"health",
7,
"What the evidence says about cold showers and ice baths.",
"https://wim.hof.institute/cold",
),
(
"VO2 Max: Your Longevity Metric",
"peter.attia.md",
"health",
9,
"Why VO2 max predicts lifespan better than most biomarkers.",
"https://peter.attia.md/vo2max",
),
(
"Gut Health and the Microbiome",
"fermented.health",
"health",
8,
"Practical steps to improve microbiome diversity.",
"https://fermented.health/microbiome",
),
(
"Breathing and Autonomic Regulation",
"breathwork.lab",
"health",
6,
"Box breathing, physiological sighs, and HRV training.",
"https://breathwork.lab/autonomic",
),
(
"Blue Light and Circadian Disruption",
"chronobiology.net",
"health",
7,
"How screens affect melatonin and what to do about it.",
"https://chronobiology.net/blue-light",
),
(
"Sauna Use and Cardiovascular Health",
"nordic.health.research",
"health",
8,
"Finnish studies on sauna frequency and heart disease risk.",
"https://nordic.health.research/sauna",
),
(
"Mindfulness-Based Stress Reduction",
"mbsr.clinic",
"health",
10,
"Eight-week MBSR protocol outcomes and how to practice.",
"https://mbsr.clinic/protocol",
),
(
"Vitamin D and Immune Function",
"vitamind.council",
"health",
5,
"Deficiency, supplementation, and what the research shows.",
"https://vitamind.council/immune",
),
(
"Running Economy and Cadence",
"running.science",
"health",
7,
"How stride rate affects efficiency and injury risk.",
"https://running.science/cadence",
),
(
"Cognitive Training and Brain Plasticity",
"neuro.performance",
"health",
9,
"What actually improves cognitive function with aging.",
"https://neuro.performance/plasticity",
),
(
"Postural Health in the Desk Era",
"ergonomics.pro",
"health",
6,
"Evidence-based fixes for tech neck and lower back pain.",
"https://ergonomics.pro/posture",
),
];
raw.iter()
.enumerate()
.map(
|(i, &(title, source, category, reading_time, description, url))| SeedItem {
id: (i + 1) as u64,
title: title.to_owned(),
source: source.to_owned(),
category: category.to_owned(),
reading_time_min: reading_time,
description: description.to_owned(),
url: url.to_owned(),
},
)
.collect()
}
/// Write seed users into the database.
///
/// - User 1: cold (no signals)
/// - User 2: explorer (~10 signals across jazz/science/travel)
/// - User 3: convergent (~50 signals in technology + jazz)
pub fn seed_users(db: &TidalDb) -> tidaldb::Result<()> {
let empty: HashMap<String, String> = HashMap::new();
// User 1: cold start
db.write_user(EntityId::new(1), &empty)?;
// User 2: explorer — scattered signals across jazz, science, travel.
// "share" is a positive engagement signal that updates the preference vector.
db.write_user(EntityId::new(2), &empty)?;
let t = Timestamp::now();
for item_id in [25u64, 13, 37, 26, 14, 38, 27, 15, 39, 28] {
db.signal_with_context("view", EntityId::new(item_id), 1.0, t, Some(2), None)?;
}
// Share top picks from each explored category → seeds preference vector.
for item_id in [25u64, 13, 37] {
db.signal_with_context("share", EntityId::new(item_id), 1.0, t, Some(2), None)?;
}
// User 3: convergent — heavy on technology and jazz.
// Share a selection of tech+jazz to seed the preference vector.
db.write_user(EntityId::new(3), &empty)?;
for item_id in 1..=12u64 {
for _ in 0..3 {
db.signal_with_context("view", EntityId::new(item_id), 1.0, t, Some(3), None)?;
}
db.signal_with_context("save", EntityId::new(item_id), 1.0, t, Some(3), None)?;
db.signal_with_context("share", EntityId::new(item_id), 1.0, t, Some(3), None)?;
}
for item_id in 25..=36u64 {
for _ in 0..2 {
db.signal_with_context("view", EntityId::new(item_id), 1.0, t, Some(3), None)?;
}
db.signal_with_context("share", EntityId::new(item_id), 1.0, t, Some(3), None)?;
}
Ok(())
}