From c1c5a10fbca31473fe032fd8a9d14fb09c377548 Mon Sep 17 00:00:00 2001 From: jordan Date: Mon, 23 Feb 2026 23:16:32 -0700 Subject: [PATCH] chore: reuse reqwest::Client across requests in forage embedder; minor forage updates Co-Authored-By: Claude Sonnet 4.6 --- applications/forage/embedder/src/main.rs | 21 ++++-- applications/forage/engine/src/lib.rs | 77 +++++++++++++++++++-- applications/forage/extension/manifest.json | 2 +- applications/forage/js/capture.js | 9 ++- 4 files changed, 96 insertions(+), 13 deletions(-) diff --git a/applications/forage/embedder/src/main.rs b/applications/forage/embedder/src/main.rs index 10dcde9..3e538d9 100644 --- a/applications/forage/embedder/src/main.rs +++ b/applications/forage/embedder/src/main.rs @@ -53,7 +53,12 @@ struct Args { #[derive(Clone)] enum Mode { Mock, - OpenAi { api_key: String }, + /// `client` is created once at startup and reused across requests. + /// `reqwest::Client` is cheaply cloneable (`Arc`-backed connection pool). + OpenAi { + api_key: String, + client: reqwest::Client, + }, } #[derive(Deserialize)] @@ -70,7 +75,7 @@ struct EmbedResp { async fn post_embed(State(mode): State>, Json(req): Json) -> impl IntoResponse { let vector = match mode.as_ref() { Mode::Mock => mock_embed(&req.text), - Mode::OpenAi { api_key } => match openai_embed(api_key, &req.text).await { + Mode::OpenAi { api_key, client } => match openai_embed(client, api_key, &req.text).await { Ok(v) => v, Err(e) => { return ( @@ -119,8 +124,11 @@ fn mock_embed(text: &str) -> Vec { } /// OpenAI text-embedding-3-small call. -async fn openai_embed(api_key: &str, text: &str) -> Result, String> { - let client = reqwest::Client::new(); +async fn openai_embed( + client: &reqwest::Client, + api_key: &str, + text: &str, +) -> Result, String> { let resp = client .post("https://api.openai.com/v1/embeddings") .bearer_auth(api_key) @@ -178,7 +186,10 @@ async fn main() { Mode::Mock } else { println!("forage-embedder: OpenAI mode (text-embedding-3-small)"); - Mode::OpenAi { api_key: key } + Mode::OpenAi { + api_key: key, + client: reqwest::Client::new(), + } } }; diff --git a/applications/forage/engine/src/lib.rs b/applications/forage/engine/src/lib.rs index a1b48f0..3c8063d 100644 --- a/applications/forage/engine/src/lib.rs +++ b/applications/forage/engine/src/lib.rs @@ -1175,10 +1175,10 @@ fn call_embedder( /// 3. Strip a trailing slash from the path unless the path is the root `/`. fn canonicalize_url(url: &str) -> String { // 1. Strip amp. subdomain - let s = if url.starts_with("https://amp.") { - format!("https://{}", &url["https://amp.".len()..]) - } else if url.starts_with("http://amp.") { - format!("http://{}", &url["http://amp.".len()..]) + let s = if let Some(rest) = url.strip_prefix("https://amp.") { + format!("https://{rest}") + } else if let Some(rest) = url.strip_prefix("http://amp.") { + format!("http://{rest}") } else { url.to_owned() }; @@ -1199,7 +1199,7 @@ fn canonicalize_url(url: &str) -> String { // Find position of the first slash after "://" let after_scheme = base.find("://").map_or(0, |i| i + 3); let first_path_slash = base[after_scheme..].find('/'); - let has_real_path = first_path_slash.map_or(false, |j| base.len() > after_scheme + j + 1); + let has_real_path = first_path_slash.is_some_and(|j| base.len() > after_scheme + j + 1); if has_real_path && base.ends_with('/') { base[..base.len() - 1].to_owned() } else { @@ -1229,6 +1229,73 @@ fn canonicalize_url(url: &str) -> String { } } +#[cfg(test)] +mod canon_tests { + use super::canonicalize_url; + + #[test] + fn strips_amp_subdomain_https() { + assert_eq!( + canonicalize_url("https://amp.example.com/article/123"), + "https://example.com/article/123" + ); + } + + #[test] + fn strips_amp_subdomain_http() { + assert_eq!( + canonicalize_url("http://amp.cnn.com/story"), + "http://cnn.com/story" + ); + } + + #[test] + fn removes_amp_query_param_standalone() { + assert_eq!( + canonicalize_url("https://example.com/article?amp=1"), + "https://example.com/article" + ); + } + + #[test] + fn removes_amp_tf_query_param_among_others() { + assert_eq!( + canonicalize_url("https://example.com/a?q=rust&_tf=1&page=2"), + "https://example.com/a?q=rust&page=2" + ); + } + + #[test] + fn strips_trailing_slash_from_real_path() { + assert_eq!( + canonicalize_url("https://example.com/article/"), + "https://example.com/article" + ); + } + + #[test] + fn preserves_root_slash() { + assert_eq!( + canonicalize_url("https://example.com/"), + "https://example.com/" + ); + } + + #[test] + fn preserves_meaningful_query_params() { + let url = "https://example.com/search?q=rust&lang=en"; + assert_eq!(canonicalize_url(url), url); + } + + #[test] + fn amp_subdomain_plus_amp_param_both_stripped() { + assert_eq!( + canonicalize_url("https://amp.example.com/post/?amp=1"), + "https://example.com/post" + ); + } +} + /// Round-robin interleave items by category to ensure the cold-start exploit /// pool spans ≥3 categories. Preserves score ordering within each category. /// diff --git a/applications/forage/extension/manifest.json b/applications/forage/extension/manifest.json index d08a1bb..9cf86a7 100644 --- a/applications/forage/extension/manifest.json +++ b/applications/forage/extension/manifest.json @@ -3,7 +3,7 @@ "name": "Forage", "version": "0.1.0", "description": "Automatically capture browsing signals for your Forage personalized feed", - "permissions": ["storage"], + "permissions": ["storage", "tabs"], "host_permissions": ["http://*/*", "https://*/*"], "content_scripts": [ { diff --git a/applications/forage/js/capture.js b/applications/forage/js/capture.js index 87bd758..39a02b6 100644 --- a/applications/forage/js/capture.js +++ b/applications/forage/js/capture.js @@ -15,10 +15,13 @@ * Configuration: * Change USER_ID to match the Forage user you are browsing as (1, 2, or 3 * for the seed users; any positive integer for a new user). + * Set TOKEN to the value passed with --token when starting the server, + * or leave empty ('') if the server was started without --token. */ (function forageCapture() { const SERVER = 'http://localhost:4242'; const USER_ID = 1; + const TOKEN = ''; const DWELL_MS = 30_000; const url = location.href; @@ -46,9 +49,11 @@ let itemId = null; + const authHeaders = TOKEN ? { Authorization: `Bearer ${TOKEN}` } : {}; + fetch(`${SERVER}/capture`, { method: 'POST', - headers: { 'Content-Type': 'application/json' }, + headers: { 'Content-Type': 'application/json', ...authHeaders }, body: JSON.stringify({ url, canonical_url: canonicalUrl, @@ -73,7 +78,7 @@ if (itemId == null) return; fetch(`${SERVER}/signal`, { method: 'POST', - headers: { 'Content-Type': 'application/json' }, + headers: { 'Content-Type': 'application/json', ...authHeaders }, body: JSON.stringify({ user_id: USER_ID, item_id: itemId,