chore: reuse reqwest::Client across requests in forage embedder; minor forage updates

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-23 23:16:32 -07:00
parent 213b8efcca
commit c1c5a10fbc
4 changed files with 96 additions and 13 deletions

View File

@ -53,7 +53,12 @@ struct Args {
#[derive(Clone)]
enum Mode {
Mock,
OpenAi { api_key: String },
/// `client` is created once at startup and reused across requests.
/// `reqwest::Client` is cheaply cloneable (`Arc`-backed connection pool).
OpenAi {
api_key: String,
client: reqwest::Client,
},
}
#[derive(Deserialize)]
@ -70,7 +75,7 @@ struct EmbedResp {
async fn post_embed(State(mode): State<Arc<Mode>>, Json(req): Json<EmbedReq>) -> impl IntoResponse {
let vector = match mode.as_ref() {
Mode::Mock => mock_embed(&req.text),
Mode::OpenAi { api_key } => match openai_embed(api_key, &req.text).await {
Mode::OpenAi { api_key, client } => match openai_embed(client, api_key, &req.text).await {
Ok(v) => v,
Err(e) => {
return (
@ -119,8 +124,11 @@ fn mock_embed(text: &str) -> Vec<f32> {
}
/// OpenAI text-embedding-3-small call.
async fn openai_embed(api_key: &str, text: &str) -> Result<Vec<f32>, String> {
let client = reqwest::Client::new();
async fn openai_embed(
client: &reqwest::Client,
api_key: &str,
text: &str,
) -> Result<Vec<f32>, String> {
let resp = client
.post("https://api.openai.com/v1/embeddings")
.bearer_auth(api_key)
@ -178,7 +186,10 @@ async fn main() {
Mode::Mock
} else {
println!("forage-embedder: OpenAI mode (text-embedding-3-small)");
Mode::OpenAi { api_key: key }
Mode::OpenAi {
api_key: key,
client: reqwest::Client::new(),
}
}
};

View File

@ -1175,10 +1175,10 @@ fn call_embedder(
/// 3. Strip a trailing slash from the path unless the path is the root `/`.
fn canonicalize_url(url: &str) -> String {
// 1. Strip amp. subdomain
let s = if url.starts_with("https://amp.") {
format!("https://{}", &url["https://amp.".len()..])
} else if url.starts_with("http://amp.") {
format!("http://{}", &url["http://amp.".len()..])
let s = if let Some(rest) = url.strip_prefix("https://amp.") {
format!("https://{rest}")
} else if let Some(rest) = url.strip_prefix("http://amp.") {
format!("http://{rest}")
} else {
url.to_owned()
};
@ -1199,7 +1199,7 @@ fn canonicalize_url(url: &str) -> String {
// Find position of the first slash after "://"
let after_scheme = base.find("://").map_or(0, |i| i + 3);
let first_path_slash = base[after_scheme..].find('/');
let has_real_path = first_path_slash.map_or(false, |j| base.len() > after_scheme + j + 1);
let has_real_path = first_path_slash.is_some_and(|j| base.len() > after_scheme + j + 1);
if has_real_path && base.ends_with('/') {
base[..base.len() - 1].to_owned()
} else {
@ -1229,6 +1229,73 @@ fn canonicalize_url(url: &str) -> String {
}
}
#[cfg(test)]
mod canon_tests {
use super::canonicalize_url;
#[test]
fn strips_amp_subdomain_https() {
assert_eq!(
canonicalize_url("https://amp.example.com/article/123"),
"https://example.com/article/123"
);
}
#[test]
fn strips_amp_subdomain_http() {
assert_eq!(
canonicalize_url("http://amp.cnn.com/story"),
"http://cnn.com/story"
);
}
#[test]
fn removes_amp_query_param_standalone() {
assert_eq!(
canonicalize_url("https://example.com/article?amp=1"),
"https://example.com/article"
);
}
#[test]
fn removes_amp_tf_query_param_among_others() {
assert_eq!(
canonicalize_url("https://example.com/a?q=rust&amp_tf=1&page=2"),
"https://example.com/a?q=rust&page=2"
);
}
#[test]
fn strips_trailing_slash_from_real_path() {
assert_eq!(
canonicalize_url("https://example.com/article/"),
"https://example.com/article"
);
}
#[test]
fn preserves_root_slash() {
assert_eq!(
canonicalize_url("https://example.com/"),
"https://example.com/"
);
}
#[test]
fn preserves_meaningful_query_params() {
let url = "https://example.com/search?q=rust&lang=en";
assert_eq!(canonicalize_url(url), url);
}
#[test]
fn amp_subdomain_plus_amp_param_both_stripped() {
assert_eq!(
canonicalize_url("https://amp.example.com/post/?amp=1"),
"https://example.com/post"
);
}
}
/// Round-robin interleave items by category to ensure the cold-start exploit
/// pool spans ≥3 categories. Preserves score ordering within each category.
///

View File

@ -3,7 +3,7 @@
"name": "Forage",
"version": "0.1.0",
"description": "Automatically capture browsing signals for your Forage personalized feed",
"permissions": ["storage"],
"permissions": ["storage", "tabs"],
"host_permissions": ["http://*/*", "https://*/*"],
"content_scripts": [
{

View File

@ -15,10 +15,13 @@
* Configuration:
* Change USER_ID to match the Forage user you are browsing as (1, 2, or 3
* for the seed users; any positive integer for a new user).
* Set TOKEN to the value passed with --token when starting the server,
* or leave empty ('') if the server was started without --token.
*/
(function forageCapture() {
const SERVER = 'http://localhost:4242';
const USER_ID = 1;
const TOKEN = '';
const DWELL_MS = 30_000;
const url = location.href;
@ -46,9 +49,11 @@
let itemId = null;
const authHeaders = TOKEN ? { Authorization: `Bearer ${TOKEN}` } : {};
fetch(`${SERVER}/capture`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
headers: { 'Content-Type': 'application/json', ...authHeaders },
body: JSON.stringify({
url,
canonical_url: canonicalUrl,
@ -73,7 +78,7 @@
if (itemId == null) return;
fetch(`${SERVER}/signal`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
headers: { 'Content-Type': 'application/json', ...authHeaders },
body: JSON.stringify({
user_id: USER_ID,
item_id: itemId,