stemedb/latent/ingest-reddit/adk-agent/tools.py

"""
ADK tools for the Reddit Adverse Event Agent.

These are plain functions that will be registered with the ADK Agent.
"""

import json
import time
from typing import Optional

import requests

# Support both package and script imports
try:
    from .config import (
        REDDIT_HEADERS,
        ADVERSE_EVENT_KEYWORDS,
        DRUG_MAP,
        MIN_CONFIDENCE,
        MAX_CONFIDENCE,
        SOURCE_CLASS_SOCIAL,
        DEFAULT_LIFECYCLE,
        STEMEDB_URL,
        ENV_STEMEDB_AGENT_SEED,
    )
    from .signer import Signer
    from .stemedb_client import StemeDBClient
except ImportError:
    from config import (
        REDDIT_HEADERS,
        ADVERSE_EVENT_KEYWORDS,
        DRUG_MAP,
        MIN_CONFIDENCE,
        MAX_CONFIDENCE,
        SOURCE_CLASS_SOCIAL,
        DEFAULT_LIFECYCLE,
        STEMEDB_URL,
        ENV_STEMEDB_AGENT_SEED,
    )
    from signer import Signer
    from stemedb_client import StemeDBClient

# Module-level client (initialized lazily)
_client: Optional[StemeDBClient] = None


def _get_client() -> StemeDBClient:
    """Get or create the StemeDB client."""
    global _client
    if _client is None:
        signer = Signer.from_env(ENV_STEMEDB_AGENT_SEED)
        _client = StemeDBClient(STEMEDB_URL, signer)
    return _client


def fetch_reddit_posts(subreddit: str, limit: int = 25) -> dict:
    """
    Fetch recent posts from a subreddit matching adverse event keywords.

    This function scrapes Reddit's public JSON API for posts in GLP-1 medication
    subreddits that mention potential adverse events or side effects.

    Args:
        subreddit: Name of the subreddit to scan (e.g., "Ozempic", "Mounjaro")
        limit: Maximum number of posts to fetch (default: 25, max: 100)

    Returns:
        A dictionary with:
        - subreddit: The scanned subreddit name
        - total_fetched: Number of posts retrieved from Reddit
        - matched_posts: Number of posts matching adverse event keywords
        - posts: List of matching posts with id, title, text, url, created_utc,
                 score, author, and detected_drug fields

    Example:
        result = fetch_reddit_posts("Ozempic", limit=50)
        for post in result["posts"]:
            print(f"{post['title']} - {post['detected_drug']}")
    """
    limit = min(limit, 100)  # Reddit API limit
    url = f"https://www.reddit.com/r/{subreddit}/new.json?limit={limit}"

    try:
        response = requests.get(url, headers=REDDIT_HEADERS, timeout=10)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        return {
            "subreddit": subreddit,
            "error": str(e),
            "total_fetched": 0,
            "matched_posts": 0,
            "posts": [],
        }

    matched_posts = []
    children = data.get("data", {}).get("children", [])

    for child in children:
        post_data = child.get("data", {})
        content = f"{post_data.get('title', '')} {post_data.get('selftext', '')}".lower()

        # Check for adverse event keywords
        if not any(keyword in content for keyword in ADVERSE_EVENT_KEYWORDS):
            continue

        # Detect the drug from subreddit name
        detected_drug = DRUG_MAP.get(subreddit.lower(), "glp1_agonist")

        matched_posts.append(
            {
                "id": post_data.get("id"),
                "title": post_data.get("title", ""),
                "text": post_data.get("selftext", "")[:2000],  # Truncate long posts
                "url": f"https://reddit.com{post_data.get('permalink', '')}",
                "created_utc": int(post_data.get("created_utc", 0)),
                "score": post_data.get("score", 0),
                "author": post_data.get("author", "deleted"),
                "detected_drug": detected_drug,
            }
        )

    return {
        "subreddit": subreddit,
        "total_fetched": len(children),
        "matched_posts": len(matched_posts),
        "posts": matched_posts,
    }


def store_assertion(
    subject: str,
    predicate: str,
    object_value: str,
    confidence: float,
    source_url: str,
    severity: Optional[str] = None,
    reddit_post_id: Optional[str] = None,
) -> dict:
    """
    Store a signed assertion in StemeDB.

    This function creates an assertion representing an extracted adverse event
    and signs it with the agent's Ed25519 key before submitting to StemeDB.

    Args:
        subject: The drug or entity (e.g., "semaglutide", "tirzepatide")
        predicate: The type of assertion (e.g., "side_effect", "adverse_event")
        object_value: The specific effect (e.g., "nausea", "gastroparesis")
        confidence: Confidence score (0.0-1.0, will be clamped to 0.3-0.7 for anecdotal data)
        source_url: Reddit post URL for provenance
        severity: Optional severity level ("low", "medium", "high")
        reddit_post_id: Optional Reddit post ID for tracking

    Returns:
        A dictionary with:
        - success: Boolean indicating if the assertion was stored
        - hash: Content-addressed hash of the assertion (if successful)
        - subject, predicate, object, confidence: The stored values
        - source_hash: BLAKE3 hash of the source URL
        - error: Error message (if failed)

    Example:
        result = store_assertion(
            subject="semaglutide",
            predicate="side_effect",
            object_value="nausea",
            confidence=0.5,
            source_url="https://reddit.com/r/Ozempic/comments/abc123/...",
            severity="low"
        )
        if result["success"]:
            print(f"Stored with hash: {result['hash']}")
    """
    # Clamp confidence to allowed range for anecdotal data
    clamped_confidence = max(MIN_CONFIDENCE, min(MAX_CONFIDENCE, confidence))

    # Build source metadata
    metadata = {"type": "reddit_post", "severity": severity}
    if reddit_post_id:
        metadata["reddit_id"] = reddit_post_id
    metadata_json = json.dumps(metadata)

    try:
        client = _get_client()
        result = client.assert_fact(
            subject=subject,
            predicate=predicate,
            object_value=object_value,
            confidence=clamped_confidence,
            source_url=source_url,
            source_class=SOURCE_CLASS_SOCIAL,
            lifecycle=DEFAULT_LIFECYCLE,
            source_metadata=metadata_json,
        )
        return {
            "success": True,
            "hash": result.hash,
            "subject": subject,
            "predicate": predicate,
            "object": object_value,
            "confidence": clamped_confidence,
            "source_hash": result.hash[:16] + "...",  # Truncated for display
        }
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "subject": subject,
            "predicate": predicate,
            "object": object_value,
        }