#!/usr/bin/env python3 """ LATENT: Tier 5 (Social) Ingestor - Reddit No API credentials needed - uses public .json endpoints """ import os import json import time import uuid import requests from datetime import datetime from dotenv import load_dotenv load_dotenv() # Configuration TARGET_SUBREDDITS = ["Ozempic", "Mounjaro", "Semaglutide", "Wegovy"] KEYWORDS = [ "stomach", "paralysis", "gastroparesis", "vomit", "nausea", "er", "emergency", "hospital", "pain", "stopped working", "hair loss", "side effect" ] # StemeDB Source Class for Social/Anecdotal SOURCE_CLASS_SOCIAL = 5 # Headers to avoid being blocked HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "application/json", } def fetch_posts_json(subreddit_name: str, limit: int = 50) -> list: """ Fetch posts from a subreddit using public .json endpoint. No API credentials required. """ print(f"[*] Scanning r/{subreddit_name}...") posts = [] url = f"https://www.reddit.com/r/{subreddit_name}/new.json?limit={limit}" try: response = requests.get(url, headers=HEADERS, timeout=10) response.raise_for_status() data = response.json() for child in data.get("data", {}).get("children", []): post_data = child.get("data", {}) content = f"{post_data.get('title', '')} {post_data.get('selftext', '')}".lower() # Keyword filter if any(k in content for k in KEYWORDS): posts.append({ "id": post_data.get("id"), "title": post_data.get("title", ""), "text": post_data.get("selftext", ""), "url": f"https://reddit.com{post_data.get('permalink', '')}", "created_utc": post_data.get("created_utc", 0), "score": post_data.get("score", 0), "author": post_data.get("author", "deleted"), "subreddit": subreddit_name }) except requests.exceptions.RequestException as e: print(f"[!] Error fetching r/{subreddit_name}: {e}") return posts def extract_assertion_mock(post: dict) -> list: """ Heuristic extraction when OpenAI key is missing. """ assertions = [] text = f"{post['title']} {post['text']}".lower() # Map subreddit to likely drug drug_map = { "ozempic": "semaglutide", "wegovy": "semaglutide", "mounjaro": "tirzepatide", "semaglutide": "semaglutide" } subject = drug_map.get(post["subreddit"].lower(), "glp1_agonist") # Extract side effects based on keywords if "paralysis" in text or "gastroparesis" in text: assertions.append({ "subject": subject, "predicate": "side_effect", "object": "gastroparesis", "severity": "high" }) if "vomit" in text or "throw up" in text or "throwing up" in text: assertions.append({ "subject": subject, "predicate": "side_effect", "object": "vomiting", "severity": "medium" }) if "nausea" in text: assertions.append({ "subject": subject, "predicate": "side_effect", "object": "nausea", "severity": "low" }) if "hair loss" in text or "losing hair" in text: assertions.append({ "subject": subject, "predicate": "side_effect", "object": "hair_loss", "severity": "medium" }) if "hospital" in text or "emergency" in text or " er " in text: assertions.append({ "subject": subject, "predicate": "adverse_event", "object": "hospitalization", "severity": "high" }) if "stopped working" in text: assertions.append({ "subject": subject, "predicate": "efficacy_issue", "object": "tolerance", "severity": "medium" }) return assertions def extract_assertion_llm(post: dict) -> list: """ Uses OpenAI to extract structured assertions from raw text. """ import openai client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) prompt = f"""Analyze this Reddit post about GLP-1 weight loss drugs. Extract any adverse health events (side effects) mentioned. Return JSON with findings list. Post Title: {post['title']} Post Body: {post['text'][:1500]} Format: {{ "findings": [ {{ "subject": "drug name (semaglutide/tirzepatide)", "predicate": "side_effect", "object": "specific symptom", "severity": "low/medium/high" }} ] }}""" try: response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], response_format={"type": "json_object"} ) data = json.loads(response.choices[0].message.content) return data.get("findings", []) except Exception as e: print(f"[!] LLM extraction failed: {e}") return extract_assertion_mock(post) # Fallback to mock def format_as_stemedb(raw_findings: list, post: dict) -> list: """ Wraps extracted findings in the StemeDB Assertion envelope. """ steme_assertions = [] for finding in raw_findings: assertion = { "id": str(uuid.uuid4()), "subject": finding.get("subject", "unknown_molecule"), "predicate": finding.get("predicate", "related_to"), "object": finding.get("object", "unknown_effect"), "confidence": 0.5, # Tier 5 starts low "source_class": SOURCE_CLASS_SOCIAL, "source_metadata": { "type": "reddit_post", "reddit_id": post["id"], "url": post["url"], "subreddit": post["subreddit"], "author_hash": hash(post["author"]), "severity": finding.get("severity") }, "timestamp": int(post["created_utc"]), "ingested_at": int(time.time()) } steme_assertions.append(assertion) return steme_assertions def main(): print("=" * 50) print("LATENT: Tier 5 (Social) Ingestor - Reddit") print("=" * 50) print("No API credentials needed - using public JSON endpoints\n") all_assertions = [] use_llm = bool(os.getenv("OPENAI_API_KEY")) if use_llm: print("[*] Using OpenAI for extraction\n") else: print("[*] Using heuristic extraction (set OPENAI_API_KEY for LLM)\n") for sub in TARGET_SUBREDDITS: posts = fetch_posts_json(sub, limit=50) print(f"[+] Found {len(posts)} keyword-matched posts in r/{sub}") for post in posts: if use_llm: raw_findings = extract_assertion_llm(post) else: raw_findings = extract_assertion_mock(post) if raw_findings: assertions = format_as_stemedb(raw_findings, post) all_assertions.extend(assertions) print(f" -> {post['id']}: {len(assertions)} assertions") time.sleep(2) # Rate limit politeness # Output output_file = "tier5_social_graph.jsonl" with open(output_file, "w") as f: for a in all_assertions: f.write(json.dumps(a) + "\n") print(f"\n{'=' * 50}") print(f"[OK] {len(all_assertions)} assertions written to {output_file}") print("=" * 50) if __name__ == "__main__": main()