stemedb/latent/ingest-fda/main.py

import requests
import json
import time
import uuid
from datetime import datetime

# Configuration
TARGET_MOLECULES = [
    "semaglutide",
    "tirzepatide",
    "liraglutide"
]

API_BASE = "https://api.fda.gov/drug/label.json"

# StemeDB Source Class for Regulatory
SOURCE_CLASS_REGULATORY = 0

def fetch_label(molecule):
    """
    Fetches the most recent FDA label for a given generic molecule name.
    """
    print(f"[*] Fetching FDA label for: {molecule}...")

    # Query for the generic name, prioritize recent effective_time
    query = f'openfda.generic_name:"{molecule}"'
    params = {
        "search": query,
        "limit": 1,
        "sort": "effective_time:desc"
    }

    try:
        response = requests.get(API_BASE, params=params)
        response.raise_for_status()
        data = response.json()

        if "results" in data and len(data["results"]) > 0:
            return data["results"][0]
        else:
            print(f"[!] No results found for {molecule}")
            return None

    except Exception as e:
        print(f"[!] Error fetching {molecule}: {e}")
        return None

def extract_assertions(label_data, molecule):
    """
    Parses raw OpenFDA JSON into StemeDB Assertions.
    """
    assertions = []

    # Metadata for the Source
    set_id = label_data.get("set_id", "unknown")
    effective_time = label_data.get("effective_time", datetime.now().strftime("%Y%m%d"))
    brand_name = label_data.get("openfda", {}).get("brand_name", ["Unknown"])[0]

    # 1. Boxed Warnings (Critical Safety Info)
    if "boxed_warning" in label_data:
        text = "\n".join(label_data["boxed_warning"])
        assertions.append({
            "id": str(uuid.uuid4()),
            "subject": molecule,
            "predicate": "has_boxed_warning",
            "object": text, # Raw text for now, NLP would extract specific risks
            "confidence": 1.0,
            "source_class": SOURCE_CLASS_REGULATORY,
            "source_metadata": {
                "type": "openfda_label",
                "set_id": set_id,
                "section": "boxed_warning",
                "effective_date": effective_time,
                "brand_name": brand_name
            },
            "timestamp": int(time.time())
        })

    # 2. Adverse Reactions (The "Side Effects")
    if "adverse_reactions" in label_data:
        text = "\n".join(label_data["adverse_reactions"])
        assertions.append({
            "id": str(uuid.uuid4()),
            "subject": molecule,
            "predicate": "has_adverse_reactions_section",
            "object": text,
            "confidence": 1.0,
            "source_class": SOURCE_CLASS_REGULATORY,
            "source_metadata": {
                "type": "openfda_label",
                "set_id": set_id,
                "section": "adverse_reactions",
                "effective_date": effective_time,
                "brand_name": brand_name
            },
            "timestamp": int(time.time())
        })

    # 3. Warnings and Precautions
    if "warnings_and_precautions" in label_data:
        text = "\n".join(label_data["warnings_and_precautions"])
        assertions.append({
            "id": str(uuid.uuid4()),
            "subject": molecule,
            "predicate": "has_warnings",
            "object": text,
            "confidence": 1.0,
            "source_class": SOURCE_CLASS_REGULATORY,
            "source_metadata": {
                "type": "openfda_label",
                "set_id": set_id,
                "section": "warnings_and_precautions",
                "effective_date": effective_time,
                "brand_name": brand_name
            },
            "timestamp": int(time.time())
        })

    return assertions

def main():
    all_assertions = []

    for molecule in TARGET_MOLECULES:
        label = fetch_label(molecule)
        if label:
            drug_assertions = extract_assertions(label, molecule)
            all_assertions.extend(drug_assertions)
            print(f"[+] Extracted {len(drug_assertions)} Tier 0 assertions for {molecule}")

        # Be nice to the API
        time.sleep(1)

    # Output to JSONL (simulating StemeDB Write)
    output_file = "tier0_regulatory_graph.jsonl"
    with open(output_file, "w") as f:
        for assertion in all_assertions:
            f.write(json.dumps(assertion) + "\n")

    print(f"\n[OK] Successfully generated {len(all_assertions)} regulatory assertions.")
    print(f"[->] Graph data written to {output_file}")

if __name__ == "__main__":
    main()