import requests import json import time import uuid from datetime import datetime # Configuration TARGET_MOLECULES = [ "semaglutide", "tirzepatide", "liraglutide" ] API_BASE = "https://api.fda.gov/drug/label.json" # StemeDB Source Class for Regulatory SOURCE_CLASS_REGULATORY = 0 def fetch_label(molecule): """ Fetches the most recent FDA label for a given generic molecule name. """ print(f"[*] Fetching FDA label for: {molecule}...") # Query for the generic name, prioritize recent effective_time query = f'openfda.generic_name:"{molecule}"' params = { "search": query, "limit": 1, "sort": "effective_time:desc" } try: response = requests.get(API_BASE, params=params) response.raise_for_status() data = response.json() if "results" in data and len(data["results"]) > 0: return data["results"][0] else: print(f"[!] No results found for {molecule}") return None except Exception as e: print(f"[!] Error fetching {molecule}: {e}") return None def extract_assertions(label_data, molecule): """ Parses raw OpenFDA JSON into StemeDB Assertions. """ assertions = [] # Metadata for the Source set_id = label_data.get("set_id", "unknown") effective_time = label_data.get("effective_time", datetime.now().strftime("%Y%m%d")) brand_name = label_data.get("openfda", {}).get("brand_name", ["Unknown"])[0] # 1. Boxed Warnings (Critical Safety Info) if "boxed_warning" in label_data: text = "\n".join(label_data["boxed_warning"]) assertions.append({ "id": str(uuid.uuid4()), "subject": molecule, "predicate": "has_boxed_warning", "object": text, # Raw text for now, NLP would extract specific risks "confidence": 1.0, "source_class": SOURCE_CLASS_REGULATORY, "source_metadata": { "type": "openfda_label", "set_id": set_id, "section": "boxed_warning", "effective_date": effective_time, "brand_name": brand_name }, "timestamp": int(time.time()) }) # 2. Adverse Reactions (The "Side Effects") if "adverse_reactions" in label_data: text = "\n".join(label_data["adverse_reactions"]) assertions.append({ "id": str(uuid.uuid4()), "subject": molecule, "predicate": "has_adverse_reactions_section", "object": text, "confidence": 1.0, "source_class": SOURCE_CLASS_REGULATORY, "source_metadata": { "type": "openfda_label", "set_id": set_id, "section": "adverse_reactions", "effective_date": effective_time, "brand_name": brand_name }, "timestamp": int(time.time()) }) # 3. Warnings and Precautions if "warnings_and_precautions" in label_data: text = "\n".join(label_data["warnings_and_precautions"]) assertions.append({ "id": str(uuid.uuid4()), "subject": molecule, "predicate": "has_warnings", "object": text, "confidence": 1.0, "source_class": SOURCE_CLASS_REGULATORY, "source_metadata": { "type": "openfda_label", "set_id": set_id, "section": "warnings_and_precautions", "effective_date": effective_time, "brand_name": brand_name }, "timestamp": int(time.time()) }) return assertions def main(): all_assertions = [] for molecule in TARGET_MOLECULES: label = fetch_label(molecule) if label: drug_assertions = extract_assertions(label, molecule) all_assertions.extend(drug_assertions) print(f"[+] Extracted {len(drug_assertions)} Tier 0 assertions for {molecule}") # Be nice to the API time.sleep(1) # Output to JSONL (simulating StemeDB Write) output_file = "tier0_regulatory_graph.jsonl" with open(output_file, "w") as f: for assertion in all_assertions: f.write(json.dumps(assertion) + "\n") print(f"\n[OK] Successfully generated {len(all_assertions)} regulatory assertions.") print(f"[->] Graph data written to {output_file}") if __name__ == "__main__": main()