stemedb/latent/ingest-fda/main.py
jordan b3e8a9a058 feat: Multi-application expansion with chaos testing and community UI
Major additions:
- Community Next.js app (port 18187) for browsing claims with API docs
- stemedb-chaos crate: Fault injection, chaos testing, CRDT properties
- Latent ingestion system: Reddit/FDA ingesters with ADK-Go agents
- Disputed claims handling: Manual review workflows and validation
- Aphoria security scanner: New extractors (SQL injection, command
  injection, weak crypto, TLS version), policy-based ignores, UAT reports
- Docker infrastructure: Dockerfile, docker-compose.yml for full stack
- VulnBank demo: Intentionally vulnerable multi-language test corpus

SDK & API enhancements:
- Source registry handlers for tracking data provenance
- Metrics endpoint
- Skeptic filtering improvements

Code quality:
- Split 14 large files (>500 lines) into focused modules
- All files now under 500-line limit per project guidelines

Documentation:
- Chaos testing guide, circuit breakers, observability docs
- Phase 7 UAT documentation updates
- Martin Kleppmann technical writer agent

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 01:24:14 -07:00

145 lines
4.5 KiB
Python

import requests
import json
import time
import uuid
from datetime import datetime
# Configuration
TARGET_MOLECULES = [
"semaglutide",
"tirzepatide",
"liraglutide"
]
API_BASE = "https://api.fda.gov/drug/label.json"
# StemeDB Source Class for Regulatory
SOURCE_CLASS_REGULATORY = 0
def fetch_label(molecule):
"""
Fetches the most recent FDA label for a given generic molecule name.
"""
print(f"[*] Fetching FDA label for: {molecule}...")
# Query for the generic name, prioritize recent effective_time
query = f'openfda.generic_name:"{molecule}"'
params = {
"search": query,
"limit": 1,
"sort": "effective_time:desc"
}
try:
response = requests.get(API_BASE, params=params)
response.raise_for_status()
data = response.json()
if "results" in data and len(data["results"]) > 0:
return data["results"][0]
else:
print(f"[!] No results found for {molecule}")
return None
except Exception as e:
print(f"[!] Error fetching {molecule}: {e}")
return None
def extract_assertions(label_data, molecule):
"""
Parses raw OpenFDA JSON into StemeDB Assertions.
"""
assertions = []
# Metadata for the Source
set_id = label_data.get("set_id", "unknown")
effective_time = label_data.get("effective_time", datetime.now().strftime("%Y%m%d"))
brand_name = label_data.get("openfda", {}).get("brand_name", ["Unknown"])[0]
# 1. Boxed Warnings (Critical Safety Info)
if "boxed_warning" in label_data:
text = "\n".join(label_data["boxed_warning"])
assertions.append({
"id": str(uuid.uuid4()),
"subject": molecule,
"predicate": "has_boxed_warning",
"object": text, # Raw text for now, NLP would extract specific risks
"confidence": 1.0,
"source_class": SOURCE_CLASS_REGULATORY,
"source_metadata": {
"type": "openfda_label",
"set_id": set_id,
"section": "boxed_warning",
"effective_date": effective_time,
"brand_name": brand_name
},
"timestamp": int(time.time())
})
# 2. Adverse Reactions (The "Side Effects")
if "adverse_reactions" in label_data:
text = "\n".join(label_data["adverse_reactions"])
assertions.append({
"id": str(uuid.uuid4()),
"subject": molecule,
"predicate": "has_adverse_reactions_section",
"object": text,
"confidence": 1.0,
"source_class": SOURCE_CLASS_REGULATORY,
"source_metadata": {
"type": "openfda_label",
"set_id": set_id,
"section": "adverse_reactions",
"effective_date": effective_time,
"brand_name": brand_name
},
"timestamp": int(time.time())
})
# 3. Warnings and Precautions
if "warnings_and_precautions" in label_data:
text = "\n".join(label_data["warnings_and_precautions"])
assertions.append({
"id": str(uuid.uuid4()),
"subject": molecule,
"predicate": "has_warnings",
"object": text,
"confidence": 1.0,
"source_class": SOURCE_CLASS_REGULATORY,
"source_metadata": {
"type": "openfda_label",
"set_id": set_id,
"section": "warnings_and_precautions",
"effective_date": effective_time,
"brand_name": brand_name
},
"timestamp": int(time.time())
})
return assertions
def main():
all_assertions = []
for molecule in TARGET_MOLECULES:
label = fetch_label(molecule)
if label:
drug_assertions = extract_assertions(label, molecule)
all_assertions.extend(drug_assertions)
print(f"[+] Extracted {len(drug_assertions)} Tier 0 assertions for {molecule}")
# Be nice to the API
time.sleep(1)
# Output to JSONL (simulating StemeDB Write)
output_file = "tier0_regulatory_graph.jsonl"
with open(output_file, "w") as f:
for assertion in all_assertions:
f.write(json.dumps(assertion) + "\n")
print(f"\n[OK] Successfully generated {len(all_assertions)} regulatory assertions.")
print(f"[->] Graph data written to {output_file}")
if __name__ == "__main__":
main()