#!/bin/bash # Setup and validate PagerDuty integration for StemeDB alerting # # Usage: # ./setup-pagerduty.sh # Full validation # ./setup-pagerduty.sh --validate-only # Skip test incident creation # ./setup-pagerduty.sh --dry-run # Show what would be done set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # Configuration (override with environment variables) PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}" PAGERDUTY_API_TOKEN="${PAGERDUTY_API_TOKEN:-}" PAGERDUTY_SERVICE_ID="${PAGERDUTY_SERVICE_ID:-}" # Modes VALIDATE_ONLY=false DRY_RUN=false # Parse arguments for arg in "$@"; do case $arg in --validate-only) VALIDATE_ONLY=true shift ;; --dry-run) DRY_RUN=true shift ;; --help) echo "Usage: $0 [--validate-only] [--dry-run] [--help]" echo "" echo "Options:" echo " --validate-only Skip test incident creation" echo " --dry-run Show what would be done without executing" echo " --help Show this help message" echo "" echo "Environment variables:" echo " PAGERDUTY_SERVICE_KEY Integration key from PagerDuty service" echo " PAGERDUTY_API_TOKEN API token for PagerDuty API" echo " PAGERDUTY_SERVICE_ID Service ID (for policy validation)" exit 0 ;; *) echo "Unknown argument: $arg" echo "Use --help for usage information" exit 1 ;; esac done # Helper functions log_info() { echo -e "${GREEN}[INFO]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } check_dependency() { if ! command -v "$1" &> /dev/null; then log_error "Required command '$1' not found" return 1 fi } # Validation step 1: Check dependencies validate_dependencies() { log_info "Checking dependencies..." local missing=0 for cmd in curl jq; do if ! check_dependency "$cmd"; then missing=1 fi done if [ $missing -eq 1 ]; then log_error "Missing required dependencies. Install curl and jq." return 1 fi log_info "✓ All dependencies present" return 0 } # Validation step 2: Check service key format validate_service_key() { log_info "Validating PagerDuty service key..." if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then log_error "PAGERDUTY_SERVICE_KEY environment variable not set" log_info "Set it with: export PAGERDUTY_SERVICE_KEY='your-key-here'" return 1 fi # Service keys are typically 32 characters (hex format) if [ ${#PAGERDUTY_SERVICE_KEY} -ne 32 ]; then log_warn "Service key length (${#PAGERDUTY_SERVICE_KEY}) is unusual (expected 32)" fi log_info "✓ Service key format validated" return 0 } # Validation step 3: Test incident creation test_incident_creation() { log_info "Testing incident creation..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would send test alert to PagerDuty" return 0 fi if [ "$VALIDATE_ONLY" = true ]; then log_info "Skipping test incident (--validate-only mode)" return 0 fi # Create test incident local response response=$(curl -X POST https://events.pagerduty.com/v2/enqueue \ -H 'Content-Type: application/json' \ -H "Authorization: Token token=$PAGERDUTY_SERVICE_KEY" \ -d '{ "routing_key": "'"$PAGERDUTY_SERVICE_KEY"'", "event_action": "trigger", "payload": { "summary": "StemeDB Setup Test - Safe to Acknowledge", "severity": "info", "source": "stemedb-setup-script", "custom_details": { "test": true, "timestamp": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'" } } }' 2>&1) # Check response if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then local dedup_key dedup_key=$(echo "$response" | jq -r '.dedup_key') log_info "✓ Test incident created successfully" log_info " Incident key: $dedup_key" log_info " Please acknowledge this test incident in PagerDuty" return 0 else log_error "Failed to create test incident" log_error "Response: $response" return 1 fi } # Validation step 4: Verify escalation policy verify_escalation_policy() { log_info "Verifying escalation policy..." if [ -z "$PAGERDUTY_API_TOKEN" ]; then log_warn "PAGERDUTY_API_TOKEN not set, skipping policy validation" log_info "Set it with: export PAGERDUTY_API_TOKEN='your-token-here'" return 0 fi if [ -z "$PAGERDUTY_SERVICE_ID" ]; then log_warn "PAGERDUTY_SERVICE_ID not set, skipping policy validation" return 0 fi if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would verify escalation policy via API" return 0 fi # Fetch service details local response response=$(curl -s -X GET \ "https://api.pagerduty.com/services/$PAGERDUTY_SERVICE_ID" \ -H 'Accept: application/vnd.pagerduty+json;version=2' \ -H "Authorization: Token token=$PAGERDUTY_API_TOKEN") if echo "$response" | jq -e '.service' > /dev/null 2>&1; then local service_name local escalation_policy service_name=$(echo "$response" | jq -r '.service.name') escalation_policy=$(echo "$response" | jq -r '.service.escalation_policy.summary') log_info "✓ Service found: $service_name" log_info " Escalation policy: $escalation_policy" return 0 else log_error "Failed to fetch service details" log_error "Response: $response" return 1 fi } # Validation step 5: Check routing configuration verify_routing() { log_info "Verifying alert routing configuration..." # Check if Alertmanager config exists local alertmanager_config="/etc/prometheus/alertmanager.yml" if [ ! -f "$alertmanager_config" ]; then log_warn "Alertmanager config not found at $alertmanager_config" log_info "Ensure PagerDuty routing is configured in Alertmanager" return 0 fi # Verify PagerDuty receiver is configured if grep -q "pagerduty" "$alertmanager_config"; then log_info "✓ PagerDuty receiver configured in Alertmanager" # Check for critical/warning routing if grep -q "severity.*critical" "$alertmanager_config"; then log_info " ✓ Critical severity routing found" else log_warn " Warning: No explicit critical severity routing" fi if grep -q "severity.*warning" "$alertmanager_config"; then log_info " ✓ Warning severity routing found" else log_warn " Warning: No explicit warning severity routing" fi else log_warn "PagerDuty receiver not found in Alertmanager config" log_info "Add a PagerDuty receiver to $alertmanager_config" fi return 0 } # Main execution main() { echo "=========================================" echo "StemeDB PagerDuty Setup Validation" echo "=========================================" echo "" if [ "$DRY_RUN" = true ]; then log_info "Running in DRY RUN mode - no changes will be made" fi local failed=0 # Run validation steps validate_dependencies || failed=1 validate_service_key || failed=1 test_incident_creation || failed=1 verify_escalation_policy || failed=1 verify_routing || failed=1 echo "" echo "=========================================" if [ $failed -eq 0 ]; then log_info "✓ PagerDuty validation PASSED" echo "=========================================" exit 0 else log_error "✗ PagerDuty validation FAILED" echo "=========================================" exit 1 fi } # Run main function main