stemedb/scripts/setup-pagerduty.sh

#!/bin/bash
# Setup and validate PagerDuty integration for StemeDB alerting
#
# Usage:
#   ./setup-pagerduty.sh                    # Full validation
#   ./setup-pagerduty.sh --validate-only    # Skip test incident creation
#   ./setup-pagerduty.sh --dry-run          # Show what would be done

set -euo pipefail

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Configuration (override with environment variables)
PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}"
PAGERDUTY_API_TOKEN="${PAGERDUTY_API_TOKEN:-}"
PAGERDUTY_SERVICE_ID="${PAGERDUTY_SERVICE_ID:-}"

# Modes
VALIDATE_ONLY=false
DRY_RUN=false

# Parse arguments
for arg in "$@"; do
  case $arg in
    --validate-only)
      VALIDATE_ONLY=true
      shift
      ;;
    --dry-run)
      DRY_RUN=true
      shift
      ;;
    --help)
      echo "Usage: $0 [--validate-only] [--dry-run] [--help]"
      echo ""
      echo "Options:"
      echo "  --validate-only  Skip test incident creation"
      echo "  --dry-run        Show what would be done without executing"
      echo "  --help           Show this help message"
      echo ""
      echo "Environment variables:"
      echo "  PAGERDUTY_SERVICE_KEY  Integration key from PagerDuty service"
      echo "  PAGERDUTY_API_TOKEN    API token for PagerDuty API"
      echo "  PAGERDUTY_SERVICE_ID   Service ID (for policy validation)"
      exit 0
      ;;
    *)
      echo "Unknown argument: $arg"
      echo "Use --help for usage information"
      exit 1
      ;;
  esac
done

# Helper functions
log_info() {
  echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
  echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
  echo -e "${RED}[ERROR]${NC} $1"
}

check_dependency() {
  if ! command -v "$1" &> /dev/null; then
    log_error "Required command '$1' not found"
    return 1
  fi
}

# Validation step 1: Check dependencies
validate_dependencies() {
  log_info "Checking dependencies..."

  local missing=0
  for cmd in curl jq; do
    if ! check_dependency "$cmd"; then
      missing=1
    fi
  done

  if [ $missing -eq 1 ]; then
    log_error "Missing required dependencies. Install curl and jq."
    return 1
  fi

  log_info "✓ All dependencies present"
  return 0
}

# Validation step 2: Check service key format
validate_service_key() {
  log_info "Validating PagerDuty service key..."

  if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then
    log_error "PAGERDUTY_SERVICE_KEY environment variable not set"
    log_info "Set it with: export PAGERDUTY_SERVICE_KEY='your-key-here'"
    return 1
  fi

  # Service keys are typically 32 characters (hex format)
  if [ ${#PAGERDUTY_SERVICE_KEY} -ne 32 ]; then
    log_warn "Service key length (${#PAGERDUTY_SERVICE_KEY}) is unusual (expected 32)"
  fi

  log_info "✓ Service key format validated"
  return 0
}

# Validation step 3: Test incident creation
test_incident_creation() {
  log_info "Testing incident creation..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would send test alert to PagerDuty"
    return 0
  fi

  if [ "$VALIDATE_ONLY" = true ]; then
    log_info "Skipping test incident (--validate-only mode)"
    return 0
  fi

  # Create test incident
  local response
  response=$(curl -X POST https://events.pagerduty.com/v2/enqueue \
    -H 'Content-Type: application/json' \
    -H "Authorization: Token token=$PAGERDUTY_SERVICE_KEY" \
    -d '{
      "routing_key": "'"$PAGERDUTY_SERVICE_KEY"'",
      "event_action": "trigger",
      "payload": {
        "summary": "StemeDB Setup Test - Safe to Acknowledge",
        "severity": "info",
        "source": "stemedb-setup-script",
        "custom_details": {
          "test": true,
          "timestamp": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"
        }
      }
    }' 2>&1)

  # Check response
  if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
    local dedup_key
    dedup_key=$(echo "$response" | jq -r '.dedup_key')
    log_info "✓ Test incident created successfully"
    log_info "  Incident key: $dedup_key"
    log_info "  Please acknowledge this test incident in PagerDuty"
    return 0
  else
    log_error "Failed to create test incident"
    log_error "Response: $response"
    return 1
  fi
}

# Validation step 4: Verify escalation policy
verify_escalation_policy() {
  log_info "Verifying escalation policy..."

  if [ -z "$PAGERDUTY_API_TOKEN" ]; then
    log_warn "PAGERDUTY_API_TOKEN not set, skipping policy validation"
    log_info "Set it with: export PAGERDUTY_API_TOKEN='your-token-here'"
    return 0
  fi

  if [ -z "$PAGERDUTY_SERVICE_ID" ]; then
    log_warn "PAGERDUTY_SERVICE_ID not set, skipping policy validation"
    return 0
  fi

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would verify escalation policy via API"
    return 0
  fi

  # Fetch service details
  local response
  response=$(curl -s -X GET \
    "https://api.pagerduty.com/services/$PAGERDUTY_SERVICE_ID" \
    -H 'Accept: application/vnd.pagerduty+json;version=2' \
    -H "Authorization: Token token=$PAGERDUTY_API_TOKEN")

  if echo "$response" | jq -e '.service' > /dev/null 2>&1; then
    local service_name
    local escalation_policy
    service_name=$(echo "$response" | jq -r '.service.name')
    escalation_policy=$(echo "$response" | jq -r '.service.escalation_policy.summary')

    log_info "✓ Service found: $service_name"
    log_info "  Escalation policy: $escalation_policy"
    return 0
  else
    log_error "Failed to fetch service details"
    log_error "Response: $response"
    return 1
  fi
}

# Validation step 5: Check routing configuration
verify_routing() {
  log_info "Verifying alert routing configuration..."

  # Check if Alertmanager config exists
  local alertmanager_config="/etc/prometheus/alertmanager.yml"

  if [ ! -f "$alertmanager_config" ]; then
    log_warn "Alertmanager config not found at $alertmanager_config"
    log_info "Ensure PagerDuty routing is configured in Alertmanager"
    return 0
  fi

  # Verify PagerDuty receiver is configured
  if grep -q "pagerduty" "$alertmanager_config"; then
    log_info "✓ PagerDuty receiver configured in Alertmanager"

    # Check for critical/warning routing
    if grep -q "severity.*critical" "$alertmanager_config"; then
      log_info "  ✓ Critical severity routing found"
    else
      log_warn "  Warning: No explicit critical severity routing"
    fi

    if grep -q "severity.*warning" "$alertmanager_config"; then
      log_info "  ✓ Warning severity routing found"
    else
      log_warn "  Warning: No explicit warning severity routing"
    fi
  else
    log_warn "PagerDuty receiver not found in Alertmanager config"
    log_info "Add a PagerDuty receiver to $alertmanager_config"
  fi

  return 0
}

# Main execution
main() {
  echo "========================================="
  echo "StemeDB PagerDuty Setup Validation"
  echo "========================================="
  echo ""

  if [ "$DRY_RUN" = true ]; then
    log_info "Running in DRY RUN mode - no changes will be made"
  fi

  local failed=0

  # Run validation steps
  validate_dependencies || failed=1
  validate_service_key || failed=1
  test_incident_creation || failed=1
  verify_escalation_policy || failed=1
  verify_routing || failed=1

  echo ""
  echo "========================================="
  if [ $failed -eq 0 ]; then
    log_info "✓ PagerDuty validation PASSED"
    echo "========================================="
    exit 0
  else
    log_error "✗ PagerDuty validation FAILED"
    echo "========================================="
    exit 1
  fi
}

# Run main function
main