stemedb/scripts/test-alerting.sh

#!/bin/bash
# End-to-end alerting test for StemeDB monitoring
#
# Tests complete alerting pipeline: Prometheus → Alertmanager → PagerDuty + Slack
#
# Usage:
#   ./test-alerting.sh                 # Full end-to-end test
#   ./test-alerting.sh --dry-run       # Show what would be done

set -euo pipefail

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Configuration
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}"
PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}"
SLACK_WEBHOOK_CRITICAL="${SLACK_WEBHOOK_CRITICAL:-}"
MAX_WAIT_SECONDS=30

# Modes
DRY_RUN=false

# Parse arguments
for arg in "$@"; do
  case $arg in
    --dry-run)
      DRY_RUN=true
      shift
      ;;
    --help)
      echo "Usage: $0 [--dry-run] [--help]"
      echo ""
      echo "Options:"
      echo "  --dry-run        Show what would be done without executing"
      echo "  --help           Show this help message"
      echo ""
      echo "Environment variables:"
      echo "  ALERTMANAGER_URL           URL for Alertmanager API (default: http://localhost:9093)"
      echo "  PROMETHEUS_URL             URL for Prometheus API (default: http://localhost:9090)"
      echo "  PAGERDUTY_SERVICE_KEY      PagerDuty integration key (required for validation)"
      echo "  SLACK_WEBHOOK_CRITICAL     Slack webhook URL (required for validation)"
      exit 0
      ;;
    *)
      echo "Unknown argument: $arg"
      echo "Use --help for usage information"
      exit 1
      ;;
  esac
done

# Helper functions
log_info() {
  echo -e "${GREEN}[INFO]${NC} $1"
}

log_step() {
  echo -e "${BLUE}[STEP]${NC} $1"
}

log_warn() {
  echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
  echo -e "${RED}[ERROR]${NC} $1"
}

check_dependency() {
  if ! command -v "$1" &> /dev/null; then
    log_error "Required command '$1' not found"
    return 1
  fi
}

# Test step 1: Verify dependencies
verify_dependencies() {
  log_step "Verifying dependencies..."

  local missing=0
  for cmd in curl jq date; do
    if ! check_dependency "$cmd"; then
      missing=1
    fi
  done

  if [ $missing -eq 1 ]; then
    log_error "Missing required dependencies"
    return 1
  fi

  log_info "✓ All dependencies present"
  return 0
}

# Test step 2: Check Alertmanager connectivity
check_alertmanager() {
  log_step "Checking Alertmanager connectivity..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would check Alertmanager at $ALERTMANAGER_URL"
    return 0
  fi

  local response
  response=$(curl -s -o /dev/null -w "%{http_code}" "$ALERTMANAGER_URL/-/healthy" 2>&1)

  if [ "$response" = "200" ]; then
    log_info "✓ Alertmanager is healthy"
    return 0
  else
    log_error "Alertmanager health check failed (HTTP $response)"
    return 1
  fi
}

# Test step 3: Send test alert to Alertmanager
send_test_alert() {
  log_step "Sending test alert to Alertmanager..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would send test alert to Alertmanager"
    return 0
  fi

  local timestamp
  timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)

  local response
  response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \
    -H 'Content-Type: application/json' \
    -d '[
      {
        "labels": {
          "alertname": "StemeDBTestAlert",
          "severity": "critical",
          "instance": "test-instance",
          "job": "stemedb-api"
        },
        "annotations": {
          "summary": "End-to-end alerting test",
          "description": "This is a test alert from test-alerting.sh. Safe to acknowledge."
        },
        "startsAt": "'"$timestamp"'",
        "generatorURL": "http://localhost:9090/graph"
      }
    ]' 2>&1)

  if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
    log_info "✓ Test alert sent successfully"
    log_info "  Alert will be processed by Alertmanager routing rules"
    return 0
  else
    log_error "Failed to send test alert"
    log_error "Response: $response"
    return 1
  fi
}

# Test step 4: Verify PagerDuty incident creation
verify_pagerduty_incident() {
  log_step "Verifying PagerDuty incident creation..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would verify PagerDuty incident"
    return 0
  fi

  if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then
    log_warn "PAGERDUTY_SERVICE_KEY not set, skipping PagerDuty verification"
    log_info "Set it to verify PagerDuty integration"
    return 0
  fi

  log_info "Waiting ${MAX_WAIT_SECONDS}s for incident to be created..."
  sleep $MAX_WAIT_SECONDS

  log_info "✓ Please check PagerDuty for incident titled 'StemeDBTestAlert'"
  log_info "  Expected: Incident should appear within $MAX_WAIT_SECONDS seconds"
  log_info "  Remember to acknowledge/resolve the test incident"

  return 0
}

# Test step 5: Verify Slack message
verify_slack_message() {
  log_step "Verifying Slack message delivery..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would verify Slack message"
    return 0
  fi

  if [ -z "$SLACK_WEBHOOK_CRITICAL" ]; then
    log_warn "SLACK_WEBHOOK_CRITICAL not set, skipping Slack verification"
    log_info "Set it to verify Slack integration"
    return 0
  fi

  log_info "✓ Please check Slack #stemedb-alerts-critical channel"
  log_info "  Expected: Message titled 'StemeDBTestAlert' should appear"
  log_info "  Verify color coding (red) and formatting are correct"

  return 0
}

# Test step 6: Measure end-to-end latency
measure_latency() {
  log_step "Measuring end-to-end latency..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would measure latency"
    return 0
  fi

  local start_time
  start_time=$(date +%s)

  log_info "Alert sent at: $(date -u +%H:%M:%S)"
  log_info "Waiting ${MAX_WAIT_SECONDS}s for delivery..."

  sleep $MAX_WAIT_SECONDS

  local end_time
  end_time=$(date +%s)
  local latency=$((end_time - start_time))

  log_info "✓ End-to-end latency: ${latency}s"

  if [ $latency -le 30 ]; then
    log_info "  ✓ Latency within target (<30s)"
  else
    log_warn "  Warning: Latency exceeds target (${latency}s > 30s)"
  fi

  return 0
}

# Test step 7: Cleanup test alert
cleanup_test_alert() {
  log_step "Cleaning up test alert..."

  if [ "$DRY_RUN" = true ]; then
    log_info "[DRY RUN] Would resolve test alert"
    return 0
  fi

  local timestamp
  timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)

  # Send resolve signal
  local response
  response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \
    -H 'Content-Type: application/json' \
    -d '[
      {
        "labels": {
          "alertname": "StemeDBTestAlert",
          "severity": "critical",
          "instance": "test-instance",
          "job": "stemedb-api"
        },
        "annotations": {
          "summary": "End-to-end alerting test",
          "description": "This is a test alert from test-alerting.sh. Safe to acknowledge."
        },
        "endsAt": "'"$timestamp"'"
      }
    ]' 2>&1)

  if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
    log_info "✓ Test alert resolved in Alertmanager"
  else
    log_warn "Failed to resolve test alert (may auto-resolve)"
    log_warn "Response: $response"
  fi

  log_info "Please manually resolve/acknowledge any test incidents in:"
  log_info "  - PagerDuty (incident titled 'StemeDBTestAlert')"
  log_info "  - Slack (message in #stemedb-alerts-critical)"

  return 0
}

# Generate test report
generate_report() {
  log_step "Generating test report..."

  echo ""
  echo "========================================="
  echo "End-to-End Alerting Test Report"
  echo "========================================="
  echo ""
  echo "Test Components:"
  echo "  - Alertmanager URL: $ALERTMANAGER_URL"
  echo "  - Prometheus URL: $PROMETHEUS_URL"
  echo "  - PagerDuty: $([ -n "$PAGERDUTY_SERVICE_KEY" ] && echo "Configured" || echo "Not configured")"
  echo "  - Slack: $([ -n "$SLACK_WEBHOOK_CRITICAL" ] && echo "Configured" || echo "Not configured")"
  echo ""
  echo "Manual Verification Checklist:"
  echo "  [ ] PagerDuty incident received within ${MAX_WAIT_SECONDS}s"
  echo "  [ ] Slack message posted to #stemedb-alerts-critical"
  echo "  [ ] Message formatting is correct (color, fields, emoji)"
  echo "  [ ] Escalation policy triggered correctly"
  echo "  [ ] End-to-end latency < 30s"
  echo ""
  echo "Cleanup Tasks:"
  echo "  [ ] Acknowledge/resolve PagerDuty test incident"
  echo "  [ ] Optionally delete Slack test message"
  echo ""
  echo "========================================="
}

# Main execution
main() {
  echo "========================================="
  echo "StemeDB End-to-End Alerting Test"
  echo "========================================="
  echo ""

  if [ "$DRY_RUN" = true ]; then
    log_info "Running in DRY RUN mode - no alerts will be sent"
  fi

  local failed=0

  # Run test steps
  verify_dependencies || failed=1
  check_alertmanager || failed=1
  send_test_alert || failed=1
  verify_pagerduty_incident || failed=1
  verify_slack_message || failed=1
  measure_latency || failed=1
  cleanup_test_alert || failed=1

  # Generate report
  generate_report

  echo ""
  if [ $failed -eq 0 ]; then
    log_info "✓ End-to-end alerting test COMPLETED"
    log_info "  Please complete manual verification checklist above"
    exit 0
  else
    log_error "✗ End-to-end alerting test FAILED"
    log_error "  Fix errors before deploying to production"
    exit 1
  fi
}

# Run main function
main