#!/bin/bash # End-to-end alerting test for StemeDB monitoring # # Tests complete alerting pipeline: Prometheus → Alertmanager → PagerDuty + Slack # # Usage: # ./test-alerting.sh # Full end-to-end test # ./test-alerting.sh --dry-run # Show what would be done set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Configuration ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}" PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}" PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}" SLACK_WEBHOOK_CRITICAL="${SLACK_WEBHOOK_CRITICAL:-}" MAX_WAIT_SECONDS=30 # Modes DRY_RUN=false # Parse arguments for arg in "$@"; do case $arg in --dry-run) DRY_RUN=true shift ;; --help) echo "Usage: $0 [--dry-run] [--help]" echo "" echo "Options:" echo " --dry-run Show what would be done without executing" echo " --help Show this help message" echo "" echo "Environment variables:" echo " ALERTMANAGER_URL URL for Alertmanager API (default: http://localhost:9093)" echo " PROMETHEUS_URL URL for Prometheus API (default: http://localhost:9090)" echo " PAGERDUTY_SERVICE_KEY PagerDuty integration key (required for validation)" echo " SLACK_WEBHOOK_CRITICAL Slack webhook URL (required for validation)" exit 0 ;; *) echo "Unknown argument: $arg" echo "Use --help for usage information" exit 1 ;; esac done # Helper functions log_info() { echo -e "${GREEN}[INFO]${NC} $1" } log_step() { echo -e "${BLUE}[STEP]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } check_dependency() { if ! command -v "$1" &> /dev/null; then log_error "Required command '$1' not found" return 1 fi } # Test step 1: Verify dependencies verify_dependencies() { log_step "Verifying dependencies..." local missing=0 for cmd in curl jq date; do if ! check_dependency "$cmd"; then missing=1 fi done if [ $missing -eq 1 ]; then log_error "Missing required dependencies" return 1 fi log_info "✓ All dependencies present" return 0 } # Test step 2: Check Alertmanager connectivity check_alertmanager() { log_step "Checking Alertmanager connectivity..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would check Alertmanager at $ALERTMANAGER_URL" return 0 fi local response response=$(curl -s -o /dev/null -w "%{http_code}" "$ALERTMANAGER_URL/-/healthy" 2>&1) if [ "$response" = "200" ]; then log_info "✓ Alertmanager is healthy" return 0 else log_error "Alertmanager health check failed (HTTP $response)" return 1 fi } # Test step 3: Send test alert to Alertmanager send_test_alert() { log_step "Sending test alert to Alertmanager..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would send test alert to Alertmanager" return 0 fi local timestamp timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ) local response response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \ -H 'Content-Type: application/json' \ -d '[ { "labels": { "alertname": "StemeDBTestAlert", "severity": "critical", "instance": "test-instance", "job": "stemedb-api" }, "annotations": { "summary": "End-to-end alerting test", "description": "This is a test alert from test-alerting.sh. Safe to acknowledge." }, "startsAt": "'"$timestamp"'", "generatorURL": "http://localhost:9090/graph" } ]' 2>&1) if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then log_info "✓ Test alert sent successfully" log_info " Alert will be processed by Alertmanager routing rules" return 0 else log_error "Failed to send test alert" log_error "Response: $response" return 1 fi } # Test step 4: Verify PagerDuty incident creation verify_pagerduty_incident() { log_step "Verifying PagerDuty incident creation..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would verify PagerDuty incident" return 0 fi if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then log_warn "PAGERDUTY_SERVICE_KEY not set, skipping PagerDuty verification" log_info "Set it to verify PagerDuty integration" return 0 fi log_info "Waiting ${MAX_WAIT_SECONDS}s for incident to be created..." sleep $MAX_WAIT_SECONDS log_info "✓ Please check PagerDuty for incident titled 'StemeDBTestAlert'" log_info " Expected: Incident should appear within $MAX_WAIT_SECONDS seconds" log_info " Remember to acknowledge/resolve the test incident" return 0 } # Test step 5: Verify Slack message verify_slack_message() { log_step "Verifying Slack message delivery..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would verify Slack message" return 0 fi if [ -z "$SLACK_WEBHOOK_CRITICAL" ]; then log_warn "SLACK_WEBHOOK_CRITICAL not set, skipping Slack verification" log_info "Set it to verify Slack integration" return 0 fi log_info "✓ Please check Slack #stemedb-alerts-critical channel" log_info " Expected: Message titled 'StemeDBTestAlert' should appear" log_info " Verify color coding (red) and formatting are correct" return 0 } # Test step 6: Measure end-to-end latency measure_latency() { log_step "Measuring end-to-end latency..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would measure latency" return 0 fi local start_time start_time=$(date +%s) log_info "Alert sent at: $(date -u +%H:%M:%S)" log_info "Waiting ${MAX_WAIT_SECONDS}s for delivery..." sleep $MAX_WAIT_SECONDS local end_time end_time=$(date +%s) local latency=$((end_time - start_time)) log_info "✓ End-to-end latency: ${latency}s" if [ $latency -le 30 ]; then log_info " ✓ Latency within target (<30s)" else log_warn " Warning: Latency exceeds target (${latency}s > 30s)" fi return 0 } # Test step 7: Cleanup test alert cleanup_test_alert() { log_step "Cleaning up test alert..." if [ "$DRY_RUN" = true ]; then log_info "[DRY RUN] Would resolve test alert" return 0 fi local timestamp timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ) # Send resolve signal local response response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \ -H 'Content-Type: application/json' \ -d '[ { "labels": { "alertname": "StemeDBTestAlert", "severity": "critical", "instance": "test-instance", "job": "stemedb-api" }, "annotations": { "summary": "End-to-end alerting test", "description": "This is a test alert from test-alerting.sh. Safe to acknowledge." }, "endsAt": "'"$timestamp"'" } ]' 2>&1) if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then log_info "✓ Test alert resolved in Alertmanager" else log_warn "Failed to resolve test alert (may auto-resolve)" log_warn "Response: $response" fi log_info "Please manually resolve/acknowledge any test incidents in:" log_info " - PagerDuty (incident titled 'StemeDBTestAlert')" log_info " - Slack (message in #stemedb-alerts-critical)" return 0 } # Generate test report generate_report() { log_step "Generating test report..." echo "" echo "=========================================" echo "End-to-End Alerting Test Report" echo "=========================================" echo "" echo "Test Components:" echo " - Alertmanager URL: $ALERTMANAGER_URL" echo " - Prometheus URL: $PROMETHEUS_URL" echo " - PagerDuty: $([ -n "$PAGERDUTY_SERVICE_KEY" ] && echo "Configured" || echo "Not configured")" echo " - Slack: $([ -n "$SLACK_WEBHOOK_CRITICAL" ] && echo "Configured" || echo "Not configured")" echo "" echo "Manual Verification Checklist:" echo " [ ] PagerDuty incident received within ${MAX_WAIT_SECONDS}s" echo " [ ] Slack message posted to #stemedb-alerts-critical" echo " [ ] Message formatting is correct (color, fields, emoji)" echo " [ ] Escalation policy triggered correctly" echo " [ ] End-to-end latency < 30s" echo "" echo "Cleanup Tasks:" echo " [ ] Acknowledge/resolve PagerDuty test incident" echo " [ ] Optionally delete Slack test message" echo "" echo "=========================================" } # Main execution main() { echo "=========================================" echo "StemeDB End-to-End Alerting Test" echo "=========================================" echo "" if [ "$DRY_RUN" = true ]; then log_info "Running in DRY RUN mode - no alerts will be sent" fi local failed=0 # Run test steps verify_dependencies || failed=1 check_alertmanager || failed=1 send_test_alert || failed=1 verify_pagerduty_incident || failed=1 verify_slack_message || failed=1 measure_latency || failed=1 cleanup_test_alert || failed=1 # Generate report generate_report echo "" if [ $failed -eq 0 ]; then log_info "✓ End-to-end alerting test COMPLETED" log_info " Please complete manual verification checklist above" exit 0 else log_error "✗ End-to-end alerting test FAILED" log_error " Fix errors before deploying to production" exit 1 fi } # Run main function main