stemedb/scripts/test-alerting.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

359 lines
9.3 KiB
Bash
Executable File

#!/bin/bash
# End-to-end alerting test for StemeDB monitoring
#
# Tests complete alerting pipeline: Prometheus → Alertmanager → PagerDuty + Slack
#
# Usage:
# ./test-alerting.sh # Full end-to-end test
# ./test-alerting.sh --dry-run # Show what would be done
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}"
PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}"
SLACK_WEBHOOK_CRITICAL="${SLACK_WEBHOOK_CRITICAL:-}"
MAX_WAIT_SECONDS=30
# Modes
DRY_RUN=false
# Parse arguments
for arg in "$@"; do
case $arg in
--dry-run)
DRY_RUN=true
shift
;;
--help)
echo "Usage: $0 [--dry-run] [--help]"
echo ""
echo "Options:"
echo " --dry-run Show what would be done without executing"
echo " --help Show this help message"
echo ""
echo "Environment variables:"
echo " ALERTMANAGER_URL URL for Alertmanager API (default: http://localhost:9093)"
echo " PROMETHEUS_URL URL for Prometheus API (default: http://localhost:9090)"
echo " PAGERDUTY_SERVICE_KEY PagerDuty integration key (required for validation)"
echo " SLACK_WEBHOOK_CRITICAL Slack webhook URL (required for validation)"
exit 0
;;
*)
echo "Unknown argument: $arg"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Helper functions
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_step() {
echo -e "${BLUE}[STEP]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
check_dependency() {
if ! command -v "$1" &> /dev/null; then
log_error "Required command '$1' not found"
return 1
fi
}
# Test step 1: Verify dependencies
verify_dependencies() {
log_step "Verifying dependencies..."
local missing=0
for cmd in curl jq date; do
if ! check_dependency "$cmd"; then
missing=1
fi
done
if [ $missing -eq 1 ]; then
log_error "Missing required dependencies"
return 1
fi
log_info "✓ All dependencies present"
return 0
}
# Test step 2: Check Alertmanager connectivity
check_alertmanager() {
log_step "Checking Alertmanager connectivity..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would check Alertmanager at $ALERTMANAGER_URL"
return 0
fi
local response
response=$(curl -s -o /dev/null -w "%{http_code}" "$ALERTMANAGER_URL/-/healthy" 2>&1)
if [ "$response" = "200" ]; then
log_info "✓ Alertmanager is healthy"
return 0
else
log_error "Alertmanager health check failed (HTTP $response)"
return 1
fi
}
# Test step 3: Send test alert to Alertmanager
send_test_alert() {
log_step "Sending test alert to Alertmanager..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would send test alert to Alertmanager"
return 0
fi
local timestamp
timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)
local response
response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \
-H 'Content-Type: application/json' \
-d '[
{
"labels": {
"alertname": "StemeDBTestAlert",
"severity": "critical",
"instance": "test-instance",
"job": "stemedb-api"
},
"annotations": {
"summary": "End-to-end alerting test",
"description": "This is a test alert from test-alerting.sh. Safe to acknowledge."
},
"startsAt": "'"$timestamp"'",
"generatorURL": "http://localhost:9090/graph"
}
]' 2>&1)
if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
log_info "✓ Test alert sent successfully"
log_info " Alert will be processed by Alertmanager routing rules"
return 0
else
log_error "Failed to send test alert"
log_error "Response: $response"
return 1
fi
}
# Test step 4: Verify PagerDuty incident creation
verify_pagerduty_incident() {
log_step "Verifying PagerDuty incident creation..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would verify PagerDuty incident"
return 0
fi
if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then
log_warn "PAGERDUTY_SERVICE_KEY not set, skipping PagerDuty verification"
log_info "Set it to verify PagerDuty integration"
return 0
fi
log_info "Waiting ${MAX_WAIT_SECONDS}s for incident to be created..."
sleep $MAX_WAIT_SECONDS
log_info "✓ Please check PagerDuty for incident titled 'StemeDBTestAlert'"
log_info " Expected: Incident should appear within $MAX_WAIT_SECONDS seconds"
log_info " Remember to acknowledge/resolve the test incident"
return 0
}
# Test step 5: Verify Slack message
verify_slack_message() {
log_step "Verifying Slack message delivery..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would verify Slack message"
return 0
fi
if [ -z "$SLACK_WEBHOOK_CRITICAL" ]; then
log_warn "SLACK_WEBHOOK_CRITICAL not set, skipping Slack verification"
log_info "Set it to verify Slack integration"
return 0
fi
log_info "✓ Please check Slack #stemedb-alerts-critical channel"
log_info " Expected: Message titled 'StemeDBTestAlert' should appear"
log_info " Verify color coding (red) and formatting are correct"
return 0
}
# Test step 6: Measure end-to-end latency
measure_latency() {
log_step "Measuring end-to-end latency..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would measure latency"
return 0
fi
local start_time
start_time=$(date +%s)
log_info "Alert sent at: $(date -u +%H:%M:%S)"
log_info "Waiting ${MAX_WAIT_SECONDS}s for delivery..."
sleep $MAX_WAIT_SECONDS
local end_time
end_time=$(date +%s)
local latency=$((end_time - start_time))
log_info "✓ End-to-end latency: ${latency}s"
if [ $latency -le 30 ]; then
log_info " ✓ Latency within target (<30s)"
else
log_warn " Warning: Latency exceeds target (${latency}s > 30s)"
fi
return 0
}
# Test step 7: Cleanup test alert
cleanup_test_alert() {
log_step "Cleaning up test alert..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would resolve test alert"
return 0
fi
local timestamp
timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)
# Send resolve signal
local response
response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \
-H 'Content-Type: application/json' \
-d '[
{
"labels": {
"alertname": "StemeDBTestAlert",
"severity": "critical",
"instance": "test-instance",
"job": "stemedb-api"
},
"annotations": {
"summary": "End-to-end alerting test",
"description": "This is a test alert from test-alerting.sh. Safe to acknowledge."
},
"endsAt": "'"$timestamp"'"
}
]' 2>&1)
if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
log_info "✓ Test alert resolved in Alertmanager"
else
log_warn "Failed to resolve test alert (may auto-resolve)"
log_warn "Response: $response"
fi
log_info "Please manually resolve/acknowledge any test incidents in:"
log_info " - PagerDuty (incident titled 'StemeDBTestAlert')"
log_info " - Slack (message in #stemedb-alerts-critical)"
return 0
}
# Generate test report
generate_report() {
log_step "Generating test report..."
echo ""
echo "========================================="
echo "End-to-End Alerting Test Report"
echo "========================================="
echo ""
echo "Test Components:"
echo " - Alertmanager URL: $ALERTMANAGER_URL"
echo " - Prometheus URL: $PROMETHEUS_URL"
echo " - PagerDuty: $([ -n "$PAGERDUTY_SERVICE_KEY" ] && echo "Configured" || echo "Not configured")"
echo " - Slack: $([ -n "$SLACK_WEBHOOK_CRITICAL" ] && echo "Configured" || echo "Not configured")"
echo ""
echo "Manual Verification Checklist:"
echo " [ ] PagerDuty incident received within ${MAX_WAIT_SECONDS}s"
echo " [ ] Slack message posted to #stemedb-alerts-critical"
echo " [ ] Message formatting is correct (color, fields, emoji)"
echo " [ ] Escalation policy triggered correctly"
echo " [ ] End-to-end latency < 30s"
echo ""
echo "Cleanup Tasks:"
echo " [ ] Acknowledge/resolve PagerDuty test incident"
echo " [ ] Optionally delete Slack test message"
echo ""
echo "========================================="
}
# Main execution
main() {
echo "========================================="
echo "StemeDB End-to-End Alerting Test"
echo "========================================="
echo ""
if [ "$DRY_RUN" = true ]; then
log_info "Running in DRY RUN mode - no alerts will be sent"
fi
local failed=0
# Run test steps
verify_dependencies || failed=1
check_alertmanager || failed=1
send_test_alert || failed=1
verify_pagerduty_incident || failed=1
verify_slack_message || failed=1
measure_latency || failed=1
cleanup_test_alert || failed=1
# Generate report
generate_report
echo ""
if [ $failed -eq 0 ]; then
log_info "✓ End-to-end alerting test COMPLETED"
log_info " Please complete manual verification checklist above"
exit 0
else
log_error "✗ End-to-end alerting test FAILED"
log_error " Fix errors before deploying to production"
exit 1
fi
}
# Run main function
main