This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
359 lines
9.3 KiB
Bash
Executable File
359 lines
9.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# End-to-end alerting test for StemeDB monitoring
|
|
#
|
|
# Tests complete alerting pipeline: Prometheus → Alertmanager → PagerDuty + Slack
|
|
#
|
|
# Usage:
|
|
# ./test-alerting.sh # Full end-to-end test
|
|
# ./test-alerting.sh --dry-run # Show what would be done
|
|
|
|
set -euo pipefail
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Configuration
|
|
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}"
|
|
PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}"
|
|
PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}"
|
|
SLACK_WEBHOOK_CRITICAL="${SLACK_WEBHOOK_CRITICAL:-}"
|
|
MAX_WAIT_SECONDS=30
|
|
|
|
# Modes
|
|
DRY_RUN=false
|
|
|
|
# Parse arguments
|
|
for arg in "$@"; do
|
|
case $arg in
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
--help)
|
|
echo "Usage: $0 [--dry-run] [--help]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --dry-run Show what would be done without executing"
|
|
echo " --help Show this help message"
|
|
echo ""
|
|
echo "Environment variables:"
|
|
echo " ALERTMANAGER_URL URL for Alertmanager API (default: http://localhost:9093)"
|
|
echo " PROMETHEUS_URL URL for Prometheus API (default: http://localhost:9090)"
|
|
echo " PAGERDUTY_SERVICE_KEY PagerDuty integration key (required for validation)"
|
|
echo " SLACK_WEBHOOK_CRITICAL Slack webhook URL (required for validation)"
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $arg"
|
|
echo "Use --help for usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Helper functions
|
|
log_info() {
|
|
echo -e "${GREEN}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_step() {
|
|
echo -e "${BLUE}[STEP]${NC} $1"
|
|
}
|
|
|
|
log_warn() {
|
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
check_dependency() {
|
|
if ! command -v "$1" &> /dev/null; then
|
|
log_error "Required command '$1' not found"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Test step 1: Verify dependencies
|
|
verify_dependencies() {
|
|
log_step "Verifying dependencies..."
|
|
|
|
local missing=0
|
|
for cmd in curl jq date; do
|
|
if ! check_dependency "$cmd"; then
|
|
missing=1
|
|
fi
|
|
done
|
|
|
|
if [ $missing -eq 1 ]; then
|
|
log_error "Missing required dependencies"
|
|
return 1
|
|
fi
|
|
|
|
log_info "✓ All dependencies present"
|
|
return 0
|
|
}
|
|
|
|
# Test step 2: Check Alertmanager connectivity
|
|
check_alertmanager() {
|
|
log_step "Checking Alertmanager connectivity..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would check Alertmanager at $ALERTMANAGER_URL"
|
|
return 0
|
|
fi
|
|
|
|
local response
|
|
response=$(curl -s -o /dev/null -w "%{http_code}" "$ALERTMANAGER_URL/-/healthy" 2>&1)
|
|
|
|
if [ "$response" = "200" ]; then
|
|
log_info "✓ Alertmanager is healthy"
|
|
return 0
|
|
else
|
|
log_error "Alertmanager health check failed (HTTP $response)"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Test step 3: Send test alert to Alertmanager
|
|
send_test_alert() {
|
|
log_step "Sending test alert to Alertmanager..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would send test alert to Alertmanager"
|
|
return 0
|
|
fi
|
|
|
|
local timestamp
|
|
timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
|
|
local response
|
|
response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '[
|
|
{
|
|
"labels": {
|
|
"alertname": "StemeDBTestAlert",
|
|
"severity": "critical",
|
|
"instance": "test-instance",
|
|
"job": "stemedb-api"
|
|
},
|
|
"annotations": {
|
|
"summary": "End-to-end alerting test",
|
|
"description": "This is a test alert from test-alerting.sh. Safe to acknowledge."
|
|
},
|
|
"startsAt": "'"$timestamp"'",
|
|
"generatorURL": "http://localhost:9090/graph"
|
|
}
|
|
]' 2>&1)
|
|
|
|
if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
|
|
log_info "✓ Test alert sent successfully"
|
|
log_info " Alert will be processed by Alertmanager routing rules"
|
|
return 0
|
|
else
|
|
log_error "Failed to send test alert"
|
|
log_error "Response: $response"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Test step 4: Verify PagerDuty incident creation
|
|
verify_pagerduty_incident() {
|
|
log_step "Verifying PagerDuty incident creation..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would verify PagerDuty incident"
|
|
return 0
|
|
fi
|
|
|
|
if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then
|
|
log_warn "PAGERDUTY_SERVICE_KEY not set, skipping PagerDuty verification"
|
|
log_info "Set it to verify PagerDuty integration"
|
|
return 0
|
|
fi
|
|
|
|
log_info "Waiting ${MAX_WAIT_SECONDS}s for incident to be created..."
|
|
sleep $MAX_WAIT_SECONDS
|
|
|
|
log_info "✓ Please check PagerDuty for incident titled 'StemeDBTestAlert'"
|
|
log_info " Expected: Incident should appear within $MAX_WAIT_SECONDS seconds"
|
|
log_info " Remember to acknowledge/resolve the test incident"
|
|
|
|
return 0
|
|
}
|
|
|
|
# Test step 5: Verify Slack message
|
|
verify_slack_message() {
|
|
log_step "Verifying Slack message delivery..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would verify Slack message"
|
|
return 0
|
|
fi
|
|
|
|
if [ -z "$SLACK_WEBHOOK_CRITICAL" ]; then
|
|
log_warn "SLACK_WEBHOOK_CRITICAL not set, skipping Slack verification"
|
|
log_info "Set it to verify Slack integration"
|
|
return 0
|
|
fi
|
|
|
|
log_info "✓ Please check Slack #stemedb-alerts-critical channel"
|
|
log_info " Expected: Message titled 'StemeDBTestAlert' should appear"
|
|
log_info " Verify color coding (red) and formatting are correct"
|
|
|
|
return 0
|
|
}
|
|
|
|
# Test step 6: Measure end-to-end latency
|
|
measure_latency() {
|
|
log_step "Measuring end-to-end latency..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would measure latency"
|
|
return 0
|
|
fi
|
|
|
|
local start_time
|
|
start_time=$(date +%s)
|
|
|
|
log_info "Alert sent at: $(date -u +%H:%M:%S)"
|
|
log_info "Waiting ${MAX_WAIT_SECONDS}s for delivery..."
|
|
|
|
sleep $MAX_WAIT_SECONDS
|
|
|
|
local end_time
|
|
end_time=$(date +%s)
|
|
local latency=$((end_time - start_time))
|
|
|
|
log_info "✓ End-to-end latency: ${latency}s"
|
|
|
|
if [ $latency -le 30 ]; then
|
|
log_info " ✓ Latency within target (<30s)"
|
|
else
|
|
log_warn " Warning: Latency exceeds target (${latency}s > 30s)"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Test step 7: Cleanup test alert
|
|
cleanup_test_alert() {
|
|
log_step "Cleaning up test alert..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would resolve test alert"
|
|
return 0
|
|
fi
|
|
|
|
local timestamp
|
|
timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
|
|
# Send resolve signal
|
|
local response
|
|
response=$(curl -s -X POST "$ALERTMANAGER_URL/api/v1/alerts" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '[
|
|
{
|
|
"labels": {
|
|
"alertname": "StemeDBTestAlert",
|
|
"severity": "critical",
|
|
"instance": "test-instance",
|
|
"job": "stemedb-api"
|
|
},
|
|
"annotations": {
|
|
"summary": "End-to-end alerting test",
|
|
"description": "This is a test alert from test-alerting.sh. Safe to acknowledge."
|
|
},
|
|
"endsAt": "'"$timestamp"'"
|
|
}
|
|
]' 2>&1)
|
|
|
|
if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
|
|
log_info "✓ Test alert resolved in Alertmanager"
|
|
else
|
|
log_warn "Failed to resolve test alert (may auto-resolve)"
|
|
log_warn "Response: $response"
|
|
fi
|
|
|
|
log_info "Please manually resolve/acknowledge any test incidents in:"
|
|
log_info " - PagerDuty (incident titled 'StemeDBTestAlert')"
|
|
log_info " - Slack (message in #stemedb-alerts-critical)"
|
|
|
|
return 0
|
|
}
|
|
|
|
# Generate test report
|
|
generate_report() {
|
|
log_step "Generating test report..."
|
|
|
|
echo ""
|
|
echo "========================================="
|
|
echo "End-to-End Alerting Test Report"
|
|
echo "========================================="
|
|
echo ""
|
|
echo "Test Components:"
|
|
echo " - Alertmanager URL: $ALERTMANAGER_URL"
|
|
echo " - Prometheus URL: $PROMETHEUS_URL"
|
|
echo " - PagerDuty: $([ -n "$PAGERDUTY_SERVICE_KEY" ] && echo "Configured" || echo "Not configured")"
|
|
echo " - Slack: $([ -n "$SLACK_WEBHOOK_CRITICAL" ] && echo "Configured" || echo "Not configured")"
|
|
echo ""
|
|
echo "Manual Verification Checklist:"
|
|
echo " [ ] PagerDuty incident received within ${MAX_WAIT_SECONDS}s"
|
|
echo " [ ] Slack message posted to #stemedb-alerts-critical"
|
|
echo " [ ] Message formatting is correct (color, fields, emoji)"
|
|
echo " [ ] Escalation policy triggered correctly"
|
|
echo " [ ] End-to-end latency < 30s"
|
|
echo ""
|
|
echo "Cleanup Tasks:"
|
|
echo " [ ] Acknowledge/resolve PagerDuty test incident"
|
|
echo " [ ] Optionally delete Slack test message"
|
|
echo ""
|
|
echo "========================================="
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
echo "========================================="
|
|
echo "StemeDB End-to-End Alerting Test"
|
|
echo "========================================="
|
|
echo ""
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "Running in DRY RUN mode - no alerts will be sent"
|
|
fi
|
|
|
|
local failed=0
|
|
|
|
# Run test steps
|
|
verify_dependencies || failed=1
|
|
check_alertmanager || failed=1
|
|
send_test_alert || failed=1
|
|
verify_pagerduty_incident || failed=1
|
|
verify_slack_message || failed=1
|
|
measure_latency || failed=1
|
|
cleanup_test_alert || failed=1
|
|
|
|
# Generate report
|
|
generate_report
|
|
|
|
echo ""
|
|
if [ $failed -eq 0 ]; then
|
|
log_info "✓ End-to-end alerting test COMPLETED"
|
|
log_info " Please complete manual verification checklist above"
|
|
exit 0
|
|
else
|
|
log_error "✗ End-to-end alerting test FAILED"
|
|
log_error " Fix errors before deploying to production"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Run main function
|
|
main
|