This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
281 lines
7.4 KiB
Bash
Executable File
281 lines
7.4 KiB
Bash
Executable File
#!/bin/bash
|
|
# Setup and validate PagerDuty integration for StemeDB alerting
|
|
#
|
|
# Usage:
|
|
# ./setup-pagerduty.sh # Full validation
|
|
# ./setup-pagerduty.sh --validate-only # Skip test incident creation
|
|
# ./setup-pagerduty.sh --dry-run # Show what would be done
|
|
|
|
set -euo pipefail
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Configuration (override with environment variables)
|
|
PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}"
|
|
PAGERDUTY_API_TOKEN="${PAGERDUTY_API_TOKEN:-}"
|
|
PAGERDUTY_SERVICE_ID="${PAGERDUTY_SERVICE_ID:-}"
|
|
|
|
# Modes
|
|
VALIDATE_ONLY=false
|
|
DRY_RUN=false
|
|
|
|
# Parse arguments
|
|
for arg in "$@"; do
|
|
case $arg in
|
|
--validate-only)
|
|
VALIDATE_ONLY=true
|
|
shift
|
|
;;
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
--help)
|
|
echo "Usage: $0 [--validate-only] [--dry-run] [--help]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --validate-only Skip test incident creation"
|
|
echo " --dry-run Show what would be done without executing"
|
|
echo " --help Show this help message"
|
|
echo ""
|
|
echo "Environment variables:"
|
|
echo " PAGERDUTY_SERVICE_KEY Integration key from PagerDuty service"
|
|
echo " PAGERDUTY_API_TOKEN API token for PagerDuty API"
|
|
echo " PAGERDUTY_SERVICE_ID Service ID (for policy validation)"
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $arg"
|
|
echo "Use --help for usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Helper functions
|
|
log_info() {
|
|
echo -e "${GREEN}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_warn() {
|
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
check_dependency() {
|
|
if ! command -v "$1" &> /dev/null; then
|
|
log_error "Required command '$1' not found"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Validation step 1: Check dependencies
|
|
validate_dependencies() {
|
|
log_info "Checking dependencies..."
|
|
|
|
local missing=0
|
|
for cmd in curl jq; do
|
|
if ! check_dependency "$cmd"; then
|
|
missing=1
|
|
fi
|
|
done
|
|
|
|
if [ $missing -eq 1 ]; then
|
|
log_error "Missing required dependencies. Install curl and jq."
|
|
return 1
|
|
fi
|
|
|
|
log_info "✓ All dependencies present"
|
|
return 0
|
|
}
|
|
|
|
# Validation step 2: Check service key format
|
|
validate_service_key() {
|
|
log_info "Validating PagerDuty service key..."
|
|
|
|
if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then
|
|
log_error "PAGERDUTY_SERVICE_KEY environment variable not set"
|
|
log_info "Set it with: export PAGERDUTY_SERVICE_KEY='your-key-here'"
|
|
return 1
|
|
fi
|
|
|
|
# Service keys are typically 32 characters (hex format)
|
|
if [ ${#PAGERDUTY_SERVICE_KEY} -ne 32 ]; then
|
|
log_warn "Service key length (${#PAGERDUTY_SERVICE_KEY}) is unusual (expected 32)"
|
|
fi
|
|
|
|
log_info "✓ Service key format validated"
|
|
return 0
|
|
}
|
|
|
|
# Validation step 3: Test incident creation
|
|
test_incident_creation() {
|
|
log_info "Testing incident creation..."
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would send test alert to PagerDuty"
|
|
return 0
|
|
fi
|
|
|
|
if [ "$VALIDATE_ONLY" = true ]; then
|
|
log_info "Skipping test incident (--validate-only mode)"
|
|
return 0
|
|
fi
|
|
|
|
# Create test incident
|
|
local response
|
|
response=$(curl -X POST https://events.pagerduty.com/v2/enqueue \
|
|
-H 'Content-Type: application/json' \
|
|
-H "Authorization: Token token=$PAGERDUTY_SERVICE_KEY" \
|
|
-d '{
|
|
"routing_key": "'"$PAGERDUTY_SERVICE_KEY"'",
|
|
"event_action": "trigger",
|
|
"payload": {
|
|
"summary": "StemeDB Setup Test - Safe to Acknowledge",
|
|
"severity": "info",
|
|
"source": "stemedb-setup-script",
|
|
"custom_details": {
|
|
"test": true,
|
|
"timestamp": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"
|
|
}
|
|
}
|
|
}' 2>&1)
|
|
|
|
# Check response
|
|
if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
|
|
local dedup_key
|
|
dedup_key=$(echo "$response" | jq -r '.dedup_key')
|
|
log_info "✓ Test incident created successfully"
|
|
log_info " Incident key: $dedup_key"
|
|
log_info " Please acknowledge this test incident in PagerDuty"
|
|
return 0
|
|
else
|
|
log_error "Failed to create test incident"
|
|
log_error "Response: $response"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Validation step 4: Verify escalation policy
|
|
verify_escalation_policy() {
|
|
log_info "Verifying escalation policy..."
|
|
|
|
if [ -z "$PAGERDUTY_API_TOKEN" ]; then
|
|
log_warn "PAGERDUTY_API_TOKEN not set, skipping policy validation"
|
|
log_info "Set it with: export PAGERDUTY_API_TOKEN='your-token-here'"
|
|
return 0
|
|
fi
|
|
|
|
if [ -z "$PAGERDUTY_SERVICE_ID" ]; then
|
|
log_warn "PAGERDUTY_SERVICE_ID not set, skipping policy validation"
|
|
return 0
|
|
fi
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "[DRY RUN] Would verify escalation policy via API"
|
|
return 0
|
|
fi
|
|
|
|
# Fetch service details
|
|
local response
|
|
response=$(curl -s -X GET \
|
|
"https://api.pagerduty.com/services/$PAGERDUTY_SERVICE_ID" \
|
|
-H 'Accept: application/vnd.pagerduty+json;version=2' \
|
|
-H "Authorization: Token token=$PAGERDUTY_API_TOKEN")
|
|
|
|
if echo "$response" | jq -e '.service' > /dev/null 2>&1; then
|
|
local service_name
|
|
local escalation_policy
|
|
service_name=$(echo "$response" | jq -r '.service.name')
|
|
escalation_policy=$(echo "$response" | jq -r '.service.escalation_policy.summary')
|
|
|
|
log_info "✓ Service found: $service_name"
|
|
log_info " Escalation policy: $escalation_policy"
|
|
return 0
|
|
else
|
|
log_error "Failed to fetch service details"
|
|
log_error "Response: $response"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Validation step 5: Check routing configuration
|
|
verify_routing() {
|
|
log_info "Verifying alert routing configuration..."
|
|
|
|
# Check if Alertmanager config exists
|
|
local alertmanager_config="/etc/prometheus/alertmanager.yml"
|
|
|
|
if [ ! -f "$alertmanager_config" ]; then
|
|
log_warn "Alertmanager config not found at $alertmanager_config"
|
|
log_info "Ensure PagerDuty routing is configured in Alertmanager"
|
|
return 0
|
|
fi
|
|
|
|
# Verify PagerDuty receiver is configured
|
|
if grep -q "pagerduty" "$alertmanager_config"; then
|
|
log_info "✓ PagerDuty receiver configured in Alertmanager"
|
|
|
|
# Check for critical/warning routing
|
|
if grep -q "severity.*critical" "$alertmanager_config"; then
|
|
log_info " ✓ Critical severity routing found"
|
|
else
|
|
log_warn " Warning: No explicit critical severity routing"
|
|
fi
|
|
|
|
if grep -q "severity.*warning" "$alertmanager_config"; then
|
|
log_info " ✓ Warning severity routing found"
|
|
else
|
|
log_warn " Warning: No explicit warning severity routing"
|
|
fi
|
|
else
|
|
log_warn "PagerDuty receiver not found in Alertmanager config"
|
|
log_info "Add a PagerDuty receiver to $alertmanager_config"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
echo "========================================="
|
|
echo "StemeDB PagerDuty Setup Validation"
|
|
echo "========================================="
|
|
echo ""
|
|
|
|
if [ "$DRY_RUN" = true ]; then
|
|
log_info "Running in DRY RUN mode - no changes will be made"
|
|
fi
|
|
|
|
local failed=0
|
|
|
|
# Run validation steps
|
|
validate_dependencies || failed=1
|
|
validate_service_key || failed=1
|
|
test_incident_creation || failed=1
|
|
verify_escalation_policy || failed=1
|
|
verify_routing || failed=1
|
|
|
|
echo ""
|
|
echo "========================================="
|
|
if [ $failed -eq 0 ]; then
|
|
log_info "✓ PagerDuty validation PASSED"
|
|
echo "========================================="
|
|
exit 0
|
|
else
|
|
log_error "✗ PagerDuty validation FAILED"
|
|
echo "========================================="
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Run main function
|
|
main
|