stemedb/scripts/setup-pagerduty.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

281 lines
7.4 KiB
Bash
Executable File

#!/bin/bash
# Setup and validate PagerDuty integration for StemeDB alerting
#
# Usage:
# ./setup-pagerduty.sh # Full validation
# ./setup-pagerduty.sh --validate-only # Skip test incident creation
# ./setup-pagerduty.sh --dry-run # Show what would be done
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration (override with environment variables)
PAGERDUTY_SERVICE_KEY="${PAGERDUTY_SERVICE_KEY:-}"
PAGERDUTY_API_TOKEN="${PAGERDUTY_API_TOKEN:-}"
PAGERDUTY_SERVICE_ID="${PAGERDUTY_SERVICE_ID:-}"
# Modes
VALIDATE_ONLY=false
DRY_RUN=false
# Parse arguments
for arg in "$@"; do
case $arg in
--validate-only)
VALIDATE_ONLY=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
--help)
echo "Usage: $0 [--validate-only] [--dry-run] [--help]"
echo ""
echo "Options:"
echo " --validate-only Skip test incident creation"
echo " --dry-run Show what would be done without executing"
echo " --help Show this help message"
echo ""
echo "Environment variables:"
echo " PAGERDUTY_SERVICE_KEY Integration key from PagerDuty service"
echo " PAGERDUTY_API_TOKEN API token for PagerDuty API"
echo " PAGERDUTY_SERVICE_ID Service ID (for policy validation)"
exit 0
;;
*)
echo "Unknown argument: $arg"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Helper functions
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
check_dependency() {
if ! command -v "$1" &> /dev/null; then
log_error "Required command '$1' not found"
return 1
fi
}
# Validation step 1: Check dependencies
validate_dependencies() {
log_info "Checking dependencies..."
local missing=0
for cmd in curl jq; do
if ! check_dependency "$cmd"; then
missing=1
fi
done
if [ $missing -eq 1 ]; then
log_error "Missing required dependencies. Install curl and jq."
return 1
fi
log_info "✓ All dependencies present"
return 0
}
# Validation step 2: Check service key format
validate_service_key() {
log_info "Validating PagerDuty service key..."
if [ -z "$PAGERDUTY_SERVICE_KEY" ]; then
log_error "PAGERDUTY_SERVICE_KEY environment variable not set"
log_info "Set it with: export PAGERDUTY_SERVICE_KEY='your-key-here'"
return 1
fi
# Service keys are typically 32 characters (hex format)
if [ ${#PAGERDUTY_SERVICE_KEY} -ne 32 ]; then
log_warn "Service key length (${#PAGERDUTY_SERVICE_KEY}) is unusual (expected 32)"
fi
log_info "✓ Service key format validated"
return 0
}
# Validation step 3: Test incident creation
test_incident_creation() {
log_info "Testing incident creation..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would send test alert to PagerDuty"
return 0
fi
if [ "$VALIDATE_ONLY" = true ]; then
log_info "Skipping test incident (--validate-only mode)"
return 0
fi
# Create test incident
local response
response=$(curl -X POST https://events.pagerduty.com/v2/enqueue \
-H 'Content-Type: application/json' \
-H "Authorization: Token token=$PAGERDUTY_SERVICE_KEY" \
-d '{
"routing_key": "'"$PAGERDUTY_SERVICE_KEY"'",
"event_action": "trigger",
"payload": {
"summary": "StemeDB Setup Test - Safe to Acknowledge",
"severity": "info",
"source": "stemedb-setup-script",
"custom_details": {
"test": true,
"timestamp": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"
}
}
}' 2>&1)
# Check response
if echo "$response" | jq -e '.status == "success"' > /dev/null 2>&1; then
local dedup_key
dedup_key=$(echo "$response" | jq -r '.dedup_key')
log_info "✓ Test incident created successfully"
log_info " Incident key: $dedup_key"
log_info " Please acknowledge this test incident in PagerDuty"
return 0
else
log_error "Failed to create test incident"
log_error "Response: $response"
return 1
fi
}
# Validation step 4: Verify escalation policy
verify_escalation_policy() {
log_info "Verifying escalation policy..."
if [ -z "$PAGERDUTY_API_TOKEN" ]; then
log_warn "PAGERDUTY_API_TOKEN not set, skipping policy validation"
log_info "Set it with: export PAGERDUTY_API_TOKEN='your-token-here'"
return 0
fi
if [ -z "$PAGERDUTY_SERVICE_ID" ]; then
log_warn "PAGERDUTY_SERVICE_ID not set, skipping policy validation"
return 0
fi
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would verify escalation policy via API"
return 0
fi
# Fetch service details
local response
response=$(curl -s -X GET \
"https://api.pagerduty.com/services/$PAGERDUTY_SERVICE_ID" \
-H 'Accept: application/vnd.pagerduty+json;version=2' \
-H "Authorization: Token token=$PAGERDUTY_API_TOKEN")
if echo "$response" | jq -e '.service' > /dev/null 2>&1; then
local service_name
local escalation_policy
service_name=$(echo "$response" | jq -r '.service.name')
escalation_policy=$(echo "$response" | jq -r '.service.escalation_policy.summary')
log_info "✓ Service found: $service_name"
log_info " Escalation policy: $escalation_policy"
return 0
else
log_error "Failed to fetch service details"
log_error "Response: $response"
return 1
fi
}
# Validation step 5: Check routing configuration
verify_routing() {
log_info "Verifying alert routing configuration..."
# Check if Alertmanager config exists
local alertmanager_config="/etc/prometheus/alertmanager.yml"
if [ ! -f "$alertmanager_config" ]; then
log_warn "Alertmanager config not found at $alertmanager_config"
log_info "Ensure PagerDuty routing is configured in Alertmanager"
return 0
fi
# Verify PagerDuty receiver is configured
if grep -q "pagerduty" "$alertmanager_config"; then
log_info "✓ PagerDuty receiver configured in Alertmanager"
# Check for critical/warning routing
if grep -q "severity.*critical" "$alertmanager_config"; then
log_info " ✓ Critical severity routing found"
else
log_warn " Warning: No explicit critical severity routing"
fi
if grep -q "severity.*warning" "$alertmanager_config"; then
log_info " ✓ Warning severity routing found"
else
log_warn " Warning: No explicit warning severity routing"
fi
else
log_warn "PagerDuty receiver not found in Alertmanager config"
log_info "Add a PagerDuty receiver to $alertmanager_config"
fi
return 0
}
# Main execution
main() {
echo "========================================="
echo "StemeDB PagerDuty Setup Validation"
echo "========================================="
echo ""
if [ "$DRY_RUN" = true ]; then
log_info "Running in DRY RUN mode - no changes will be made"
fi
local failed=0
# Run validation steps
validate_dependencies || failed=1
validate_service_key || failed=1
test_incident_creation || failed=1
verify_escalation_policy || failed=1
verify_routing || failed=1
echo ""
echo "========================================="
if [ $failed -eq 0 ]; then
log_info "✓ PagerDuty validation PASSED"
echo "========================================="
exit 0
else
log_error "✗ PagerDuty validation FAILED"
echo "========================================="
exit 1
fi
}
# Run main function
main