This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
372 lines
10 KiB
Bash
Executable File
372 lines
10 KiB
Bash
Executable File
#!/bin/bash
|
||
# Setup and validate Slack integration for StemeDB alerting
|
||
#
|
||
# Usage:
|
||
# ./setup-slack.sh # Full validation
|
||
# ./setup-slack.sh --validate-only # Skip test message posting
|
||
# ./setup-slack.sh --dry-run # Show what would be done
|
||
|
||
set -euo pipefail
|
||
|
||
# Colors for output
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# Configuration (override with environment variables)
|
||
SLACK_WEBHOOK_CRITICAL="${SLACK_WEBHOOK_CRITICAL:-}"
|
||
SLACK_WEBHOOK_WARNING="${SLACK_WEBHOOK_WARNING:-}"
|
||
SLACK_WEBHOOK_INFO="${SLACK_WEBHOOK_INFO:-}"
|
||
SLACK_CHANNEL_CRITICAL="${SLACK_CHANNEL_CRITICAL:-#stemedb-alerts-critical}"
|
||
SLACK_CHANNEL_WARNING="${SLACK_CHANNEL_WARNING:-#stemedb-alerts-warning}"
|
||
SLACK_CHANNEL_INFO="${SLACK_CHANNEL_INFO:-#stemedb-alerts-info}"
|
||
|
||
# Modes
|
||
VALIDATE_ONLY=false
|
||
DRY_RUN=false
|
||
|
||
# Parse arguments
|
||
for arg in "$@"; do
|
||
case $arg in
|
||
--validate-only)
|
||
VALIDATE_ONLY=true
|
||
shift
|
||
;;
|
||
--dry-run)
|
||
DRY_RUN=true
|
||
shift
|
||
;;
|
||
--help)
|
||
echo "Usage: $0 [--validate-only] [--dry-run] [--help]"
|
||
echo ""
|
||
echo "Options:"
|
||
echo " --validate-only Skip test message posting"
|
||
echo " --dry-run Show what would be done without executing"
|
||
echo " --help Show this help message"
|
||
echo ""
|
||
echo "Environment variables:"
|
||
echo " SLACK_WEBHOOK_CRITICAL Webhook URL for critical alerts"
|
||
echo " SLACK_WEBHOOK_WARNING Webhook URL for warning alerts"
|
||
echo " SLACK_WEBHOOK_INFO Webhook URL for info alerts"
|
||
echo " SLACK_CHANNEL_CRITICAL Channel name (default: #stemedb-alerts-critical)"
|
||
echo " SLACK_CHANNEL_WARNING Channel name (default: #stemedb-alerts-warning)"
|
||
echo " SLACK_CHANNEL_INFO Channel name (default: #stemedb-alerts-info)"
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $arg"
|
||
echo "Use --help for usage information"
|
||
exit 1
|
||
;;
|
||
esac
|
||
done
|
||
|
||
# Helper functions
|
||
log_info() {
|
||
echo -e "${GREEN}[INFO]${NC} $1"
|
||
}
|
||
|
||
log_warn() {
|
||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1"
|
||
}
|
||
|
||
check_dependency() {
|
||
if ! command -v "$1" &> /dev/null; then
|
||
log_error "Required command '$1' not found"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# Validation step 1: Check dependencies
|
||
validate_dependencies() {
|
||
log_info "Checking dependencies..."
|
||
|
||
local missing=0
|
||
for cmd in curl jq; do
|
||
if ! check_dependency "$cmd"; then
|
||
missing=1
|
||
fi
|
||
done
|
||
|
||
if [ $missing -eq 1 ]; then
|
||
log_error "Missing required dependencies. Install curl and jq."
|
||
return 1
|
||
fi
|
||
|
||
log_info "✓ All dependencies present"
|
||
return 0
|
||
}
|
||
|
||
# Validation step 2: Validate webhook URLs
|
||
validate_webhook_urls() {
|
||
log_info "Validating Slack webhook URLs..."
|
||
|
||
local failed=0
|
||
|
||
# Validate critical webhook
|
||
if [ -z "$SLACK_WEBHOOK_CRITICAL" ]; then
|
||
log_error "SLACK_WEBHOOK_CRITICAL not set"
|
||
log_info "Set it with: export SLACK_WEBHOOK_CRITICAL='https://hooks.slack.com/services/...'"
|
||
failed=1
|
||
elif [[ ! "$SLACK_WEBHOOK_CRITICAL" =~ ^https://hooks\.slack\.com/services/ ]]; then
|
||
log_error "SLACK_WEBHOOK_CRITICAL has invalid format"
|
||
log_info "Expected format: https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXX"
|
||
failed=1
|
||
else
|
||
log_info "✓ Critical webhook URL format valid"
|
||
fi
|
||
|
||
# Validate warning webhook
|
||
if [ -z "$SLACK_WEBHOOK_WARNING" ]; then
|
||
log_warn "SLACK_WEBHOOK_WARNING not set (optional)"
|
||
elif [[ ! "$SLACK_WEBHOOK_WARNING" =~ ^https://hooks\.slack\.com/services/ ]]; then
|
||
log_error "SLACK_WEBHOOK_WARNING has invalid format"
|
||
failed=1
|
||
else
|
||
log_info "✓ Warning webhook URL format valid"
|
||
fi
|
||
|
||
# Validate info webhook
|
||
if [ -z "$SLACK_WEBHOOK_INFO" ]; then
|
||
log_warn "SLACK_WEBHOOK_INFO not set (optional)"
|
||
elif [[ ! "$SLACK_WEBHOOK_INFO" =~ ^https://hooks\.slack\.com/services/ ]]; then
|
||
log_error "SLACK_WEBHOOK_INFO has invalid format"
|
||
failed=1
|
||
else
|
||
log_info "✓ Info webhook URL format valid"
|
||
fi
|
||
|
||
return $failed
|
||
}
|
||
|
||
# Validation step 3: Test message posting
|
||
test_message_posting() {
|
||
log_info "Testing message posting to Slack channels..."
|
||
|
||
if [ "$DRY_RUN" = true ]; then
|
||
log_info "[DRY RUN] Would send test messages to Slack"
|
||
return 0
|
||
fi
|
||
|
||
if [ "$VALIDATE_ONLY" = true ]; then
|
||
log_info "Skipping test messages (--validate-only mode)"
|
||
return 0
|
||
fi
|
||
|
||
local failed=0
|
||
|
||
# Test critical channel
|
||
if [ -n "$SLACK_WEBHOOK_CRITICAL" ]; then
|
||
log_info "Sending test message to $SLACK_CHANNEL_CRITICAL..."
|
||
|
||
local response
|
||
response=$(curl -X POST "$SLACK_WEBHOOK_CRITICAL" \
|
||
-H 'Content-Type: application/json' \
|
||
-d '{
|
||
"channel": "'"$SLACK_CHANNEL_CRITICAL"'",
|
||
"username": "StemeDB Alerts",
|
||
"icon_emoji": ":warning:",
|
||
"attachments": [{
|
||
"color": "danger",
|
||
"title": "🔴 CRITICAL: StemeDB Setup Test",
|
||
"text": "This is a test message from setup-slack.sh. Safe to ignore.",
|
||
"fields": [
|
||
{
|
||
"title": "Severity",
|
||
"value": "CRITICAL",
|
||
"short": true
|
||
},
|
||
{
|
||
"title": "Timestamp",
|
||
"value": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
|
||
"short": true
|
||
}
|
||
],
|
||
"footer": "StemeDB Monitoring"
|
||
}]
|
||
}' 2>&1)
|
||
|
||
if [ "$response" = "ok" ]; then
|
||
log_info "✓ Test message sent to $SLACK_CHANNEL_CRITICAL"
|
||
else
|
||
log_error "Failed to send message to $SLACK_CHANNEL_CRITICAL"
|
||
log_error "Response: $response"
|
||
failed=1
|
||
fi
|
||
fi
|
||
|
||
# Test warning channel
|
||
if [ -n "$SLACK_WEBHOOK_WARNING" ]; then
|
||
log_info "Sending test message to $SLACK_CHANNEL_WARNING..."
|
||
|
||
local response
|
||
response=$(curl -X POST "$SLACK_WEBHOOK_WARNING" \
|
||
-H 'Content-Type: application/json' \
|
||
-d '{
|
||
"channel": "'"$SLACK_CHANNEL_WARNING"'",
|
||
"username": "StemeDB Alerts",
|
||
"icon_emoji": ":warning:",
|
||
"attachments": [{
|
||
"color": "warning",
|
||
"title": "🟡 WARNING: StemeDB Setup Test",
|
||
"text": "This is a test message from setup-slack.sh. Safe to ignore.",
|
||
"fields": [
|
||
{
|
||
"title": "Severity",
|
||
"value": "WARNING",
|
||
"short": true
|
||
},
|
||
{
|
||
"title": "Timestamp",
|
||
"value": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
|
||
"short": true
|
||
}
|
||
],
|
||
"footer": "StemeDB Monitoring"
|
||
}]
|
||
}' 2>&1)
|
||
|
||
if [ "$response" = "ok" ]; then
|
||
log_info "✓ Test message sent to $SLACK_CHANNEL_WARNING"
|
||
else
|
||
log_warn "Failed to send message to $SLACK_CHANNEL_WARNING"
|
||
log_warn "Response: $response"
|
||
fi
|
||
fi
|
||
|
||
# Test info channel
|
||
if [ -n "$SLACK_WEBHOOK_INFO" ]; then
|
||
log_info "Sending test message to $SLACK_CHANNEL_INFO..."
|
||
|
||
local response
|
||
response=$(curl -X POST "$SLACK_WEBHOOK_INFO" \
|
||
-H 'Content-Type: application/json' \
|
||
-d '{
|
||
"channel": "'"$SLACK_CHANNEL_INFO"'",
|
||
"username": "StemeDB Alerts",
|
||
"icon_emoji": ":information_source:",
|
||
"attachments": [{
|
||
"color": "good",
|
||
"title": "ℹ️ INFO: StemeDB Setup Test",
|
||
"text": "This is a test message from setup-slack.sh. Safe to ignore.",
|
||
"fields": [
|
||
{
|
||
"title": "Severity",
|
||
"value": "INFO",
|
||
"short": true
|
||
},
|
||
{
|
||
"title": "Timestamp",
|
||
"value": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
|
||
"short": true
|
||
}
|
||
],
|
||
"footer": "StemeDB Monitoring"
|
||
}]
|
||
}' 2>&1)
|
||
|
||
if [ "$response" = "ok" ]; then
|
||
log_info "✓ Test message sent to $SLACK_CHANNEL_INFO"
|
||
else
|
||
log_warn "Failed to send message to $SLACK_CHANNEL_INFO"
|
||
log_warn "Response: $response"
|
||
fi
|
||
fi
|
||
|
||
return $failed
|
||
}
|
||
|
||
# Validation step 4: Verify formatting renders correctly
|
||
verify_formatting() {
|
||
log_info "Verifying message formatting..."
|
||
|
||
if [ "$DRY_RUN" = true ] || [ "$VALIDATE_ONLY" = true ]; then
|
||
log_info "Skipping formatting verification (requires manual check)"
|
||
return 0
|
||
fi
|
||
|
||
log_info "Please check Slack channels to verify:"
|
||
log_info " 1. Messages appear in correct channels"
|
||
log_info " 2. Color coding is correct (red=critical, yellow=warning, green=info)"
|
||
log_info " 3. Formatting renders properly (fields, footer, emoji)"
|
||
log_info " 4. Bot icon and username are correct"
|
||
|
||
return 0
|
||
}
|
||
|
||
# Validation step 5: Check Alertmanager configuration
|
||
verify_alertmanager_config() {
|
||
log_info "Verifying Alertmanager Slack configuration..."
|
||
|
||
local alertmanager_config="/etc/prometheus/alertmanager.yml"
|
||
|
||
if [ ! -f "$alertmanager_config" ]; then
|
||
log_warn "Alertmanager config not found at $alertmanager_config"
|
||
log_info "Ensure Slack receivers are configured in Alertmanager"
|
||
return 0
|
||
fi
|
||
|
||
# Verify Slack receiver is configured
|
||
if grep -q "slack_configs" "$alertmanager_config"; then
|
||
log_info "✓ Slack receivers configured in Alertmanager"
|
||
|
||
# Count configured Slack receivers
|
||
local slack_count
|
||
slack_count=$(grep -c "api_url:" "$alertmanager_config" || echo "0")
|
||
log_info " Found $slack_count Slack webhook(s) configured"
|
||
|
||
# Check for channel routing
|
||
if grep -q "channel:" "$alertmanager_config"; then
|
||
log_info " ✓ Channel routing configured"
|
||
else
|
||
log_warn " Warning: No explicit channel routing found"
|
||
fi
|
||
else
|
||
log_warn "No Slack receivers found in Alertmanager config"
|
||
log_info "Add Slack receivers to $alertmanager_config"
|
||
fi
|
||
|
||
return 0
|
||
}
|
||
|
||
# Main execution
|
||
main() {
|
||
echo "========================================="
|
||
echo "StemeDB Slack Setup Validation"
|
||
echo "========================================="
|
||
echo ""
|
||
|
||
if [ "$DRY_RUN" = true ]; then
|
||
log_info "Running in DRY RUN mode - no changes will be made"
|
||
fi
|
||
|
||
local failed=0
|
||
|
||
# Run validation steps
|
||
validate_dependencies || failed=1
|
||
validate_webhook_urls || failed=1
|
||
test_message_posting || failed=1
|
||
verify_formatting || failed=1
|
||
verify_alertmanager_config || failed=1
|
||
|
||
echo ""
|
||
echo "========================================="
|
||
if [ $failed -eq 0 ]; then
|
||
log_info "✓ Slack validation PASSED"
|
||
echo "========================================="
|
||
exit 0
|
||
else
|
||
log_error "✗ Slack validation FAILED"
|
||
echo "========================================="
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# Run main function
|
||
main
|