stemedb/scripts/setup-slack.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

372 lines
10 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Setup and validate Slack integration for StemeDB alerting
#
# Usage:
# ./setup-slack.sh # Full validation
# ./setup-slack.sh --validate-only # Skip test message posting
# ./setup-slack.sh --dry-run # Show what would be done
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration (override with environment variables)
SLACK_WEBHOOK_CRITICAL="${SLACK_WEBHOOK_CRITICAL:-}"
SLACK_WEBHOOK_WARNING="${SLACK_WEBHOOK_WARNING:-}"
SLACK_WEBHOOK_INFO="${SLACK_WEBHOOK_INFO:-}"
SLACK_CHANNEL_CRITICAL="${SLACK_CHANNEL_CRITICAL:-#stemedb-alerts-critical}"
SLACK_CHANNEL_WARNING="${SLACK_CHANNEL_WARNING:-#stemedb-alerts-warning}"
SLACK_CHANNEL_INFO="${SLACK_CHANNEL_INFO:-#stemedb-alerts-info}"
# Modes
VALIDATE_ONLY=false
DRY_RUN=false
# Parse arguments
for arg in "$@"; do
case $arg in
--validate-only)
VALIDATE_ONLY=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
--help)
echo "Usage: $0 [--validate-only] [--dry-run] [--help]"
echo ""
echo "Options:"
echo " --validate-only Skip test message posting"
echo " --dry-run Show what would be done without executing"
echo " --help Show this help message"
echo ""
echo "Environment variables:"
echo " SLACK_WEBHOOK_CRITICAL Webhook URL for critical alerts"
echo " SLACK_WEBHOOK_WARNING Webhook URL for warning alerts"
echo " SLACK_WEBHOOK_INFO Webhook URL for info alerts"
echo " SLACK_CHANNEL_CRITICAL Channel name (default: #stemedb-alerts-critical)"
echo " SLACK_CHANNEL_WARNING Channel name (default: #stemedb-alerts-warning)"
echo " SLACK_CHANNEL_INFO Channel name (default: #stemedb-alerts-info)"
exit 0
;;
*)
echo "Unknown argument: $arg"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Helper functions
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
check_dependency() {
if ! command -v "$1" &> /dev/null; then
log_error "Required command '$1' not found"
return 1
fi
}
# Validation step 1: Check dependencies
validate_dependencies() {
log_info "Checking dependencies..."
local missing=0
for cmd in curl jq; do
if ! check_dependency "$cmd"; then
missing=1
fi
done
if [ $missing -eq 1 ]; then
log_error "Missing required dependencies. Install curl and jq."
return 1
fi
log_info "✓ All dependencies present"
return 0
}
# Validation step 2: Validate webhook URLs
validate_webhook_urls() {
log_info "Validating Slack webhook URLs..."
local failed=0
# Validate critical webhook
if [ -z "$SLACK_WEBHOOK_CRITICAL" ]; then
log_error "SLACK_WEBHOOK_CRITICAL not set"
log_info "Set it with: export SLACK_WEBHOOK_CRITICAL='https://hooks.slack.com/services/...'"
failed=1
elif [[ ! "$SLACK_WEBHOOK_CRITICAL" =~ ^https://hooks\.slack\.com/services/ ]]; then
log_error "SLACK_WEBHOOK_CRITICAL has invalid format"
log_info "Expected format: https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXX"
failed=1
else
log_info "✓ Critical webhook URL format valid"
fi
# Validate warning webhook
if [ -z "$SLACK_WEBHOOK_WARNING" ]; then
log_warn "SLACK_WEBHOOK_WARNING not set (optional)"
elif [[ ! "$SLACK_WEBHOOK_WARNING" =~ ^https://hooks\.slack\.com/services/ ]]; then
log_error "SLACK_WEBHOOK_WARNING has invalid format"
failed=1
else
log_info "✓ Warning webhook URL format valid"
fi
# Validate info webhook
if [ -z "$SLACK_WEBHOOK_INFO" ]; then
log_warn "SLACK_WEBHOOK_INFO not set (optional)"
elif [[ ! "$SLACK_WEBHOOK_INFO" =~ ^https://hooks\.slack\.com/services/ ]]; then
log_error "SLACK_WEBHOOK_INFO has invalid format"
failed=1
else
log_info "✓ Info webhook URL format valid"
fi
return $failed
}
# Validation step 3: Test message posting
test_message_posting() {
log_info "Testing message posting to Slack channels..."
if [ "$DRY_RUN" = true ]; then
log_info "[DRY RUN] Would send test messages to Slack"
return 0
fi
if [ "$VALIDATE_ONLY" = true ]; then
log_info "Skipping test messages (--validate-only mode)"
return 0
fi
local failed=0
# Test critical channel
if [ -n "$SLACK_WEBHOOK_CRITICAL" ]; then
log_info "Sending test message to $SLACK_CHANNEL_CRITICAL..."
local response
response=$(curl -X POST "$SLACK_WEBHOOK_CRITICAL" \
-H 'Content-Type: application/json' \
-d '{
"channel": "'"$SLACK_CHANNEL_CRITICAL"'",
"username": "StemeDB Alerts",
"icon_emoji": ":warning:",
"attachments": [{
"color": "danger",
"title": "🔴 CRITICAL: StemeDB Setup Test",
"text": "This is a test message from setup-slack.sh. Safe to ignore.",
"fields": [
{
"title": "Severity",
"value": "CRITICAL",
"short": true
},
{
"title": "Timestamp",
"value": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
"short": true
}
],
"footer": "StemeDB Monitoring"
}]
}' 2>&1)
if [ "$response" = "ok" ]; then
log_info "✓ Test message sent to $SLACK_CHANNEL_CRITICAL"
else
log_error "Failed to send message to $SLACK_CHANNEL_CRITICAL"
log_error "Response: $response"
failed=1
fi
fi
# Test warning channel
if [ -n "$SLACK_WEBHOOK_WARNING" ]; then
log_info "Sending test message to $SLACK_CHANNEL_WARNING..."
local response
response=$(curl -X POST "$SLACK_WEBHOOK_WARNING" \
-H 'Content-Type: application/json' \
-d '{
"channel": "'"$SLACK_CHANNEL_WARNING"'",
"username": "StemeDB Alerts",
"icon_emoji": ":warning:",
"attachments": [{
"color": "warning",
"title": "🟡 WARNING: StemeDB Setup Test",
"text": "This is a test message from setup-slack.sh. Safe to ignore.",
"fields": [
{
"title": "Severity",
"value": "WARNING",
"short": true
},
{
"title": "Timestamp",
"value": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
"short": true
}
],
"footer": "StemeDB Monitoring"
}]
}' 2>&1)
if [ "$response" = "ok" ]; then
log_info "✓ Test message sent to $SLACK_CHANNEL_WARNING"
else
log_warn "Failed to send message to $SLACK_CHANNEL_WARNING"
log_warn "Response: $response"
fi
fi
# Test info channel
if [ -n "$SLACK_WEBHOOK_INFO" ]; then
log_info "Sending test message to $SLACK_CHANNEL_INFO..."
local response
response=$(curl -X POST "$SLACK_WEBHOOK_INFO" \
-H 'Content-Type: application/json' \
-d '{
"channel": "'"$SLACK_CHANNEL_INFO"'",
"username": "StemeDB Alerts",
"icon_emoji": ":information_source:",
"attachments": [{
"color": "good",
"title": " INFO: StemeDB Setup Test",
"text": "This is a test message from setup-slack.sh. Safe to ignore.",
"fields": [
{
"title": "Severity",
"value": "INFO",
"short": true
},
{
"title": "Timestamp",
"value": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
"short": true
}
],
"footer": "StemeDB Monitoring"
}]
}' 2>&1)
if [ "$response" = "ok" ]; then
log_info "✓ Test message sent to $SLACK_CHANNEL_INFO"
else
log_warn "Failed to send message to $SLACK_CHANNEL_INFO"
log_warn "Response: $response"
fi
fi
return $failed
}
# Validation step 4: Verify formatting renders correctly
verify_formatting() {
log_info "Verifying message formatting..."
if [ "$DRY_RUN" = true ] || [ "$VALIDATE_ONLY" = true ]; then
log_info "Skipping formatting verification (requires manual check)"
return 0
fi
log_info "Please check Slack channels to verify:"
log_info " 1. Messages appear in correct channels"
log_info " 2. Color coding is correct (red=critical, yellow=warning, green=info)"
log_info " 3. Formatting renders properly (fields, footer, emoji)"
log_info " 4. Bot icon and username are correct"
return 0
}
# Validation step 5: Check Alertmanager configuration
verify_alertmanager_config() {
log_info "Verifying Alertmanager Slack configuration..."
local alertmanager_config="/etc/prometheus/alertmanager.yml"
if [ ! -f "$alertmanager_config" ]; then
log_warn "Alertmanager config not found at $alertmanager_config"
log_info "Ensure Slack receivers are configured in Alertmanager"
return 0
fi
# Verify Slack receiver is configured
if grep -q "slack_configs" "$alertmanager_config"; then
log_info "✓ Slack receivers configured in Alertmanager"
# Count configured Slack receivers
local slack_count
slack_count=$(grep -c "api_url:" "$alertmanager_config" || echo "0")
log_info " Found $slack_count Slack webhook(s) configured"
# Check for channel routing
if grep -q "channel:" "$alertmanager_config"; then
log_info " ✓ Channel routing configured"
else
log_warn " Warning: No explicit channel routing found"
fi
else
log_warn "No Slack receivers found in Alertmanager config"
log_info "Add Slack receivers to $alertmanager_config"
fi
return 0
}
# Main execution
main() {
echo "========================================="
echo "StemeDB Slack Setup Validation"
echo "========================================="
echo ""
if [ "$DRY_RUN" = true ]; then
log_info "Running in DRY RUN mode - no changes will be made"
fi
local failed=0
# Run validation steps
validate_dependencies || failed=1
validate_webhook_urls || failed=1
test_message_posting || failed=1
verify_formatting || failed=1
verify_alertmanager_config || failed=1
echo ""
echo "========================================="
if [ $failed -eq 0 ]; then
log_info "✓ Slack validation PASSED"
echo "========================================="
exit 0
else
log_error "✗ Slack validation FAILED"
echo "========================================="
exit 1
fi
}
# Run main function
main