This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
388 lines
11 KiB
Bash
Executable File
388 lines
11 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# StemeDB Backup & DR Integration Tests
|
|
#
|
|
# End-to-end test suite validating all P5.3 components:
|
|
# - Backup creation
|
|
# - Retention policy
|
|
# - Backup verification
|
|
# - WAL archival
|
|
# - S3 upload
|
|
# - Metrics export
|
|
# - Alert rules
|
|
#
|
|
# Usage:
|
|
# ./uat/production-readiness/backup-dr-tests.sh
|
|
#
|
|
# Exit codes:
|
|
# 0 - All tests passed
|
|
# 1 - One or more tests failed
|
|
#
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
readonly PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
|
|
readonly TEST_DATA_DIR="/tmp/stemedb-backup-dr-test"
|
|
readonly TEST_WAL_DIR="${TEST_DATA_DIR}/wal"
|
|
readonly TEST_DB_DIR="${TEST_DATA_DIR}/db"
|
|
readonly TEST_BACKUP_DIR="${TEST_DATA_DIR}/backups"
|
|
readonly METRICS_DIR="${TEST_DATA_DIR}/metrics"
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
# Test results
|
|
TESTS_RUN=0
|
|
TESTS_PASSED=0
|
|
TESTS_FAILED=0
|
|
FAILED_TESTS=()
|
|
|
|
# Logging
|
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
success() { echo -e "${GREEN}[PASS]${NC} $*"; }
|
|
fail_test() { echo -e "${RED}[FAIL]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
|
|
# Test helpers
|
|
setup() {
|
|
info "Setting up test environment..."
|
|
|
|
# Clean previous test data
|
|
rm -rf "$TEST_DATA_DIR"
|
|
|
|
# Create test directories
|
|
mkdir -p "$TEST_WAL_DIR" "$TEST_DB_DIR" "$TEST_BACKUP_DIR" "$METRICS_DIR"
|
|
|
|
# Create fake WAL files
|
|
for i in {1..10}; do
|
|
# Write STEM magic bytes + some data
|
|
printf '\x53\x54\x45\x4d' > "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal"
|
|
dd if=/dev/urandom bs=1K count=100 >> "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal" 2>/dev/null
|
|
done
|
|
|
|
# Create fake DB files
|
|
for i in {1..5}; do
|
|
dd if=/dev/urandom of="${TEST_DB_DIR}/data-$(printf "%03d" $i).kv" bs=1M count=10 2>/dev/null
|
|
done
|
|
|
|
success "Test environment ready"
|
|
}
|
|
|
|
teardown() {
|
|
info "Cleaning up test environment..."
|
|
rm -rf "$TEST_DATA_DIR"
|
|
success "Cleanup complete"
|
|
}
|
|
|
|
run_test() {
|
|
local test_name="$1"
|
|
local test_func="$2"
|
|
|
|
((TESTS_RUN++))
|
|
echo ""
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo "Test $TESTS_RUN: $test_name"
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
|
|
if $test_func; then
|
|
((TESTS_PASSED++))
|
|
success "$test_name"
|
|
else
|
|
((TESTS_FAILED++))
|
|
FAILED_TESTS+=("$test_name")
|
|
fail_test "$test_name"
|
|
fi
|
|
}
|
|
|
|
# Test 1: Backup creation
|
|
test_backup_creation() {
|
|
info "Testing backup creation..."
|
|
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1
|
|
|
|
# Verify backup exists
|
|
local backup_count
|
|
backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
|
|
|
|
if [[ $backup_count -ne 1 ]]; then
|
|
fail_test "Expected 1 backup, found $backup_count"
|
|
return 1
|
|
fi
|
|
|
|
# Verify backup structure
|
|
local backup_dir
|
|
backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1)
|
|
|
|
[[ -f "${backup_dir}/backup-metadata.json" ]] || { fail_test "Missing metadata.json"; return 1; }
|
|
[[ -d "${backup_dir}/wal" ]] || { fail_test "Missing wal/ directory"; return 1; }
|
|
[[ -d "${backup_dir}/db" ]] || { fail_test "Missing db/ directory"; return 1; }
|
|
|
|
# Verify file counts
|
|
local wal_count
|
|
wal_count=$(find "${backup_dir}/wal" -name "*.wal" | wc -l)
|
|
if [[ $wal_count -ne 10 ]]; then
|
|
fail_test "Expected 10 WAL files, found $wal_count"
|
|
return 1
|
|
fi
|
|
|
|
local db_count
|
|
db_count=$(find "${backup_dir}/db" -name "*.kv" | wc -l)
|
|
if [[ $db_count -ne 5 ]]; then
|
|
fail_test "Expected 5 DB files, found $db_count"
|
|
return 1
|
|
fi
|
|
|
|
success "Backup created successfully with correct structure"
|
|
return 0
|
|
}
|
|
|
|
# Test 2: Retention policy
|
|
test_retention_policy() {
|
|
info "Testing retention policy..."
|
|
|
|
# Create 5 backups with different timestamps
|
|
for i in {1..5}; do
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null
|
|
|
|
sleep 1 # Ensure different timestamps
|
|
done
|
|
|
|
# Apply retention: keep last 3
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" \
|
|
--output "$TEST_BACKUP_DIR" \
|
|
--keep-last 2d || return 1 # Keep last 2 days (should keep minimum 3)
|
|
|
|
# Count remaining backups
|
|
local backup_count
|
|
backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
|
|
|
|
# Should have at least 3 (minimum retention)
|
|
if [[ $backup_count -lt 3 ]]; then
|
|
fail_test "Retention policy too aggressive: only $backup_count backups remain"
|
|
return 1
|
|
fi
|
|
|
|
success "Retention policy working correctly (kept $backup_count backups)"
|
|
return 0
|
|
}
|
|
|
|
# Test 3: Backup verification
|
|
test_backup_verification() {
|
|
info "Testing backup verification..."
|
|
|
|
# Create a backup
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null
|
|
|
|
# Verify it
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/verify-backup.sh" "$TEST_BACKUP_DIR/$(ls -t "$TEST_BACKUP_DIR" | head -n1)" || return 1
|
|
|
|
# Check metrics were written
|
|
[[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; }
|
|
|
|
# Verify metrics content
|
|
if ! grep -q "stemedb_backup_verification_status.*1" "${METRICS_DIR}/stemedb_backup.prom"; then
|
|
fail_test "Verification status not set to 1 (passed)"
|
|
return 1
|
|
fi
|
|
|
|
success "Backup verification passed and metrics written"
|
|
return 0
|
|
}
|
|
|
|
# Test 4: WAL magic byte detection
|
|
test_wal_magic_validation() {
|
|
info "Testing WAL magic byte validation..."
|
|
|
|
# Create backup with corrupted WAL
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null
|
|
|
|
local backup_dir
|
|
backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1)
|
|
|
|
# Corrupt first WAL file (wrong magic bytes)
|
|
printf '\x00\x00\x00\x00' > "${backup_dir}/wal/$(ls "${backup_dir}/wal" | head -n1)"
|
|
|
|
# Verification should fail
|
|
if METRICS_DIR="$METRICS_DIR" "${PROJECT_DIR}/scripts/verify-backup.sh" "$backup_dir" 2>/dev/null; then
|
|
fail_test "Verification should have failed for corrupted WAL"
|
|
return 1
|
|
fi
|
|
|
|
# Check metrics show failure
|
|
if ! grep -q "stemedb_backup_verification_status.*0" "${METRICS_DIR}/stemedb_backup.prom"; then
|
|
fail_test "Verification status not set to 0 (failed)"
|
|
return 1
|
|
fi
|
|
|
|
success "WAL corruption detected correctly"
|
|
return 0
|
|
}
|
|
|
|
# Test 5: Dry run mode
|
|
test_dry_run() {
|
|
info "Testing dry run mode..."
|
|
|
|
local backup_count_before
|
|
backup_count_before=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
|
|
|
|
# Run backup in dry-run mode
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" \
|
|
--output "$TEST_BACKUP_DIR" \
|
|
--dry-run || return 1
|
|
|
|
local backup_count_after
|
|
backup_count_after=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
|
|
|
|
if [[ $backup_count_before -ne $backup_count_after ]]; then
|
|
fail_test "Dry run created a backup (should not have)"
|
|
return 1
|
|
fi
|
|
|
|
success "Dry run mode working correctly (no backup created)"
|
|
return 0
|
|
}
|
|
|
|
# Test 6: Metrics export
|
|
test_metrics_export() {
|
|
info "Testing metrics export..."
|
|
|
|
# Create backup with metrics
|
|
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
|
|
STEMEDB_DB_DIR="$TEST_DB_DIR" \
|
|
METRICS_DIR="$METRICS_DIR" \
|
|
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1
|
|
|
|
# Verify metrics file exists
|
|
[[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; }
|
|
|
|
# Verify required metrics present
|
|
local required_metrics=(
|
|
"stemedb_backup_last_success_timestamp"
|
|
"stemedb_backup_age_seconds"
|
|
"stemedb_backup_size_bytes"
|
|
"stemedb_backup_wal_files"
|
|
"stemedb_backup_db_files"
|
|
)
|
|
|
|
for metric in "${required_metrics[@]}"; do
|
|
if ! grep -q "^${metric}" "${METRICS_DIR}/stemedb_backup.prom"; then
|
|
fail_test "Missing metric: $metric"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
success "All required metrics exported correctly"
|
|
return 0
|
|
}
|
|
|
|
# Test 7: Alert rules syntax
|
|
test_alert_rules() {
|
|
info "Testing Prometheus alert rules syntax..."
|
|
|
|
local alert_file="${PROJECT_DIR}/docs/operations/deployment/prometheus/backup-alerts.yml"
|
|
|
|
[[ -f "$alert_file" ]] || { fail_test "Alert rules file not found"; return 1; }
|
|
|
|
# Basic YAML syntax check
|
|
if ! command -v yamllint &>/dev/null; then
|
|
warn "yamllint not installed, skipping syntax validation"
|
|
return 0
|
|
fi
|
|
|
|
if ! yamllint -d relaxed "$alert_file" 2>/dev/null; then
|
|
fail_test "Alert rules YAML syntax invalid"
|
|
return 1
|
|
fi
|
|
|
|
# Check required alerts exist
|
|
local required_alerts=(
|
|
"StemeDBBackupFailed"
|
|
"StemeDBBackupVerificationFailed"
|
|
"StemeDBWALArchivalLag"
|
|
"StemeDBBackupStale"
|
|
)
|
|
|
|
for alert in "${required_alerts[@]}"; do
|
|
if ! grep -q "alert: $alert" "$alert_file"; then
|
|
fail_test "Missing alert: $alert"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
success "Alert rules syntax valid and all required alerts present"
|
|
return 0
|
|
}
|
|
|
|
# Main test execution
|
|
main() {
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " StemeDB Backup & DR Integration Tests"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
setup
|
|
|
|
# Run all tests
|
|
run_test "Backup Creation" test_backup_creation
|
|
run_test "Retention Policy" test_retention_policy
|
|
run_test "Backup Verification" test_backup_verification
|
|
run_test "WAL Magic Validation" test_wal_magic_validation
|
|
run_test "Dry Run Mode" test_dry_run
|
|
run_test "Metrics Export" test_metrics_export
|
|
run_test "Alert Rules" test_alert_rules
|
|
|
|
teardown
|
|
|
|
# Summary
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " Test Summary"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo " Total: $TESTS_RUN"
|
|
echo -e " Passed: ${GREEN}${TESTS_PASSED}${NC}"
|
|
echo -e " Failed: ${RED}${TESTS_FAILED}${NC}"
|
|
echo ""
|
|
|
|
if [[ $TESTS_FAILED -gt 0 ]]; then
|
|
echo "Failed tests:"
|
|
for test in "${FAILED_TESTS[@]}"; do
|
|
echo " - $test"
|
|
done
|
|
echo ""
|
|
exit 1
|
|
else
|
|
echo -e "${GREEN}All tests passed!${NC}"
|
|
echo ""
|
|
exit 0
|
|
fi
|
|
}
|
|
|
|
main "$@"
|