#!/usr/bin/env bash # # StemeDB Backup & DR Integration Tests # # End-to-end test suite validating all P5.3 components: # - Backup creation # - Retention policy # - Backup verification # - WAL archival # - S3 upload # - Metrics export # - Alert rules # # Usage: # ./uat/production-readiness/backup-dr-tests.sh # # Exit codes: # 0 - All tests passed # 1 - One or more tests failed # set -euo pipefail # Configuration readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")" readonly TEST_DATA_DIR="/tmp/stemedb-backup-dr-test" readonly TEST_WAL_DIR="${TEST_DATA_DIR}/wal" readonly TEST_DB_DIR="${TEST_DATA_DIR}/db" readonly TEST_BACKUP_DIR="${TEST_DATA_DIR}/backups" readonly METRICS_DIR="${TEST_DATA_DIR}/metrics" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' NC='\033[0m' # Test results TESTS_RUN=0 TESTS_PASSED=0 TESTS_FAILED=0 FAILED_TESTS=() # Logging info() { echo -e "${BLUE}[INFO]${NC} $*"; } success() { echo -e "${GREEN}[PASS]${NC} $*"; } fail_test() { echo -e "${RED}[FAIL]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } # Test helpers setup() { info "Setting up test environment..." # Clean previous test data rm -rf "$TEST_DATA_DIR" # Create test directories mkdir -p "$TEST_WAL_DIR" "$TEST_DB_DIR" "$TEST_BACKUP_DIR" "$METRICS_DIR" # Create fake WAL files for i in {1..10}; do # Write STEM magic bytes + some data printf '\x53\x54\x45\x4d' > "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal" dd if=/dev/urandom bs=1K count=100 >> "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal" 2>/dev/null done # Create fake DB files for i in {1..5}; do dd if=/dev/urandom of="${TEST_DB_DIR}/data-$(printf "%03d" $i).kv" bs=1M count=10 2>/dev/null done success "Test environment ready" } teardown() { info "Cleaning up test environment..." rm -rf "$TEST_DATA_DIR" success "Cleanup complete" } run_test() { local test_name="$1" local test_func="$2" ((TESTS_RUN++)) echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Test $TESTS_RUN: $test_name" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" if $test_func; then ((TESTS_PASSED++)) success "$test_name" else ((TESTS_FAILED++)) FAILED_TESTS+=("$test_name") fail_test "$test_name" fi } # Test 1: Backup creation test_backup_creation() { info "Testing backup creation..." STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1 # Verify backup exists local backup_count backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l) if [[ $backup_count -ne 1 ]]; then fail_test "Expected 1 backup, found $backup_count" return 1 fi # Verify backup structure local backup_dir backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1) [[ -f "${backup_dir}/backup-metadata.json" ]] || { fail_test "Missing metadata.json"; return 1; } [[ -d "${backup_dir}/wal" ]] || { fail_test "Missing wal/ directory"; return 1; } [[ -d "${backup_dir}/db" ]] || { fail_test "Missing db/ directory"; return 1; } # Verify file counts local wal_count wal_count=$(find "${backup_dir}/wal" -name "*.wal" | wc -l) if [[ $wal_count -ne 10 ]]; then fail_test "Expected 10 WAL files, found $wal_count" return 1 fi local db_count db_count=$(find "${backup_dir}/db" -name "*.kv" | wc -l) if [[ $db_count -ne 5 ]]; then fail_test "Expected 5 DB files, found $db_count" return 1 fi success "Backup created successfully with correct structure" return 0 } # Test 2: Retention policy test_retention_policy() { info "Testing retention policy..." # Create 5 backups with different timestamps for i in {1..5}; do STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null sleep 1 # Ensure different timestamps done # Apply retention: keep last 3 STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" \ --output "$TEST_BACKUP_DIR" \ --keep-last 2d || return 1 # Keep last 2 days (should keep minimum 3) # Count remaining backups local backup_count backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l) # Should have at least 3 (minimum retention) if [[ $backup_count -lt 3 ]]; then fail_test "Retention policy too aggressive: only $backup_count backups remain" return 1 fi success "Retention policy working correctly (kept $backup_count backups)" return 0 } # Test 3: Backup verification test_backup_verification() { info "Testing backup verification..." # Create a backup STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null # Verify it METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/verify-backup.sh" "$TEST_BACKUP_DIR/$(ls -t "$TEST_BACKUP_DIR" | head -n1)" || return 1 # Check metrics were written [[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; } # Verify metrics content if ! grep -q "stemedb_backup_verification_status.*1" "${METRICS_DIR}/stemedb_backup.prom"; then fail_test "Verification status not set to 1 (passed)" return 1 fi success "Backup verification passed and metrics written" return 0 } # Test 4: WAL magic byte detection test_wal_magic_validation() { info "Testing WAL magic byte validation..." # Create backup with corrupted WAL STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null local backup_dir backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1) # Corrupt first WAL file (wrong magic bytes) printf '\x00\x00\x00\x00' > "${backup_dir}/wal/$(ls "${backup_dir}/wal" | head -n1)" # Verification should fail if METRICS_DIR="$METRICS_DIR" "${PROJECT_DIR}/scripts/verify-backup.sh" "$backup_dir" 2>/dev/null; then fail_test "Verification should have failed for corrupted WAL" return 1 fi # Check metrics show failure if ! grep -q "stemedb_backup_verification_status.*0" "${METRICS_DIR}/stemedb_backup.prom"; then fail_test "Verification status not set to 0 (failed)" return 1 fi success "WAL corruption detected correctly" return 0 } # Test 5: Dry run mode test_dry_run() { info "Testing dry run mode..." local backup_count_before backup_count_before=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l) # Run backup in dry-run mode STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" \ --output "$TEST_BACKUP_DIR" \ --dry-run || return 1 local backup_count_after backup_count_after=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l) if [[ $backup_count_before -ne $backup_count_after ]]; then fail_test "Dry run created a backup (should not have)" return 1 fi success "Dry run mode working correctly (no backup created)" return 0 } # Test 6: Metrics export test_metrics_export() { info "Testing metrics export..." # Create backup with metrics STEMEDB_WAL_DIR="$TEST_WAL_DIR" \ STEMEDB_DB_DIR="$TEST_DB_DIR" \ METRICS_DIR="$METRICS_DIR" \ "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1 # Verify metrics file exists [[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; } # Verify required metrics present local required_metrics=( "stemedb_backup_last_success_timestamp" "stemedb_backup_age_seconds" "stemedb_backup_size_bytes" "stemedb_backup_wal_files" "stemedb_backup_db_files" ) for metric in "${required_metrics[@]}"; do if ! grep -q "^${metric}" "${METRICS_DIR}/stemedb_backup.prom"; then fail_test "Missing metric: $metric" return 1 fi done success "All required metrics exported correctly" return 0 } # Test 7: Alert rules syntax test_alert_rules() { info "Testing Prometheus alert rules syntax..." local alert_file="${PROJECT_DIR}/docs/operations/deployment/prometheus/backup-alerts.yml" [[ -f "$alert_file" ]] || { fail_test "Alert rules file not found"; return 1; } # Basic YAML syntax check if ! command -v yamllint &>/dev/null; then warn "yamllint not installed, skipping syntax validation" return 0 fi if ! yamllint -d relaxed "$alert_file" 2>/dev/null; then fail_test "Alert rules YAML syntax invalid" return 1 fi # Check required alerts exist local required_alerts=( "StemeDBBackupFailed" "StemeDBBackupVerificationFailed" "StemeDBWALArchivalLag" "StemeDBBackupStale" ) for alert in "${required_alerts[@]}"; do if ! grep -q "alert: $alert" "$alert_file"; then fail_test "Missing alert: $alert" return 1 fi done success "Alert rules syntax valid and all required alerts present" return 0 } # Main test execution main() { echo "" echo "==========================================" echo " StemeDB Backup & DR Integration Tests" echo "==========================================" echo "" setup # Run all tests run_test "Backup Creation" test_backup_creation run_test "Retention Policy" test_retention_policy run_test "Backup Verification" test_backup_verification run_test "WAL Magic Validation" test_wal_magic_validation run_test "Dry Run Mode" test_dry_run run_test "Metrics Export" test_metrics_export run_test "Alert Rules" test_alert_rules teardown # Summary echo "" echo "==========================================" echo " Test Summary" echo "==========================================" echo "" echo " Total: $TESTS_RUN" echo -e " Passed: ${GREEN}${TESTS_PASSED}${NC}" echo -e " Failed: ${RED}${TESTS_FAILED}${NC}" echo "" if [[ $TESTS_FAILED -gt 0 ]]; then echo "Failed tests:" for test in "${FAILED_TESTS[@]}"; do echo " - $test" done echo "" exit 1 else echo -e "${GREEN}All tests passed!${NC}" echo "" exit 0 fi } main "$@"