stemedb/uat/production-readiness/backup-dr-tests.sh

#!/usr/bin/env bash
#
# StemeDB Backup & DR Integration Tests
#
# End-to-end test suite validating all P5.3 components:
# - Backup creation
# - Retention policy
# - Backup verification
# - WAL archival
# - S3 upload
# - Metrics export
# - Alert rules
#
# Usage:
#   ./uat/production-readiness/backup-dr-tests.sh
#
# Exit codes:
#   0 - All tests passed
#   1 - One or more tests failed
#

set -euo pipefail

# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
readonly TEST_DATA_DIR="/tmp/stemedb-backup-dr-test"
readonly TEST_WAL_DIR="${TEST_DATA_DIR}/wal"
readonly TEST_DB_DIR="${TEST_DATA_DIR}/db"
readonly TEST_BACKUP_DIR="${TEST_DATA_DIR}/backups"
readonly METRICS_DIR="${TEST_DATA_DIR}/metrics"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# Test results
TESTS_RUN=0
TESTS_PASSED=0
TESTS_FAILED=0
FAILED_TESTS=()

# Logging
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[PASS]${NC} $*"; }
fail_test() { echo -e "${RED}[FAIL]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }

# Test helpers
setup() {
    info "Setting up test environment..."

    # Clean previous test data
    rm -rf "$TEST_DATA_DIR"

    # Create test directories
    mkdir -p "$TEST_WAL_DIR" "$TEST_DB_DIR" "$TEST_BACKUP_DIR" "$METRICS_DIR"

    # Create fake WAL files
    for i in {1..10}; do
        # Write STEM magic bytes + some data
        printf '\x53\x54\x45\x4d' > "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal"
        dd if=/dev/urandom bs=1K count=100 >> "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal" 2>/dev/null
    done

    # Create fake DB files
    for i in {1..5}; do
        dd if=/dev/urandom of="${TEST_DB_DIR}/data-$(printf "%03d" $i).kv" bs=1M count=10 2>/dev/null
    done

    success "Test environment ready"
}

teardown() {
    info "Cleaning up test environment..."
    rm -rf "$TEST_DATA_DIR"
    success "Cleanup complete"
}

run_test() {
    local test_name="$1"
    local test_func="$2"

    ((TESTS_RUN++))
    echo ""
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    echo "Test $TESTS_RUN: $test_name"
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

    if $test_func; then
        ((TESTS_PASSED++))
        success "$test_name"
    else
        ((TESTS_FAILED++))
        FAILED_TESTS+=("$test_name")
        fail_test "$test_name"
    fi
}

# Test 1: Backup creation
test_backup_creation() {
    info "Testing backup creation..."

    STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
    STEMEDB_DB_DIR="$TEST_DB_DIR" \
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1

    # Verify backup exists
    local backup_count
    backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)

    if [[ $backup_count -ne 1 ]]; then
        fail_test "Expected 1 backup, found $backup_count"
        return 1
    fi

    # Verify backup structure
    local backup_dir
    backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1)

    [[ -f "${backup_dir}/backup-metadata.json" ]] || { fail_test "Missing metadata.json"; return 1; }
    [[ -d "${backup_dir}/wal" ]] || { fail_test "Missing wal/ directory"; return 1; }
    [[ -d "${backup_dir}/db" ]] || { fail_test "Missing db/ directory"; return 1; }

    # Verify file counts
    local wal_count
    wal_count=$(find "${backup_dir}/wal" -name "*.wal" | wc -l)
    if [[ $wal_count -ne 10 ]]; then
        fail_test "Expected 10 WAL files, found $wal_count"
        return 1
    fi

    local db_count
    db_count=$(find "${backup_dir}/db" -name "*.kv" | wc -l)
    if [[ $db_count -ne 5 ]]; then
        fail_test "Expected 5 DB files, found $db_count"
        return 1
    fi

    success "Backup created successfully with correct structure"
    return 0
}

# Test 2: Retention policy
test_retention_policy() {
    info "Testing retention policy..."

    # Create 5 backups with different timestamps
    for i in {1..5}; do
        STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
        STEMEDB_DB_DIR="$TEST_DB_DIR" \
        METRICS_DIR="$METRICS_DIR" \
        "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null

        sleep 1  # Ensure different timestamps
    done

    # Apply retention: keep last 3
    STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
    STEMEDB_DB_DIR="$TEST_DB_DIR" \
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/backup-stemedb.sh" \
        --output "$TEST_BACKUP_DIR" \
        --keep-last 2d || return 1  # Keep last 2 days (should keep minimum 3)

    # Count remaining backups
    local backup_count
    backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)

    # Should have at least 3 (minimum retention)
    if [[ $backup_count -lt 3 ]]; then
        fail_test "Retention policy too aggressive: only $backup_count backups remain"
        return 1
    fi

    success "Retention policy working correctly (kept $backup_count backups)"
    return 0
}

# Test 3: Backup verification
test_backup_verification() {
    info "Testing backup verification..."

    # Create a backup
    STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
    STEMEDB_DB_DIR="$TEST_DB_DIR" \
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null

    # Verify it
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/verify-backup.sh" "$TEST_BACKUP_DIR/$(ls -t "$TEST_BACKUP_DIR" | head -n1)" || return 1

    # Check metrics were written
    [[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; }

    # Verify metrics content
    if ! grep -q "stemedb_backup_verification_status.*1" "${METRICS_DIR}/stemedb_backup.prom"; then
        fail_test "Verification status not set to 1 (passed)"
        return 1
    fi

    success "Backup verification passed and metrics written"
    return 0
}

# Test 4: WAL magic byte detection
test_wal_magic_validation() {
    info "Testing WAL magic byte validation..."

    # Create backup with corrupted WAL
    STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
    STEMEDB_DB_DIR="$TEST_DB_DIR" \
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null

    local backup_dir
    backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1)

    # Corrupt first WAL file (wrong magic bytes)
    printf '\x00\x00\x00\x00' > "${backup_dir}/wal/$(ls "${backup_dir}/wal" | head -n1)"

    # Verification should fail
    if METRICS_DIR="$METRICS_DIR" "${PROJECT_DIR}/scripts/verify-backup.sh" "$backup_dir" 2>/dev/null; then
        fail_test "Verification should have failed for corrupted WAL"
        return 1
    fi

    # Check metrics show failure
    if ! grep -q "stemedb_backup_verification_status.*0" "${METRICS_DIR}/stemedb_backup.prom"; then
        fail_test "Verification status not set to 0 (failed)"
        return 1
    fi

    success "WAL corruption detected correctly"
    return 0
}

# Test 5: Dry run mode
test_dry_run() {
    info "Testing dry run mode..."

    local backup_count_before
    backup_count_before=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)

    # Run backup in dry-run mode
    STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
    STEMEDB_DB_DIR="$TEST_DB_DIR" \
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/backup-stemedb.sh" \
        --output "$TEST_BACKUP_DIR" \
        --dry-run || return 1

    local backup_count_after
    backup_count_after=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)

    if [[ $backup_count_before -ne $backup_count_after ]]; then
        fail_test "Dry run created a backup (should not have)"
        return 1
    fi

    success "Dry run mode working correctly (no backup created)"
    return 0
}

# Test 6: Metrics export
test_metrics_export() {
    info "Testing metrics export..."

    # Create backup with metrics
    STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
    STEMEDB_DB_DIR="$TEST_DB_DIR" \
    METRICS_DIR="$METRICS_DIR" \
    "${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1

    # Verify metrics file exists
    [[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; }

    # Verify required metrics present
    local required_metrics=(
        "stemedb_backup_last_success_timestamp"
        "stemedb_backup_age_seconds"
        "stemedb_backup_size_bytes"
        "stemedb_backup_wal_files"
        "stemedb_backup_db_files"
    )

    for metric in "${required_metrics[@]}"; do
        if ! grep -q "^${metric}" "${METRICS_DIR}/stemedb_backup.prom"; then
            fail_test "Missing metric: $metric"
            return 1
        fi
    done

    success "All required metrics exported correctly"
    return 0
}

# Test 7: Alert rules syntax
test_alert_rules() {
    info "Testing Prometheus alert rules syntax..."

    local alert_file="${PROJECT_DIR}/docs/operations/deployment/prometheus/backup-alerts.yml"

    [[ -f "$alert_file" ]] || { fail_test "Alert rules file not found"; return 1; }

    # Basic YAML syntax check
    if ! command -v yamllint &>/dev/null; then
        warn "yamllint not installed, skipping syntax validation"
        return 0
    fi

    if ! yamllint -d relaxed "$alert_file" 2>/dev/null; then
        fail_test "Alert rules YAML syntax invalid"
        return 1
    fi

    # Check required alerts exist
    local required_alerts=(
        "StemeDBBackupFailed"
        "StemeDBBackupVerificationFailed"
        "StemeDBWALArchivalLag"
        "StemeDBBackupStale"
    )

    for alert in "${required_alerts[@]}"; do
        if ! grep -q "alert: $alert" "$alert_file"; then
            fail_test "Missing alert: $alert"
            return 1
        fi
    done

    success "Alert rules syntax valid and all required alerts present"
    return 0
}

# Main test execution
main() {
    echo ""
    echo "=========================================="
    echo "  StemeDB Backup & DR Integration Tests"
    echo "=========================================="
    echo ""

    setup

    # Run all tests
    run_test "Backup Creation" test_backup_creation
    run_test "Retention Policy" test_retention_policy
    run_test "Backup Verification" test_backup_verification
    run_test "WAL Magic Validation" test_wal_magic_validation
    run_test "Dry Run Mode" test_dry_run
    run_test "Metrics Export" test_metrics_export
    run_test "Alert Rules" test_alert_rules

    teardown

    # Summary
    echo ""
    echo "=========================================="
    echo "  Test Summary"
    echo "=========================================="
    echo ""
    echo "  Total:  $TESTS_RUN"
    echo -e "  Passed: ${GREEN}${TESTS_PASSED}${NC}"
    echo -e "  Failed: ${RED}${TESTS_FAILED}${NC}"
    echo ""

    if [[ $TESTS_FAILED -gt 0 ]]; then
        echo "Failed tests:"
        for test in "${FAILED_TESTS[@]}"; do
            echo "  - $test"
        done
        echo ""
        exit 1
    else
        echo -e "${GREEN}All tests passed!${NC}"
        echo ""
        exit 0
    fi
}

main "$@"