stemedb/uat/production-readiness/backup-dr-tests.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

388 lines
11 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# StemeDB Backup & DR Integration Tests
#
# End-to-end test suite validating all P5.3 components:
# - Backup creation
# - Retention policy
# - Backup verification
# - WAL archival
# - S3 upload
# - Metrics export
# - Alert rules
#
# Usage:
# ./uat/production-readiness/backup-dr-tests.sh
#
# Exit codes:
# 0 - All tests passed
# 1 - One or more tests failed
#
set -euo pipefail
# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
readonly TEST_DATA_DIR="/tmp/stemedb-backup-dr-test"
readonly TEST_WAL_DIR="${TEST_DATA_DIR}/wal"
readonly TEST_DB_DIR="${TEST_DATA_DIR}/db"
readonly TEST_BACKUP_DIR="${TEST_DATA_DIR}/backups"
readonly METRICS_DIR="${TEST_DATA_DIR}/metrics"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# Test results
TESTS_RUN=0
TESTS_PASSED=0
TESTS_FAILED=0
FAILED_TESTS=()
# Logging
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[PASS]${NC} $*"; }
fail_test() { echo -e "${RED}[FAIL]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
# Test helpers
setup() {
info "Setting up test environment..."
# Clean previous test data
rm -rf "$TEST_DATA_DIR"
# Create test directories
mkdir -p "$TEST_WAL_DIR" "$TEST_DB_DIR" "$TEST_BACKUP_DIR" "$METRICS_DIR"
# Create fake WAL files
for i in {1..10}; do
# Write STEM magic bytes + some data
printf '\x53\x54\x45\x4d' > "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal"
dd if=/dev/urandom bs=1K count=100 >> "${TEST_WAL_DIR}/segment-$(printf "%05d" $i).wal" 2>/dev/null
done
# Create fake DB files
for i in {1..5}; do
dd if=/dev/urandom of="${TEST_DB_DIR}/data-$(printf "%03d" $i).kv" bs=1M count=10 2>/dev/null
done
success "Test environment ready"
}
teardown() {
info "Cleaning up test environment..."
rm -rf "$TEST_DATA_DIR"
success "Cleanup complete"
}
run_test() {
local test_name="$1"
local test_func="$2"
((TESTS_RUN++))
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Test $TESTS_RUN: $test_name"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
if $test_func; then
((TESTS_PASSED++))
success "$test_name"
else
((TESTS_FAILED++))
FAILED_TESTS+=("$test_name")
fail_test "$test_name"
fi
}
# Test 1: Backup creation
test_backup_creation() {
info "Testing backup creation..."
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1
# Verify backup exists
local backup_count
backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
if [[ $backup_count -ne 1 ]]; then
fail_test "Expected 1 backup, found $backup_count"
return 1
fi
# Verify backup structure
local backup_dir
backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1)
[[ -f "${backup_dir}/backup-metadata.json" ]] || { fail_test "Missing metadata.json"; return 1; }
[[ -d "${backup_dir}/wal" ]] || { fail_test "Missing wal/ directory"; return 1; }
[[ -d "${backup_dir}/db" ]] || { fail_test "Missing db/ directory"; return 1; }
# Verify file counts
local wal_count
wal_count=$(find "${backup_dir}/wal" -name "*.wal" | wc -l)
if [[ $wal_count -ne 10 ]]; then
fail_test "Expected 10 WAL files, found $wal_count"
return 1
fi
local db_count
db_count=$(find "${backup_dir}/db" -name "*.kv" | wc -l)
if [[ $db_count -ne 5 ]]; then
fail_test "Expected 5 DB files, found $db_count"
return 1
fi
success "Backup created successfully with correct structure"
return 0
}
# Test 2: Retention policy
test_retention_policy() {
info "Testing retention policy..."
# Create 5 backups with different timestamps
for i in {1..5}; do
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null
sleep 1 # Ensure different timestamps
done
# Apply retention: keep last 3
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" \
--output "$TEST_BACKUP_DIR" \
--keep-last 2d || return 1 # Keep last 2 days (should keep minimum 3)
# Count remaining backups
local backup_count
backup_count=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
# Should have at least 3 (minimum retention)
if [[ $backup_count -lt 3 ]]; then
fail_test "Retention policy too aggressive: only $backup_count backups remain"
return 1
fi
success "Retention policy working correctly (kept $backup_count backups)"
return 0
}
# Test 3: Backup verification
test_backup_verification() {
info "Testing backup verification..."
# Create a backup
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null
# Verify it
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/verify-backup.sh" "$TEST_BACKUP_DIR/$(ls -t "$TEST_BACKUP_DIR" | head -n1)" || return 1
# Check metrics were written
[[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; }
# Verify metrics content
if ! grep -q "stemedb_backup_verification_status.*1" "${METRICS_DIR}/stemedb_backup.prom"; then
fail_test "Verification status not set to 1 (passed)"
return 1
fi
success "Backup verification passed and metrics written"
return 0
}
# Test 4: WAL magic byte detection
test_wal_magic_validation() {
info "Testing WAL magic byte validation..."
# Create backup with corrupted WAL
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" >/dev/null
local backup_dir
backup_dir=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | head -n1)
# Corrupt first WAL file (wrong magic bytes)
printf '\x00\x00\x00\x00' > "${backup_dir}/wal/$(ls "${backup_dir}/wal" | head -n1)"
# Verification should fail
if METRICS_DIR="$METRICS_DIR" "${PROJECT_DIR}/scripts/verify-backup.sh" "$backup_dir" 2>/dev/null; then
fail_test "Verification should have failed for corrupted WAL"
return 1
fi
# Check metrics show failure
if ! grep -q "stemedb_backup_verification_status.*0" "${METRICS_DIR}/stemedb_backup.prom"; then
fail_test "Verification status not set to 0 (failed)"
return 1
fi
success "WAL corruption detected correctly"
return 0
}
# Test 5: Dry run mode
test_dry_run() {
info "Testing dry run mode..."
local backup_count_before
backup_count_before=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
# Run backup in dry-run mode
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" \
--output "$TEST_BACKUP_DIR" \
--dry-run || return 1
local backup_count_after
backup_count_after=$(find "$TEST_BACKUP_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
if [[ $backup_count_before -ne $backup_count_after ]]; then
fail_test "Dry run created a backup (should not have)"
return 1
fi
success "Dry run mode working correctly (no backup created)"
return 0
}
# Test 6: Metrics export
test_metrics_export() {
info "Testing metrics export..."
# Create backup with metrics
STEMEDB_WAL_DIR="$TEST_WAL_DIR" \
STEMEDB_DB_DIR="$TEST_DB_DIR" \
METRICS_DIR="$METRICS_DIR" \
"${PROJECT_DIR}/scripts/backup-stemedb.sh" --output "$TEST_BACKUP_DIR" || return 1
# Verify metrics file exists
[[ -f "${METRICS_DIR}/stemedb_backup.prom" ]] || { fail_test "Metrics file not created"; return 1; }
# Verify required metrics present
local required_metrics=(
"stemedb_backup_last_success_timestamp"
"stemedb_backup_age_seconds"
"stemedb_backup_size_bytes"
"stemedb_backup_wal_files"
"stemedb_backup_db_files"
)
for metric in "${required_metrics[@]}"; do
if ! grep -q "^${metric}" "${METRICS_DIR}/stemedb_backup.prom"; then
fail_test "Missing metric: $metric"
return 1
fi
done
success "All required metrics exported correctly"
return 0
}
# Test 7: Alert rules syntax
test_alert_rules() {
info "Testing Prometheus alert rules syntax..."
local alert_file="${PROJECT_DIR}/docs/operations/deployment/prometheus/backup-alerts.yml"
[[ -f "$alert_file" ]] || { fail_test "Alert rules file not found"; return 1; }
# Basic YAML syntax check
if ! command -v yamllint &>/dev/null; then
warn "yamllint not installed, skipping syntax validation"
return 0
fi
if ! yamllint -d relaxed "$alert_file" 2>/dev/null; then
fail_test "Alert rules YAML syntax invalid"
return 1
fi
# Check required alerts exist
local required_alerts=(
"StemeDBBackupFailed"
"StemeDBBackupVerificationFailed"
"StemeDBWALArchivalLag"
"StemeDBBackupStale"
)
for alert in "${required_alerts[@]}"; do
if ! grep -q "alert: $alert" "$alert_file"; then
fail_test "Missing alert: $alert"
return 1
fi
done
success "Alert rules syntax valid and all required alerts present"
return 0
}
# Main test execution
main() {
echo ""
echo "=========================================="
echo " StemeDB Backup & DR Integration Tests"
echo "=========================================="
echo ""
setup
# Run all tests
run_test "Backup Creation" test_backup_creation
run_test "Retention Policy" test_retention_policy
run_test "Backup Verification" test_backup_verification
run_test "WAL Magic Validation" test_wal_magic_validation
run_test "Dry Run Mode" test_dry_run
run_test "Metrics Export" test_metrics_export
run_test "Alert Rules" test_alert_rules
teardown
# Summary
echo ""
echo "=========================================="
echo " Test Summary"
echo "=========================================="
echo ""
echo " Total: $TESTS_RUN"
echo -e " Passed: ${GREEN}${TESTS_PASSED}${NC}"
echo -e " Failed: ${RED}${TESTS_FAILED}${NC}"
echo ""
if [[ $TESTS_FAILED -gt 0 ]]; then
echo "Failed tests:"
for test in "${FAILED_TESTS[@]}"; do
echo " - $test"
done
echo ""
exit 1
else
echo -e "${GREEN}All tests passed!${NC}"
echo ""
exit 0
fi
}
main "$@"