stemedb/uat/production-readiness/backup-dr-tests-simple.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

127 lines
4.0 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# StemeDB Backup & DR Integration Tests (Simplified)
#
# Quick validation that P5.3 components work together.
#
set -euo pipefail
PROJECT_DIR="/home/jml/Workspace/stemedb"
TEST_DIR="/tmp/stemedb-backup-test-$$"
GREEN='\033[0;32m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m'
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
pass() { echo -e "${GREEN}[PASS]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
cleanup() {
rm -rf "$TEST_DIR"
}
trap cleanup EXIT
echo ""
echo "=========================================="
echo " P5.3 Backup & DR Tests"
echo "=========================================="
echo ""
# Setup
info "Setting up test environment..."
mkdir -p "$TEST_DIR"/{wal,db,backups,metrics}
# Create minimal test data
printf '\x53\x54\x45\x4d' > "$TEST_DIR/wal/test.wal"
echo "test data" >> "$TEST_DIR/wal/test.wal"
echo "test data" > "$TEST_DIR/db/test.kv"
pass "Test environment ready"
# Test 1: Backup creation
info "Test 1: Backup creation..."
STEMEDB_WAL_DIR="$TEST_DIR/wal" \
STEMEDB_DB_DIR="$TEST_DIR/db" \
METRICS_DIR="$TEST_DIR/metrics" \
"$PROJECT_DIR/scripts/backup-stemedb.sh" --output "$TEST_DIR/backups" >/dev/null 2>&1
BACKUP_COUNT=$(find "$TEST_DIR/backups" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
if [[ $BACKUP_COUNT -eq 1 ]]; then
pass "Backup created"
else
fail "Backup not created (found $BACKUP_COUNT backups)"
fi
# Test 2: Backup structure
info "Test 2: Backup structure..."
BACKUP=$(find "$TEST_DIR/backups" -name "stemedb-backup-*" -type d | head -n1)
[[ -f "$BACKUP/backup-metadata.json" ]] || fail "Missing metadata.json"
[[ -d "$BACKUP/wal" ]] || fail "Missing wal/"
[[ -d "$BACKUP/db" ]] || fail "Missing db/"
pass "Backup structure valid"
# Test 3: Metrics export
info "Test 3: Metrics export..."
[[ -f "$TEST_DIR/metrics/stemedb_backup.prom" ]] || fail "Metrics not exported"
grep -q "stemedb_backup_last_success_timestamp" "$TEST_DIR/metrics/stemedb_backup.prom" || fail "Missing metrics"
pass "Metrics exported"
# Test 4: Verification
info "Test 4: Backup verification..."
METRICS_DIR="$TEST_DIR/metrics" \
"$PROJECT_DIR/scripts/verify-backup.sh" "$BACKUP" >/dev/null 2>&1 || fail "Verification failed"
grep -q "stemedb_backup_verification_status.*1" "$TEST_DIR/metrics/stemedb_backup.prom" || fail "Verification status incorrect"
pass "Verification passed"
# Test 5: Retention
info "Test 5: Retention policy..."
for i in {1..3}; do
sleep 1
STEMEDB_WAL_DIR="$TEST_DIR/wal" \
STEMEDB_DB_DIR="$TEST_DIR/db" \
METRICS_DIR="$TEST_DIR/metrics" \
"$PROJECT_DIR/scripts/backup-stemedb.sh" --output "$TEST_DIR/backups" >/dev/null 2>&1
done
BACKUP_COUNT=$(find "$TEST_DIR/backups" -name "stemedb-backup-*" -type d | wc -l)
[[ $BACKUP_COUNT -eq 4 ]] || fail "Expected 4 backups, found $BACKUP_COUNT"
STEMEDB_WAL_DIR="$TEST_DIR/wal" \
STEMEDB_DB_DIR="$TEST_DIR/db" \
METRICS_DIR="$TEST_DIR/metrics" \
"$PROJECT_DIR/scripts/backup-stemedb.sh" \
--output "$TEST_DIR/backups" \
--keep-last 1d >/dev/null 2>&1
BACKUP_COUNT_AFTER=$(find "$TEST_DIR/backups" -name "stemedb-backup-*" -type d | wc -l)
[[ $BACKUP_COUNT_AFTER -ge 3 ]] || fail "Retention too aggressive"
pass "Retention policy working"
# Test 6: Dry run
info "Test 6: Dry run mode..."
BEFORE=$(find "$TEST_DIR/backups" -name "stemedb-backup-*" -type d | wc -l)
STEMEDB_WAL_DIR="$TEST_DIR/wal" \
STEMEDB_DB_DIR="$TEST_DIR/db" \
"$PROJECT_DIR/scripts/backup-stemedb.sh" \
--output "$TEST_DIR/backups" \
--dry-run >/dev/null 2>&1
AFTER=$(find "$TEST_DIR/backups" -name "stemedb-backup-*" -type d | wc -l)
[[ $BEFORE -eq $AFTER ]] || fail "Dry run created backup"
pass "Dry run mode working"
# Test 7: Alert rules
info "Test 7: Alert rules..."
[[ -f "$PROJECT_DIR/docs/operations/deployment/prometheus/backup-alerts.yml" ]] || fail "Alert rules missing"
pass "Alert rules present"
# Summary
echo ""
echo "=========================================="
echo -e " ${GREEN}All tests passed (7/7)${NC}"
echo "=========================================="
echo ""