stemedb/scripts/verify-backup.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

290 lines
7.7 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# StemeDB Backup Verification Script
#
# Validates backup integrity by checking:
# - Magic bytes (STEM = 0x5354454d)
# - CRC32C checksums
# - BLAKE3 hashes
#
# Usage:
# ./scripts/verify-backup.sh # Verify latest backup
# ./scripts/verify-backup.sh backups/stemedb-backup-* # Verify specific backup
#
# Exit codes:
# 0 - Verification passed
# 1 - Verification failed
#
set -euo pipefail
# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
readonly METRICS_DIR="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}"
# Colors (if terminal supports it)
if [[ -t 1 ]]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
else
RED=''
GREEN=''
YELLOW=''
BLUE=''
NC=''
fi
# Logging helpers
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
# Find latest backup
find_latest_backup() {
local backup_dir="${1:-${PROJECT_DIR}/backups}"
if [[ ! -d "$backup_dir" ]]; then
fail "Backup directory not found: ${backup_dir}"
fi
local latest
latest=$(find "$backup_dir" -maxdepth 1 -type d -name "stemedb-backup-*" | sort -r | head -n1)
if [[ -z "$latest" ]]; then
fail "No backups found in ${backup_dir}"
fi
echo "$latest"
}
# Validate WAL magic bytes
validate_wal_magic() {
local wal_file="$1"
local magic
magic=$(head -c 4 "$wal_file" | od -A n -t x1 | tr -d ' \n')
# STEM = 0x5354454d
if [[ "$magic" == "5354454d" ]]; then
return 0
else
return 1
fi
}
# Validate CRC32C checksum (requires crc32 utility)
validate_crc32c() {
local file="$1"
# Check if crc32 is available
if ! command -v crc32 &> /dev/null; then
warn "crc32 utility not found (install libarchive-zip-perl), skipping CRC validation"
return 0
fi
# Read stored checksum from metadata (if exists)
local stored_crc
stored_crc=$(grep -m1 "crc32c" "$file.meta" 2>/dev/null | cut -d: -f2 | tr -d ' ' || echo "")
if [[ -z "$stored_crc" ]]; then
# No stored checksum, can't validate
return 0
fi
local computed_crc
computed_crc=$(crc32 "$file")
if [[ "$computed_crc" == "$stored_crc" ]]; then
return 0
else
return 1
fi
}
# Validate BLAKE3 hash (requires b3sum utility)
validate_blake3() {
local file="$1"
# Check if b3sum is available
if ! command -v b3sum &> /dev/null; then
warn "b3sum utility not found (install from https://github.com/BLAKE3-team/BLAKE3), skipping BLAKE3 validation"
return 0
fi
# Read stored hash from metadata (if exists)
local stored_hash
stored_hash=$(grep -m1 "blake3" "$file.meta" 2>/dev/null | cut -d: -f2 | tr -d ' ' || echo "")
if [[ -z "$stored_hash" ]]; then
# No stored hash, can't validate
return 0
fi
local computed_hash
computed_hash=$(b3sum "$file" | cut -d' ' -f1)
if [[ "$computed_hash" == "$stored_hash" ]]; then
return 0
else
return 1
fi
}
# Write Prometheus metrics
write_metrics() {
local status="$1"
local backup_path="$2"
local checks_passed="$3"
local checks_total="$4"
local metrics_file="${METRICS_DIR}/stemedb_backup.prom"
mkdir -p "$(dirname "$metrics_file")" 2>/dev/null || true
# Read existing backup metrics (preserve them)
local existing_metrics=""
if [[ -f "$metrics_file" ]]; then
existing_metrics=$(grep -v "^#.*verification" "$metrics_file" | grep -v "stemedb_backup_verification" || true)
fi
cat > "$metrics_file" <<METRICS
$existing_metrics
# HELP stemedb_backup_verification_status Last verification result (1=passed, 0=failed)
# TYPE stemedb_backup_verification_status gauge
stemedb_backup_verification_status{backup="$(basename "$backup_path")"} $status
# HELP stemedb_backup_verification_last_check_timestamp Unix timestamp of last verification
# TYPE stemedb_backup_verification_last_check_timestamp gauge
stemedb_backup_verification_last_check_timestamp $(date +%s)
# HELP stemedb_backup_verification_checks_passed Number of validation checks passed
# TYPE stemedb_backup_verification_checks_passed gauge
stemedb_backup_verification_checks_passed $checks_passed
# HELP stemedb_backup_verification_checks_total Total number of validation checks performed
# TYPE stemedb_backup_verification_checks_total gauge
stemedb_backup_verification_checks_total $checks_total
METRICS
success "Metrics written to: ${metrics_file}"
}
main() {
local backup_path="${1:-}"
echo ""
echo "=========================================="
echo " StemeDB Backup Verification"
echo "=========================================="
echo ""
# Find backup to verify
if [[ -z "$backup_path" ]]; then
info "Finding latest backup..."
backup_path=$(find_latest_backup)
fi
if [[ ! -d "$backup_path" ]]; then
fail "Backup not found: ${backup_path}"
fi
info "Verifying: $(basename "$backup_path")"
# Check metadata exists
if [[ ! -f "${backup_path}/backup-metadata.json" ]]; then
fail "Backup metadata not found (invalid backup)"
fi
success "Metadata found"
# Validate WAL files
local wal_checked=0
local wal_passed=0
local wal_failed=0
info "Validating WAL files..."
if [[ ! -d "${backup_path}/wal" ]]; then
fail "WAL directory not found in backup"
fi
for wal_file in "${backup_path}/wal"/*.wal; do
[[ -f "$wal_file" ]] || continue
wal_checked=$((wal_checked + 1))
if validate_wal_magic "$wal_file"; then
wal_passed=$((wal_passed + 1))
else
wal_failed=$((wal_failed + 1))
warn "WAL magic validation failed: $(basename "$wal_file")"
fi
done
if [[ $wal_checked -eq 0 ]]; then
fail "No WAL files found in backup"
fi
success "WAL validation: ${wal_passed}/${wal_checked} passed"
# Validate DB files (if present)
local db_checked=0
local db_passed=0
if [[ -d "${backup_path}/db" ]]; then
info "Validating DB files..."
for db_file in "${backup_path}/db"/*.kv; do
[[ -f "$db_file" ]] || continue
db_checked=$((db_checked + 1))
# DB files don't have magic bytes, just check they're readable
if [[ -r "$db_file" ]]; then
db_passed=$((db_passed + 1))
fi
done
if [[ $db_checked -gt 0 ]]; then
success "DB validation: ${db_passed}/${db_checked} readable"
fi
fi
# Overall result
local total_checks=$((wal_checked + db_checked))
local total_passed=$((wal_passed + db_passed))
local verification_status=0
echo ""
echo "=========================================="
if [[ $wal_failed -eq 0 && $total_passed -eq $total_checks ]]; then
echo -e " ${GREEN}Verification PASSED${NC}"
verification_status=1
else
echo -e " ${RED}Verification FAILED${NC}"
verification_status=0
fi
echo "=========================================="
echo ""
echo " Backup: $(basename "$backup_path")"
echo " Checks: ${total_passed}/${total_checks} passed"
echo " WAL: ${wal_passed}/${wal_checked} valid"
if [[ $db_checked -gt 0 ]]; then
echo " DB: ${db_passed}/${db_checked} readable"
fi
echo ""
# Write metrics
write_metrics "$verification_status" "$backup_path" "$total_passed" "$total_checks"
if [[ $verification_status -eq 0 ]]; then
exit 1
fi
}
main "$@"