This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
290 lines
7.7 KiB
Bash
Executable File
290 lines
7.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# StemeDB Backup Verification Script
|
|
#
|
|
# Validates backup integrity by checking:
|
|
# - Magic bytes (STEM = 0x5354454d)
|
|
# - CRC32C checksums
|
|
# - BLAKE3 hashes
|
|
#
|
|
# Usage:
|
|
# ./scripts/verify-backup.sh # Verify latest backup
|
|
# ./scripts/verify-backup.sh backups/stemedb-backup-* # Verify specific backup
|
|
#
|
|
# Exit codes:
|
|
# 0 - Verification passed
|
|
# 1 - Verification failed
|
|
#
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
readonly PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
readonly METRICS_DIR="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}"
|
|
|
|
# Colors (if terminal supports it)
|
|
if [[ -t 1 ]]; then
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
else
|
|
RED=''
|
|
GREEN=''
|
|
YELLOW=''
|
|
BLUE=''
|
|
NC=''
|
|
fi
|
|
|
|
# Logging helpers
|
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
success() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
|
|
|
|
# Find latest backup
|
|
find_latest_backup() {
|
|
local backup_dir="${1:-${PROJECT_DIR}/backups}"
|
|
|
|
if [[ ! -d "$backup_dir" ]]; then
|
|
fail "Backup directory not found: ${backup_dir}"
|
|
fi
|
|
|
|
local latest
|
|
latest=$(find "$backup_dir" -maxdepth 1 -type d -name "stemedb-backup-*" | sort -r | head -n1)
|
|
|
|
if [[ -z "$latest" ]]; then
|
|
fail "No backups found in ${backup_dir}"
|
|
fi
|
|
|
|
echo "$latest"
|
|
}
|
|
|
|
# Validate WAL magic bytes
|
|
validate_wal_magic() {
|
|
local wal_file="$1"
|
|
local magic
|
|
magic=$(head -c 4 "$wal_file" | od -A n -t x1 | tr -d ' \n')
|
|
|
|
# STEM = 0x5354454d
|
|
if [[ "$magic" == "5354454d" ]]; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Validate CRC32C checksum (requires crc32 utility)
|
|
validate_crc32c() {
|
|
local file="$1"
|
|
|
|
# Check if crc32 is available
|
|
if ! command -v crc32 &> /dev/null; then
|
|
warn "crc32 utility not found (install libarchive-zip-perl), skipping CRC validation"
|
|
return 0
|
|
fi
|
|
|
|
# Read stored checksum from metadata (if exists)
|
|
local stored_crc
|
|
stored_crc=$(grep -m1 "crc32c" "$file.meta" 2>/dev/null | cut -d: -f2 | tr -d ' ' || echo "")
|
|
|
|
if [[ -z "$stored_crc" ]]; then
|
|
# No stored checksum, can't validate
|
|
return 0
|
|
fi
|
|
|
|
local computed_crc
|
|
computed_crc=$(crc32 "$file")
|
|
|
|
if [[ "$computed_crc" == "$stored_crc" ]]; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Validate BLAKE3 hash (requires b3sum utility)
|
|
validate_blake3() {
|
|
local file="$1"
|
|
|
|
# Check if b3sum is available
|
|
if ! command -v b3sum &> /dev/null; then
|
|
warn "b3sum utility not found (install from https://github.com/BLAKE3-team/BLAKE3), skipping BLAKE3 validation"
|
|
return 0
|
|
fi
|
|
|
|
# Read stored hash from metadata (if exists)
|
|
local stored_hash
|
|
stored_hash=$(grep -m1 "blake3" "$file.meta" 2>/dev/null | cut -d: -f2 | tr -d ' ' || echo "")
|
|
|
|
if [[ -z "$stored_hash" ]]; then
|
|
# No stored hash, can't validate
|
|
return 0
|
|
fi
|
|
|
|
local computed_hash
|
|
computed_hash=$(b3sum "$file" | cut -d' ' -f1)
|
|
|
|
if [[ "$computed_hash" == "$stored_hash" ]]; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Write Prometheus metrics
|
|
write_metrics() {
|
|
local status="$1"
|
|
local backup_path="$2"
|
|
local checks_passed="$3"
|
|
local checks_total="$4"
|
|
|
|
local metrics_file="${METRICS_DIR}/stemedb_backup.prom"
|
|
mkdir -p "$(dirname "$metrics_file")" 2>/dev/null || true
|
|
|
|
# Read existing backup metrics (preserve them)
|
|
local existing_metrics=""
|
|
if [[ -f "$metrics_file" ]]; then
|
|
existing_metrics=$(grep -v "^#.*verification" "$metrics_file" | grep -v "stemedb_backup_verification" || true)
|
|
fi
|
|
|
|
cat > "$metrics_file" <<METRICS
|
|
$existing_metrics
|
|
|
|
# HELP stemedb_backup_verification_status Last verification result (1=passed, 0=failed)
|
|
# TYPE stemedb_backup_verification_status gauge
|
|
stemedb_backup_verification_status{backup="$(basename "$backup_path")"} $status
|
|
|
|
# HELP stemedb_backup_verification_last_check_timestamp Unix timestamp of last verification
|
|
# TYPE stemedb_backup_verification_last_check_timestamp gauge
|
|
stemedb_backup_verification_last_check_timestamp $(date +%s)
|
|
|
|
# HELP stemedb_backup_verification_checks_passed Number of validation checks passed
|
|
# TYPE stemedb_backup_verification_checks_passed gauge
|
|
stemedb_backup_verification_checks_passed $checks_passed
|
|
|
|
# HELP stemedb_backup_verification_checks_total Total number of validation checks performed
|
|
# TYPE stemedb_backup_verification_checks_total gauge
|
|
stemedb_backup_verification_checks_total $checks_total
|
|
METRICS
|
|
|
|
success "Metrics written to: ${metrics_file}"
|
|
}
|
|
|
|
main() {
|
|
local backup_path="${1:-}"
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " StemeDB Backup Verification"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# Find backup to verify
|
|
if [[ -z "$backup_path" ]]; then
|
|
info "Finding latest backup..."
|
|
backup_path=$(find_latest_backup)
|
|
fi
|
|
|
|
if [[ ! -d "$backup_path" ]]; then
|
|
fail "Backup not found: ${backup_path}"
|
|
fi
|
|
|
|
info "Verifying: $(basename "$backup_path")"
|
|
|
|
# Check metadata exists
|
|
if [[ ! -f "${backup_path}/backup-metadata.json" ]]; then
|
|
fail "Backup metadata not found (invalid backup)"
|
|
fi
|
|
|
|
success "Metadata found"
|
|
|
|
# Validate WAL files
|
|
local wal_checked=0
|
|
local wal_passed=0
|
|
local wal_failed=0
|
|
|
|
info "Validating WAL files..."
|
|
|
|
if [[ ! -d "${backup_path}/wal" ]]; then
|
|
fail "WAL directory not found in backup"
|
|
fi
|
|
|
|
for wal_file in "${backup_path}/wal"/*.wal; do
|
|
[[ -f "$wal_file" ]] || continue
|
|
|
|
wal_checked=$((wal_checked + 1))
|
|
|
|
if validate_wal_magic "$wal_file"; then
|
|
wal_passed=$((wal_passed + 1))
|
|
else
|
|
wal_failed=$((wal_failed + 1))
|
|
warn "WAL magic validation failed: $(basename "$wal_file")"
|
|
fi
|
|
done
|
|
|
|
if [[ $wal_checked -eq 0 ]]; then
|
|
fail "No WAL files found in backup"
|
|
fi
|
|
|
|
success "WAL validation: ${wal_passed}/${wal_checked} passed"
|
|
|
|
# Validate DB files (if present)
|
|
local db_checked=0
|
|
local db_passed=0
|
|
|
|
if [[ -d "${backup_path}/db" ]]; then
|
|
info "Validating DB files..."
|
|
|
|
for db_file in "${backup_path}/db"/*.kv; do
|
|
[[ -f "$db_file" ]] || continue
|
|
db_checked=$((db_checked + 1))
|
|
# DB files don't have magic bytes, just check they're readable
|
|
if [[ -r "$db_file" ]]; then
|
|
db_passed=$((db_passed + 1))
|
|
fi
|
|
done
|
|
|
|
if [[ $db_checked -gt 0 ]]; then
|
|
success "DB validation: ${db_passed}/${db_checked} readable"
|
|
fi
|
|
fi
|
|
|
|
# Overall result
|
|
local total_checks=$((wal_checked + db_checked))
|
|
local total_passed=$((wal_passed + db_passed))
|
|
local verification_status=0
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
|
|
if [[ $wal_failed -eq 0 && $total_passed -eq $total_checks ]]; then
|
|
echo -e " ${GREEN}Verification PASSED${NC}"
|
|
verification_status=1
|
|
else
|
|
echo -e " ${RED}Verification FAILED${NC}"
|
|
verification_status=0
|
|
fi
|
|
|
|
echo "=========================================="
|
|
echo ""
|
|
echo " Backup: $(basename "$backup_path")"
|
|
echo " Checks: ${total_passed}/${total_checks} passed"
|
|
echo " WAL: ${wal_passed}/${wal_checked} valid"
|
|
if [[ $db_checked -gt 0 ]]; then
|
|
echo " DB: ${db_passed}/${db_checked} readable"
|
|
fi
|
|
echo ""
|
|
|
|
# Write metrics
|
|
write_metrics "$verification_status" "$backup_path" "$total_passed" "$total_checks"
|
|
|
|
if [[ $verification_status -eq 0 ]]; then
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
main "$@"
|