This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
426 lines
13 KiB
Bash
Executable File
426 lines
13 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# StemeDB Backup Script
|
|
#
|
|
# Creates a timestamped backup of WAL and database files.
|
|
#
|
|
# Usage:
|
|
# ./scripts/backup-stemedb.sh # Default backup to backups/
|
|
# ./scripts/backup-stemedb.sh --output /mnt/nfs # Custom output directory
|
|
# ./scripts/backup-stemedb.sh --wal-only # Backup WAL only (faster)
|
|
#
|
|
# Exit codes:
|
|
# 0 - Backup completed successfully
|
|
# 1 - Backup failed
|
|
#
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
readonly PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
readonly WAL_DIR="${STEMEDB_WAL_DIR:-${PROJECT_DIR}/data/wal}"
|
|
readonly DB_DIR="${STEMEDB_DB_DIR:-${PROJECT_DIR}/data/db}"
|
|
readonly TIMESTAMP="$(date +%Y%m%d-%H%M%S)"
|
|
|
|
# Colors (if terminal supports it)
|
|
if [[ -t 1 ]]; then
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
else
|
|
RED=''
|
|
GREEN=''
|
|
YELLOW=''
|
|
BLUE=''
|
|
NC=''
|
|
fi
|
|
|
|
# Logging helpers
|
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
success() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
|
|
|
|
# Defaults
|
|
OUTPUT_DIR="${PROJECT_DIR}/backups"
|
|
WAL_ONLY=false
|
|
DRY_RUN=false
|
|
KEEP_LAST=""
|
|
UPLOAD_S3=false
|
|
S3_BUCKET="${AWS_S3_BUCKET:-}"
|
|
|
|
# Parse arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--output)
|
|
OUTPUT_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--wal-only)
|
|
WAL_ONLY=true
|
|
shift
|
|
;;
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
--keep-last)
|
|
KEEP_LAST="$2"
|
|
shift 2
|
|
;;
|
|
--upload-s3)
|
|
UPLOAD_S3=true
|
|
shift
|
|
;;
|
|
--s3-bucket)
|
|
S3_BUCKET="$2"
|
|
shift 2
|
|
;;
|
|
--help|-h)
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "Create a timestamped backup of StemeDB data."
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --output <dir> Output directory (default: backups/)"
|
|
echo " --wal-only Backup WAL directory only (skip DB)"
|
|
echo " --dry-run Show what would be done without executing"
|
|
echo " --keep-last <dur> Delete backups older than duration (e.g., 30d, 7d)"
|
|
echo " --upload-s3 Upload backup to S3 after creation"
|
|
echo " --s3-bucket <name> S3 bucket name (default: AWS_S3_BUCKET env var)"
|
|
echo " --help Show this help message"
|
|
echo ""
|
|
echo "Environment:"
|
|
echo " STEMEDB_WAL_DIR WAL directory (default: data/wal)"
|
|
echo " STEMEDB_DB_DIR Database directory (default: data/db)"
|
|
echo " AWS_S3_BUCKET S3 bucket for uploads (default: none)"
|
|
echo " AWS_REGION AWS region (default: us-east-1)"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 # Basic backup"
|
|
echo " $0 --keep-last 30d # Backup with 30-day retention"
|
|
echo " $0 --upload-s3 --s3-bucket my-bucket # Backup to S3"
|
|
echo " $0 --dry-run --keep-last 7d # Preview cleanup"
|
|
exit 0
|
|
;;
|
|
*)
|
|
fail "Unknown argument: $1 (use --help for usage)"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
readonly BACKUP_DIR="${OUTPUT_DIR}/stemedb-backup-${TIMESTAMP}"
|
|
|
|
# Cleanup partial backup on failure
|
|
cleanup() {
|
|
local exit_code=$?
|
|
if [[ $exit_code -ne 0 && -d "$BACKUP_DIR" && "$DRY_RUN" == "false" ]]; then
|
|
warn "Backup failed, removing partial backup at ${BACKUP_DIR}"
|
|
rm -rf "$BACKUP_DIR"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Parse duration string (e.g., "30d", "7d") to seconds
|
|
parse_duration() {
|
|
local duration="$1"
|
|
local value="${duration%?}"
|
|
local unit="${duration: -1}"
|
|
|
|
case "$unit" in
|
|
d) echo $((value * 86400)) ;;
|
|
h) echo $((value * 3600)) ;;
|
|
m) echo $((value * 60)) ;;
|
|
*) fail "Invalid duration unit: $unit (use d=days, h=hours, m=minutes)" ;;
|
|
esac
|
|
}
|
|
|
|
# Cleanup old backups based on retention policy
|
|
cleanup_old_backups() {
|
|
local retention_seconds
|
|
retention_seconds=$(parse_duration "$KEEP_LAST")
|
|
|
|
local cutoff_time
|
|
cutoff_time=$(($(date +%s) - retention_seconds))
|
|
|
|
info "Enforcing retention policy: keep backups from last ${KEEP_LAST}"
|
|
|
|
local removed_count=0
|
|
local kept_count=0
|
|
|
|
# Find all backup directories
|
|
while IFS= read -r -d '' backup_path; do
|
|
local backup_time
|
|
backup_time=$(stat -c %Y "$backup_path" 2>/dev/null || stat -f %m "$backup_path" 2>/dev/null)
|
|
|
|
if [[ $backup_time -lt $cutoff_time ]]; then
|
|
# Keep at least 3 most recent backups regardless of age
|
|
local total_backups
|
|
total_backups=$(find "$OUTPUT_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" | wc -l)
|
|
|
|
if [[ $total_backups -gt 3 ]]; then
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
info "[DRY RUN] Would remove: $(basename "$backup_path")"
|
|
else
|
|
warn "Removing old backup: $(basename "$backup_path")"
|
|
rm -rf "$backup_path"
|
|
fi
|
|
removed_count=$((removed_count + 1))
|
|
else
|
|
info "Keeping backup (minimum 3 retained): $(basename "$backup_path")"
|
|
kept_count=$((kept_count + 1))
|
|
fi
|
|
else
|
|
kept_count=$((kept_count + 1))
|
|
fi
|
|
done < <(find "$OUTPUT_DIR" -maxdepth 1 -type d -name "stemedb-backup-*" -print0 | sort -z) || true
|
|
|
|
if [[ "$DRY_RUN" == "false" ]]; then
|
|
success "Retention: removed ${removed_count}, kept ${kept_count} backups"
|
|
else
|
|
info "[DRY RUN] Would remove: ${removed_count}, would keep: ${kept_count}"
|
|
fi
|
|
}
|
|
|
|
# Upload backup to S3
|
|
upload_to_s3() {
|
|
if [[ -z "$S3_BUCKET" ]]; then
|
|
fail "S3 bucket not specified (use --s3-bucket or set AWS_S3_BUCKET)"
|
|
fi
|
|
|
|
# Check if aws CLI is available
|
|
if ! command -v aws &> /dev/null; then
|
|
fail "AWS CLI not found. Install with: apt install awscli"
|
|
fi
|
|
|
|
local s3_path="s3://${S3_BUCKET}/$(basename "$BACKUP_DIR")"
|
|
|
|
info "Uploading backup to S3..."
|
|
info "Destination: ${s3_path}"
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
info "[DRY RUN] Would upload: ${BACKUP_DIR} -> ${s3_path}"
|
|
return 0
|
|
fi
|
|
|
|
# Upload with progress, use STANDARD_IA storage class for cost savings
|
|
if aws s3 sync "$BACKUP_DIR" "$s3_path" \
|
|
--storage-class STANDARD_IA \
|
|
--region "${AWS_REGION:-us-east-1}" \
|
|
2>&1 | tee /tmp/s3-upload.log; then
|
|
success "Uploaded to S3: ${s3_path}"
|
|
|
|
# Write S3 metrics
|
|
write_s3_metrics "$s3_path"
|
|
else
|
|
warn "S3 upload failed (backup still available locally)"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Write Prometheus metrics
|
|
write_backup_metrics() {
|
|
local metrics_file="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}/stemedb_backup.prom"
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
info "[DRY RUN] Would write metrics to: ${metrics_file}"
|
|
return 0
|
|
fi
|
|
|
|
# Create directory if it doesn't exist (for local dev)
|
|
if ! mkdir -p "$(dirname "$metrics_file")" 2>/dev/null; then
|
|
warn "Cannot create metrics directory, skipping metrics export"
|
|
return 0
|
|
fi
|
|
|
|
# Check if metrics file is writable
|
|
if ! touch "$metrics_file" 2>/dev/null; then
|
|
warn "Cannot write to metrics file, skipping metrics export"
|
|
return 0
|
|
fi
|
|
|
|
local now
|
|
now=$(date +%s)
|
|
|
|
cat > "$metrics_file" <<METRICS
|
|
# HELP stemedb_backup_last_success_timestamp Unix timestamp of last successful backup
|
|
# TYPE stemedb_backup_last_success_timestamp gauge
|
|
stemedb_backup_last_success_timestamp ${now}
|
|
|
|
# HELP stemedb_backup_age_seconds Time since last successful backup
|
|
# TYPE stemedb_backup_age_seconds gauge
|
|
stemedb_backup_age_seconds 0
|
|
|
|
# HELP stemedb_backup_size_bytes Total backup size in bytes
|
|
# TYPE stemedb_backup_size_bytes gauge
|
|
stemedb_backup_size_bytes $(du -sb "$BACKUP_DIR" 2>/dev/null | cut -f1 || echo 0)
|
|
|
|
# HELP stemedb_backup_wal_files Number of WAL files in backup
|
|
# TYPE stemedb_backup_wal_files gauge
|
|
stemedb_backup_wal_files $(find "${BACKUP_DIR}/wal" -type f 2>/dev/null | wc -l)
|
|
|
|
# HELP stemedb_backup_db_files Number of DB files in backup
|
|
# TYPE stemedb_backup_db_files gauge
|
|
stemedb_backup_db_files $(find "${BACKUP_DIR}/db" -type f 2>/dev/null | wc -l)
|
|
METRICS
|
|
|
|
success "Metrics written to: ${metrics_file}"
|
|
}
|
|
|
|
write_s3_metrics() {
|
|
local s3_path="$1"
|
|
local metrics_file="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}/stemedb_backup.prom"
|
|
|
|
# Check if metrics file exists and is writable
|
|
if [[ ! -f "$metrics_file" ]] || ! touch "$metrics_file" 2>/dev/null; then
|
|
warn "Cannot write S3 metrics (metrics file not writable)"
|
|
return 0
|
|
fi
|
|
|
|
# Append S3 metrics to existing file
|
|
cat >> "$metrics_file" <<METRICS
|
|
|
|
# HELP stemedb_backup_s3_last_upload_timestamp Unix timestamp of last S3 upload
|
|
# TYPE stemedb_backup_s3_last_upload_timestamp gauge
|
|
stemedb_backup_s3_last_upload_timestamp $(date +%s)
|
|
|
|
# HELP stemedb_backup_s3_uploaded Boolean indicating if latest backup was uploaded to S3
|
|
# TYPE stemedb_backup_s3_uploaded gauge
|
|
stemedb_backup_s3_uploaded 1
|
|
METRICS
|
|
}
|
|
|
|
main() {
|
|
echo ""
|
|
echo "=========================================="
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " StemeDB Backup (DRY RUN)"
|
|
else
|
|
echo " StemeDB Backup"
|
|
fi
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# Validate source directories
|
|
if [[ ! -d "$WAL_DIR" ]]; then
|
|
fail "WAL directory not found: ${WAL_DIR}"
|
|
fi
|
|
|
|
if [[ -z "$(ls -A "$WAL_DIR" 2>/dev/null)" ]]; then
|
|
fail "WAL directory is empty: ${WAL_DIR}"
|
|
fi
|
|
|
|
if [[ "$WAL_ONLY" == "false" ]]; then
|
|
if [[ ! -d "$DB_DIR" ]]; then
|
|
fail "DB directory not found: ${DB_DIR}"
|
|
fi
|
|
if [[ -z "$(ls -A "$DB_DIR" 2>/dev/null)" ]]; then
|
|
fail "DB directory is empty: ${DB_DIR}"
|
|
fi
|
|
fi
|
|
|
|
# Handle dry run
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
info "[DRY RUN] Would create backup at: ${BACKUP_DIR}"
|
|
info "[DRY RUN] WAL source: ${WAL_DIR}"
|
|
if [[ "$WAL_ONLY" == "false" ]]; then
|
|
info "[DRY RUN] DB source: ${DB_DIR}"
|
|
fi
|
|
if [[ -n "$KEEP_LAST" ]]; then
|
|
cleanup_old_backups
|
|
fi
|
|
if [[ "$UPLOAD_S3" == "true" ]]; then
|
|
info "[DRY RUN] Would upload to S3 bucket: ${S3_BUCKET}"
|
|
fi
|
|
echo ""
|
|
echo "=========================================="
|
|
echo -e " ${BLUE}Dry run complete (no changes made)${NC}"
|
|
echo "=========================================="
|
|
return 0
|
|
fi
|
|
|
|
# Create backup directory
|
|
mkdir -p "$BACKUP_DIR"
|
|
info "Backup directory: ${BACKUP_DIR}"
|
|
|
|
# Backup WAL (append-only, safe to copy live)
|
|
info "Copying WAL directory..."
|
|
rsync -a "${WAL_DIR}/" "${BACKUP_DIR}/wal/"
|
|
local wal_files
|
|
wal_files=$(find "${BACKUP_DIR}/wal" -type f | wc -l)
|
|
local wal_size
|
|
wal_size=$(du -sh "${BACKUP_DIR}/wal" | cut -f1)
|
|
success "WAL: ${wal_files} files, ${wal_size}"
|
|
|
|
# Backup DB (unless --wal-only)
|
|
local db_files=0
|
|
local db_size="0"
|
|
if [[ "$WAL_ONLY" == "false" ]]; then
|
|
info "Copying DB directory..."
|
|
rsync -a "${DB_DIR}/" "${BACKUP_DIR}/db/"
|
|
db_files=$(find "${BACKUP_DIR}/db" -type f | wc -l)
|
|
db_size=$(du -sh "${BACKUP_DIR}/db" | cut -f1)
|
|
success "DB: ${db_files} files, ${db_size}"
|
|
else
|
|
info "Skipping DB (--wal-only)"
|
|
fi
|
|
|
|
# Compute total size
|
|
local total_size
|
|
total_size=$(du -sh "$BACKUP_DIR" | cut -f1)
|
|
|
|
# Write metadata
|
|
cat > "${BACKUP_DIR}/backup-metadata.json" <<METADATA
|
|
{
|
|
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
|
|
"source_wal_dir": "${WAL_DIR}",
|
|
"source_db_dir": "${DB_DIR}",
|
|
"wal_only": ${WAL_ONLY},
|
|
"wal_file_count": ${wal_files},
|
|
"db_file_count": ${db_files},
|
|
"total_size": "${total_size}",
|
|
"hostname": "$(hostname)",
|
|
"stemedb_version": "$(cargo metadata --format-version=1 --no-deps 2>/dev/null | grep -o '"stemedb-api","version":"[^"]*"' | head -1 | cut -d'"' -f6 || echo "unknown")"
|
|
}
|
|
METADATA
|
|
success "Metadata written"
|
|
|
|
# Write metrics
|
|
write_backup_metrics
|
|
|
|
# Cleanup old backups if retention policy specified
|
|
if [[ -n "$KEEP_LAST" ]]; then
|
|
cleanup_old_backups
|
|
fi
|
|
|
|
# Upload to S3 if requested
|
|
if [[ "$UPLOAD_S3" == "true" ]]; then
|
|
upload_to_s3
|
|
fi
|
|
|
|
# Summary
|
|
echo ""
|
|
echo "=========================================="
|
|
echo -e " ${GREEN}Backup complete${NC}"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo " Location: ${BACKUP_DIR}"
|
|
echo " WAL files: ${wal_files} (${wal_size})"
|
|
if [[ "$WAL_ONLY" == "false" ]]; then
|
|
echo " DB files: ${db_files} (${db_size})"
|
|
fi
|
|
echo " Total: ${total_size}"
|
|
if [[ "$UPLOAD_S3" == "true" && -n "$S3_BUCKET" ]]; then
|
|
echo " S3 Upload: s3://${S3_BUCKET}/$(basename "$BACKUP_DIR")"
|
|
fi
|
|
echo ""
|
|
echo "Restore with:"
|
|
echo " ./scripts/restore-stemedb.sh ${BACKUP_DIR}"
|
|
echo ""
|
|
}
|
|
|
|
main "$@"
|