stemedb/scripts/archive-wal-to-s3.sh
jml 3e7eddc074 feat: add enterprise production readiness infrastructure
This commit implements comprehensive production hardening across multiple
layers to prepare StemeDB for enterprise pilot deployments:

## API Layer
- Add rate limiting middleware with configurable limits per endpoint
- Enhance error handling with detailed context and proper HTTP status codes
- Add security hardening tests for input validation and boundary conditions
- Create store_helpers module for defensive storage access patterns

## Storage & WAL
- Optimize group commit batching for higher throughput
- Add defensive error handling in hybrid backend with proper fallbacks
- Enhance WAL journal durability guarantees with fsync validation
- Improve index store query performance with better caching

## Operations & Deployment
- Add comprehensive operations documentation (deployment, monitoring, DR)
- Create systemd units for backup, WAL archival, and verification
- Add monitoring configs (Prometheus alerts, metrics exporters)
- Implement backup/restore scripts with verification and S3 archival
- Add DR drill automation and runbook procedures
- Create load balancer configs (nginx, envoy) with health checks

## Documentation
- Update CLAUDE.md with operations and troubleshooting guides
- Expand roadmap with production readiness milestones
- Add pilot success criteria and deployment reference architecture
- Document TLS setup, monitoring integration, and incident response

## Configuration
- Add .env.example with all required environment variables
- Document resource sizing for different deployment scales
- Add configuration examples for various deployment topologies

This positions StemeDB for successful enterprise pilots with proper
operational discipline, monitoring, backup/DR, and security hardening.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 06:08:15 +00:00

268 lines
7.2 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# StemeDB WAL Archival to S3
#
# Ships WAL segments to S3 every 15 minutes to achieve RPO=15min.
# Tracks archival state to avoid re-uploading already archived segments.
#
# Usage:
# ./scripts/archive-wal-to-s3.sh
#
# Exit codes:
# 0 - Archival completed successfully (or nothing to archive)
# 1 - Archival failed
#
set -euo pipefail
# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
readonly WAL_DIR="${STEMEDB_WAL_DIR:-${PROJECT_DIR}/data/wal}"
readonly STATE_FILE="${STATE_FILE:-/var/lib/stemedb/wal-archival-state.json}"
readonly S3_BUCKET="${AWS_S3_BUCKET:-}"
readonly S3_PREFIX="${AWS_S3_PREFIX:-wal-archive}"
readonly METRICS_DIR="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}"
# Colors (if terminal supports it)
if [[ -t 1 ]]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
else
RED=''
GREEN=''
YELLOW=''
BLUE=''
NC=''
fi
# Logging helpers
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
# Load archival state
load_state() {
if [[ -f "$STATE_FILE" ]]; then
cat "$STATE_FILE"
else
echo '{"last_archived_segment": "", "last_archival_timestamp": 0, "total_segments_archived": 0}'
fi
}
# Save archival state
save_state() {
local last_segment="$1"
local total_archived="$2"
mkdir -p "$(dirname "$STATE_FILE")"
cat > "$STATE_FILE" <<STATE
{
"last_archived_segment": "$last_segment",
"last_archival_timestamp": $(date +%s),
"total_segments_archived": $total_archived
}
STATE
}
# Get list of WAL segments to archive
get_segments_to_archive() {
local last_archived="$1"
# Find all .wal files, sorted
local segments=()
while IFS= read -r -d '' wal_file; do
local basename
basename=$(basename "$wal_file")
# Skip if already archived
if [[ -n "$last_archived" && "$basename" < "$last_archived" ]]; then
continue
fi
if [[ "$basename" == "$last_archived" ]]; then
continue
fi
# Only archive completed segments (not the current active segment)
# Active segment is typically the newest one, skip it
segments+=("$wal_file")
done < <(find "$WAL_DIR" -name "*.wal" -type f -print0 | sort -z)
# Remove last segment from list (it's likely still being written)
if [[ ${#segments[@]} -gt 1 ]]; then
unset 'segments[-1]'
elif [[ ${#segments[@]} -eq 1 ]]; then
# Only one segment, don't archive it (could be active)
segments=()
fi
printf '%s\n' "${segments[@]}"
}
# Upload segment to S3
upload_segment() {
local wal_file="$1"
local basename
basename=$(basename "$wal_file")
local s3_path="s3://${S3_BUCKET}/${S3_PREFIX}/${basename}"
info "Uploading: ${basename}"
if aws s3 cp "$wal_file" "$s3_path" \
--storage-class STANDARD_IA \
--region "${AWS_REGION:-us-east-1}" \
--only-show-errors; then
success "Uploaded: ${s3_path}"
return 0
else
warn "Upload failed: ${basename}"
return 1
fi
}
# Calculate archival lag (time between WAL creation and S3 upload)
calculate_archival_lag() {
local wal_file="$1"
local wal_mtime
wal_mtime=$(stat -c %Y "$wal_file" 2>/dev/null || stat -f %m "$wal_file" 2>/dev/null)
local now
now=$(date +%s)
echo $((now - wal_mtime))
}
# Write Prometheus metrics
write_metrics() {
local segments_uploaded="$1"
local segments_failed="$2"
local max_lag="$3"
local metrics_file="${METRICS_DIR}/stemedb_wal_archival.prom"
mkdir -p "$(dirname "$metrics_file")" 2>/dev/null || true
cat > "$metrics_file" <<METRICS
# HELP stemedb_wal_archival_last_run_timestamp Unix timestamp of last archival run
# TYPE stemedb_wal_archival_last_run_timestamp gauge
stemedb_wal_archival_last_run_timestamp $(date +%s)
# HELP stemedb_wal_archival_segments_uploaded_total Number of segments uploaded in last run
# TYPE stemedb_wal_archival_segments_uploaded_total counter
stemedb_wal_archival_segments_uploaded_total $segments_uploaded
# HELP stemedb_wal_archival_segments_failed_total Number of segments that failed to upload
# TYPE stemedb_wal_archival_segments_failed_total counter
stemedb_wal_archival_segments_failed_total $segments_failed
# HELP stemedb_wal_archival_lag_seconds Time between WAL creation and S3 upload (max across segments)
# TYPE stemedb_wal_archival_lag_seconds gauge
stemedb_wal_archival_lag_seconds $max_lag
METRICS
success "Metrics written to: ${metrics_file}"
}
main() {
echo ""
echo "=========================================="
echo " StemeDB WAL Archival to S3"
echo "=========================================="
echo ""
# Validate configuration
if [[ -z "$S3_BUCKET" ]]; then
fail "S3 bucket not specified (set AWS_S3_BUCKET environment variable)"
fi
if ! command -v aws &> /dev/null; then
fail "AWS CLI not found. Install with: apt install awscli"
fi
if [[ ! -d "$WAL_DIR" ]]; then
fail "WAL directory not found: ${WAL_DIR}"
fi
# Load state
local state
state=$(load_state)
local last_archived
last_archived=$(echo "$state" | grep -o '"last_archived_segment": "[^"]*"' | cut -d'"' -f4)
local total_archived
total_archived=$(echo "$state" | grep -o '"total_segments_archived": [0-9]*' | cut -d: -f2 | tr -d ' ')
info "Last archived: ${last_archived:-none}"
info "Total archived: ${total_archived}"
# Get segments to archive
local segments
mapfile -t segments < <(get_segments_to_archive "$last_archived")
if [[ ${#segments[@]} -eq 0 ]]; then
info "No new segments to archive"
write_metrics 0 0 0
return 0
fi
info "Found ${#segments[@]} segment(s) to archive"
# Upload segments
local uploaded=0
local failed=0
local max_lag=0
local new_last_archived=""
for wal_file in "${segments[@]}"; do
if upload_segment "$wal_file"; then
((uploaded++))
new_last_archived=$(basename "$wal_file")
# Track archival lag
local lag
lag=$(calculate_archival_lag "$wal_file")
if [[ $lag -gt $max_lag ]]; then
max_lag=$lag
fi
else
((failed++))
fi
done
# Update state
if [[ -n "$new_last_archived" ]]; then
total_archived=$((total_archived + uploaded))
save_state "$new_last_archived" "$total_archived"
fi
# Write metrics
write_metrics "$uploaded" "$failed" "$max_lag"
# Summary
echo ""
echo "=========================================="
if [[ $failed -eq 0 ]]; then
echo -e " ${GREEN}Archival complete${NC}"
else
echo -e " ${YELLOW}Archival completed with errors${NC}"
fi
echo "=========================================="
echo ""
echo " Uploaded: ${uploaded}"
echo " Failed: ${failed}"
echo " Max lag: ${max_lag}s"
echo " S3 path: s3://${S3_BUCKET}/${S3_PREFIX}/"
echo ""
if [[ $failed -gt 0 ]]; then
exit 1
fi
}
main "$@"