This commit implements comprehensive production hardening across multiple layers to prepare StemeDB for enterprise pilot deployments: ## API Layer - Add rate limiting middleware with configurable limits per endpoint - Enhance error handling with detailed context and proper HTTP status codes - Add security hardening tests for input validation and boundary conditions - Create store_helpers module for defensive storage access patterns ## Storage & WAL - Optimize group commit batching for higher throughput - Add defensive error handling in hybrid backend with proper fallbacks - Enhance WAL journal durability guarantees with fsync validation - Improve index store query performance with better caching ## Operations & Deployment - Add comprehensive operations documentation (deployment, monitoring, DR) - Create systemd units for backup, WAL archival, and verification - Add monitoring configs (Prometheus alerts, metrics exporters) - Implement backup/restore scripts with verification and S3 archival - Add DR drill automation and runbook procedures - Create load balancer configs (nginx, envoy) with health checks ## Documentation - Update CLAUDE.md with operations and troubleshooting guides - Expand roadmap with production readiness milestones - Add pilot success criteria and deployment reference architecture - Document TLS setup, monitoring integration, and incident response ## Configuration - Add .env.example with all required environment variables - Document resource sizing for different deployment scales - Add configuration examples for various deployment topologies This positions StemeDB for successful enterprise pilots with proper operational discipline, monitoring, backup/DR, and security hardening. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
268 lines
7.2 KiB
Bash
Executable File
268 lines
7.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# StemeDB WAL Archival to S3
|
|
#
|
|
# Ships WAL segments to S3 every 15 minutes to achieve RPO=15min.
|
|
# Tracks archival state to avoid re-uploading already archived segments.
|
|
#
|
|
# Usage:
|
|
# ./scripts/archive-wal-to-s3.sh
|
|
#
|
|
# Exit codes:
|
|
# 0 - Archival completed successfully (or nothing to archive)
|
|
# 1 - Archival failed
|
|
#
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
readonly PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
readonly WAL_DIR="${STEMEDB_WAL_DIR:-${PROJECT_DIR}/data/wal}"
|
|
readonly STATE_FILE="${STATE_FILE:-/var/lib/stemedb/wal-archival-state.json}"
|
|
readonly S3_BUCKET="${AWS_S3_BUCKET:-}"
|
|
readonly S3_PREFIX="${AWS_S3_PREFIX:-wal-archive}"
|
|
readonly METRICS_DIR="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}"
|
|
|
|
# Colors (if terminal supports it)
|
|
if [[ -t 1 ]]; then
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
else
|
|
RED=''
|
|
GREEN=''
|
|
YELLOW=''
|
|
BLUE=''
|
|
NC=''
|
|
fi
|
|
|
|
# Logging helpers
|
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
success() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
|
|
|
|
# Load archival state
|
|
load_state() {
|
|
if [[ -f "$STATE_FILE" ]]; then
|
|
cat "$STATE_FILE"
|
|
else
|
|
echo '{"last_archived_segment": "", "last_archival_timestamp": 0, "total_segments_archived": 0}'
|
|
fi
|
|
}
|
|
|
|
# Save archival state
|
|
save_state() {
|
|
local last_segment="$1"
|
|
local total_archived="$2"
|
|
|
|
mkdir -p "$(dirname "$STATE_FILE")"
|
|
|
|
cat > "$STATE_FILE" <<STATE
|
|
{
|
|
"last_archived_segment": "$last_segment",
|
|
"last_archival_timestamp": $(date +%s),
|
|
"total_segments_archived": $total_archived
|
|
}
|
|
STATE
|
|
}
|
|
|
|
# Get list of WAL segments to archive
|
|
get_segments_to_archive() {
|
|
local last_archived="$1"
|
|
|
|
# Find all .wal files, sorted
|
|
local segments=()
|
|
while IFS= read -r -d '' wal_file; do
|
|
local basename
|
|
basename=$(basename "$wal_file")
|
|
|
|
# Skip if already archived
|
|
if [[ -n "$last_archived" && "$basename" < "$last_archived" ]]; then
|
|
continue
|
|
fi
|
|
if [[ "$basename" == "$last_archived" ]]; then
|
|
continue
|
|
fi
|
|
|
|
# Only archive completed segments (not the current active segment)
|
|
# Active segment is typically the newest one, skip it
|
|
segments+=("$wal_file")
|
|
done < <(find "$WAL_DIR" -name "*.wal" -type f -print0 | sort -z)
|
|
|
|
# Remove last segment from list (it's likely still being written)
|
|
if [[ ${#segments[@]} -gt 1 ]]; then
|
|
unset 'segments[-1]'
|
|
elif [[ ${#segments[@]} -eq 1 ]]; then
|
|
# Only one segment, don't archive it (could be active)
|
|
segments=()
|
|
fi
|
|
|
|
printf '%s\n' "${segments[@]}"
|
|
}
|
|
|
|
# Upload segment to S3
|
|
upload_segment() {
|
|
local wal_file="$1"
|
|
local basename
|
|
basename=$(basename "$wal_file")
|
|
|
|
local s3_path="s3://${S3_BUCKET}/${S3_PREFIX}/${basename}"
|
|
|
|
info "Uploading: ${basename}"
|
|
|
|
if aws s3 cp "$wal_file" "$s3_path" \
|
|
--storage-class STANDARD_IA \
|
|
--region "${AWS_REGION:-us-east-1}" \
|
|
--only-show-errors; then
|
|
success "Uploaded: ${s3_path}"
|
|
return 0
|
|
else
|
|
warn "Upload failed: ${basename}"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Calculate archival lag (time between WAL creation and S3 upload)
|
|
calculate_archival_lag() {
|
|
local wal_file="$1"
|
|
|
|
local wal_mtime
|
|
wal_mtime=$(stat -c %Y "$wal_file" 2>/dev/null || stat -f %m "$wal_file" 2>/dev/null)
|
|
|
|
local now
|
|
now=$(date +%s)
|
|
|
|
echo $((now - wal_mtime))
|
|
}
|
|
|
|
# Write Prometheus metrics
|
|
write_metrics() {
|
|
local segments_uploaded="$1"
|
|
local segments_failed="$2"
|
|
local max_lag="$3"
|
|
|
|
local metrics_file="${METRICS_DIR}/stemedb_wal_archival.prom"
|
|
mkdir -p "$(dirname "$metrics_file")" 2>/dev/null || true
|
|
|
|
cat > "$metrics_file" <<METRICS
|
|
# HELP stemedb_wal_archival_last_run_timestamp Unix timestamp of last archival run
|
|
# TYPE stemedb_wal_archival_last_run_timestamp gauge
|
|
stemedb_wal_archival_last_run_timestamp $(date +%s)
|
|
|
|
# HELP stemedb_wal_archival_segments_uploaded_total Number of segments uploaded in last run
|
|
# TYPE stemedb_wal_archival_segments_uploaded_total counter
|
|
stemedb_wal_archival_segments_uploaded_total $segments_uploaded
|
|
|
|
# HELP stemedb_wal_archival_segments_failed_total Number of segments that failed to upload
|
|
# TYPE stemedb_wal_archival_segments_failed_total counter
|
|
stemedb_wal_archival_segments_failed_total $segments_failed
|
|
|
|
# HELP stemedb_wal_archival_lag_seconds Time between WAL creation and S3 upload (max across segments)
|
|
# TYPE stemedb_wal_archival_lag_seconds gauge
|
|
stemedb_wal_archival_lag_seconds $max_lag
|
|
METRICS
|
|
|
|
success "Metrics written to: ${metrics_file}"
|
|
}
|
|
|
|
main() {
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " StemeDB WAL Archival to S3"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# Validate configuration
|
|
if [[ -z "$S3_BUCKET" ]]; then
|
|
fail "S3 bucket not specified (set AWS_S3_BUCKET environment variable)"
|
|
fi
|
|
|
|
if ! command -v aws &> /dev/null; then
|
|
fail "AWS CLI not found. Install with: apt install awscli"
|
|
fi
|
|
|
|
if [[ ! -d "$WAL_DIR" ]]; then
|
|
fail "WAL directory not found: ${WAL_DIR}"
|
|
fi
|
|
|
|
# Load state
|
|
local state
|
|
state=$(load_state)
|
|
local last_archived
|
|
last_archived=$(echo "$state" | grep -o '"last_archived_segment": "[^"]*"' | cut -d'"' -f4)
|
|
local total_archived
|
|
total_archived=$(echo "$state" | grep -o '"total_segments_archived": [0-9]*' | cut -d: -f2 | tr -d ' ')
|
|
|
|
info "Last archived: ${last_archived:-none}"
|
|
info "Total archived: ${total_archived}"
|
|
|
|
# Get segments to archive
|
|
local segments
|
|
mapfile -t segments < <(get_segments_to_archive "$last_archived")
|
|
|
|
if [[ ${#segments[@]} -eq 0 ]]; then
|
|
info "No new segments to archive"
|
|
write_metrics 0 0 0
|
|
return 0
|
|
fi
|
|
|
|
info "Found ${#segments[@]} segment(s) to archive"
|
|
|
|
# Upload segments
|
|
local uploaded=0
|
|
local failed=0
|
|
local max_lag=0
|
|
local new_last_archived=""
|
|
|
|
for wal_file in "${segments[@]}"; do
|
|
if upload_segment "$wal_file"; then
|
|
((uploaded++))
|
|
new_last_archived=$(basename "$wal_file")
|
|
|
|
# Track archival lag
|
|
local lag
|
|
lag=$(calculate_archival_lag "$wal_file")
|
|
if [[ $lag -gt $max_lag ]]; then
|
|
max_lag=$lag
|
|
fi
|
|
else
|
|
((failed++))
|
|
fi
|
|
done
|
|
|
|
# Update state
|
|
if [[ -n "$new_last_archived" ]]; then
|
|
total_archived=$((total_archived + uploaded))
|
|
save_state "$new_last_archived" "$total_archived"
|
|
fi
|
|
|
|
# Write metrics
|
|
write_metrics "$uploaded" "$failed" "$max_lag"
|
|
|
|
# Summary
|
|
echo ""
|
|
echo "=========================================="
|
|
if [[ $failed -eq 0 ]]; then
|
|
echo -e " ${GREEN}Archival complete${NC}"
|
|
else
|
|
echo -e " ${YELLOW}Archival completed with errors${NC}"
|
|
fi
|
|
echo "=========================================="
|
|
echo ""
|
|
echo " Uploaded: ${uploaded}"
|
|
echo " Failed: ${failed}"
|
|
echo " Max lag: ${max_lag}s"
|
|
echo " S3 path: s3://${S3_BUCKET}/${S3_PREFIX}/"
|
|
echo ""
|
|
|
|
if [[ $failed -gt 0 ]]; then
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
main "$@"
|