stemedb/scripts/archive-wal-to-s3.sh

#!/usr/bin/env bash
#
# StemeDB WAL Archival to S3
#
# Ships WAL segments to S3 every 15 minutes to achieve RPO=15min.
# Tracks archival state to avoid re-uploading already archived segments.
#
# Usage:
#   ./scripts/archive-wal-to-s3.sh
#
# Exit codes:
#   0 - Archival completed successfully (or nothing to archive)
#   1 - Archival failed
#

set -euo pipefail

# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
readonly WAL_DIR="${STEMEDB_WAL_DIR:-${PROJECT_DIR}/data/wal}"
readonly STATE_FILE="${STATE_FILE:-/var/lib/stemedb/wal-archival-state.json}"
readonly S3_BUCKET="${AWS_S3_BUCKET:-}"
readonly S3_PREFIX="${AWS_S3_PREFIX:-wal-archive}"
readonly METRICS_DIR="${METRICS_DIR:-/var/lib/node_exporter/textfile_collector}"

# Colors (if terminal supports it)
if [[ -t 1 ]]; then
    RED='\033[0;31m'
    GREEN='\033[0;32m'
    YELLOW='\033[0;33m'
    BLUE='\033[0;34m'
    NC='\033[0m'
else
    RED=''
    GREEN=''
    YELLOW=''
    BLUE=''
    NC=''
fi

# Logging helpers
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }

# Load archival state
load_state() {
    if [[ -f "$STATE_FILE" ]]; then
        cat "$STATE_FILE"
    else
        echo '{"last_archived_segment": "", "last_archival_timestamp": 0, "total_segments_archived": 0}'
    fi
}

# Save archival state
save_state() {
    local last_segment="$1"
    local total_archived="$2"

    mkdir -p "$(dirname "$STATE_FILE")"

    cat > "$STATE_FILE" <<STATE
{
  "last_archived_segment": "$last_segment",
  "last_archival_timestamp": $(date +%s),
  "total_segments_archived": $total_archived
}
STATE
}

# Get list of WAL segments to archive
get_segments_to_archive() {
    local last_archived="$1"

    # Find all .wal files, sorted
    local segments=()
    while IFS= read -r -d '' wal_file; do
        local basename
        basename=$(basename "$wal_file")

        # Skip if already archived
        if [[ -n "$last_archived" && "$basename" < "$last_archived" ]]; then
            continue
        fi
        if [[ "$basename" == "$last_archived" ]]; then
            continue
        fi

        # Only archive completed segments (not the current active segment)
        # Active segment is typically the newest one, skip it
        segments+=("$wal_file")
    done < <(find "$WAL_DIR" -name "*.wal" -type f -print0 | sort -z)

    # Remove last segment from list (it's likely still being written)
    if [[ ${#segments[@]} -gt 1 ]]; then
        unset 'segments[-1]'
    elif [[ ${#segments[@]} -eq 1 ]]; then
        # Only one segment, don't archive it (could be active)
        segments=()
    fi

    printf '%s\n' "${segments[@]}"
}

# Upload segment to S3
upload_segment() {
    local wal_file="$1"
    local basename
    basename=$(basename "$wal_file")

    local s3_path="s3://${S3_BUCKET}/${S3_PREFIX}/${basename}"

    info "Uploading: ${basename}"

    if aws s3 cp "$wal_file" "$s3_path" \
        --storage-class STANDARD_IA \
        --region "${AWS_REGION:-us-east-1}" \
        --only-show-errors; then
        success "Uploaded: ${s3_path}"
        return 0
    else
        warn "Upload failed: ${basename}"
        return 1
    fi
}

# Calculate archival lag (time between WAL creation and S3 upload)
calculate_archival_lag() {
    local wal_file="$1"

    local wal_mtime
    wal_mtime=$(stat -c %Y "$wal_file" 2>/dev/null || stat -f %m "$wal_file" 2>/dev/null)

    local now
    now=$(date +%s)

    echo $((now - wal_mtime))
}

# Write Prometheus metrics
write_metrics() {
    local segments_uploaded="$1"
    local segments_failed="$2"
    local max_lag="$3"

    local metrics_file="${METRICS_DIR}/stemedb_wal_archival.prom"
    mkdir -p "$(dirname "$metrics_file")" 2>/dev/null || true

    cat > "$metrics_file" <<METRICS
# HELP stemedb_wal_archival_last_run_timestamp Unix timestamp of last archival run
# TYPE stemedb_wal_archival_last_run_timestamp gauge
stemedb_wal_archival_last_run_timestamp $(date +%s)

# HELP stemedb_wal_archival_segments_uploaded_total Number of segments uploaded in last run
# TYPE stemedb_wal_archival_segments_uploaded_total counter
stemedb_wal_archival_segments_uploaded_total $segments_uploaded

# HELP stemedb_wal_archival_segments_failed_total Number of segments that failed to upload
# TYPE stemedb_wal_archival_segments_failed_total counter
stemedb_wal_archival_segments_failed_total $segments_failed

# HELP stemedb_wal_archival_lag_seconds Time between WAL creation and S3 upload (max across segments)
# TYPE stemedb_wal_archival_lag_seconds gauge
stemedb_wal_archival_lag_seconds $max_lag
METRICS

    success "Metrics written to: ${metrics_file}"
}

main() {
    echo ""
    echo "=========================================="
    echo "  StemeDB WAL Archival to S3"
    echo "=========================================="
    echo ""

    # Validate configuration
    if [[ -z "$S3_BUCKET" ]]; then
        fail "S3 bucket not specified (set AWS_S3_BUCKET environment variable)"
    fi

    if ! command -v aws &> /dev/null; then
        fail "AWS CLI not found. Install with: apt install awscli"
    fi

    if [[ ! -d "$WAL_DIR" ]]; then
        fail "WAL directory not found: ${WAL_DIR}"
    fi

    # Load state
    local state
    state=$(load_state)
    local last_archived
    last_archived=$(echo "$state" | grep -o '"last_archived_segment": "[^"]*"' | cut -d'"' -f4)
    local total_archived
    total_archived=$(echo "$state" | grep -o '"total_segments_archived": [0-9]*' | cut -d: -f2 | tr -d ' ')

    info "Last archived: ${last_archived:-none}"
    info "Total archived: ${total_archived}"

    # Get segments to archive
    local segments
    mapfile -t segments < <(get_segments_to_archive "$last_archived")

    if [[ ${#segments[@]} -eq 0 ]]; then
        info "No new segments to archive"
        write_metrics 0 0 0
        return 0
    fi

    info "Found ${#segments[@]} segment(s) to archive"

    # Upload segments
    local uploaded=0
    local failed=0
    local max_lag=0
    local new_last_archived=""

    for wal_file in "${segments[@]}"; do
        if upload_segment "$wal_file"; then
            ((uploaded++))
            new_last_archived=$(basename "$wal_file")

            # Track archival lag
            local lag
            lag=$(calculate_archival_lag "$wal_file")
            if [[ $lag -gt $max_lag ]]; then
                max_lag=$lag
            fi
        else
            ((failed++))
        fi
    done

    # Update state
    if [[ -n "$new_last_archived" ]]; then
        total_archived=$((total_archived + uploaded))
        save_state "$new_last_archived" "$total_archived"
    fi

    # Write metrics
    write_metrics "$uploaded" "$failed" "$max_lag"

    # Summary
    echo ""
    echo "=========================================="
    if [[ $failed -eq 0 ]]; then
        echo -e "  ${GREEN}Archival complete${NC}"
    else
        echo -e "  ${YELLOW}Archival completed with errors${NC}"
    fi
    echo "=========================================="
    echo ""
    echo "  Uploaded: ${uploaded}"
    echo "  Failed:   ${failed}"
    echo "  Max lag:  ${max_lag}s"
    echo "  S3 path:  s3://${S3_BUCKET}/${S3_PREFIX}/"
    echo ""

    if [[ $failed -gt 0 ]]; then
        exit 1
    fi
}

main "$@"