rdev/cookbooks/scripts/common.sh

#!/bin/bash
# Common utilities for rdev cookbook scripts
#
# Usage:
#   source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
#
# Provides:
#   - api_call() - Make authenticated API calls
#   - wait_for_build() - Poll for build completion
#   - wait_for_pipeline() - Poll for CI pipeline completion
#   - wait_for_site() - Wait for site to respond
#   - Colors for output

set -euo pipefail

# Environment variables (checked at runtime by preflight_check, not on source)
# This allows commands like 'list' to work without credentials
RDEV_API_URL="${RDEV_API_URL:-}"
RDEV_API_KEY="${RDEV_API_KEY:-}"

# Auto-cleanup configuration
# Set AUTO_TEARDOWN=true to automatically clean up projects on exit
AUTO_TEARDOWN="${AUTO_TEARDOWN:-false}"

# Track created project for cleanup
# Scripts should set this after successful project creation
CLEANUP_PROJECT=""

# Cleanup handler for auto-teardown
# Called on script exit when AUTO_TEARDOWN=true
cleanup_on_exit() {
    local exit_code=$?
    if [[ -n "$CLEANUP_PROJECT" && "$AUTO_TEARDOWN" == "true" ]]; then
        echo ""
        echo -e "${CYAN}Auto-teardown: Cleaning up $CLEANUP_PROJECT...${NC}"
        api_call DELETE "/project/$CLEANUP_PROJECT" > /dev/null 2>&1 || true
        echo -e "${GREEN}✓ Project $CLEANUP_PROJECT deleted${NC}"
    fi
    exit $exit_code
}

# Register cleanup handler
# Scripts should call this after sourcing common.sh if they want auto-cleanup
register_cleanup_trap() {
    trap cleanup_on_exit EXIT INT TERM
}

# Parse --auto-teardown from args and return remaining args
# Usage: args=$(parse_auto_teardown_flag "$@")
parse_auto_teardown_flag() {
    local args=()
    for arg in "$@"; do
        if [[ "$arg" == "--auto-teardown" ]]; then
            AUTO_TEARDOWN="true"
        else
            args+=("$arg")
        fi
    done
    echo "${args[@]}"
}

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

# Default API timeout in seconds (can be overridden with API_TIMEOUT env var)
API_TIMEOUT="${API_TIMEOUT:-60}"

# Make an authenticated API call
# Arguments: method endpoint [data]
# Example: api_call GET "/projects"
# Example: api_call POST "/projects" '{"name": "test"}'
api_call() {
    local method="$1"
    local endpoint="$2"
    local data="${3:-}"

    if [[ -n "$data" ]]; then
        curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \
            -H "X-API-Key: $RDEV_API_KEY" \
            -H "Content-Type: application/json" \
            -d "$data"
    else
        curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \
            -H "X-API-Key: $RDEV_API_KEY"
    fi
}

# Wait for a build to complete
# Arguments: task_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
wait_for_build() {
    local task_id="$1"
    local max_attempts="${2:-120}"  # 10 minutes default (5s * 120)
    local poll_interval="${3:-5}"
    local attempt=0

    echo -e "${CYAN}Waiting for build to complete (task: $task_id)...${NC}"

    while [[ $attempt -lt $max_attempts ]]; do
        local result
        result=$(api_call GET "/builds/$task_id")
        local status
        status=$(echo "$result" | jq -r '.status // .data.status // "unknown"')

        case "$status" in
            completed)
                local success
                success=$(echo "$result" | jq -r '.result.success // .data.result.success // false')
                if [[ "$success" == "true" ]]; then
                    echo -e "${GREEN}Build completed successfully!${NC}"
                    echo "$result" | jq '.result // .data.result'
                    return 0
                else
                    echo -e "${RED}Build completed but failed:${NC}"
                    echo "$result" | jq '.result // .data.result'
                    return 1
                fi
                ;;
            failed)
                echo -e "${RED}Build failed:${NC}"
                echo "$result" | jq '.'
                return 1
                ;;
            running)
                echo "  Build running... (attempt $((attempt + 1))/$max_attempts)"
                ;;
            pending)
                echo "  Build pending... (attempt $((attempt + 1))/$max_attempts)"
                ;;
            *)
                echo "  Unknown status: $status (attempt $((attempt + 1))/$max_attempts)"
                ;;
        esac

        sleep "$poll_interval"
        ((attempt++))
    done

    echo -e "${YELLOW}Timeout waiting for build to complete${NC}"
    return 2
}

# Wait for CI pipeline to complete
# Arguments: project_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
# On failure, automatically runs diagnostics
#
# Fast-fail behavior: Returns immediately on failure/error/killed states
# instead of waiting for timeout. This prevents "blind waiting" when
# the pipeline has already failed.
wait_for_pipeline() {
    local project_id="$1"
    local max_attempts="${2:-120}"  # 10 minutes default
    local poll_interval="${3:-10}"
    local attempt=0
    local tracked_pipeline=""  # Track specific pipeline once found

    echo -e "${CYAN}Waiting for new CI pipeline...${NC}"

    # Record the current latest pipeline number BEFORE waiting
    # so we only track pipelines triggered AFTER this point.
    # Race condition guard: if the triggering step pushed fast enough that its pipeline
    # already appears as the latest, track that pipeline directly instead of waiting for
    # a newer one that will never come.
    local baseline_number=0
    local initial_result initial_status
    initial_result=$(api_call GET "/projects/$project_id/pipelines" 2>/dev/null)
    if echo "$initial_result" | jq -e '.data[0]' >/dev/null 2>&1; then
        baseline_number=$(echo "$initial_result" | jq -r '.data[0].number // 0')
        initial_status=$(echo "$initial_result" | jq -r '.data[0].status // "unknown"')
        # If the latest pipeline is already running or pending, it was triggered by the
        # preceding step — track it directly rather than waiting for a newer one.
        if [[ "$initial_status" == "running" || "$initial_status" == "pending" || "$initial_status" == "started" ]]; then
            tracked_pipeline="$baseline_number"
            echo "  Detected in-progress pipeline #$baseline_number (status: $initial_status) — tracking it"
        else
            echo "  Baseline pipeline: #$baseline_number (status: $initial_status) — waiting for a newer one"
        fi
    fi

    while [[ $attempt -lt $max_attempts ]]; do
        local result
        result=$(api_call GET "/projects/$project_id/pipelines")

        # Check if we have any pipelines
        local pipeline_count
        pipeline_count=$(echo "$result" | jq '.data | length // 0')

        if [[ "$pipeline_count" -eq 0 ]]; then
            echo "  No pipelines yet... (attempt $((attempt + 1))/$max_attempts)"
            sleep "$poll_interval"
            ((attempt++))
            continue
        fi

        # Get latest pipeline number and status
        local pipeline_number status
        pipeline_number=$(echo "$result" | jq -r '.data[0].number // 0')
        status=$(echo "$result" | jq -r '.data[0].status // "unknown"')

        # Skip any pipeline that is not newer than our baseline.
        # Exception: if tracked_pipeline is already set (we detected an in-progress
        # pipeline at startup), bypass the baseline check and go straight to status.
        if [[ -z "$tracked_pipeline" && "$pipeline_number" -le "$baseline_number" ]]; then
            echo "  Waiting for new pipeline (latest is #$pipeline_number, baseline #$baseline_number)... (attempt $((attempt + 1))/$max_attempts)"
            sleep "$poll_interval"
            ((attempt++))
            continue
        fi

        # A new pipeline exists — track it (if not already tracking)
        if [[ -z "$tracked_pipeline" ]]; then
            tracked_pipeline="$pipeline_number"
            echo "  Tracking new pipeline #$tracked_pipeline"
        fi

        case "$status" in
            success)
                echo -e "${GREEN}✓ Pipeline #$pipeline_number completed successfully!${NC}"
                return 0
                ;;
            failure|error|killed)
                # FAST FAIL: Don't wait for timeout, fail immediately
                echo ""
                echo -e "${RED}✗ Pipeline #$pipeline_number failed (status: $status)${NC}"
                echo ""

                # Quick inline step summary before full diagnostics
                local steps_response
                steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{}')
                local has_steps
                has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")

                if [[ "$has_steps" == "true" ]]; then
                    # Show failed steps inline for quick diagnosis
                    local failed_count
                    failed_count=$(echo "$steps_response" | jq '[.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")] | length')
                    if [[ "$failed_count" -gt 0 ]]; then
                        echo -e "${RED}  Failed steps:${NC}"
                        echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") | "    ✗ \(.name): exit \(.exit_code // "?")"'
                    fi
                fi

                # Full diagnostics
                diagnose_pipeline_failure "$project_id"
                return 1
                ;;
            running|pending)
                echo "  Pipeline #$pipeline_number $status... (attempt $((attempt + 1))/$max_attempts)"
                ;;
            *)
                echo "  Pipeline #$pipeline_number status: $status (attempt $((attempt + 1))/$max_attempts)"
                ;;
        esac

        sleep "$poll_interval"
        ((attempt++))
    done

    echo -e "${YELLOW}Timeout waiting for new pipeline${NC}"
    # On timeout, still run diagnostics to help debug
    if [[ -n "$tracked_pipeline" ]]; then
        diagnose_pipeline_failure "$project_id"
    fi
    return 2
}

# Wait for site to be accessible
# Arguments: domain [max_attempts] [poll_interval] [project_id]
# Returns: 0 on success, 1 on timeout
# On timeout, automatically runs diagnostics if project_id is provided
wait_for_site() {
    local domain="$1"
    local max_attempts="${2:-30}"
    local poll_interval="${3:-5}"
    local project_id="${4:-}"
    local attempt=0
    local last_http_code=""

    echo -e "${CYAN}Waiting for site to be accessible at https://$domain...${NC}"

    while [[ $attempt -lt $max_attempts ]]; do
        local http_code
        http_code=$(curl -s -o /dev/null -w "%{http_code}" "https://$domain" 2>/dev/null || echo "000")

        if [[ "$http_code" == "200" ]]; then
            echo -e "${GREEN}Site is live! (HTTP $http_code)${NC}"
            return 0
        fi

        # Only print status change or every 5th attempt to reduce noise
        if [[ "$http_code" != "$last_http_code" ]] || (( attempt % 5 == 0 )); then
            echo "  HTTP $http_code... (attempt $((attempt + 1))/$max_attempts)"
        fi
        last_http_code="$http_code"

        sleep "$poll_interval"
        ((attempt++))
    done

    echo -e "${YELLOW}Timeout waiting for site to respond (last: HTTP $last_http_code)${NC}"

    # Automatically diagnose if we have project_id
    if [[ -n "$project_id" ]]; then
        diagnose_site_failure "$domain" "$project_id"
    else
        echo ""
        echo "  Tip: Pass project_id to wait_for_site for automatic diagnostics"
    fi

    return 1
}

# Print a section header
print_header() {
    local title="$1"
    echo ""
    echo -e "${BLUE}=== $title ===${NC}"
    echo ""
}

# Print success message
print_success() {
    echo -e "${GREEN}✓ $1${NC}"
}

# Print error message
print_error() {
    echo -e "${RED}✗ $1${NC}"
}

# Print warning message
print_warning() {
    echo -e "${YELLOW}⚠ $1${NC}"
}

# Print diagnostic section header
print_diagnostic_header() {
    local title="$1"
    echo ""
    echo -e "${CYAN}┌─────────────────────────────────────────────────────────────────┐${NC}"
    echo -e "${CYAN}│ DIAGNOSTIC: $title${NC}"
    echo -e "${CYAN}└─────────────────────────────────────────────────────────────────┘${NC}"
}

# Print a suggested fix
print_fix() {
    echo -e "${YELLOW}  → FIX: $1${NC}"
}

# Print a command the user can run
print_cmd() {
    echo -e "${BLUE}  \$ $1${NC}"
}

# Get git owner from environment or default
get_git_owner() {
    echo "${GITEA_DEFAULT_ORG:-jordan}"
}

# Diagnose a failed pipeline - fetches details and prints actionable info
# Arguments: project_id
diagnose_pipeline_failure() {
    local project_id="$1"
    local git_owner
    git_owner=$(get_git_owner)

    print_diagnostic_header "Pipeline Failure Analysis"

    # Get the latest pipeline
    local pipelines
    pipelines=$(api_call GET "/projects/$project_id/pipelines")

    local pipeline_number
    pipeline_number=$(echo "$pipelines" | jq -r '.data[0].number // "?"')
    local pipeline_status
    pipeline_status=$(echo "$pipelines" | jq -r '.data[0].status // "unknown"')
    local pipeline_errors
    pipeline_errors=$(echo "$pipelines" | jq -r '.data[0].errors // []')
    local commit_msg
    commit_msg=$(echo "$pipelines" | jq -r '.data[0].message // ""' | head -1)

    echo ""
    echo "  Pipeline #$pipeline_number: $pipeline_status"
    echo "  Commit: $commit_msg"

    # Show any pipeline-level errors (YAML validation, etc.)
    local error_count
    error_count=$(echo "$pipeline_errors" | jq 'length // 0')
    if [[ "$error_count" -gt 0 ]]; then
        echo ""
        echo -e "${RED}  Pipeline Errors:${NC}"
        echo "$pipeline_errors" | jq -r '.[] | "    - \(.type): \(.message)"'
    fi

    # Try to get step details from the steps API (if available)
    local steps_response
    steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{"error":"not available"}')

    local has_steps
    has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")

    if [[ "$has_steps" == "true" ]]; then
        echo ""
        echo "  Steps:"
        # Format steps with status icons, duration, and exit code for failures
        echo "$steps_response" | jq -r '.data.steps[] |
            (if .duration_seconds > 0 then " (\(.duration_seconds)s)" else "" end) as $dur |
            if .status == "failure" or .status == "error" or .status == "killed" then
                "    \u001b[31m✗\u001b[0m \(.name): FAILED (exit \(.exit_code // "?"))\($dur)"
            elif .status == "success" then
                "    \u001b[32m✓\u001b[0m \(.name): success\($dur)"
            elif .status == "running" then
                "    \u001b[33m◐\u001b[0m \(.name): running..."
            elif .status == "pending" then
                "    ○ \(.name): pending"
            elif .status == "skipped" then
                "    ○ \(.name): skipped"
            else
                "    ? \(.name): \(.status)"
            end'

        # Show logs from failed steps
        local failed_steps
        failed_steps=$(echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")')

        if [[ -n "$failed_steps" ]]; then
            echo ""
            echo -e "${RED}  Failed Step Details:${NC}"

            # For each failed step, show error and log
            echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") |
                "\n  Step: \(.name)" +
                (if .error and .error != "" then "\n  Error: \(.error)" else "" end) +
                (if .log and .log != "" then "\n  Last lines of log:\n\(.log | split("\n") | .[-20:] | join("\n") | gsub("^"; "    "))" else "" end)'
        fi
    else
        echo ""
        echo -e "${YELLOW}  Steps API not available - upgrade rdev-api for detailed step info${NC}"
    fi

    # Always provide direct links
    echo ""
    echo "  View full logs:"
    print_cmd "open https://ci.threesix.ai/$git_owner/$project_id/$pipeline_number"

    # Pattern match common errors and suggest fixes
    echo ""
    diagnose_common_pipeline_errors "$project_id" "$pipeline_number"
}

# Pattern match common pipeline errors and suggest fixes
diagnose_common_pipeline_errors() {
    local project_id="$1"
    local pipeline_number="$2"

    echo "  Common issues to check:"
    echo ""

    # Check 1: Missing K8s deployment (most common issue)
    echo "  1. Missing Kubernetes Deployment?"
    echo "     The CI pipeline tries to 'kubectl set image' but deployment may not exist."
    print_cmd "kubectl get deployment -n projects -l app=$project_id"
    print_fix "Component may need initial deployment created"
    echo ""

    # Check 2: Docker build issues
    echo "  2. Docker Build Failed?"
    echo "     Check if Dockerfile exists and workspace files are correct."
    print_cmd "Check the build step in Woodpecker UI for specific error"
    echo ""

    # Check 3: Registry auth
    echo "  3. Registry Push Failed?"
    echo "     Kaniko may not have credentials to push to registry."
    print_cmd "kubectl get secret -n woodpecker-agents | grep registry"
}

# Diagnose why a site is not accessible
# Arguments: domain project_id
diagnose_site_failure() {
    local domain="$1"
    local project_id="$2"

    print_diagnostic_header "Site Accessibility Analysis"

    echo ""
    echo "  Domain: https://$domain"
    echo "  Project: $project_id"
    echo ""

    # Check if kubectl is available and configured
    if ! command -v kubectl &> /dev/null; then
        echo -e "${YELLOW}  kubectl not found - cannot check K8s state${NC}"
        echo "  Install kubectl and set KUBECONFIG to diagnose further"
        return
    fi

    if [[ -z "${KUBECONFIG:-}" ]]; then
        echo -e "${YELLOW}  KUBECONFIG not set - trying default context${NC}"
    fi

    # Check pods
    echo "  Checking pods in 'projects' namespace:"
    local pods
    pods=$(kubectl get pods -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "ERROR")

    if [[ "$pods" == "ERROR" ]]; then
        echo -e "${RED}    Failed to query K8s (check KUBECONFIG)${NC}"
        print_cmd "export KUBECONFIG=~/.kube/orchard9-k3sf.yaml"
        return
    elif [[ -z "$pods" ]]; then
        echo -e "${RED}    No pods found for app=$project_id${NC}"
        print_fix "Deployment doesn't exist - CI may have failed or component needs initial deploy"
        print_cmd "kubectl get deployments -n projects"
    else
        echo "$pods" | sed 's/^/    /'

        # Check for common pod issues
        if echo "$pods" | grep -q "ImagePullBackOff\|ErrImagePull"; then
            echo ""
            echo -e "${RED}  Issue: ImagePullBackOff${NC}"
            print_fix "Image doesn't exist in registry - check CI build step"
            print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A5 'Events:'"
        fi

        if echo "$pods" | grep -q "CrashLoopBackOff"; then
            echo ""
            echo -e "${RED}  Issue: CrashLoopBackOff${NC}"
            print_fix "Container is crashing - check application logs"
            print_cmd "kubectl logs -n projects -l app=$project_id --tail=50"
        fi

        if echo "$pods" | grep -q "Pending"; then
            echo ""
            echo -e "${RED}  Issue: Pod stuck in Pending${NC}"
            print_fix "Likely resource constraints or scheduling issues"
            print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A10 'Events:'"
        fi

        if echo "$pods" | grep -q "0/1\|0/2"; then
            echo ""
            echo -e "${YELLOW}  Issue: Container not ready${NC}"
            print_fix "Container may still be starting or failing health checks"
            print_cmd "kubectl logs -n projects -l app=$project_id --tail=20"
        fi
    fi

    # Check services
    echo ""
    echo "  Checking services:"
    local svc
    svc=$(kubectl get svc -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "")
    if [[ -z "$svc" ]]; then
        echo -e "${RED}    No service found for app=$project_id${NC}"
        print_fix "Service needs to be created along with deployment"
    else
        echo "$svc" | sed 's/^/    /'
    fi

    # Check ingress
    echo ""
    echo "  Checking ingress:"
    local ingress
    ingress=$(kubectl get ingress -n projects --no-headers 2>/dev/null | grep "$project_id\|$domain" || echo "")
    if [[ -z "$ingress" ]]; then
        echo -e "${YELLOW}    No ingress found matching $project_id or $domain${NC}"
    else
        echo "$ingress" | sed 's/^/    /'
    fi

    # Recent events
    echo ""
    echo "  Recent events:"
    kubectl get events -n projects --sort-by='.lastTimestamp' 2>/dev/null | grep "$project_id" | tail -5 | sed 's/^/    /' || echo "    No recent events"

    echo ""
    echo "  Manual investigation commands:"
    print_cmd "kubectl logs -n projects -l app=$project_id -f"
    print_cmd "kubectl describe pod -n projects -l app=$project_id"
    print_cmd "kubectl get events -n projects --sort-by='.lastTimestamp' | tail -20"
}