#!/bin/bash # Common utilities for rdev cookbook scripts # # Usage: # source "$(dirname "${BASH_SOURCE[0]}")/common.sh" # # Provides: # - api_call() - Make authenticated API calls # - wait_for_build() - Poll for build completion # - wait_for_pipeline() - Poll for CI pipeline completion # - wait_for_site() - Wait for site to respond # - Colors for output set -euo pipefail # Environment variables (checked at runtime by preflight_check, not on source) # This allows commands like 'list' to work without credentials RDEV_API_URL="${RDEV_API_URL:-}" RDEV_API_KEY="${RDEV_API_KEY:-}" # Auto-cleanup configuration # Set AUTO_TEARDOWN=true to automatically clean up projects on exit AUTO_TEARDOWN="${AUTO_TEARDOWN:-false}" # Track created project for cleanup # Scripts should set this after successful project creation CLEANUP_PROJECT="" # Cleanup handler for auto-teardown # Called on script exit when AUTO_TEARDOWN=true cleanup_on_exit() { local exit_code=$? if [[ -n "$CLEANUP_PROJECT" && "$AUTO_TEARDOWN" == "true" ]]; then echo "" echo -e "${CYAN}Auto-teardown: Cleaning up $CLEANUP_PROJECT...${NC}" api_call DELETE "/project/$CLEANUP_PROJECT" > /dev/null 2>&1 || true echo -e "${GREEN}✓ Project $CLEANUP_PROJECT deleted${NC}" fi exit $exit_code } # Register cleanup handler # Scripts should call this after sourcing common.sh if they want auto-cleanup register_cleanup_trap() { trap cleanup_on_exit EXIT INT TERM } # Parse --auto-teardown from args and return remaining args # Usage: args=$(parse_auto_teardown_flag "$@") parse_auto_teardown_flag() { local args=() for arg in "$@"; do if [[ "$arg" == "--auto-teardown" ]]; then AUTO_TEARDOWN="true" else args+=("$arg") fi done echo "${args[@]}" } # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color # Default API timeout in seconds (can be overridden with API_TIMEOUT env var) API_TIMEOUT="${API_TIMEOUT:-60}" # Make an authenticated API call # Arguments: method endpoint [data] # Example: api_call GET "/projects" # Example: api_call POST "/projects" '{"name": "test"}' api_call() { local method="$1" local endpoint="$2" local data="${3:-}" if [[ -n "$data" ]]; then curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \ -H "X-API-Key: $RDEV_API_KEY" \ -H "Content-Type: application/json" \ -d "$data" else curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \ -H "X-API-Key: $RDEV_API_KEY" fi } # Wait for a build to complete # Arguments: task_id [max_attempts] [poll_interval] # Returns: 0 on success, 1 on failure, 2 on timeout wait_for_build() { local task_id="$1" local max_attempts="${2:-120}" # 10 minutes default (5s * 120) local poll_interval="${3:-5}" local attempt=0 echo -e "${CYAN}Waiting for build to complete (task: $task_id)...${NC}" while [[ $attempt -lt $max_attempts ]]; do local result result=$(api_call GET "/builds/$task_id") local status status=$(echo "$result" | jq -r '.status // .data.status // "unknown"') case "$status" in completed) local success success=$(echo "$result" | jq -r '.result.success // .data.result.success // false') if [[ "$success" == "true" ]]; then echo -e "${GREEN}Build completed successfully!${NC}" echo "$result" | jq '.result // .data.result' return 0 else echo -e "${RED}Build completed but failed:${NC}" echo "$result" | jq '.result // .data.result' return 1 fi ;; failed) echo -e "${RED}Build failed:${NC}" echo "$result" | jq '.' return 1 ;; running) echo " Build running... (attempt $((attempt + 1))/$max_attempts)" ;; pending) echo " Build pending... (attempt $((attempt + 1))/$max_attempts)" ;; *) echo " Unknown status: $status (attempt $((attempt + 1))/$max_attempts)" ;; esac sleep "$poll_interval" ((attempt++)) done echo -e "${YELLOW}Timeout waiting for build to complete${NC}" return 2 } # Wait for CI pipeline to complete # Arguments: project_id [max_attempts] [poll_interval] # Returns: 0 on success, 1 on failure, 2 on timeout # On failure, automatically runs diagnostics # # Fast-fail behavior: Returns immediately on failure/error/killed states # instead of waiting for timeout. This prevents "blind waiting" when # the pipeline has already failed. wait_for_pipeline() { local project_id="$1" local max_attempts="${2:-120}" # 10 minutes default local poll_interval="${3:-10}" local attempt=0 local tracked_pipeline="" # Track specific pipeline once found echo -e "${CYAN}Waiting for new CI pipeline...${NC}" # Record the current latest pipeline number BEFORE waiting # so we only track pipelines triggered AFTER this point. # Race condition guard: if the triggering step pushed fast enough that its pipeline # already appears as the latest, track that pipeline directly instead of waiting for # a newer one that will never come. local baseline_number=0 local initial_result initial_status initial_result=$(api_call GET "/projects/$project_id/pipelines" 2>/dev/null) if echo "$initial_result" | jq -e '.data[0]' >/dev/null 2>&1; then baseline_number=$(echo "$initial_result" | jq -r '.data[0].number // 0') initial_status=$(echo "$initial_result" | jq -r '.data[0].status // "unknown"') # If the latest pipeline is already running or pending, it was triggered by the # preceding step — track it directly rather than waiting for a newer one. if [[ "$initial_status" == "running" || "$initial_status" == "pending" || "$initial_status" == "started" ]]; then tracked_pipeline="$baseline_number" echo " Detected in-progress pipeline #$baseline_number (status: $initial_status) — tracking it" else echo " Baseline pipeline: #$baseline_number (status: $initial_status) — waiting for a newer one" fi fi while [[ $attempt -lt $max_attempts ]]; do local result result=$(api_call GET "/projects/$project_id/pipelines") # Check if we have any pipelines local pipeline_count pipeline_count=$(echo "$result" | jq '.data | length // 0') if [[ "$pipeline_count" -eq 0 ]]; then echo " No pipelines yet... (attempt $((attempt + 1))/$max_attempts)" sleep "$poll_interval" ((attempt++)) continue fi # Get latest pipeline number and status local pipeline_number status pipeline_number=$(echo "$result" | jq -r '.data[0].number // 0') status=$(echo "$result" | jq -r '.data[0].status // "unknown"') # Skip any pipeline that is not newer than our baseline. # Exception: if tracked_pipeline is already set (we detected an in-progress # pipeline at startup), bypass the baseline check and go straight to status. if [[ -z "$tracked_pipeline" && "$pipeline_number" -le "$baseline_number" ]]; then echo " Waiting for new pipeline (latest is #$pipeline_number, baseline #$baseline_number)... (attempt $((attempt + 1))/$max_attempts)" sleep "$poll_interval" ((attempt++)) continue fi # A new pipeline exists — track it (if not already tracking) if [[ -z "$tracked_pipeline" ]]; then tracked_pipeline="$pipeline_number" echo " Tracking new pipeline #$tracked_pipeline" fi case "$status" in success) echo -e "${GREEN}✓ Pipeline #$pipeline_number completed successfully!${NC}" return 0 ;; failure|error|killed) # FAST FAIL: Don't wait for timeout, fail immediately echo "" echo -e "${RED}✗ Pipeline #$pipeline_number failed (status: $status)${NC}" echo "" # Quick inline step summary before full diagnostics local steps_response steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{}') local has_steps has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false") if [[ "$has_steps" == "true" ]]; then # Show failed steps inline for quick diagnosis local failed_count failed_count=$(echo "$steps_response" | jq '[.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")] | length') if [[ "$failed_count" -gt 0 ]]; then echo -e "${RED} Failed steps:${NC}" echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") | " ✗ \(.name): exit \(.exit_code // "?")"' fi fi # Full diagnostics diagnose_pipeline_failure "$project_id" return 1 ;; running|pending) echo " Pipeline #$pipeline_number $status... (attempt $((attempt + 1))/$max_attempts)" ;; *) echo " Pipeline #$pipeline_number status: $status (attempt $((attempt + 1))/$max_attempts)" ;; esac sleep "$poll_interval" ((attempt++)) done echo -e "${YELLOW}Timeout waiting for new pipeline${NC}" # On timeout, still run diagnostics to help debug if [[ -n "$tracked_pipeline" ]]; then diagnose_pipeline_failure "$project_id" fi return 2 } # Wait for site to be accessible # Arguments: domain [max_attempts] [poll_interval] [project_id] # Returns: 0 on success, 1 on timeout # On timeout, automatically runs diagnostics if project_id is provided wait_for_site() { local domain="$1" local max_attempts="${2:-30}" local poll_interval="${3:-5}" local project_id="${4:-}" local attempt=0 local last_http_code="" echo -e "${CYAN}Waiting for site to be accessible at https://$domain...${NC}" while [[ $attempt -lt $max_attempts ]]; do local http_code http_code=$(curl -s -o /dev/null -w "%{http_code}" "https://$domain" 2>/dev/null || echo "000") if [[ "$http_code" == "200" ]]; then echo -e "${GREEN}Site is live! (HTTP $http_code)${NC}" return 0 fi # Only print status change or every 5th attempt to reduce noise if [[ "$http_code" != "$last_http_code" ]] || (( attempt % 5 == 0 )); then echo " HTTP $http_code... (attempt $((attempt + 1))/$max_attempts)" fi last_http_code="$http_code" sleep "$poll_interval" ((attempt++)) done echo -e "${YELLOW}Timeout waiting for site to respond (last: HTTP $last_http_code)${NC}" # Automatically diagnose if we have project_id if [[ -n "$project_id" ]]; then diagnose_site_failure "$domain" "$project_id" else echo "" echo " Tip: Pass project_id to wait_for_site for automatic diagnostics" fi return 1 } # Print a section header print_header() { local title="$1" echo "" echo -e "${BLUE}=== $title ===${NC}" echo "" } # Print success message print_success() { echo -e "${GREEN}✓ $1${NC}" } # Print error message print_error() { echo -e "${RED}✗ $1${NC}" } # Print warning message print_warning() { echo -e "${YELLOW}⚠ $1${NC}" } # Print diagnostic section header print_diagnostic_header() { local title="$1" echo "" echo -e "${CYAN}┌─────────────────────────────────────────────────────────────────┐${NC}" echo -e "${CYAN}│ DIAGNOSTIC: $title${NC}" echo -e "${CYAN}└─────────────────────────────────────────────────────────────────┘${NC}" } # Print a suggested fix print_fix() { echo -e "${YELLOW} → FIX: $1${NC}" } # Print a command the user can run print_cmd() { echo -e "${BLUE} \$ $1${NC}" } # Get git owner from environment or default get_git_owner() { echo "${GITEA_DEFAULT_ORG:-jordan}" } # Diagnose a failed pipeline - fetches details and prints actionable info # Arguments: project_id diagnose_pipeline_failure() { local project_id="$1" local git_owner git_owner=$(get_git_owner) print_diagnostic_header "Pipeline Failure Analysis" # Get the latest pipeline local pipelines pipelines=$(api_call GET "/projects/$project_id/pipelines") local pipeline_number pipeline_number=$(echo "$pipelines" | jq -r '.data[0].number // "?"') local pipeline_status pipeline_status=$(echo "$pipelines" | jq -r '.data[0].status // "unknown"') local pipeline_errors pipeline_errors=$(echo "$pipelines" | jq -r '.data[0].errors // []') local commit_msg commit_msg=$(echo "$pipelines" | jq -r '.data[0].message // ""' | head -1) echo "" echo " Pipeline #$pipeline_number: $pipeline_status" echo " Commit: $commit_msg" # Show any pipeline-level errors (YAML validation, etc.) local error_count error_count=$(echo "$pipeline_errors" | jq 'length // 0') if [[ "$error_count" -gt 0 ]]; then echo "" echo -e "${RED} Pipeline Errors:${NC}" echo "$pipeline_errors" | jq -r '.[] | " - \(.type): \(.message)"' fi # Try to get step details from the steps API (if available) local steps_response steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{"error":"not available"}') local has_steps has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false") if [[ "$has_steps" == "true" ]]; then echo "" echo " Steps:" # Format steps with status icons, duration, and exit code for failures echo "$steps_response" | jq -r '.data.steps[] | (if .duration_seconds > 0 then " (\(.duration_seconds)s)" else "" end) as $dur | if .status == "failure" or .status == "error" or .status == "killed" then " \u001b[31m✗\u001b[0m \(.name): FAILED (exit \(.exit_code // "?"))\($dur)" elif .status == "success" then " \u001b[32m✓\u001b[0m \(.name): success\($dur)" elif .status == "running" then " \u001b[33m◐\u001b[0m \(.name): running..." elif .status == "pending" then " ○ \(.name): pending" elif .status == "skipped" then " ○ \(.name): skipped" else " ? \(.name): \(.status)" end' # Show logs from failed steps local failed_steps failed_steps=$(echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")') if [[ -n "$failed_steps" ]]; then echo "" echo -e "${RED} Failed Step Details:${NC}" # For each failed step, show error and log echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") | "\n Step: \(.name)" + (if .error and .error != "" then "\n Error: \(.error)" else "" end) + (if .log and .log != "" then "\n Last lines of log:\n\(.log | split("\n") | .[-20:] | join("\n") | gsub("^"; " "))" else "" end)' fi else echo "" echo -e "${YELLOW} Steps API not available - upgrade rdev-api for detailed step info${NC}" fi # Always provide direct links echo "" echo " View full logs:" print_cmd "open https://ci.threesix.ai/$git_owner/$project_id/$pipeline_number" # Pattern match common errors and suggest fixes echo "" diagnose_common_pipeline_errors "$project_id" "$pipeline_number" } # Pattern match common pipeline errors and suggest fixes diagnose_common_pipeline_errors() { local project_id="$1" local pipeline_number="$2" echo " Common issues to check:" echo "" # Check 1: Missing K8s deployment (most common issue) echo " 1. Missing Kubernetes Deployment?" echo " The CI pipeline tries to 'kubectl set image' but deployment may not exist." print_cmd "kubectl get deployment -n projects -l app=$project_id" print_fix "Component may need initial deployment created" echo "" # Check 2: Docker build issues echo " 2. Docker Build Failed?" echo " Check if Dockerfile exists and workspace files are correct." print_cmd "Check the build step in Woodpecker UI for specific error" echo "" # Check 3: Registry auth echo " 3. Registry Push Failed?" echo " Kaniko may not have credentials to push to registry." print_cmd "kubectl get secret -n woodpecker-agents | grep registry" } # Diagnose why a site is not accessible # Arguments: domain project_id diagnose_site_failure() { local domain="$1" local project_id="$2" print_diagnostic_header "Site Accessibility Analysis" echo "" echo " Domain: https://$domain" echo " Project: $project_id" echo "" # Check if kubectl is available and configured if ! command -v kubectl &> /dev/null; then echo -e "${YELLOW} kubectl not found - cannot check K8s state${NC}" echo " Install kubectl and set KUBECONFIG to diagnose further" return fi if [[ -z "${KUBECONFIG:-}" ]]; then echo -e "${YELLOW} KUBECONFIG not set - trying default context${NC}" fi # Check pods echo " Checking pods in 'projects' namespace:" local pods pods=$(kubectl get pods -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "ERROR") if [[ "$pods" == "ERROR" ]]; then echo -e "${RED} Failed to query K8s (check KUBECONFIG)${NC}" print_cmd "export KUBECONFIG=~/.kube/orchard9-k3sf.yaml" return elif [[ -z "$pods" ]]; then echo -e "${RED} No pods found for app=$project_id${NC}" print_fix "Deployment doesn't exist - CI may have failed or component needs initial deploy" print_cmd "kubectl get deployments -n projects" else echo "$pods" | sed 's/^/ /' # Check for common pod issues if echo "$pods" | grep -q "ImagePullBackOff\|ErrImagePull"; then echo "" echo -e "${RED} Issue: ImagePullBackOff${NC}" print_fix "Image doesn't exist in registry - check CI build step" print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A5 'Events:'" fi if echo "$pods" | grep -q "CrashLoopBackOff"; then echo "" echo -e "${RED} Issue: CrashLoopBackOff${NC}" print_fix "Container is crashing - check application logs" print_cmd "kubectl logs -n projects -l app=$project_id --tail=50" fi if echo "$pods" | grep -q "Pending"; then echo "" echo -e "${RED} Issue: Pod stuck in Pending${NC}" print_fix "Likely resource constraints or scheduling issues" print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A10 'Events:'" fi if echo "$pods" | grep -q "0/1\|0/2"; then echo "" echo -e "${YELLOW} Issue: Container not ready${NC}" print_fix "Container may still be starting or failing health checks" print_cmd "kubectl logs -n projects -l app=$project_id --tail=20" fi fi # Check services echo "" echo " Checking services:" local svc svc=$(kubectl get svc -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "") if [[ -z "$svc" ]]; then echo -e "${RED} No service found for app=$project_id${NC}" print_fix "Service needs to be created along with deployment" else echo "$svc" | sed 's/^/ /' fi # Check ingress echo "" echo " Checking ingress:" local ingress ingress=$(kubectl get ingress -n projects --no-headers 2>/dev/null | grep "$project_id\|$domain" || echo "") if [[ -z "$ingress" ]]; then echo -e "${YELLOW} No ingress found matching $project_id or $domain${NC}" else echo "$ingress" | sed 's/^/ /' fi # Recent events echo "" echo " Recent events:" kubectl get events -n projects --sort-by='.lastTimestamp' 2>/dev/null | grep "$project_id" | tail -5 | sed 's/^/ /' || echo " No recent events" echo "" echo " Manual investigation commands:" print_cmd "kubectl logs -n projects -l app=$project_id -f" print_cmd "kubectl describe pod -n projects -l app=$project_id" print_cmd "kubectl get events -n projects --sort-by='.lastTimestamp' | tail -20" }