rdev/cookbooks/scripts/common.sh
jordan 863dfd3214
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
fix: skip root deployment for empty template (defaults to skeleton)
When req.Template is empty, it defaults to 'skeleton' but the check
in createInitialDeployment only matched 'skeleton' explicitly, not
empty string. This caused a broken deployment to be created for
monorepo projects with a non-existent image.

Root cause: slackpath-5 creates project with empty template, which
defaults to skeleton, but createInitialDeployment was still creating
a root deployment that references registry.threesix.ai/{project}:latest
which never gets built (skeleton has no root Dockerfile).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-07 19:32:19 -07:00

561 lines
20 KiB
Bash
Executable File

#!/bin/bash
# Common utilities for rdev cookbook scripts
#
# Usage:
# source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
#
# Provides:
# - api_call() - Make authenticated API calls
# - wait_for_build() - Poll for build completion
# - wait_for_pipeline() - Poll for CI pipeline completion
# - wait_for_site() - Wait for site to respond
# - Colors for output
set -euo pipefail
# Environment variables (checked at runtime by preflight_check, not on source)
# This allows commands like 'list' to work without credentials
RDEV_API_URL="${RDEV_API_URL:-}"
RDEV_API_KEY="${RDEV_API_KEY:-}"
# Auto-cleanup configuration
# Set AUTO_TEARDOWN=true to automatically clean up projects on exit
AUTO_TEARDOWN="${AUTO_TEARDOWN:-false}"
# Track created project for cleanup
# Scripts should set this after successful project creation
CLEANUP_PROJECT=""
# Cleanup handler for auto-teardown
# Called on script exit when AUTO_TEARDOWN=true
cleanup_on_exit() {
local exit_code=$?
if [[ -n "$CLEANUP_PROJECT" && "$AUTO_TEARDOWN" == "true" ]]; then
echo ""
echo -e "${CYAN}Auto-teardown: Cleaning up $CLEANUP_PROJECT...${NC}"
api_call DELETE "/project/$CLEANUP_PROJECT" > /dev/null 2>&1 || true
echo -e "${GREEN}✓ Project $CLEANUP_PROJECT deleted${NC}"
fi
exit $exit_code
}
# Register cleanup handler
# Scripts should call this after sourcing common.sh if they want auto-cleanup
register_cleanup_trap() {
trap cleanup_on_exit EXIT INT TERM
}
# Parse --auto-teardown from args and return remaining args
# Usage: args=$(parse_auto_teardown_flag "$@")
parse_auto_teardown_flag() {
local args=()
for arg in "$@"; do
if [[ "$arg" == "--auto-teardown" ]]; then
AUTO_TEARDOWN="true"
else
args+=("$arg")
fi
done
echo "${args[@]}"
}
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Default API timeout in seconds (can be overridden with API_TIMEOUT env var)
API_TIMEOUT="${API_TIMEOUT:-60}"
# Make an authenticated API call
# Arguments: method endpoint [data]
# Example: api_call GET "/projects"
# Example: api_call POST "/projects" '{"name": "test"}'
api_call() {
local method="$1"
local endpoint="$2"
local data="${3:-}"
if [[ -n "$data" ]]; then
curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \
-H "X-API-Key: $RDEV_API_KEY" \
-H "Content-Type: application/json" \
-d "$data"
else
curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \
-H "X-API-Key: $RDEV_API_KEY"
fi
}
# Wait for a build to complete
# Arguments: task_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
wait_for_build() {
local task_id="$1"
local max_attempts="${2:-60}" # 5 minutes default (5s * 60)
local poll_interval="${3:-5}"
local attempt=0
echo -e "${CYAN}Waiting for build to complete (task: $task_id)...${NC}"
while [[ $attempt -lt $max_attempts ]]; do
local result
result=$(api_call GET "/builds/$task_id")
local status
status=$(echo "$result" | jq -r '.status // .data.status // "unknown"')
case "$status" in
completed)
local success
success=$(echo "$result" | jq -r '.result.success // .data.result.success // false')
if [[ "$success" == "true" ]]; then
echo -e "${GREEN}Build completed successfully!${NC}"
echo "$result" | jq '.result // .data.result'
return 0
else
echo -e "${RED}Build completed but failed:${NC}"
echo "$result" | jq '.result // .data.result'
return 1
fi
;;
failed)
echo -e "${RED}Build failed:${NC}"
echo "$result" | jq '.'
return 1
;;
running)
echo " Build running... (attempt $((attempt + 1))/$max_attempts)"
;;
pending)
echo " Build pending... (attempt $((attempt + 1))/$max_attempts)"
;;
*)
echo " Unknown status: $status (attempt $((attempt + 1))/$max_attempts)"
;;
esac
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for build to complete${NC}"
return 2
}
# Wait for CI pipeline to complete
# Arguments: project_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
# On failure, automatically runs diagnostics
#
# Fast-fail behavior: Returns immediately on failure/error/killed states
# instead of waiting for timeout. This prevents "blind waiting" when
# the pipeline has already failed.
wait_for_pipeline() {
local project_id="$1"
local max_attempts="${2:-60}" # 5 minutes default
local poll_interval="${3:-5}"
local attempt=0
local tracked_pipeline="" # Track specific pipeline once found
echo -e "${CYAN}Waiting for CI pipeline...${NC}"
# Wait a bit for pipeline to be created
sleep 5
while [[ $attempt -lt $max_attempts ]]; do
local result
result=$(api_call GET "/projects/$project_id/pipelines")
# Check if we have any pipelines
local pipeline_count
pipeline_count=$(echo "$result" | jq '.data | length // 0')
if [[ "$pipeline_count" -eq 0 ]]; then
echo " No pipelines yet... (attempt $((attempt + 1))/$max_attempts)"
sleep "$poll_interval"
((attempt++))
continue
fi
# Get latest pipeline status
local status
status=$(echo "$result" | jq -r '.data[0].status // "unknown"')
local pipeline_number
pipeline_number=$(echo "$result" | jq -r '.data[0].number // "?"')
# Track the pipeline we're monitoring
if [[ -z "$tracked_pipeline" ]]; then
tracked_pipeline="$pipeline_number"
echo " Tracking pipeline #$pipeline_number"
fi
case "$status" in
success)
echo -e "${GREEN}✓ Pipeline #$pipeline_number completed successfully!${NC}"
return 0
;;
failure|error|killed)
# FAST FAIL: Don't wait for timeout, fail immediately
echo ""
echo -e "${RED}✗ Pipeline #$pipeline_number failed (status: $status)${NC}"
echo ""
# Quick inline step summary before full diagnostics
local steps_response
steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{}')
local has_steps
has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")
if [[ "$has_steps" == "true" ]]; then
# Show failed steps inline for quick diagnosis
local failed_count
failed_count=$(echo "$steps_response" | jq '[.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")] | length')
if [[ "$failed_count" -gt 0 ]]; then
echo -e "${RED} Failed steps:${NC}"
echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") | " ✗ \(.name): exit \(.exit_code // "?")"'
fi
fi
# Full diagnostics
diagnose_pipeline_failure "$project_id"
return 1
;;
running|pending)
echo " Pipeline #$pipeline_number $status... (attempt $((attempt + 1))/$max_attempts)"
;;
*)
echo " Pipeline #$pipeline_number status: $status (attempt $((attempt + 1))/$max_attempts)"
;;
esac
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for pipeline to complete${NC}"
# On timeout, still run diagnostics to help debug
if [[ -n "$tracked_pipeline" ]]; then
diagnose_pipeline_failure "$project_id"
fi
return 2
}
# Wait for site to be accessible
# Arguments: domain [max_attempts] [poll_interval] [project_id]
# Returns: 0 on success, 1 on timeout
# On timeout, automatically runs diagnostics if project_id is provided
wait_for_site() {
local domain="$1"
local max_attempts="${2:-30}"
local poll_interval="${3:-5}"
local project_id="${4:-}"
local attempt=0
local last_http_code=""
echo -e "${CYAN}Waiting for site to be accessible at https://$domain...${NC}"
while [[ $attempt -lt $max_attempts ]]; do
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" "https://$domain" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]]; then
echo -e "${GREEN}Site is live! (HTTP $http_code)${NC}"
return 0
fi
# Only print status change or every 5th attempt to reduce noise
if [[ "$http_code" != "$last_http_code" ]] || (( attempt % 5 == 0 )); then
echo " HTTP $http_code... (attempt $((attempt + 1))/$max_attempts)"
fi
last_http_code="$http_code"
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for site to respond (last: HTTP $last_http_code)${NC}"
# Automatically diagnose if we have project_id
if [[ -n "$project_id" ]]; then
diagnose_site_failure "$domain" "$project_id"
else
echo ""
echo " Tip: Pass project_id to wait_for_site for automatic diagnostics"
fi
return 1
}
# Print a section header
print_header() {
local title="$1"
echo ""
echo -e "${BLUE}=== $title ===${NC}"
echo ""
}
# Print success message
print_success() {
echo -e "${GREEN}$1${NC}"
}
# Print error message
print_error() {
echo -e "${RED}$1${NC}"
}
# Print warning message
print_warning() {
echo -e "${YELLOW}$1${NC}"
}
# Print diagnostic section header
print_diagnostic_header() {
local title="$1"
echo ""
echo -e "${CYAN}┌─────────────────────────────────────────────────────────────────┐${NC}"
echo -e "${CYAN}│ DIAGNOSTIC: $title${NC}"
echo -e "${CYAN}└─────────────────────────────────────────────────────────────────┘${NC}"
}
# Print a suggested fix
print_fix() {
echo -e "${YELLOW} → FIX: $1${NC}"
}
# Print a command the user can run
print_cmd() {
echo -e "${BLUE} \$ $1${NC}"
}
# Get git owner from environment or default
get_git_owner() {
echo "${GITEA_DEFAULT_ORG:-jordan}"
}
# Diagnose a failed pipeline - fetches details and prints actionable info
# Arguments: project_id
diagnose_pipeline_failure() {
local project_id="$1"
local git_owner
git_owner=$(get_git_owner)
print_diagnostic_header "Pipeline Failure Analysis"
# Get the latest pipeline
local pipelines
pipelines=$(api_call GET "/projects/$project_id/pipelines")
local pipeline_number
pipeline_number=$(echo "$pipelines" | jq -r '.data[0].number // "?"')
local pipeline_status
pipeline_status=$(echo "$pipelines" | jq -r '.data[0].status // "unknown"')
local pipeline_errors
pipeline_errors=$(echo "$pipelines" | jq -r '.data[0].errors // []')
local commit_msg
commit_msg=$(echo "$pipelines" | jq -r '.data[0].message // ""' | head -1)
echo ""
echo " Pipeline #$pipeline_number: $pipeline_status"
echo " Commit: $commit_msg"
# Show any pipeline-level errors (YAML validation, etc.)
local error_count
error_count=$(echo "$pipeline_errors" | jq 'length // 0')
if [[ "$error_count" -gt 0 ]]; then
echo ""
echo -e "${RED} Pipeline Errors:${NC}"
echo "$pipeline_errors" | jq -r '.[] | " - \(.type): \(.message)"'
fi
# Try to get step details from the steps API (if available)
local steps_response
steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{"error":"not available"}')
local has_steps
has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")
if [[ "$has_steps" == "true" ]]; then
echo ""
echo " Steps:"
# Format steps with status icons, duration, and exit code for failures
echo "$steps_response" | jq -r '.data.steps[] |
(if .duration_seconds > 0 then " (\(.duration_seconds)s)" else "" end) as $dur |
if .status == "failure" or .status == "error" or .status == "killed" then
" \u001b[31m✗\u001b[0m \(.name): FAILED (exit \(.exit_code // "?"))\($dur)"
elif .status == "success" then
" \u001b[32m✓\u001b[0m \(.name): success\($dur)"
elif .status == "running" then
" \u001b[33m◐\u001b[0m \(.name): running..."
elif .status == "pending" then
" ○ \(.name): pending"
elif .status == "skipped" then
" ○ \(.name): skipped"
else
" ? \(.name): \(.status)"
end'
# Show logs from failed steps
local failed_steps
failed_steps=$(echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")')
if [[ -n "$failed_steps" ]]; then
echo ""
echo -e "${RED} Failed Step Details:${NC}"
# For each failed step, show error and log
echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") |
"\n Step: \(.name)" +
(if .error and .error != "" then "\n Error: \(.error)" else "" end) +
(if .log and .log != "" then "\n Last lines of log:\n\(.log | split("\n") | .[-20:] | join("\n") | gsub("^"; " "))" else "" end)'
fi
else
echo ""
echo -e "${YELLOW} Steps API not available - upgrade rdev-api for detailed step info${NC}"
fi
# Always provide direct links
echo ""
echo " View full logs:"
print_cmd "open https://ci.threesix.ai/$git_owner/$project_id/$pipeline_number"
# Pattern match common errors and suggest fixes
echo ""
diagnose_common_pipeline_errors "$project_id" "$pipeline_number"
}
# Pattern match common pipeline errors and suggest fixes
diagnose_common_pipeline_errors() {
local project_id="$1"
local pipeline_number="$2"
echo " Common issues to check:"
echo ""
# Check 1: Missing K8s deployment (most common issue)
echo " 1. Missing Kubernetes Deployment?"
echo " The CI pipeline tries to 'kubectl set image' but deployment may not exist."
print_cmd "kubectl get deployment -n projects -l app=$project_id"
print_fix "Component may need initial deployment created"
echo ""
# Check 2: Docker build issues
echo " 2. Docker Build Failed?"
echo " Check if Dockerfile exists and workspace files are correct."
print_cmd "Check the build step in Woodpecker UI for specific error"
echo ""
# Check 3: Registry auth
echo " 3. Registry Push Failed?"
echo " Kaniko may not have credentials to push to registry."
print_cmd "kubectl get secret -n woodpecker-agents | grep registry"
}
# Diagnose why a site is not accessible
# Arguments: domain project_id
diagnose_site_failure() {
local domain="$1"
local project_id="$2"
print_diagnostic_header "Site Accessibility Analysis"
echo ""
echo " Domain: https://$domain"
echo " Project: $project_id"
echo ""
# Check if kubectl is available and configured
if ! command -v kubectl &> /dev/null; then
echo -e "${YELLOW} kubectl not found - cannot check K8s state${NC}"
echo " Install kubectl and set KUBECONFIG to diagnose further"
return
fi
if [[ -z "${KUBECONFIG:-}" ]]; then
echo -e "${YELLOW} KUBECONFIG not set - trying default context${NC}"
fi
# Check pods
echo " Checking pods in 'projects' namespace:"
local pods
pods=$(kubectl get pods -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "ERROR")
if [[ "$pods" == "ERROR" ]]; then
echo -e "${RED} Failed to query K8s (check KUBECONFIG)${NC}"
print_cmd "export KUBECONFIG=~/.kube/orchard9-k3sf.yaml"
return
elif [[ -z "$pods" ]]; then
echo -e "${RED} No pods found for app=$project_id${NC}"
print_fix "Deployment doesn't exist - CI may have failed or component needs initial deploy"
print_cmd "kubectl get deployments -n projects"
else
echo "$pods" | sed 's/^/ /'
# Check for common pod issues
if echo "$pods" | grep -q "ImagePullBackOff\|ErrImagePull"; then
echo ""
echo -e "${RED} Issue: ImagePullBackOff${NC}"
print_fix "Image doesn't exist in registry - check CI build step"
print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A5 'Events:'"
fi
if echo "$pods" | grep -q "CrashLoopBackOff"; then
echo ""
echo -e "${RED} Issue: CrashLoopBackOff${NC}"
print_fix "Container is crashing - check application logs"
print_cmd "kubectl logs -n projects -l app=$project_id --tail=50"
fi
if echo "$pods" | grep -q "Pending"; then
echo ""
echo -e "${RED} Issue: Pod stuck in Pending${NC}"
print_fix "Likely resource constraints or scheduling issues"
print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A10 'Events:'"
fi
if echo "$pods" | grep -q "0/1\|0/2"; then
echo ""
echo -e "${YELLOW} Issue: Container not ready${NC}"
print_fix "Container may still be starting or failing health checks"
print_cmd "kubectl logs -n projects -l app=$project_id --tail=20"
fi
fi
# Check services
echo ""
echo " Checking services:"
local svc
svc=$(kubectl get svc -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "")
if [[ -z "$svc" ]]; then
echo -e "${RED} No service found for app=$project_id${NC}"
print_fix "Service needs to be created along with deployment"
else
echo "$svc" | sed 's/^/ /'
fi
# Check ingress
echo ""
echo " Checking ingress:"
local ingress
ingress=$(kubectl get ingress -n projects --no-headers 2>/dev/null | grep "$project_id\|$domain" || echo "")
if [[ -z "$ingress" ]]; then
echo -e "${YELLOW} No ingress found matching $project_id or $domain${NC}"
else
echo "$ingress" | sed 's/^/ /'
fi
# Recent events
echo ""
echo " Recent events:"
kubectl get events -n projects --sort-by='.lastTimestamp' 2>/dev/null | grep "$project_id" | tail -5 | sed 's/^/ /' || echo " No recent events"
echo ""
echo " Manual investigation commands:"
print_cmd "kubectl logs -n projects -l app=$project_id -f"
print_cmd "kubectl describe pod -n projects -l app=$project_id"
print_cmd "kubectl get events -n projects --sort-by='.lastTimestamp' | tail -20"
}