rdev/cookbooks/scripts/common.sh

481 lines
16 KiB
Bash
Executable File

#!/bin/bash
# Common utilities for rdev cookbook scripts
#
# Usage:
# source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
#
# Provides:
# - api_call() - Make authenticated API calls
# - wait_for_build() - Poll for build completion
# - wait_for_pipeline() - Poll for CI pipeline completion
# - wait_for_site() - Wait for site to respond
# - Colors for output
set -euo pipefail
# Require environment variables
: "${RDEV_API_URL:?RDEV_API_URL must be set}"
: "${RDEV_API_KEY:?RDEV_API_KEY must be set}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Make an authenticated API call
# Arguments: method endpoint [data]
# Example: api_call GET "/projects"
# Example: api_call POST "/projects" '{"name": "test"}'
api_call() {
local method="$1"
local endpoint="$2"
local data="${3:-}"
if [[ -n "$data" ]]; then
curl -s -X "$method" "$RDEV_API_URL$endpoint" \
-H "X-API-Key: $RDEV_API_KEY" \
-H "Content-Type: application/json" \
-d "$data"
else
curl -s -X "$method" "$RDEV_API_URL$endpoint" \
-H "X-API-Key: $RDEV_API_KEY"
fi
}
# Wait for a build to complete
# Arguments: task_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
wait_for_build() {
local task_id="$1"
local max_attempts="${2:-60}" # 5 minutes default (5s * 60)
local poll_interval="${3:-5}"
local attempt=0
echo -e "${CYAN}Waiting for build to complete (task: $task_id)...${NC}"
while [[ $attempt -lt $max_attempts ]]; do
local result
result=$(api_call GET "/builds/$task_id")
local status
status=$(echo "$result" | jq -r '.status // .data.status // "unknown"')
case "$status" in
completed)
local success
success=$(echo "$result" | jq -r '.result.success // .data.result.success // false')
if [[ "$success" == "true" ]]; then
echo -e "${GREEN}Build completed successfully!${NC}"
echo "$result" | jq '.result // .data.result'
return 0
else
echo -e "${RED}Build completed but failed:${NC}"
echo "$result" | jq '.result // .data.result'
return 1
fi
;;
failed)
echo -e "${RED}Build failed:${NC}"
echo "$result" | jq '.'
return 1
;;
running)
echo " Build running... (attempt $((attempt + 1))/$max_attempts)"
;;
pending)
echo " Build pending... (attempt $((attempt + 1))/$max_attempts)"
;;
*)
echo " Unknown status: $status (attempt $((attempt + 1))/$max_attempts)"
;;
esac
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for build to complete${NC}"
return 2
}
# Wait for CI pipeline to complete
# Arguments: project_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
# On failure, automatically runs diagnostics
wait_for_pipeline() {
local project_id="$1"
local max_attempts="${2:-60}" # 5 minutes default
local poll_interval="${3:-5}"
local attempt=0
echo -e "${CYAN}Waiting for CI pipeline...${NC}"
# Wait a bit for pipeline to be created
sleep 5
while [[ $attempt -lt $max_attempts ]]; do
local result
result=$(api_call GET "/projects/$project_id/pipelines")
# Check if we have any pipelines
local pipeline_count
pipeline_count=$(echo "$result" | jq '.data | length // 0')
if [[ "$pipeline_count" -eq 0 ]]; then
echo " No pipelines yet... (attempt $((attempt + 1))/$max_attempts)"
sleep "$poll_interval"
((attempt++))
continue
fi
# Get latest pipeline status
local status
status=$(echo "$result" | jq -r '.data[0].status // "unknown"')
local pipeline_number
pipeline_number=$(echo "$result" | jq -r '.data[0].number // "?"')
case "$status" in
success)
echo -e "${GREEN}Pipeline #$pipeline_number completed successfully!${NC}"
return 0
;;
failure|error|killed)
echo -e "${RED}Pipeline #$pipeline_number failed with status: $status${NC}"
# Automatically diagnose the failure
diagnose_pipeline_failure "$project_id"
return 1
;;
running|pending)
echo " Pipeline #$pipeline_number $status... (attempt $((attempt + 1))/$max_attempts)"
;;
*)
echo " Pipeline #$pipeline_number status: $status (attempt $((attempt + 1))/$max_attempts)"
;;
esac
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for pipeline to complete${NC}"
return 2
}
# Wait for site to be accessible
# Arguments: domain [max_attempts] [poll_interval] [project_id]
# Returns: 0 on success, 1 on timeout
# On timeout, automatically runs diagnostics if project_id is provided
wait_for_site() {
local domain="$1"
local max_attempts="${2:-30}"
local poll_interval="${3:-5}"
local project_id="${4:-}"
local attempt=0
local last_http_code=""
echo -e "${CYAN}Waiting for site to be accessible at https://$domain...${NC}"
while [[ $attempt -lt $max_attempts ]]; do
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" "https://$domain" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]]; then
echo -e "${GREEN}Site is live! (HTTP $http_code)${NC}"
return 0
fi
# Only print status change or every 5th attempt to reduce noise
if [[ "$http_code" != "$last_http_code" ]] || (( attempt % 5 == 0 )); then
echo " HTTP $http_code... (attempt $((attempt + 1))/$max_attempts)"
fi
last_http_code="$http_code"
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for site to respond (last: HTTP $last_http_code)${NC}"
# Automatically diagnose if we have project_id
if [[ -n "$project_id" ]]; then
diagnose_site_failure "$domain" "$project_id"
else
echo ""
echo " Tip: Pass project_id to wait_for_site for automatic diagnostics"
fi
return 1
}
# Print a section header
print_header() {
local title="$1"
echo ""
echo -e "${BLUE}=== $title ===${NC}"
echo ""
}
# Print success message
print_success() {
echo -e "${GREEN}$1${NC}"
}
# Print error message
print_error() {
echo -e "${RED}$1${NC}"
}
# Print warning message
print_warning() {
echo -e "${YELLOW}$1${NC}"
}
# Print diagnostic section header
print_diagnostic_header() {
local title="$1"
echo ""
echo -e "${CYAN}┌─────────────────────────────────────────────────────────────────┐${NC}"
echo -e "${CYAN}│ DIAGNOSTIC: $title${NC}"
echo -e "${CYAN}└─────────────────────────────────────────────────────────────────┘${NC}"
}
# Print a suggested fix
print_fix() {
echo -e "${YELLOW} → FIX: $1${NC}"
}
# Print a command the user can run
print_cmd() {
echo -e "${BLUE} \$ $1${NC}"
}
# Get git owner from environment or default
get_git_owner() {
echo "${GITEA_DEFAULT_ORG:-jordan}"
}
# Diagnose a failed pipeline - fetches details and prints actionable info
# Arguments: project_id
diagnose_pipeline_failure() {
local project_id="$1"
local git_owner
git_owner=$(get_git_owner)
print_diagnostic_header "Pipeline Failure Analysis"
# Get the latest pipeline
local pipelines
pipelines=$(api_call GET "/projects/$project_id/pipelines")
local pipeline_number
pipeline_number=$(echo "$pipelines" | jq -r '.data[0].number // "?"')
local pipeline_status
pipeline_status=$(echo "$pipelines" | jq -r '.data[0].status // "unknown"')
local pipeline_errors
pipeline_errors=$(echo "$pipelines" | jq -r '.data[0].errors // []')
local commit_msg
commit_msg=$(echo "$pipelines" | jq -r '.data[0].message // ""' | head -1)
echo ""
echo " Pipeline #$pipeline_number: $pipeline_status"
echo " Commit: $commit_msg"
# Show any pipeline-level errors (YAML validation, etc.)
local error_count
error_count=$(echo "$pipeline_errors" | jq 'length // 0')
if [[ "$error_count" -gt 0 ]]; then
echo ""
echo -e "${RED} Pipeline Errors:${NC}"
echo "$pipeline_errors" | jq -r '.[] | " - \(.type): \(.message)"'
fi
# Try to get step details from the steps API (if available)
local steps_response
steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{"error":"not available"}')
local has_steps
has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")
if [[ "$has_steps" == "true" ]]; then
echo ""
echo " Steps:"
# Format steps with status icons, duration, and exit code for failures
echo "$steps_response" | jq -r '.data.steps[] |
(if .duration_seconds > 0 then " (\(.duration_seconds)s)" else "" end) as $dur |
if .status == "failure" or .status == "error" or .status == "killed" then
" \u001b[31m✗\u001b[0m \(.name): FAILED (exit \(.exit_code // "?"))\($dur)"
elif .status == "success" then
" \u001b[32m✓\u001b[0m \(.name): success\($dur)"
elif .status == "running" then
" \u001b[33m◐\u001b[0m \(.name): running..."
elif .status == "pending" then
" ○ \(.name): pending"
elif .status == "skipped" then
" ○ \(.name): skipped"
else
" ? \(.name): \(.status)"
end'
# Show logs from failed steps
local failed_steps
failed_steps=$(echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")')
if [[ -n "$failed_steps" ]]; then
echo ""
echo -e "${RED} Failed Step Details:${NC}"
# For each failed step, show error and log
echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") |
"\n Step: \(.name)" +
(if .error and .error != "" then "\n Error: \(.error)" else "" end) +
(if .log and .log != "" then "\n Last lines of log:\n\(.log | split("\n") | .[-20:] | join("\n") | gsub("^"; " "))" else "" end)'
fi
else
echo ""
echo -e "${YELLOW} Steps API not available - upgrade rdev-api for detailed step info${NC}"
fi
# Always provide direct links
echo ""
echo " View full logs:"
print_cmd "open https://ci.threesix.ai/$git_owner/$project_id/$pipeline_number"
# Pattern match common errors and suggest fixes
echo ""
diagnose_common_pipeline_errors "$project_id" "$pipeline_number"
}
# Pattern match common pipeline errors and suggest fixes
diagnose_common_pipeline_errors() {
local project_id="$1"
local pipeline_number="$2"
echo " Common issues to check:"
echo ""
# Check 1: Missing K8s deployment (most common issue)
echo " 1. Missing Kubernetes Deployment?"
echo " The CI pipeline tries to 'kubectl set image' but deployment may not exist."
print_cmd "kubectl get deployment -n projects -l app=$project_id"
print_fix "Component may need initial deployment created"
echo ""
# Check 2: Docker build issues
echo " 2. Docker Build Failed?"
echo " Check if Dockerfile exists and workspace files are correct."
print_cmd "Check the build step in Woodpecker UI for specific error"
echo ""
# Check 3: Registry auth
echo " 3. Registry Push Failed?"
echo " Kaniko may not have credentials to push to registry."
print_cmd "kubectl get secret -n woodpecker-agents | grep registry"
}
# Diagnose why a site is not accessible
# Arguments: domain project_id
diagnose_site_failure() {
local domain="$1"
local project_id="$2"
print_diagnostic_header "Site Accessibility Analysis"
echo ""
echo " Domain: https://$domain"
echo " Project: $project_id"
echo ""
# Check if kubectl is available and configured
if ! command -v kubectl &> /dev/null; then
echo -e "${YELLOW} kubectl not found - cannot check K8s state${NC}"
echo " Install kubectl and set KUBECONFIG to diagnose further"
return
fi
if [[ -z "${KUBECONFIG:-}" ]]; then
echo -e "${YELLOW} KUBECONFIG not set - trying default context${NC}"
fi
# Check pods
echo " Checking pods in 'projects' namespace:"
local pods
pods=$(kubectl get pods -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "ERROR")
if [[ "$pods" == "ERROR" ]]; then
echo -e "${RED} Failed to query K8s (check KUBECONFIG)${NC}"
print_cmd "export KUBECONFIG=~/.kube/orchard9-k3sf.yaml"
return
elif [[ -z "$pods" ]]; then
echo -e "${RED} No pods found for app=$project_id${NC}"
print_fix "Deployment doesn't exist - CI may have failed or component needs initial deploy"
print_cmd "kubectl get deployments -n projects"
else
echo "$pods" | sed 's/^/ /'
# Check for common pod issues
if echo "$pods" | grep -q "ImagePullBackOff\|ErrImagePull"; then
echo ""
echo -e "${RED} Issue: ImagePullBackOff${NC}"
print_fix "Image doesn't exist in registry - check CI build step"
print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A5 'Events:'"
fi
if echo "$pods" | grep -q "CrashLoopBackOff"; then
echo ""
echo -e "${RED} Issue: CrashLoopBackOff${NC}"
print_fix "Container is crashing - check application logs"
print_cmd "kubectl logs -n projects -l app=$project_id --tail=50"
fi
if echo "$pods" | grep -q "Pending"; then
echo ""
echo -e "${RED} Issue: Pod stuck in Pending${NC}"
print_fix "Likely resource constraints or scheduling issues"
print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A10 'Events:'"
fi
if echo "$pods" | grep -q "0/1\|0/2"; then
echo ""
echo -e "${YELLOW} Issue: Container not ready${NC}"
print_fix "Container may still be starting or failing health checks"
print_cmd "kubectl logs -n projects -l app=$project_id --tail=20"
fi
fi
# Check services
echo ""
echo " Checking services:"
local svc
svc=$(kubectl get svc -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "")
if [[ -z "$svc" ]]; then
echo -e "${RED} No service found for app=$project_id${NC}"
print_fix "Service needs to be created along with deployment"
else
echo "$svc" | sed 's/^/ /'
fi
# Check ingress
echo ""
echo " Checking ingress:"
local ingress
ingress=$(kubectl get ingress -n projects --no-headers 2>/dev/null | grep "$project_id\|$domain" || echo "")
if [[ -z "$ingress" ]]; then
echo -e "${YELLOW} No ingress found matching $project_id or $domain${NC}"
else
echo "$ingress" | sed 's/^/ /'
fi
# Recent events
echo ""
echo " Recent events:"
kubectl get events -n projects --sort-by='.lastTimestamp' 2>/dev/null | grep "$project_id" | tail -5 | sed 's/^/ /' || echo " No recent events"
echo ""
echo " Manual investigation commands:"
print_cmd "kubectl logs -n projects -l app=$project_id -f"
print_cmd "kubectl describe pod -n projects -l app=$project_id"
print_cmd "kubectl get events -n projects --sort-by='.lastTimestamp' | tail -20"
}