rdev/cookbooks/scripts/common.sh
jordan e42c18a9a3 feat: add session web UI mode + aeries-daeya cookbook tree
Session WebUI:
- Add `web_ui` flag to session create — launches claude-code-ui in pod on port 3001
- Install @siteboon/claude-code-ui in claudebox Dockerfile, expose port 3001
- Migration 027: add web_ui column to sessions table
- startWebUI/stopWebUI fire-and-forget helpers in SessionsHandler
- Service selects preview port 3001 (web UI) vs 8080 (sidecar) based on flag

Aeries Daeya cookbook:
- Add cookbooks/trees/aeries-daeya.yaml: privacy-first avatar social platform
  (infra → avatar data model → AI generation pipeline → studio UI)
- Add cookbooks/scripts/aeries-daeya-test.sh: run/status/diagnose/teardown harness
- Fix race condition in common.sh wait_for_pipeline: detect already-running pipelines
  at startup and track directly instead of waiting for a newer one

Docs/tooling:
- Add SDK Update Workflow section to CLAUDE.md
- Add `make sdk` and `make sdk-check` targets for OpenAPI spec management

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 23:14:08 -07:00

588 lines
21 KiB
Bash
Executable File

#!/bin/bash
# Common utilities for rdev cookbook scripts
#
# Usage:
# source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
#
# Provides:
# - api_call() - Make authenticated API calls
# - wait_for_build() - Poll for build completion
# - wait_for_pipeline() - Poll for CI pipeline completion
# - wait_for_site() - Wait for site to respond
# - Colors for output
set -euo pipefail
# Environment variables (checked at runtime by preflight_check, not on source)
# This allows commands like 'list' to work without credentials
RDEV_API_URL="${RDEV_API_URL:-}"
RDEV_API_KEY="${RDEV_API_KEY:-}"
# Auto-cleanup configuration
# Set AUTO_TEARDOWN=true to automatically clean up projects on exit
AUTO_TEARDOWN="${AUTO_TEARDOWN:-false}"
# Track created project for cleanup
# Scripts should set this after successful project creation
CLEANUP_PROJECT=""
# Cleanup handler for auto-teardown
# Called on script exit when AUTO_TEARDOWN=true
cleanup_on_exit() {
local exit_code=$?
if [[ -n "$CLEANUP_PROJECT" && "$AUTO_TEARDOWN" == "true" ]]; then
echo ""
echo -e "${CYAN}Auto-teardown: Cleaning up $CLEANUP_PROJECT...${NC}"
api_call DELETE "/project/$CLEANUP_PROJECT" > /dev/null 2>&1 || true
echo -e "${GREEN}✓ Project $CLEANUP_PROJECT deleted${NC}"
fi
exit $exit_code
}
# Register cleanup handler
# Scripts should call this after sourcing common.sh if they want auto-cleanup
register_cleanup_trap() {
trap cleanup_on_exit EXIT INT TERM
}
# Parse --auto-teardown from args and return remaining args
# Usage: args=$(parse_auto_teardown_flag "$@")
parse_auto_teardown_flag() {
local args=()
for arg in "$@"; do
if [[ "$arg" == "--auto-teardown" ]]; then
AUTO_TEARDOWN="true"
else
args+=("$arg")
fi
done
echo "${args[@]}"
}
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Default API timeout in seconds (can be overridden with API_TIMEOUT env var)
API_TIMEOUT="${API_TIMEOUT:-60}"
# Make an authenticated API call
# Arguments: method endpoint [data]
# Example: api_call GET "/projects"
# Example: api_call POST "/projects" '{"name": "test"}'
api_call() {
local method="$1"
local endpoint="$2"
local data="${3:-}"
if [[ -n "$data" ]]; then
curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \
-H "X-API-Key: $RDEV_API_KEY" \
-H "Content-Type: application/json" \
-d "$data"
else
curl -s --max-time "$API_TIMEOUT" -X "$method" "$RDEV_API_URL$endpoint" \
-H "X-API-Key: $RDEV_API_KEY"
fi
}
# Wait for a build to complete
# Arguments: task_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
wait_for_build() {
local task_id="$1"
local max_attempts="${2:-120}" # 10 minutes default (5s * 120)
local poll_interval="${3:-5}"
local attempt=0
echo -e "${CYAN}Waiting for build to complete (task: $task_id)...${NC}"
while [[ $attempt -lt $max_attempts ]]; do
local result
result=$(api_call GET "/builds/$task_id")
local status
status=$(echo "$result" | jq -r '.status // .data.status // "unknown"')
case "$status" in
completed)
local success
success=$(echo "$result" | jq -r '.result.success // .data.result.success // false')
if [[ "$success" == "true" ]]; then
echo -e "${GREEN}Build completed successfully!${NC}"
echo "$result" | jq '.result // .data.result'
return 0
else
echo -e "${RED}Build completed but failed:${NC}"
echo "$result" | jq '.result // .data.result'
return 1
fi
;;
failed)
echo -e "${RED}Build failed:${NC}"
echo "$result" | jq '.'
return 1
;;
running)
echo " Build running... (attempt $((attempt + 1))/$max_attempts)"
;;
pending)
echo " Build pending... (attempt $((attempt + 1))/$max_attempts)"
;;
*)
echo " Unknown status: $status (attempt $((attempt + 1))/$max_attempts)"
;;
esac
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for build to complete${NC}"
return 2
}
# Wait for CI pipeline to complete
# Arguments: project_id [max_attempts] [poll_interval]
# Returns: 0 on success, 1 on failure, 2 on timeout
# On failure, automatically runs diagnostics
#
# Fast-fail behavior: Returns immediately on failure/error/killed states
# instead of waiting for timeout. This prevents "blind waiting" when
# the pipeline has already failed.
wait_for_pipeline() {
local project_id="$1"
local max_attempts="${2:-120}" # 10 minutes default
local poll_interval="${3:-10}"
local attempt=0
local tracked_pipeline="" # Track specific pipeline once found
echo -e "${CYAN}Waiting for new CI pipeline...${NC}"
# Record the current latest pipeline number BEFORE waiting
# so we only track pipelines triggered AFTER this point.
# Race condition guard: if the triggering step pushed fast enough that its pipeline
# already appears as the latest, track that pipeline directly instead of waiting for
# a newer one that will never come.
local baseline_number=0
local initial_result initial_status
initial_result=$(api_call GET "/projects/$project_id/pipelines" 2>/dev/null)
if echo "$initial_result" | jq -e '.data[0]' >/dev/null 2>&1; then
baseline_number=$(echo "$initial_result" | jq -r '.data[0].number // 0')
initial_status=$(echo "$initial_result" | jq -r '.data[0].status // "unknown"')
# If the latest pipeline is already running or pending, it was triggered by the
# preceding step — track it directly rather than waiting for a newer one.
if [[ "$initial_status" == "running" || "$initial_status" == "pending" || "$initial_status" == "started" ]]; then
tracked_pipeline="$baseline_number"
echo " Detected in-progress pipeline #$baseline_number (status: $initial_status) — tracking it"
else
echo " Baseline pipeline: #$baseline_number (status: $initial_status) — waiting for a newer one"
fi
fi
while [[ $attempt -lt $max_attempts ]]; do
local result
result=$(api_call GET "/projects/$project_id/pipelines")
# Check if we have any pipelines
local pipeline_count
pipeline_count=$(echo "$result" | jq '.data | length // 0')
if [[ "$pipeline_count" -eq 0 ]]; then
echo " No pipelines yet... (attempt $((attempt + 1))/$max_attempts)"
sleep "$poll_interval"
((attempt++))
continue
fi
# Get latest pipeline number and status
local pipeline_number status
pipeline_number=$(echo "$result" | jq -r '.data[0].number // 0')
status=$(echo "$result" | jq -r '.data[0].status // "unknown"')
# Skip any pipeline that is not newer than our baseline.
# Exception: if tracked_pipeline is already set (we detected an in-progress
# pipeline at startup), bypass the baseline check and go straight to status.
if [[ -z "$tracked_pipeline" && "$pipeline_number" -le "$baseline_number" ]]; then
echo " Waiting for new pipeline (latest is #$pipeline_number, baseline #$baseline_number)... (attempt $((attempt + 1))/$max_attempts)"
sleep "$poll_interval"
((attempt++))
continue
fi
# A new pipeline exists — track it (if not already tracking)
if [[ -z "$tracked_pipeline" ]]; then
tracked_pipeline="$pipeline_number"
echo " Tracking new pipeline #$tracked_pipeline"
fi
case "$status" in
success)
echo -e "${GREEN}✓ Pipeline #$pipeline_number completed successfully!${NC}"
return 0
;;
failure|error|killed)
# FAST FAIL: Don't wait for timeout, fail immediately
echo ""
echo -e "${RED}✗ Pipeline #$pipeline_number failed (status: $status)${NC}"
echo ""
# Quick inline step summary before full diagnostics
local steps_response
steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{}')
local has_steps
has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")
if [[ "$has_steps" == "true" ]]; then
# Show failed steps inline for quick diagnosis
local failed_count
failed_count=$(echo "$steps_response" | jq '[.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")] | length')
if [[ "$failed_count" -gt 0 ]]; then
echo -e "${RED} Failed steps:${NC}"
echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") | " ✗ \(.name): exit \(.exit_code // "?")"'
fi
fi
# Full diagnostics
diagnose_pipeline_failure "$project_id"
return 1
;;
running|pending)
echo " Pipeline #$pipeline_number $status... (attempt $((attempt + 1))/$max_attempts)"
;;
*)
echo " Pipeline #$pipeline_number status: $status (attempt $((attempt + 1))/$max_attempts)"
;;
esac
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for new pipeline${NC}"
# On timeout, still run diagnostics to help debug
if [[ -n "$tracked_pipeline" ]]; then
diagnose_pipeline_failure "$project_id"
fi
return 2
}
# Wait for site to be accessible
# Arguments: domain [max_attempts] [poll_interval] [project_id]
# Returns: 0 on success, 1 on timeout
# On timeout, automatically runs diagnostics if project_id is provided
wait_for_site() {
local domain="$1"
local max_attempts="${2:-30}"
local poll_interval="${3:-5}"
local project_id="${4:-}"
local attempt=0
local last_http_code=""
echo -e "${CYAN}Waiting for site to be accessible at https://$domain...${NC}"
while [[ $attempt -lt $max_attempts ]]; do
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" "https://$domain" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]]; then
echo -e "${GREEN}Site is live! (HTTP $http_code)${NC}"
return 0
fi
# Only print status change or every 5th attempt to reduce noise
if [[ "$http_code" != "$last_http_code" ]] || (( attempt % 5 == 0 )); then
echo " HTTP $http_code... (attempt $((attempt + 1))/$max_attempts)"
fi
last_http_code="$http_code"
sleep "$poll_interval"
((attempt++))
done
echo -e "${YELLOW}Timeout waiting for site to respond (last: HTTP $last_http_code)${NC}"
# Automatically diagnose if we have project_id
if [[ -n "$project_id" ]]; then
diagnose_site_failure "$domain" "$project_id"
else
echo ""
echo " Tip: Pass project_id to wait_for_site for automatic diagnostics"
fi
return 1
}
# Print a section header
print_header() {
local title="$1"
echo ""
echo -e "${BLUE}=== $title ===${NC}"
echo ""
}
# Print success message
print_success() {
echo -e "${GREEN}$1${NC}"
}
# Print error message
print_error() {
echo -e "${RED}$1${NC}"
}
# Print warning message
print_warning() {
echo -e "${YELLOW}$1${NC}"
}
# Print diagnostic section header
print_diagnostic_header() {
local title="$1"
echo ""
echo -e "${CYAN}┌─────────────────────────────────────────────────────────────────┐${NC}"
echo -e "${CYAN}│ DIAGNOSTIC: $title${NC}"
echo -e "${CYAN}└─────────────────────────────────────────────────────────────────┘${NC}"
}
# Print a suggested fix
print_fix() {
echo -e "${YELLOW} → FIX: $1${NC}"
}
# Print a command the user can run
print_cmd() {
echo -e "${BLUE} \$ $1${NC}"
}
# Get git owner from environment or default
get_git_owner() {
echo "${GITEA_DEFAULT_ORG:-jordan}"
}
# Diagnose a failed pipeline - fetches details and prints actionable info
# Arguments: project_id
diagnose_pipeline_failure() {
local project_id="$1"
local git_owner
git_owner=$(get_git_owner)
print_diagnostic_header "Pipeline Failure Analysis"
# Get the latest pipeline
local pipelines
pipelines=$(api_call GET "/projects/$project_id/pipelines")
local pipeline_number
pipeline_number=$(echo "$pipelines" | jq -r '.data[0].number // "?"')
local pipeline_status
pipeline_status=$(echo "$pipelines" | jq -r '.data[0].status // "unknown"')
local pipeline_errors
pipeline_errors=$(echo "$pipelines" | jq -r '.data[0].errors // []')
local commit_msg
commit_msg=$(echo "$pipelines" | jq -r '.data[0].message // ""' | head -1)
echo ""
echo " Pipeline #$pipeline_number: $pipeline_status"
echo " Commit: $commit_msg"
# Show any pipeline-level errors (YAML validation, etc.)
local error_count
error_count=$(echo "$pipeline_errors" | jq 'length // 0')
if [[ "$error_count" -gt 0 ]]; then
echo ""
echo -e "${RED} Pipeline Errors:${NC}"
echo "$pipeline_errors" | jq -r '.[] | " - \(.type): \(.message)"'
fi
# Try to get step details from the steps API (if available)
local steps_response
steps_response=$(api_call GET "/projects/$project_id/pipelines/$pipeline_number/steps" 2>/dev/null || echo '{"error":"not available"}')
local has_steps
has_steps=$(echo "$steps_response" | jq 'has("data")' 2>/dev/null || echo "false")
if [[ "$has_steps" == "true" ]]; then
echo ""
echo " Steps:"
# Format steps with status icons, duration, and exit code for failures
echo "$steps_response" | jq -r '.data.steps[] |
(if .duration_seconds > 0 then " (\(.duration_seconds)s)" else "" end) as $dur |
if .status == "failure" or .status == "error" or .status == "killed" then
" \u001b[31m✗\u001b[0m \(.name): FAILED (exit \(.exit_code // "?"))\($dur)"
elif .status == "success" then
" \u001b[32m✓\u001b[0m \(.name): success\($dur)"
elif .status == "running" then
" \u001b[33m◐\u001b[0m \(.name): running..."
elif .status == "pending" then
" ○ \(.name): pending"
elif .status == "skipped" then
" ○ \(.name): skipped"
else
" ? \(.name): \(.status)"
end'
# Show logs from failed steps
local failed_steps
failed_steps=$(echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed")')
if [[ -n "$failed_steps" ]]; then
echo ""
echo -e "${RED} Failed Step Details:${NC}"
# For each failed step, show error and log
echo "$steps_response" | jq -r '.data.steps[] | select(.status == "failure" or .status == "error" or .status == "killed") |
"\n Step: \(.name)" +
(if .error and .error != "" then "\n Error: \(.error)" else "" end) +
(if .log and .log != "" then "\n Last lines of log:\n\(.log | split("\n") | .[-20:] | join("\n") | gsub("^"; " "))" else "" end)'
fi
else
echo ""
echo -e "${YELLOW} Steps API not available - upgrade rdev-api for detailed step info${NC}"
fi
# Always provide direct links
echo ""
echo " View full logs:"
print_cmd "open https://ci.threesix.ai/$git_owner/$project_id/$pipeline_number"
# Pattern match common errors and suggest fixes
echo ""
diagnose_common_pipeline_errors "$project_id" "$pipeline_number"
}
# Pattern match common pipeline errors and suggest fixes
diagnose_common_pipeline_errors() {
local project_id="$1"
local pipeline_number="$2"
echo " Common issues to check:"
echo ""
# Check 1: Missing K8s deployment (most common issue)
echo " 1. Missing Kubernetes Deployment?"
echo " The CI pipeline tries to 'kubectl set image' but deployment may not exist."
print_cmd "kubectl get deployment -n projects -l app=$project_id"
print_fix "Component may need initial deployment created"
echo ""
# Check 2: Docker build issues
echo " 2. Docker Build Failed?"
echo " Check if Dockerfile exists and workspace files are correct."
print_cmd "Check the build step in Woodpecker UI for specific error"
echo ""
# Check 3: Registry auth
echo " 3. Registry Push Failed?"
echo " Kaniko may not have credentials to push to registry."
print_cmd "kubectl get secret -n woodpecker-agents | grep registry"
}
# Diagnose why a site is not accessible
# Arguments: domain project_id
diagnose_site_failure() {
local domain="$1"
local project_id="$2"
print_diagnostic_header "Site Accessibility Analysis"
echo ""
echo " Domain: https://$domain"
echo " Project: $project_id"
echo ""
# Check if kubectl is available and configured
if ! command -v kubectl &> /dev/null; then
echo -e "${YELLOW} kubectl not found - cannot check K8s state${NC}"
echo " Install kubectl and set KUBECONFIG to diagnose further"
return
fi
if [[ -z "${KUBECONFIG:-}" ]]; then
echo -e "${YELLOW} KUBECONFIG not set - trying default context${NC}"
fi
# Check pods
echo " Checking pods in 'projects' namespace:"
local pods
pods=$(kubectl get pods -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "ERROR")
if [[ "$pods" == "ERROR" ]]; then
echo -e "${RED} Failed to query K8s (check KUBECONFIG)${NC}"
print_cmd "export KUBECONFIG=~/.kube/orchard9-k3sf.yaml"
return
elif [[ -z "$pods" ]]; then
echo -e "${RED} No pods found for app=$project_id${NC}"
print_fix "Deployment doesn't exist - CI may have failed or component needs initial deploy"
print_cmd "kubectl get deployments -n projects"
else
echo "$pods" | sed 's/^/ /'
# Check for common pod issues
if echo "$pods" | grep -q "ImagePullBackOff\|ErrImagePull"; then
echo ""
echo -e "${RED} Issue: ImagePullBackOff${NC}"
print_fix "Image doesn't exist in registry - check CI build step"
print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A5 'Events:'"
fi
if echo "$pods" | grep -q "CrashLoopBackOff"; then
echo ""
echo -e "${RED} Issue: CrashLoopBackOff${NC}"
print_fix "Container is crashing - check application logs"
print_cmd "kubectl logs -n projects -l app=$project_id --tail=50"
fi
if echo "$pods" | grep -q "Pending"; then
echo ""
echo -e "${RED} Issue: Pod stuck in Pending${NC}"
print_fix "Likely resource constraints or scheduling issues"
print_cmd "kubectl describe pod -n projects -l app=$project_id | grep -A10 'Events:'"
fi
if echo "$pods" | grep -q "0/1\|0/2"; then
echo ""
echo -e "${YELLOW} Issue: Container not ready${NC}"
print_fix "Container may still be starting or failing health checks"
print_cmd "kubectl logs -n projects -l app=$project_id --tail=20"
fi
fi
# Check services
echo ""
echo " Checking services:"
local svc
svc=$(kubectl get svc -n projects -l "app=$project_id" --no-headers 2>/dev/null || echo "")
if [[ -z "$svc" ]]; then
echo -e "${RED} No service found for app=$project_id${NC}"
print_fix "Service needs to be created along with deployment"
else
echo "$svc" | sed 's/^/ /'
fi
# Check ingress
echo ""
echo " Checking ingress:"
local ingress
ingress=$(kubectl get ingress -n projects --no-headers 2>/dev/null | grep "$project_id\|$domain" || echo "")
if [[ -z "$ingress" ]]; then
echo -e "${YELLOW} No ingress found matching $project_id or $domain${NC}"
else
echo "$ingress" | sed 's/^/ /'
fi
# Recent events
echo ""
echo " Recent events:"
kubectl get events -n projects --sort-by='.lastTimestamp' 2>/dev/null | grep "$project_id" | tail -5 | sed 's/^/ /' || echo " No recent events"
echo ""
echo " Manual investigation commands:"
print_cmd "kubectl logs -n projects -l app=$project_id -f"
print_cmd "kubectl describe pod -n projects -l app=$project_id"
print_cmd "kubectl get events -n projects --sort-by='.lastTimestamp' | tail -20"
}