From 9a1309a0c5ea9e962daa5635d281d80544797397 Mon Sep 17 00:00:00 2001
From: jordan <jordan.washburn@gmail.com>
Date: Tue, 3 Feb 2026 18:46:51 -0700
Subject: [PATCH] feat: fix composable monorepo CI builds + health endpoint
 improvements

Composable monorepo CI fixes:
- Add empty go.sum.tmpl files for pkg, service, worker, and cli components
- Fix Dockerfile.tmpl glob patterns (COPY go.work.sum* is invalid in Kaniko)
- Add deps step to CI that runs go work sync and go mod tidy before builds
- Fix scalar-go dependency version (v0.1.2 doesn't exist, use v0.13.0)

Health endpoint improvements:
- Add registry health check (zot OCI /v2/ endpoint)
- Add health metrics for CI, registry, and Git
- Add /health/ci endpoint for Woodpecker health

Visual verification scaffolding:
- Add Playwright pod and scripts ConfigMap
- Add vision.md and implementation breakdown plan

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 cmd/rdev-api/main.go                          |  17 +
 deployments/k8s/base/kustomization.yaml       |   4 +
 deployments/k8s/base/playwright-pod.yaml      |  90 ++++
 .../base/playwright-scripts-configmap.yaml    | 108 ++++
 docs/plans/visual-verification-breakdown.md   | 479 ++++++++++++++++++
 .../templates/astro-landing/.woodpecker.yml   |   1 +
 .../templates/components/cli/go.sum.tmpl      |   0
 .../components/service/Dockerfile.tmpl        |   3 +-
 .../templates/components/service/go.sum.tmpl  |   0
 .../components/worker/Dockerfile.tmpl         |   3 +-
 .../templates/components/worker/go.sum.tmpl   |   0
 .../templates/default/.woodpecker.yml         |   1 +
 .../templates/go-api/.woodpecker.yml          |   1 +
 .../templates/skeleton/.woodpecker.yml.tmpl   |  26 +
 .../templates/skeleton/pkg/go.mod.tmpl        |   2 +-
 .../templates/skeleton/pkg/go.sum.tmpl        |   0
 internal/adapter/zot/client.go                |  74 +++
 internal/domain/errors.go                     |   5 +-
 internal/domain/operation.go                  |   3 +-
 internal/domain/registry.go                   |  12 +
 internal/handlers/health.go                   |  52 +-
 internal/handlers/woodpecker_webhook.go       |  68 +++
 internal/handlers/woodpecker_webhook_test.go  |  60 ++-
 internal/metrics/metrics.go                   |  42 ++
 internal/port/health.go                       |  12 +-
 vision.md                                     | 357 +++++++++++++
 26 files changed, 1404 insertions(+), 16 deletions(-)
 create mode 100644 deployments/k8s/base/playwright-pod.yaml
 create mode 100644 deployments/k8s/base/playwright-scripts-configmap.yaml
 create mode 100644 docs/plans/visual-verification-breakdown.md
 create mode 100644 internal/adapter/templates/templates/components/cli/go.sum.tmpl
 create mode 100644 internal/adapter/templates/templates/components/service/go.sum.tmpl
 create mode 100644 internal/adapter/templates/templates/components/worker/go.sum.tmpl
 create mode 100644 internal/adapter/templates/templates/skeleton/pkg/go.sum.tmpl
 create mode 100644 internal/adapter/zot/client.go
 create mode 100644 internal/domain/registry.go
 create mode 100644 vision.md

diff --git a/cmd/rdev-api/main.go b/cmd/rdev-api/main.go
index 693a54e..1cd5eb6 100644
--- a/cmd/rdev-api/main.go
+++ b/cmd/rdev-api/main.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"log/slog"
 	"os"
+	"strings"
 	"time"
 
 	"github.com/orchard9/rdev/internal/adapter/cloudflare"
@@ -20,6 +21,7 @@ import (
 	redisadapter "github.com/orchard9/rdev/internal/adapter/redis"
 	"github.com/orchard9/rdev/internal/adapter/templates"
 	"github.com/orchard9/rdev/internal/adapter/woodpecker"
+	"github.com/orchard9/rdev/internal/adapter/zot"
 	"github.com/orchard9/rdev/internal/auth"
 	"github.com/orchard9/rdev/internal/db"
 	"github.com/orchard9/rdev/internal/envutil"
@@ -404,9 +406,24 @@ func main() {
 	// Initialize operations handler (for debugging project failures)
 	operationsHandler := handlers.NewOperationsHandler(operationRepo)
 
+	// Initialize registry health checker (for monitoring)
+	var registryChecker *zot.Client
+	if infraCfg.RegistryURL != "" {
+		registryURL := infraCfg.RegistryURL
+		// Ensure URL has protocol
+		if !strings.HasPrefix(registryURL, "http") {
+			registryURL = "https://" + registryURL
+		}
+		registryChecker = zot.NewClient(registryURL)
+		logger.Info("registry health checker initialized", "url", registryURL)
+	}
+
 	// Override default health/ready endpoints with full dependency checks
 	healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil).
 		WithAgentRegistry(agentRegistry)
+	if registryChecker != nil {
+		healthHandler = healthHandler.WithRegistryChecker(registryChecker)
+	}
 
 	app.Router().Get("/health", healthHandler.Health)
 	app.Router().Get("/ready", healthHandler.Ready)
diff --git a/deployments/k8s/base/kustomization.yaml b/deployments/k8s/base/kustomization.yaml
index 709fa82..b21235d 100644
--- a/deployments/k8s/base/kustomization.yaml
+++ b/deployments/k8s/base/kustomization.yaml
@@ -12,6 +12,10 @@ resources:
   - claudebox.yaml
   - configmaps.yaml
 
+  # Playwright pod for visual verification
+  - playwright-pod.yaml
+  - playwright-scripts-configmap.yaml
+
   # NOTE: secrets.yaml and credentials.yaml contain real keys and are gitignored.
   # Copy from *.example files and fill in real values before deploying.
   - secrets.yaml       # from secrets.yaml.example
diff --git a/deployments/k8s/base/playwright-pod.yaml b/deployments/k8s/base/playwright-pod.yaml
new file mode 100644
index 0000000..4131c60
--- /dev/null
+++ b/deployments/k8s/base/playwright-pod.yaml
@@ -0,0 +1,90 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: playwright
+  namespace: rdev
+  labels:
+    app.kubernetes.io/name: playwright
+    app.kubernetes.io/part-of: rdev
+spec:
+  serviceName: playwright
+  replicas: 1
+  selector:
+    matchLabels:
+      app: playwright
+  template:
+    metadata:
+      labels:
+        app: playwright
+        app.kubernetes.io/name: playwright
+        app.kubernetes.io/part-of: rdev
+        rdev.orchard9.ai/role: playwright
+    spec:
+      containers:
+        - name: playwright
+          image: mcr.microsoft.com/playwright:v1.50.0-noble
+          imagePullPolicy: IfNotPresent
+          command: ["sleep", "infinity"]
+
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "1Gi"
+            limits:
+              cpu: "2"
+              memory: "4Gi"
+
+          volumeMounts:
+            # Captures directory for screenshots and videos
+            - name: captures
+              mountPath: /captures
+
+            # Scripts ConfigMap mounted as scripts directory
+            - name: scripts
+              mountPath: /scripts
+
+          # Simple liveness check - container is running
+          livenessProbe:
+            exec:
+              command:
+                - test
+                - -f
+                - /scripts/capture.js
+            initialDelaySeconds: 5
+            periodSeconds: 60
+
+          # Readiness - node and playwright are available
+          readinessProbe:
+            exec:
+              command:
+                - node
+                - --version
+            initialDelaySeconds: 10
+            periodSeconds: 30
+            timeoutSeconds: 10
+
+      volumes:
+        - name: captures
+          emptyDir: {}
+
+        - name: scripts
+          configMap:
+            name: playwright-scripts
+            defaultMode: 0755
+---
+# Headless service for StatefulSet
+apiVersion: v1
+kind: Service
+metadata:
+  name: playwright
+  namespace: rdev
+  labels:
+    app.kubernetes.io/name: playwright
+    app.kubernetes.io/part-of: rdev
+spec:
+  clusterIP: None
+  selector:
+    app: playwright
+  ports:
+    - port: 9323
+      name: debug
diff --git a/deployments/k8s/base/playwright-scripts-configmap.yaml b/deployments/k8s/base/playwright-scripts-configmap.yaml
new file mode 100644
index 0000000..3efbac3
--- /dev/null
+++ b/deployments/k8s/base/playwright-scripts-configmap.yaml
@@ -0,0 +1,108 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: playwright-scripts
+  namespace: rdev
+  labels:
+    app.kubernetes.io/name: playwright
+    app.kubernetes.io/part-of: rdev
+data:
+  capture.js: |
+    #!/usr/bin/env node
+    // capture.js - Playwright screenshot/video capture script
+    // Input:  --url, --viewports (comma-separated), --output (dir),
+    //         --wait-for (selector), --full-page, --video
+    // Output: JSON manifest to stdout
+
+    const { chromium } = require('playwright');
+    const path = require('path');
+    const fs = require('fs');
+
+    async function main() {
+      const args = parseArgs(process.argv.slice(2));
+
+      if (!args.url) {
+        console.error('Error: --url is required');
+        process.exit(1);
+      }
+
+      const outputDir = args.output || '/captures/default';
+      const viewports = args.viewports ? args.viewports.split(',') : ['1920x1080', '768x1024', '375x667'];
+      const waitFor = args['wait-for'] || 'body';
+      const fullPage = args['full-page'] === 'true';
+      const recordVideo = args.video === 'true';
+
+      // Ensure output directory exists
+      fs.mkdirSync(outputDir, { recursive: true });
+
+      const browser = await chromium.launch({ headless: true });
+      const result = { screenshots: {} };
+
+      try {
+        for (const viewport of viewports) {
+          const [width, height] = viewport.split('x').map(Number);
+          const viewportName = `${width}x${height}`;
+
+          const contextOptions = {
+            viewport: { width, height },
+          };
+
+          if (recordVideo && viewport === viewports[0]) {
+            contextOptions.recordVideo = {
+              dir: outputDir,
+              size: { width, height }
+            };
+          }
+
+          const context = await browser.newContext(contextOptions);
+          const page = await context.newPage();
+
+          await page.goto(args.url, { waitUntil: 'networkidle', timeout: 30000 });
+          await page.waitForSelector(waitFor, { timeout: 10000 }).catch(() => {});
+
+          const screenshotPath = path.join(outputDir, `${viewportName.replace('x', '_')}.png`);
+          await page.screenshot({ path: screenshotPath, fullPage });
+          result.screenshots[viewportName] = screenshotPath;
+
+          if (recordVideo && viewport === viewports[0]) {
+            await page.close();
+            const video = page.video();
+            if (video) {
+              const videoPath = await video.path();
+              const finalVideoPath = path.join(outputDir, 'recording.webm');
+              fs.renameSync(videoPath, finalVideoPath);
+              result.video = finalVideoPath;
+            }
+          }
+
+          await context.close();
+        }
+      } finally {
+        await browser.close();
+      }
+
+      console.log(JSON.stringify(result));
+    }
+
+    function parseArgs(argv) {
+      const args = {};
+      for (let i = 0; i < argv.length; i++) {
+        if (argv[i].startsWith('--')) {
+          const key = argv[i].slice(2);
+          const eqIdx = key.indexOf('=');
+          if (eqIdx !== -1) {
+            args[key.slice(0, eqIdx)] = key.slice(eqIdx + 1);
+          } else if (argv[i + 1] && !argv[i + 1].startsWith('--')) {
+            args[key] = argv[++i];
+          } else {
+            args[key] = 'true';
+          }
+        }
+      }
+      return args;
+    }
+
+    main().catch(err => {
+      console.error('Error:', err.message);
+      process.exit(1);
+    });
diff --git a/docs/plans/visual-verification-breakdown.md b/docs/plans/visual-verification-breakdown.md
new file mode 100644
index 0000000..9b5038e
--- /dev/null
+++ b/docs/plans/visual-verification-breakdown.md
@@ -0,0 +1,479 @@
+# Visual Verification Implementation Breakdown
+
+**Goal:** Add Playwright-based visual verification to rdev, enabling automated screenshot/video capture of deployed sites and AI-driven feature completeness evaluation. Integrate with SDLC as an optional QA gate and add a cookbook E2E test.
+
+**Estimated Duration:** 4 weeks (assumes ~25 hours/week of focused work)
+
+---
+
+## Week 1: Foundation — Domain + Capture Infrastructure
+
+**Goals:**
+- Playwright pod deployed and reachable via kubectl exec
+- Capture script working end-to-end
+- Domain models and work task type in place
+- Manual verification via kubectl exec confirms capture works
+
+**Tasks:**
+
+### Day 1-2: Playwright Pod Infrastructure
+
+1. **Create Playwright pod manifest** (`deployments/k8s/base/playwright-pod.yaml`)
+   - StatefulSet with `mcr.microsoft.com/playwright:v1.50.0-noble` image
+   - `sleep infinity` command (stays alive for kubectl exec)
+   - Labels: `app: playwright`, `rdev.orchard9.ai/role: playwright`
+   - Volumes: `/captures` (emptyDir), `/scripts` (ConfigMap)
+   - Resources: 500m CPU / 1Gi request, 2 CPU / 4Gi limit
+
+2. **Create capture script** (`deployments/k8s/base/playwright-scripts/capture.js`)
+   - ~60 lines Node.js using Playwright
+   - CLI: `--url`, `--viewports` (comma-sep), `--output`, `--wait-for`, `--full-page`, `--video`, `--timeout`
+   - Output: JSON manifest to stdout with screenshot paths
+   - Error handling: catch navigation failures, timeout gracefully
+
+3. **Create ConfigMap for script** (`deployments/k8s/base/playwright-configmap.yaml`)
+   - Mount `capture.js` at `/scripts/capture.js`
+
+4. **Deploy to cluster and test manually**
+   ```bash
+   kubectl apply -f deployments/k8s/base/playwright-configmap.yaml
+   kubectl apply -f deployments/k8s/base/playwright-pod.yaml
+   kubectl exec playwright-0 -- node /scripts/capture.js \
+     --url=https://example.com --viewports=1920x1080 --output=/captures/test/
+   kubectl exec playwright-0 -- cat /captures/test/manifest.json
+   ```
+
+### Day 3: Domain Models
+
+5. **Create domain types** (`internal/domain/verify.go`)
+   - `VerifySpec` struct with fields: URL, Viewports, WaitFor, WaitTimeout, FullPage, Video, Evaluate, Prompt, SpecPath, CallbackURL
+   - `Validate()` method: URL required, callback URL validation (reuse `ValidateCallbackURL`)
+   - `VerifyResult` struct: Success, Screenshots, Video, Evaluation, Score, Passed, DurationMs, Error
+   - `ToWorkResult()` method (promote screenshots to artifacts map)
+
+6. **Add work task type** (`internal/domain/work.go`)
+   - Add `WorkTaskTypeVerify WorkTaskType = "verify"` to constants
+   - Update `IsValid()` to include verify
+
+7. **Unit tests** (`internal/domain/verify_test.go`)
+   - Test Validate() with valid/invalid specs
+   - Test ToWorkResult() conversion
+
+### Day 4-5: Verify Executor (Capture Only)
+
+8. **Create verify executor** (`internal/worker/verify_executor.go`)
+   - Follow `BuildExecutor` pattern exactly
+   - `Execute(ctx, task)` method:
+     - Parse VerifySpec from task.Spec map
+     - Build kubectl exec command: `kubectl exec playwright-0 -- node /scripts/capture.js --url=X ...`
+     - Execute via existing `CommandExecutor` port
+     - Parse JSON manifest from stdout
+     - Return `BuildResult` with artifacts map containing screenshot paths
+   - Config struct: `VerifyExecutorConfig` with playwright pod name, namespace
+   - Constructor: `NewVerifyExecutor(executor, streams, logger, cfg)`
+
+9. **Wire executor to WorkExecutor** (`internal/worker/work_executor.go`)
+   - Add `verifyExec *VerifyExecutor` field
+   - Add case in `executeTask()` switch for `WorkTaskTypeVerify`
+   - Update `NewWorkExecutor()` to accept VerifyExecutor
+
+10. **Unit tests** (`internal/worker/verify_executor_test.go`)
+    - Mock CommandExecutor to return capture manifest JSON
+    - Test successful capture with multiple viewports
+    - Test failure handling (command fails, invalid JSON)
+
+**Deliverables:**
+- [ ] Playwright pod running in cluster
+- [ ] Capture script takes screenshots successfully
+- [ ] VerifySpec/VerifyResult domain types with tests
+- [ ] VerifyExecutor can dispatch capture via kubectl exec
+- [ ] Work queue can dispatch verify tasks (manual test via SQL insert)
+
+**Foundation this enables:**
+- Week 2 can build API layer knowing capture works
+- Executor pattern established for AI evaluation later
+
+---
+
+## Week 2: API Layer + Manual E2E
+
+**Goals:**
+- Full API surface: POST /verify, GET /verify/{id}, GET /verifications
+- Auth scopes configured
+- Manual E2E working: API call → queue → capture → result
+- Initial release candidate deployed to staging
+
+**Tasks:**
+
+### Day 1: Auth and Service Layer
+
+1. **Add auth scopes** (`internal/auth/scopes.go`)
+   - `ScopeVerifyRead Scope = "verify:read"`
+   - `ScopeVerifyWrite Scope = "verify:write"`
+   - Add to `AllScopes` if needed
+
+2. **Create verify service** (`internal/service/verify_service.go`)
+   - Follow `BuildService` pattern
+   - `StartVerify(ctx, projectID, spec)` → validate, enqueue task, return task ID
+   - `GetVerifyStatus(ctx, taskID)` → get task from work queue
+   - `ListVerifications(ctx, projectID, limit)` → list tasks by project
+   - Dependencies: WorkQueue port (existing)
+
+3. **Unit tests** (`internal/service/verify_service_test.go`)
+   - Mock work queue
+   - Test enqueue, status, list
+
+### Day 2-3: Handler Layer
+
+4. **Create verify handler** (`internal/handlers/verify.go`)
+   - Follow `BuildsHandler` pattern exactly
+   - `Mount(r api.Router)` with scopes:
+     - POST `/projects/{id}/verify` → ScopeVerifyWrite
+     - GET `/projects/{id}/verifications` → ScopeVerifyRead
+     - GET `/verify/{taskId}` → ScopeVerifyRead
+   - Use `api.DecodeJSON()`, `validate.New()`, response helpers
+   - Request struct: `VerifyRequest` matching VerifySpec
+   - Response structs: match existing patterns
+
+5. **Wire DI** (`cmd/rdev-api/main.go`)
+   - Create VerifyExecutor in worker setup
+   - Create VerifyService
+   - Create VerifyHandler
+   - Mount routes
+
+6. **Handler tests** (`internal/handlers/verify_test.go`)
+   - Test POST with valid/invalid specs
+   - Test auth scope enforcement
+   - Test GET status/list
+
+### Day 4: SSE Events
+
+7. **Add verify events** (`internal/worker/verify_executor.go`)
+   - Publish events via StreamPublisher:
+     - `verify.started` - task claimed
+     - `verify.capturing` - starting capture
+     - `verify.captured` - capture complete with manifest
+     - `verify.completed` / `verify.failed` - final status
+   - Event constants in verify_executor.go (follow BuildExecutor pattern)
+
+### Day 5: Manual E2E + Deploy
+
+8. **Manual E2E test sequence**
+   ```bash
+   # 1. Start verification
+   curl -X POST $RDEV_API_URL/projects/myproject/verify \
+     -H "X-API-Key: $RDEV_API_KEY" \
+     -H "Content-Type: application/json" \
+     -d '{"url": "https://myproject.threesix.ai", "viewports": ["1920x1080"]}'
+   # Response: {"task_id": "xxx"}
+
+   # 2. Poll for completion
+   curl $RDEV_API_URL/verify/xxx -H "X-API-Key: $RDEV_API_KEY"
+   # Response: screenshots in artifacts
+   ```
+
+9. **Build and deploy**
+   ```bash
+   ./scripts/release.sh v0.11.0 "feat: add visual verification (capture-only MVP)" --deploy
+   ```
+
+**Deliverables:**
+- [ ] Auth scopes for verify:read/write
+- [ ] VerifyService with enqueue/status/list
+- [ ] VerifyHandler with 3 endpoints
+- [ ] SSE events for verification progress
+- [ ] Deployed to staging, manual E2E passing
+
+**Foundation this enables:**
+- Week 3 can add AI evaluation knowing API works
+- Cookbook script can use standard api_call() pattern
+
+---
+
+## Week 3: AI Evaluation + Cookbook Test
+
+**Goals:**
+- AI evaluation path working (Claude reads screenshots, returns verdict)
+- Cookbook E2E test script: `visual-verify-test.sh`
+- Add to common.sh utilities
+- Full E2E passing in CI
+
+**Tasks:**
+
+### Day 1-2: AI Evaluation Path
+
+1. **Add evaluation to VerifyExecutor** (`internal/worker/verify_executor.go`)
+   - After successful capture, if `spec.Evaluate`:
+     - Build evaluation prompt: "Compare these screenshots against the specification..."
+     - Include spec.Prompt or read spec.SpecPath content
+     - Call Claude Code via CodeAgentRegistry
+     - Pass screenshots as attachments (file paths in pod)
+     - Parse evaluation output for score (look for "Score: XX/100" pattern)
+     - Set result.Evaluation, result.Score, result.Passed
+
+2. **Evaluation prompt template** (hardcoded in executor for now)
+   ```
+   Evaluate these screenshots against the following specification:
+
+   {spec.Prompt or contents of spec.SpecPath}
+
+   For each screenshot, assess:
+   1. Does the UI match the specification?
+   2. Are all required elements present?
+   3. Is the layout correct at this viewport?
+
+   End with: "Score: XX/100" and "PASSED" or "FAILED"
+   ```
+
+3. **Handle partial failures** (`internal/worker/verify_executor.go`)
+   - If capture succeeds but evaluation fails:
+     - Set success=true (screenshots are still useful)
+     - Leave evaluation=""
+     - Log warning
+
+4. **Unit tests for evaluation path**
+   - Mock CodeAgentRegistry
+   - Test evaluation output parsing
+   - Test partial failure handling
+
+### Day 3-4: Cookbook Test Script
+
+5. **Add utility to common.sh** (`cookbooks/scripts/common.sh`)
+   ```bash
+   # Wait for verification to complete
+   # Arguments: task_id [max_attempts] [poll_interval]
+   wait_for_verify() {
+       local task_id="$1"
+       local max_attempts="${2:-30}"
+       local poll_interval="${3:-5}"
+       # Poll GET /verify/{task_id} until completed/failed
+   }
+   ```
+
+6. **Create visual-verify-test.sh** (`cookbooks/scripts/visual-verify-test.sh`)
+   - Follow cookbook script SKILL.md patterns exactly
+   - Commands: run, status, diagnose, teardown
+   - Flow:
+     1. Create composable project with app-astro component
+     2. Wait for initial deploy (site is live)
+     3. Start build: "Create a hero section with a call-to-action button"
+     4. Wait for build to complete
+     5. Wait for CI pipeline
+     6. Wait for site to respond
+     7. Start verification: `POST /projects/{id}/verify {url, evaluate: true, prompt: ...}`
+     8. Wait for verify to complete
+     9. Assert: result.passed == true OR result.score >= 70
+     10. Teardown
+
+7. **Add auto-teardown support**
+   - Parse `--auto-teardown` flag
+   - Register cleanup trap
+   - Set CLEANUP_PROJECT
+
+### Day 5: Integration + CI
+
+8. **Test locally**
+   ```bash
+   ./cookbooks/scripts/visual-verify-test.sh run vv-test --auto-teardown
+   ```
+
+9. **Add to CI** (if CI runs cookbook tests)
+   - Add visual-verify-test to test matrix
+   - Ensure playwright-0 pod is available in test environment
+
+10. **Document in cookbook skill** (`.claude/skills/cookbook-scripts/SKILL.md`)
+    - Add `wait_for_verify()` to utilities list
+    - Add visual-verify-test.sh to examples
+
+**Deliverables:**
+- [ ] AI evaluation working with score extraction
+- [ ] Partial failure handling (capture ok, eval fail)
+- [ ] wait_for_verify() in common.sh
+- [ ] visual-verify-test.sh passing end-to-end
+- [ ] Documentation updated
+
+**Foundation this enables:**
+- Week 4 can add SDLC integration knowing full flow works
+- Cookbook pattern established for future tests
+
+---
+
+## Week 4: SDLC Integration + Polish
+
+**Goals:**
+- Visual verification as optional SDLC gate between QA and merge
+- Skeleton command: `/verify-feature`
+- Build chaining: auto-verify after deploy
+- Release v0.12.0 with full feature
+
+**Tasks:**
+
+### Day 1-2: SDLC Types and Rules
+
+1. **Add artifact type** (`internal/sdlc/types.go`)
+   - `ArtifactVerification ArtifactType = "verification"`
+   - Add to `ValidArtifactTypes` slice
+   - Add case in `ArtifactFilename()` → returns `"verification.md"`
+
+2. **Add action types** (`internal/sdlc/types.go`)
+   - `ActionVerifyFeature ActionType = "VERIFY_FEATURE"`
+   - `ActionFixVerificationIssues ActionType = "FIX_VERIFICATION_ISSUES"`
+
+3. **Add classifier rules** (`internal/sdlc/rules_execution.go`)
+   - `needsVerificationRule()`:
+     - Condition: Phase=QA, qa_results=passed, verification=nil or pending
+     - Action: ActionVerifyFeature
+     - NextCommand: "/verify-feature {slug}"
+   - `verificationFailedRule()`:
+     - Condition: Phase=QA, verification=failed
+     - Action: ActionFixVerificationIssues
+     - NextCommand: "/fix-verification-issues {slug}"
+   - `verificationPassedRule()`:
+     - Condition: Phase=QA, qa_results=passed, verification=passed
+     - Action: ActionTransition to PhaseMerge
+
+4. **Update rule ordering** (`internal/sdlc/rules.go`)
+   - Insert verification rules after qaPassedRule
+   - Update qaPassedRule: only transition if verification also passed OR feature doesn't require verification (config flag)
+
+5. **Unit tests** (`internal/sdlc/rules_execution_test.go`)
+   - Test all three verification rules
+   - Test interaction with existing QA rules
+
+### Day 3: Skeleton Command
+
+6. **Create verify-feature command** (embedded template: `templates/skeleton/.claude/commands/verify-feature.md`)
+   ```markdown
+   ---
+   description: Visually verify a deployed feature
+   argument-hint: <feature-slug>
+   allowed-tools: Bash, Read, Write, Edit, Glob, Grep
+   ---
+
+   Visually verify feature: $ARGUMENTS
+
+   ## Instructions
+
+   1. Load feature spec from `.sdlc/features/$ARGUMENTS/spec.md`
+   2. Get project domain from CLAUDE.md or config
+   3. Determine the deployed URL
+   4. Execute verification via rdev API (if available) or Playwright directly
+   5. Write results to `.sdlc/features/$ARGUMENTS/verification.md`
+   6. Register artifact: `sdlc artifact create $ARGUMENTS verification`
+
+   ## Output Format
+
+   Write `.sdlc/features/$ARGUMENTS/verification.md`:
+
+   ```markdown
+   # Visual Verification: [Feature Title]
+
+   ## Screenshots
+
+   | Viewport | Status | Notes |
+   |----------|--------|-------|
+   | Desktop (1920x1080) | PASS | All elements visible |
+   | Mobile (375x667) | PASS | Responsive layout correct |
+
+   ## Evaluation
+
+   [AI or manual evaluation notes]
+
+   ## Result
+
+   **Status:** PASSED
+   **Score:** 95/100
+   ```
+   ```
+
+7. **Update skeleton template** to include the command
+   - Ensure new projects get verify-feature.md
+
+### Day 4: Build Chaining (Optional)
+
+8. **Add verify_after to BuildSpec** (`internal/domain/build.go`)
+   - `VerifyAfter bool` - auto-verify after successful deploy
+   - `VerifyURL string` - URL to verify (if different from project domain)
+
+9. **Chain verification in BuildExecutor** (`internal/worker/build_executor.go`)
+   - After successful build + push (line ~270):
+     ```go
+     if spec.VerifyAfter && spec.VerifyURL != "" {
+         // Enqueue verify task
+     }
+     ```
+   - Or: callback webhook triggers external verification
+
+10. **Update build handler** to accept verify_after/verify_url
+
+### Day 5: Documentation + Release
+
+11. **Update documentation**
+    - CLAUDE.md: Update platform status to "Done"
+    - visual-verification.md: Add SDLC integration examples
+    - sdlc.md: Document verification rules
+
+12. **Integration test**
+    - Test full SDLC flow with verification gate
+    - Test classifier transitions correctly
+
+13. **Final release**
+    ```bash
+    ./scripts/release.sh v0.12.0 "feat: visual verification with SDLC integration" --deploy
+    ```
+
+**Deliverables:**
+- [ ] ArtifactVerification type in SDLC
+- [ ] 3 classifier rules for verification gate
+- [ ] verify-feature.md skeleton command
+- [ ] Build chaining (verify_after flag)
+- [ ] Full integration test passing
+- [ ] v0.12.0 released
+
+---
+
+## Summary
+
+| Week | Theme | Key Output |
+|------|-------|------------|
+| 1 | Foundation | Playwright pod + capture script + domain types + executor |
+| 2 | API Layer | Handlers + service + auth scopes + manual E2E |
+| 3 | AI + Cookbook | Evaluation path + visual-verify-test.sh + common.sh utils |
+| 4 | SDLC + Polish | Classifier rules + skeleton command + build chaining + release |
+
+## Risks and Mitigations
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| Playwright pod OOM | Capture fails | Start with conservative limits (4Gi), tune based on usage |
+| AI evaluation unreliable | Poor pass/fail decisions | Start with high threshold (70), tune; partial success mode |
+| Screenshot storage fills up | Pod crashes | EmptyDir for now, add cleanup job or PVC later |
+| SDLC rules conflict | Features stuck | Test extensively, make verification optional via config |
+| Claude Code can't read screenshots | Evaluation broken | Test multimodal support; fallback to manual verification |
+
+## Files Created/Modified
+
+**New Files (13):**
+- `internal/domain/verify.go`
+- `internal/domain/verify_test.go`
+- `internal/service/verify_service.go`
+- `internal/service/verify_service_test.go`
+- `internal/handlers/verify.go`
+- `internal/handlers/verify_test.go`
+- `internal/worker/verify_executor.go`
+- `internal/worker/verify_executor_test.go`
+- `deployments/k8s/base/playwright-pod.yaml`
+- `deployments/k8s/base/playwright-configmap.yaml`
+- `deployments/k8s/base/playwright-scripts/capture.js`
+- `cookbooks/scripts/visual-verify-test.sh`
+- `templates/skeleton/.claude/commands/verify-feature.md`
+
+**Modified Files (8):**
+- `internal/domain/work.go` - Add WorkTaskTypeVerify
+- `internal/auth/scopes.go` - Add verify scopes
+- `internal/worker/work_executor.go` - Add dispatch case
+- `internal/sdlc/types.go` - Add artifact/action types
+- `internal/sdlc/rules.go` - Register verification rules
+- `internal/sdlc/rules_execution.go` - Add verification rules
+- `cookbooks/scripts/common.sh` - Add wait_for_verify()
+- `cmd/rdev-api/main.go` - Wire DI
diff --git a/internal/adapter/templates/templates/astro-landing/.woodpecker.yml b/internal/adapter/templates/templates/astro-landing/.woodpecker.yml
index 4f2d84f..ad7b75f 100644
--- a/internal/adapter/templates/templates/astro-landing/.woodpecker.yml
+++ b/internal/adapter/templates/templates/astro-landing/.woodpecker.yml
@@ -23,6 +23,7 @@ steps:
         - ${CI_COMMIT_SHA:0:8}
       cache: true
       skip-tls-verify: true
+    failure: retry
     when:
       - event: push
         branch: main
diff --git a/internal/adapter/templates/templates/components/cli/go.sum.tmpl b/internal/adapter/templates/templates/components/cli/go.sum.tmpl
new file mode 100644
index 0000000..e69de29
diff --git a/internal/adapter/templates/templates/components/service/Dockerfile.tmpl b/internal/adapter/templates/templates/components/service/Dockerfile.tmpl
index 57377af..fdd1358 100644
--- a/internal/adapter/templates/templates/components/service/Dockerfile.tmpl
+++ b/internal/adapter/templates/templates/components/service/Dockerfile.tmpl
@@ -10,7 +10,8 @@ ENV GOWORK=/app/go.work
 WORKDIR /app
 
 # Copy go workspace and all source (workspace deps are local)
-COPY go.work go.work.sum* ./
+COPY go.work ./
+COPY go.work.sum ./
 COPY pkg/ ./pkg/
 COPY services/{{COMPONENT_NAME}}/ ./services/{{COMPONENT_NAME}}/
 
diff --git a/internal/adapter/templates/templates/components/service/go.sum.tmpl b/internal/adapter/templates/templates/components/service/go.sum.tmpl
new file mode 100644
index 0000000..e69de29
diff --git a/internal/adapter/templates/templates/components/worker/Dockerfile.tmpl b/internal/adapter/templates/templates/components/worker/Dockerfile.tmpl
index 33fea68..3973313 100644
--- a/internal/adapter/templates/templates/components/worker/Dockerfile.tmpl
+++ b/internal/adapter/templates/templates/components/worker/Dockerfile.tmpl
@@ -10,7 +10,8 @@ ENV GOWORK=/app/go.work
 WORKDIR /app
 
 # Copy go workspace and all source (workspace deps are local)
-COPY go.work go.work.sum* ./
+COPY go.work ./
+COPY go.work.sum ./
 COPY pkg/ ./pkg/
 COPY workers/{{COMPONENT_NAME}}/ ./workers/{{COMPONENT_NAME}}/
 
diff --git a/internal/adapter/templates/templates/components/worker/go.sum.tmpl b/internal/adapter/templates/templates/components/worker/go.sum.tmpl
new file mode 100644
index 0000000..e69de29
diff --git a/internal/adapter/templates/templates/default/.woodpecker.yml b/internal/adapter/templates/templates/default/.woodpecker.yml
index 2105b9b..c9b5ad4 100644
--- a/internal/adapter/templates/templates/default/.woodpecker.yml
+++ b/internal/adapter/templates/templates/default/.woodpecker.yml
@@ -9,6 +9,7 @@ steps:
         - ${CI_COMMIT_SHA:0:8}
       cache: true
       skip-tls-verify: true
+    failure: retry
     when:
       - event: push
         branch: main
diff --git a/internal/adapter/templates/templates/go-api/.woodpecker.yml b/internal/adapter/templates/templates/go-api/.woodpecker.yml
index 7eb3f66..f2cec55 100644
--- a/internal/adapter/templates/templates/go-api/.woodpecker.yml
+++ b/internal/adapter/templates/templates/go-api/.woodpecker.yml
@@ -23,6 +23,7 @@ steps:
         - ${CI_COMMIT_SHA:0:8}
       cache: true
       skip-tls-verify: true
+    failure: retry
     when:
       - event: push
         branch: main
diff --git a/internal/adapter/templates/templates/skeleton/.woodpecker.yml.tmpl b/internal/adapter/templates/templates/skeleton/.woodpecker.yml.tmpl
index 1f16d55..fbb54dc 100644
--- a/internal/adapter/templates/templates/skeleton/.woodpecker.yml.tmpl
+++ b/internal/adapter/templates/templates/skeleton/.woodpecker.yml.tmpl
@@ -8,6 +8,32 @@ clone:
       depth: 1
 
 steps:
+  deps:
+    image: golang:1.23
+    commands:
+      - go work sync
+      - |
+        for dir in services/*/; do
+          if [ -f "$dir/go.mod" ]; then
+            (cd "$dir" && go mod tidy)
+          fi
+        done
+      - |
+        for dir in workers/*/; do
+          if [ -f "$dir/go.mod" ]; then
+            (cd "$dir" && go mod tidy)
+          fi
+        done
+      - |
+        for dir in cli/*/; do
+          if [ -f "$dir/go.mod" ]; then
+            (cd "$dir" && go mod tidy)
+          fi
+        done
+    when:
+      branch: main
+      event: push
+
   # COMPONENT_STEPS_BELOW
   # Do not remove the marker above - component steps are inserted here
 
diff --git a/internal/adapter/templates/templates/skeleton/pkg/go.mod.tmpl b/internal/adapter/templates/templates/skeleton/pkg/go.mod.tmpl
index 5408061..ff13211 100644
--- a/internal/adapter/templates/templates/skeleton/pkg/go.mod.tmpl
+++ b/internal/adapter/templates/templates/skeleton/pkg/go.mod.tmpl
@@ -3,7 +3,7 @@ module {{GO_MODULE}}/pkg
 go 1.23
 
 require (
-	github.com/bdpiprava/scalar-go v0.1.2
+	github.com/bdpiprava/scalar-go v0.13.0
 	github.com/go-chi/chi/v5 v5.2.0
 	github.com/go-chi/cors v1.2.1
 	github.com/go-playground/validator/v10 v10.23.0
diff --git a/internal/adapter/templates/templates/skeleton/pkg/go.sum.tmpl b/internal/adapter/templates/templates/skeleton/pkg/go.sum.tmpl
new file mode 100644
index 0000000..e69de29
diff --git a/internal/adapter/zot/client.go b/internal/adapter/zot/client.go
new file mode 100644
index 0000000..a15161d
--- /dev/null
+++ b/internal/adapter/zot/client.go
@@ -0,0 +1,74 @@
+// Package zot provides a client for checking zot container registry health.
+package zot
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/orchard9/rdev/internal/domain"
+)
+
+// Client checks zot registry health via the OCI /v2/ endpoint.
+type Client struct {
+	url        string
+	httpClient *http.Client
+}
+
+// NewClient creates a new zot health checker.
+// The URL should be the registry base URL (e.g., "https://registry.threesix.ai").
+func NewClient(url string) *Client {
+	return &Client{
+		url: url,
+		httpClient: &http.Client{
+			Timeout: 5 * time.Second,
+		},
+	}
+}
+
+// Check returns the health status of the registry.
+// A 200 or 401 response indicates the registry is healthy (401 means auth required but registry is up).
+func (c *Client) Check(ctx context.Context) domain.RegistryStatus {
+	start := time.Now()
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.url+"/v2/", nil)
+	if err != nil {
+		return domain.RegistryStatus{
+			Healthy:     false,
+			URL:         c.url,
+			Error:       fmt.Sprintf("failed to create request: %v", err),
+			LastChecked: time.Now().UTC(),
+		}
+	}
+
+	resp, err := c.httpClient.Do(req)
+	latency := time.Since(start)
+
+	if err != nil {
+		return domain.RegistryStatus{
+			Healthy:     false,
+			URL:         c.url,
+			Latency:     latency.String(),
+			Error:       fmt.Sprintf("connection error: %v", err),
+			LastChecked: time.Now().UTC(),
+		}
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	// 200 = healthy, 401 = healthy but requires auth
+	healthy := resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusUnauthorized
+
+	status := domain.RegistryStatus{
+		Healthy:     healthy,
+		URL:         c.url,
+		Latency:     latency.String(),
+		LastChecked: time.Now().UTC(),
+	}
+
+	if !healthy {
+		status.Error = fmt.Sprintf("unexpected status code: %d", resp.StatusCode)
+	}
+
+	return status
+}
diff --git a/internal/domain/errors.go b/internal/domain/errors.go
index 400ac8e..b2134f6 100644
--- a/internal/domain/errors.go
+++ b/internal/domain/errors.go
@@ -79,6 +79,7 @@ var (
 	ErrOperationNotFound = errors.New("operation not found")
 
 	// Infrastructure errors (should typically be wrapped)
-	ErrDatabaseConnection = errors.New("database connection error")
-	ErrKubernetesError    = errors.New("kubernetes error")
+	ErrDatabaseConnection  = errors.New("database connection error")
+	ErrKubernetesError     = errors.New("kubernetes error")
+	ErrRegistryUnavailable = errors.New("container registry unavailable")
 )
diff --git a/internal/domain/operation.go b/internal/domain/operation.go
index 4b79be3..67066da 100644
--- a/internal/domain/operation.go
+++ b/internal/domain/operation.go
@@ -12,6 +12,7 @@ const (
 	OperationTypeProjectCreate     OperationType = "project.create"
 	OperationTypeComponentAdd      OperationType = "component.add"
 	OperationTypeBuild             OperationType = "build"
+	OperationTypeCIBuild           OperationType = "ci.build"
 	OperationTypeResourceProvision OperationType = "resource.provision"
 )
 
@@ -19,7 +20,7 @@ const (
 func (t OperationType) IsValid() bool {
 	switch t {
 	case OperationTypeProjectCreate, OperationTypeComponentAdd,
-		OperationTypeBuild, OperationTypeResourceProvision:
+		OperationTypeBuild, OperationTypeCIBuild, OperationTypeResourceProvision:
 		return true
 	}
 	return false
diff --git a/internal/domain/registry.go b/internal/domain/registry.go
new file mode 100644
index 0000000..7681e80
--- /dev/null
+++ b/internal/domain/registry.go
@@ -0,0 +1,12 @@
+package domain
+
+import "time"
+
+// RegistryStatus represents the health status of a container registry.
+type RegistryStatus struct {
+	Healthy     bool      `json:"healthy"`
+	URL         string    `json:"url"`
+	Latency     string    `json:"latency,omitempty"`
+	Error       string    `json:"error,omitempty"`
+	LastChecked time.Time `json:"last_checked"`
+}
diff --git a/internal/handlers/health.go b/internal/handlers/health.go
index fbbb224..a656ac8 100644
--- a/internal/handlers/health.go
+++ b/internal/handlers/health.go
@@ -8,6 +8,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/orchard9/rdev/internal/metrics"
 	"github.com/orchard9/rdev/internal/port"
 	"github.com/orchard9/rdev/pkg/api"
 )
@@ -20,11 +21,12 @@ type ExecutorHealthChecker interface {
 
 // HealthHandler handles health and readiness checks.
 type HealthHandler struct {
-	serviceName   string
-	db            port.DatabasePinger
-	k8sChecker    port.KubernetesChecker
-	agentRegistry port.CodeAgentRegistry
-	workExecutor  ExecutorHealthChecker
+	serviceName     string
+	db              port.DatabasePinger
+	k8sChecker      port.KubernetesChecker
+	agentRegistry   port.CodeAgentRegistry
+	workExecutor    ExecutorHealthChecker
+	registryChecker port.RegistryChecker
 }
 
 // NewHealthHandler creates a new health handler with dependencies.
@@ -48,6 +50,12 @@ func (h *HealthHandler) WithWorkExecutor(executor ExecutorHealthChecker) *Health
 	return h
 }
 
+// WithRegistryChecker adds a registry checker for health monitoring.
+func (h *HealthHandler) WithRegistryChecker(checker port.RegistryChecker) *HealthHandler {
+	h.registryChecker = checker
+	return h
+}
+
 // Health returns a simple liveness check.
 // This should be lightweight and only fail if the process is unhealthy.
 // GET /health
@@ -100,6 +108,11 @@ func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) {
 		checks["work_executor"] = h.checkWorkExecutor()
 	}
 
+	// Registry check (informational - doesn't affect overall readiness)
+	if h.registryChecker != nil {
+		checks["registry"] = h.checkRegistry(ctx)
+	}
+
 	response := ReadinessResponse{
 		Status:  "ready",
 		Service: h.serviceName,
@@ -217,6 +230,35 @@ func (h *HealthHandler) checkWorkExecutor() CheckResult {
 	}
 }
 
+// checkRegistry checks whether the container registry is healthy.
+func (h *HealthHandler) checkRegistry(ctx context.Context) CheckResult {
+	status := h.registryChecker.Check(ctx)
+
+	// Update metrics
+	latencySeconds := 0.0
+	if status.Latency != "" {
+		// Parse duration string like "45ms"
+		if d, err := time.ParseDuration(status.Latency); err == nil {
+			latencySeconds = d.Seconds()
+		}
+	}
+	metrics.SetRegistryHealth(status.Healthy, latencySeconds)
+
+	result := CheckResult{
+		Healthy:   status.Healthy,
+		Latency:   status.Latency,
+		LastCheck: status.LastChecked,
+	}
+
+	if status.Healthy {
+		result.Message = "connected"
+	} else {
+		result.Message = status.Error
+	}
+
+	return result
+}
+
 // CheckResult represents the result of a health check.
 type CheckResult struct {
 	Healthy   bool      `json:"healthy"`
diff --git a/internal/handlers/woodpecker_webhook.go b/internal/handlers/woodpecker_webhook.go
index f4e218f..89e4d20 100644
--- a/internal/handlers/woodpecker_webhook.go
+++ b/internal/handlers/woodpecker_webhook.go
@@ -14,6 +14,7 @@ import (
 	"strings"
 
 	"github.com/orchard9/rdev/internal/domain"
+	"github.com/orchard9/rdev/internal/metrics"
 	"github.com/orchard9/rdev/internal/port"
 	"github.com/orchard9/rdev/internal/service"
 	"github.com/orchard9/rdev/pkg/api"
@@ -166,6 +167,18 @@ func (h *WoodpeckerWebhookHandler) HandleWebhook(w http.ResponseWriter, r *http.
 		"build_number", payload.Build.Number,
 	)
 
+	// Track failed builds for visibility
+	if payload.Build.Status == "failure" {
+		h.handleFailedBuild(ctx, payload)
+		api.WriteSuccess(w, r, map[string]any{
+			"status":  "recorded",
+			"reason":  "build failed",
+			"project": payload.Repo.Name,
+			"build":   payload.Build.Number,
+		})
+		return
+	}
+
 	// Only process successful builds on main/master branch
 	if payload.Build.Status != "success" {
 		api.WriteSuccess(w, r, map[string]string{
@@ -287,3 +300,58 @@ func (h *WoodpeckerWebhookHandler) verifySignature(body []byte, signature string
 
 	return hmac.Equal([]byte(signature), []byte(expected))
 }
+
+// handleFailedBuild records a failed CI build for visibility and debugging.
+func (h *WoodpeckerWebhookHandler) handleFailedBuild(ctx context.Context, payload WoodpeckerPayload) {
+	projectName := payload.Repo.Name
+
+	h.logger.Warn("CI build failed",
+		"project", projectName,
+		"build_number", payload.Build.Number,
+		"branch", payload.Build.Branch,
+		"commit", payload.Build.Commit,
+		"author", payload.Build.Author,
+	)
+
+	// Record metrics
+	metrics.RecordCIBuild(projectName, "failure")
+
+	// Check if this looks like a registry push failure
+	// (We can't get detailed logs here, but we track the failure)
+	if payload.Build.Branch == "main" || payload.Build.Branch == "master" {
+		// Failed builds on main are likely image push failures
+		metrics.RecordCIPushFailure(projectName)
+	}
+
+	// Track as operation if operation service is configured
+	if h.operationService != nil {
+		operationID, _ := h.operationService.StartOperation(ctx, projectName,
+			domain.OperationTypeCIBuild,
+			map[string]any{
+				"repo":         payload.Repo.FullName,
+				"branch":       payload.Build.Branch,
+				"commit":       payload.Build.Commit,
+				"build_number": payload.Build.Number,
+				"author":       payload.Build.Author,
+			}, "")
+
+		if operationID != "" {
+			// Set external reference to build number
+			if opErr := h.operationService.SetExternalRef(ctx, operationID, fmt.Sprintf("build#%d", payload.Build.Number)); opErr != nil {
+				h.logger.Error("failed to set external ref", "error", opErr, "operation_id", operationID)
+			}
+
+			// Link to parent operation via commit SHA
+			if parent, err := h.operationService.FindByCommit(ctx, projectName, payload.Build.Commit); err == nil && parent != nil {
+				if opErr := h.operationService.LinkToParent(ctx, operationID, parent.ID); opErr != nil {
+					h.logger.Error("failed to link to parent operation", "error", opErr, "operation_id", operationID)
+				}
+			}
+
+			// Mark as failed
+			if opErr := h.operationService.FailOperation(ctx, operationID, "CI build failed", ""); opErr != nil {
+				h.logger.Error("failed to record operation failure", "error", opErr, "operation_id", operationID)
+			}
+		}
+	}
+}
diff --git a/internal/handlers/woodpecker_webhook_test.go b/internal/handlers/woodpecker_webhook_test.go
index 987fbd6..c675b32 100644
--- a/internal/handlers/woodpecker_webhook_test.go
+++ b/internal/handlers/woodpecker_webhook_test.go
@@ -254,7 +254,59 @@ func TestWoodpeckerWebhookHandler_LinksToParentOperation(t *testing.T) {
 	t.Error("build operation not found")
 }
 
-func TestWoodpeckerWebhookHandler_IgnoresNonSuccessBuilds(t *testing.T) {
+func TestWoodpeckerWebhookHandler_RecordsFailedBuilds(t *testing.T) {
+	opRepo := newMockOperationRepo()
+	opSvc := service.NewOperationService(opRepo, slog.Default())
+
+	h := &WoodpeckerWebhookHandler{
+		operationService: opSvc,
+		logger:           slog.Default(),
+	}
+
+	payload := WoodpeckerPayload{
+		Event: "push",
+		Repo:  WoodpeckerRepo{Name: "my-project", FullName: "org/my-project"},
+		Build: WoodpeckerBuild{
+			Number: 99,
+			Status: "failure",
+			Branch: "main",
+			Commit: "abc123",
+		},
+	}
+	body, _ := json.Marshal(payload)
+
+	req := httptest.NewRequest(http.MethodPost, "/webhooks/woodpecker", strings.NewReader(string(body)))
+	rec := httptest.NewRecorder()
+	h.HandleWebhook(rec, req)
+
+	// Failed builds are now recorded for visibility
+	if opRepo.count() != 1 {
+		t.Errorf("expected 1 operation for failed build, got %d", opRepo.count())
+	}
+
+	// Verify the operation was marked as failed
+	for _, op := range opRepo.operations {
+		if op.Type != domain.OperationTypeCIBuild {
+			t.Errorf("expected operation type ci.build, got %s", op.Type)
+		}
+		if op.Status != domain.OperationStatusFailed {
+			t.Errorf("expected operation status failed, got %s", op.Status)
+		}
+	}
+
+	// Verify response indicates build was recorded
+	var resp struct {
+		Data map[string]any `json:"data"`
+	}
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to unmarshal response: %v", err)
+	}
+	if resp.Data["status"] != "recorded" {
+		t.Errorf("expected status 'recorded', got %v", resp.Data["status"])
+	}
+}
+
+func TestWoodpeckerWebhookHandler_IgnoresPendingBuilds(t *testing.T) {
 	opRepo := newMockOperationRepo()
 	opSvc := service.NewOperationService(opRepo, slog.Default())
 
@@ -267,7 +319,7 @@ func TestWoodpeckerWebhookHandler_IgnoresNonSuccessBuilds(t *testing.T) {
 		Event: "push",
 		Repo:  WoodpeckerRepo{Name: "my-project"},
 		Build: WoodpeckerBuild{
-			Status: "failure",
+			Status: "pending",
 			Branch: "main",
 			Commit: "abc123",
 		},
@@ -278,8 +330,8 @@ func TestWoodpeckerWebhookHandler_IgnoresNonSuccessBuilds(t *testing.T) {
 	rec := httptest.NewRecorder()
 	h.HandleWebhook(rec, req)
 
-	// Non-success builds are ignored, so no operation should be created
+	// Pending/running builds are ignored (only success and failure are handled)
 	if opRepo.count() != 0 {
-		t.Errorf("expected no operations for failed build, got %d", opRepo.count())
+		t.Errorf("expected no operations for pending build, got %d", opRepo.count())
 	}
 }
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 5b1c7e6..275a5ba 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -120,6 +120,28 @@ var (
 		Name: "rdev_api_requests_total",
 		Help: "Total number of API requests",
 	}, []string{"method", "path", "status"})
+
+	// Registry health
+	registryHealthy = promauto.NewGauge(prometheus.GaugeOpts{
+		Name: "rdev_registry_healthy",
+		Help: "Whether the container registry is healthy (1) or not (0)",
+	})
+
+	registryLatency = promauto.NewGauge(prometheus.GaugeOpts{
+		Name: "rdev_registry_latency_seconds",
+		Help: "Latency of registry health check in seconds",
+	})
+
+	// CI builds
+	ciBuildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "rdev_ci_builds_total",
+		Help: "Total number of CI builds by project and status",
+	}, []string{"project", "status"})
+
+	ciPushFailures = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "rdev_ci_push_failures_total",
+		Help: "Total number of CI image push failures by project",
+	}, []string{"project"})
 )
 
 // RecordCommand records a command execution.
@@ -206,6 +228,26 @@ func SetWorkQueueDepth(status string, count int64) {
 	workQueueDepth.WithLabelValues(status).Set(float64(count))
 }
 
+// SetRegistryHealth sets the registry health status.
+func SetRegistryHealth(healthy bool, latencySeconds float64) {
+	val := 0.0
+	if healthy {
+		val = 1.0
+	}
+	registryHealthy.Set(val)
+	registryLatency.Set(latencySeconds)
+}
+
+// RecordCIBuild records a CI build event.
+func RecordCIBuild(project, status string) {
+	ciBuildsTotal.WithLabelValues(project, status).Inc()
+}
+
+// RecordCIPushFailure records a CI image push failure.
+func RecordCIPushFailure(project string) {
+	ciPushFailures.WithLabelValues(project).Inc()
+}
+
 // Handler returns the Prometheus HTTP handler.
 func Handler() http.Handler {
 	return promhttp.Handler()
diff --git a/internal/port/health.go b/internal/port/health.go
index de853dc..582792e 100644
--- a/internal/port/health.go
+++ b/internal/port/health.go
@@ -1,6 +1,10 @@
 package port
 
-import "context"
+import (
+	"context"
+
+	"github.com/orchard9/rdev/internal/domain"
+)
 
 // DatabasePinger checks database connectivity.
 // *sql.DB satisfies this interface.
@@ -13,3 +17,9 @@ type KubernetesChecker interface {
 	// ServerVersion returns the server version string, or an error if unreachable.
 	ServerVersion() (string, error)
 }
+
+// RegistryChecker checks container registry health.
+type RegistryChecker interface {
+	// Check returns the health status of the registry.
+	Check(ctx context.Context) domain.RegistryStatus
+}
diff --git a/vision.md b/vision.md
new file mode 100644
index 0000000..40d5b22
--- /dev/null
+++ b/vision.md
@@ -0,0 +1,357 @@
+# rdev: The Agent's Operating System
+
+> **Platform:** threesix.ai
+> **Category:** Infrastructure / Agent Orchestration Platform
+> **Role:** The runtime environment where AI agents become software engineers
+
+## The Problem: Agents Have No Workspace
+
+Current agent systems suffer from **The Phantom Limb** problem: agents can *think* but they can't *do*. They generate code but have nowhere to run it. They propose changes but have no git repo. They want to deploy but have no infrastructure.
+
+When you ask an agent to "build a landing page," it must:
+- **Beg for shell access** (security nightmare)
+- **Dump code to chat** (copy-paste purgatory)
+- **Hope you handle infra** (manual setup hell)
+
+**Real example:** A founder asks Claude to build a product landing page. Claude writes the code, but now what? The founder needs to set up a git repo, configure CI/CD, buy a domain, provision DNS, create a database, and figure out deployment. By the time infra is ready, the enthusiasm is gone. The code sits in a chat log. The product never launches.
+
+## The Solution: Give Agents a Full Developer Environment
+
+rdev rejects the idea that agents are just "code generators." Instead, it models agent work as a **Controlled Development Environment**:
+
+- **Projects are isolated.** Each agent workspace is a Kubernetes pod with its own git repo, secrets, and environment.
+- **Commands are executed.** Shell, Git, and Claude Code commands run inside pods, not locally.
+- **Infrastructure is automatic.** Git repos, CI/CD, DNS, databases, caches, and deployments provision on demand.
+- **Feature delivery is deterministic.** A 10-phase SDLC lifecycle guides every feature from idea to production.
+
+## The Four Pillars
+
+Every use case must demonstrate at least one pillar. If a shell script could do it, it's not a compelling use case.
+
+| Pillar | What It Enables | Shell Script Gap |
+|--------|-----------------|------------------|
+| **First-Class Isolation** | Each project in its own pod with dedicated workspace, credentials, network | Shared machine, credential leakage, no boundaries |
+| **Deterministic SDLC** | Every feature follows 10-phase lifecycle with classifier-driven transitions | Manual process, skipped steps, undefined state |
+| **Infrastructure Orchestration** | Git, CI/CD, DNS, DB, cache, deployment created via API | Hours of manual setup per project |
+| **Observable Execution** | Every command logged, streamed, auditable | Fire-and-forget scripts, no visibility |
+
+## The Core Data Model: The Project
+
+The atomic unit is not a Container, VM, or Directory. It is the **Project**:
+
+```go
+type Project struct {
+    // Identity
+    ID          string            // Kubernetes pod name
+    Name        string            // Human-readable name
+    Namespace   string            // K8s namespace isolation
+
+    // Infrastructure
+    GitRepo     *GitRepo          // Gitea repo with SSH/HTTPS URLs
+    Domain      *Domain           // Custom subdomain + TLS
+    Database    *Database         // CockroachDB isolated tenant
+    Cache       *Cache            // Redis ACL-scoped namespace
+
+    // Execution
+    Status      ProjectStatus     // Running, Stopped, Failed
+    Agent       CodeAgent         // Claude Code, OpenCode, etc.
+    WorkDir     string            // /workspace inside pod
+
+    // SDLC
+    Features    []Feature         // Active feature branches
+    Classifier  ClassifierEngine  // State machine for transitions
+}
+```
+
+## The SDLC Lifecycle
+
+Every feature follows a deterministic 10-phase lifecycle. The classifier engine evaluates state and returns the next valid action.
+
+| Phase | What Happens | Artifacts Produced |
+|-------|--------------|-------------------|
+| **Draft** | Feature captured as rough idea | `spec.md` draft |
+| **Specified** | Requirements refined, acceptance criteria defined | `spec.md` approved |
+| **Planned** | Implementation strategy designed | `design.md` with component breakdown |
+| **Ready** | Tasks extracted, blockers resolved | `tasks.md` with implementation items |
+| **Implementation** | Code written task-by-task | Code commits, test coverage |
+| **Review** | Code reviewed for quality | Review comments, fixes |
+| **Audit** | Tech debt and security checked | Audit report |
+| **QA** | Feature tested against spec | QA checklist, evidence |
+| **Merge** | Feature branch merged to main | Git merge commit |
+| **Released** | Deployed to production | Deployment record |
+
+The classifier is a pure function: given current state, it returns the next action. No ambiguity. No skipped steps.
+
+## The Work Queue: Scaled Agent Labor
+
+Multiple agents can work across projects via the **Worker Pool**:
+
+```go
+type WorkTask struct {
+    // Identity
+    ID          string            // UUID
+    ProjectID   string            // Target project
+    Command     string            // claude, shell, git
+
+    // State
+    Status      TaskStatus        // pending → running → completed/failed
+    WorkerID    *string           // Assigned worker
+    Error       *WorkTaskError    // Classified failure
+
+    // Lifecycle
+    Attempts    int               // Retry count
+    CreatedAt   time.Time
+    StartedAt   *time.Time
+    CompletedAt *time.Time
+}
+```
+
+Workers are stateless pods that poll for tasks. When a worker claims a task, it:
+1. Executes the command in the target project's pod
+2. Streams output back via SSE
+3. Reports success/failure with error classification
+
+Error classification enables smart retries:
+
+| Error Class | Behavior |
+|-------------|----------|
+| **RateLimited** | Exponential backoff |
+| **AuthFailed** | Fail immediately, notify |
+| **Timeout** | Retry with longer timeout |
+| **StaleWorker** | Reassign to healthy worker |
+| **ResourceExhausted** | Wait for capacity |
+
+## The Infrastructure Stack
+
+A single API call provisions complete project infrastructure:
+
+```http
+POST /projects
+{
+  "name": "acme-landing",
+  "template": "astro-landing"
+}
+```
+
+This triggers:
+
+| Step | Adapter | Result |
+|------|---------|--------|
+| 1. Git repo | Gitea | `git@gitea.orchard9.ai:projects/acme-landing.git` |
+| 2. CI/CD | Woodpecker | Pipeline auto-activated, webhooks configured |
+| 3. DNS | Cloudflare | `acme-landing.threesix.ai` A record |
+| 4. TLS | Kubernetes | Wildcard cert via cert-manager |
+| 5. Database | CockroachDB | Tenant `acme_landing` with isolated schema |
+| 6. Cache | Redis | ACL-scoped `acme-landing:*` keys |
+| 7. Deployment | Kubernetes | Deployment + Service + Ingress |
+
+Total time: ~30 seconds. Manual equivalent: ~3 hours.
+
+## Architecture: The Hexagonal Stack
+
+| Layer | Package | Role |
+|-------|---------|------|
+| **Handlers** | `internal/handlers/` | HTTP endpoints, request validation, auth |
+| **Services** | `internal/service/` | Business logic orchestration |
+| **Ports** | `internal/port/` | Interface contracts (no implementation) |
+| **Adapters** | `internal/adapter/` | Infrastructure implementations |
+| **Domain** | `internal/domain/` | Pure business models (zero dependencies) |
+
+The hexagonal metaphor:
+- **Domain:** Pure truth. No imports except stdlib.
+- **Ports:** Contracts. What the domain needs from the world.
+- **Adapters:** Implementations. Kubernetes, Postgres, Gitea, etc.
+- **Services:** Orchestration. Coordinate ports to achieve business goals.
+
+```
+                    ┌────────────────────┐
+                    │   HTTP Handlers    │
+                    └─────────┬──────────┘
+                              │
+                    ┌─────────▼──────────┐
+                    │   Service Layer    │
+                    └─────────┬──────────┘
+                              │
+        ┌─────────────────────┼─────────────────────┐
+        │                     │                     │
+┌───────▼───────┐   ┌────────▼────────┐   ┌───────▼───────┐
+│   Kubernetes  │   │    PostgreSQL   │   │     Gitea     │
+│   Adapter     │   │    Adapter      │   │    Adapter    │
+└───────────────┘   └─────────────────┘   └───────────────┘
+```
+
+## The Agent Registry
+
+rdev supports multiple agent providers through a unified interface:
+
+| Agent | Capabilities | Use Case |
+|-------|--------------|----------|
+| **Claude Code** | Full IDE replacement, complex reasoning | Feature implementation |
+| **OpenCode** | Fast iteration, cost-effective | Simple fixes, testing |
+| **Custom** | Extensible via registry | Specialized workflows |
+
+Agents are interchangeable. The same work task can target different agents based on complexity, cost, or capability requirements.
+
+## Key Capabilities
+
+### Streaming Execution
+Commands stream output in real-time via Server-Sent Events:
+
+```http
+GET /projects/acme-landing/events
+Accept: text/event-stream
+
+data: {"type":"output","line":"Installing dependencies..."}
+data: {"type":"output","line":"Building production bundle..."}
+data: {"type":"complete","exit_code":0}
+```
+
+### SDLC Orchestration
+Ask the classifier what to do next:
+
+```http
+GET /projects/acme-landing/sdlc/features/user-auth/next
+
+{
+  "action": "implement-task",
+  "task_id": "task-003",
+  "reason": "All blockers resolved, tasks available"
+}
+```
+
+### Operation Audit Trail
+Every operation is logged with step-level granularity:
+
+```http
+GET /projects/acme-landing/audit
+
+[
+  {
+    "operation_id": "op-123",
+    "type": "sdlc_execute",
+    "steps": [
+      {"name": "read_state", "status": "completed", "duration_ms": 45},
+      {"name": "classify", "status": "completed", "duration_ms": 12},
+      {"name": "execute_action", "status": "completed", "duration_ms": 8234}
+    ]
+  }
+]
+```
+
+### Visual Verification (Planned)
+Playwright captures screenshots and video for AI evaluation:
+
+```http
+POST /projects/acme-landing/verify
+{
+  "url": "https://acme-landing.threesix.ai",
+  "viewports": ["desktop", "tablet", "mobile"],
+  "capture_video": true
+}
+```
+
+## The Composable Monorepo
+
+Projects can be composable monorepos with independent components:
+
+```
+acme-platform/
+├── services/
+│   ├── api/           # Go API service
+│   └── worker/        # Background job processor
+├── apps/
+│   ├── web/           # React frontend
+│   └── landing/       # Astro marketing site
+└── packages/
+    └── shared/        # Shared types and utilities
+```
+
+Each component has:
+- Independent deployment pipeline
+- Own database/cache isolation
+- Separate CI/CD triggers
+- Shared monorepo patterns
+
+## The Git Analogy
+
+| Git Concept | rdev Equivalent |
+|-------------|-----------------|
+| Repository | Project (isolated pod with workspace) |
+| Branch | Feature (SDLC lifecycle instance) |
+| Commit | Artifact (spec, design, code, test) |
+| Merge | Phase transition to Released |
+| CI/CD | Woodpecker pipeline (auto-triggered) |
+| Deploy | Kubernetes Deployment (auto-provisioned) |
+
+## When to Use rdev
+
+**Use rdev when:**
+- You want agents to execute code, not just generate it
+- You need isolated, auditable agent workspaces
+- You want deterministic feature delivery with clear phases
+- You need complete project infrastructure on demand
+- You're building a platform where agents do development work
+
+**Use raw Kubernetes when:**
+- You're running traditional containerized workloads
+- You don't need agent execution capabilities
+- You want manual control over every resource
+- You're not doing agent-driven development
+
+**Use GitHub/GitLab when:**
+- You have human-only development workflows
+- You want managed SaaS with full features
+- You don't need agent isolation
+
+For agent-driven development at scale: **rdev is the operating system.**
+
+## Future Vision
+
+### Multi-Cluster Federation (Planned)
+Projects distributed across clusters based on region, compliance, or capacity.
+
+### Agent Collaboration (Planned)
+Multiple agents working on the same project with coordination protocols and conflict resolution.
+
+### Pattern Learning (Planned)
+Successful patterns extracted from completed features and applied to new projects automatically.
+
+### The Swarm (Planned)
+A pool of specialized agents (frontend, backend, devops, QA) that self-organize around feature delivery.
+
+## The Kubernetes Analogy
+
+| K8s Concept | rdev Purpose |
+|-------------|--------------|
+| Pod | Project isolation boundary |
+| Namespace | Multi-tenancy separation |
+| Service | Internal project communication |
+| Ingress | External project access |
+| ConfigMap | Project configuration |
+| Secret | Encrypted credentials |
+| Job | Work task execution |
+| CronJob | Scheduled maintenance |
+
+## Why "Remote Developer"?
+
+The name captures the essence: **rdev is a remote developer that never sleeps.**
+
+- **Remote:** Runs in the cloud, accessible via API
+- **Developer:** Does real development work, not just code generation
+- **Deterministic:** Every action follows defined rules
+- **Observable:** Every operation is logged and auditable
+- **Scalable:** Worker pools handle unlimited concurrent tasks
+
+When you dispatch work to rdev, you're not asking for code suggestions. You're assigning a task to a developer who will:
+1. Clone the repo
+2. Create a feature branch
+3. Write the code
+4. Run the tests
+5. Submit for review
+6. Deploy to production
+
+The difference? This developer is an AI agent with a full development environment, not a human with a laptop.
+
+---
+
+**rdev: Give your agents a proper workspace.**