feat: fix composable monorepo CI builds + health endpoint improvements

Composable monorepo CI fixes:
- Add empty go.sum.tmpl files for pkg, service, worker, and cli components
- Fix Dockerfile.tmpl glob patterns (COPY go.work.sum* is invalid in Kaniko)
- Add deps step to CI that runs go work sync and go mod tidy before builds
- Fix scalar-go dependency version (v0.1.2 doesn't exist, use v0.13.0)

Health endpoint improvements:
- Add registry health check (zot OCI /v2/ endpoint)
- Add health metrics for CI, registry, and Git
- Add /health/ci endpoint for Woodpecker health

Visual verification scaffolding:
- Add Playwright pod and scripts ConfigMap
- Add vision.md and implementation breakdown plan

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-03 18:46:51 -07:00
parent 9a0591e67d
commit 9a1309a0c5
26 changed files with 1404 additions and 16 deletions

View File

@ -5,6 +5,7 @@ import (
"context" "context"
"log/slog" "log/slog"
"os" "os"
"strings"
"time" "time"
"github.com/orchard9/rdev/internal/adapter/cloudflare" "github.com/orchard9/rdev/internal/adapter/cloudflare"
@ -20,6 +21,7 @@ import (
redisadapter "github.com/orchard9/rdev/internal/adapter/redis" redisadapter "github.com/orchard9/rdev/internal/adapter/redis"
"github.com/orchard9/rdev/internal/adapter/templates" "github.com/orchard9/rdev/internal/adapter/templates"
"github.com/orchard9/rdev/internal/adapter/woodpecker" "github.com/orchard9/rdev/internal/adapter/woodpecker"
"github.com/orchard9/rdev/internal/adapter/zot"
"github.com/orchard9/rdev/internal/auth" "github.com/orchard9/rdev/internal/auth"
"github.com/orchard9/rdev/internal/db" "github.com/orchard9/rdev/internal/db"
"github.com/orchard9/rdev/internal/envutil" "github.com/orchard9/rdev/internal/envutil"
@ -404,9 +406,24 @@ func main() {
// Initialize operations handler (for debugging project failures) // Initialize operations handler (for debugging project failures)
operationsHandler := handlers.NewOperationsHandler(operationRepo) operationsHandler := handlers.NewOperationsHandler(operationRepo)
// Initialize registry health checker (for monitoring)
var registryChecker *zot.Client
if infraCfg.RegistryURL != "" {
registryURL := infraCfg.RegistryURL
// Ensure URL has protocol
if !strings.HasPrefix(registryURL, "http") {
registryURL = "https://" + registryURL
}
registryChecker = zot.NewClient(registryURL)
logger.Info("registry health checker initialized", "url", registryURL)
}
// Override default health/ready endpoints with full dependency checks // Override default health/ready endpoints with full dependency checks
healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil). healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil).
WithAgentRegistry(agentRegistry) WithAgentRegistry(agentRegistry)
if registryChecker != nil {
healthHandler = healthHandler.WithRegistryChecker(registryChecker)
}
app.Router().Get("/health", healthHandler.Health) app.Router().Get("/health", healthHandler.Health)
app.Router().Get("/ready", healthHandler.Ready) app.Router().Get("/ready", healthHandler.Ready)

View File

@ -12,6 +12,10 @@ resources:
- claudebox.yaml - claudebox.yaml
- configmaps.yaml - configmaps.yaml
# Playwright pod for visual verification
- playwright-pod.yaml
- playwright-scripts-configmap.yaml
# NOTE: secrets.yaml and credentials.yaml contain real keys and are gitignored. # NOTE: secrets.yaml and credentials.yaml contain real keys and are gitignored.
# Copy from *.example files and fill in real values before deploying. # Copy from *.example files and fill in real values before deploying.
- secrets.yaml # from secrets.yaml.example - secrets.yaml # from secrets.yaml.example

View File

@ -0,0 +1,90 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: playwright
namespace: rdev
labels:
app.kubernetes.io/name: playwright
app.kubernetes.io/part-of: rdev
spec:
serviceName: playwright
replicas: 1
selector:
matchLabels:
app: playwright
template:
metadata:
labels:
app: playwright
app.kubernetes.io/name: playwright
app.kubernetes.io/part-of: rdev
rdev.orchard9.ai/role: playwright
spec:
containers:
- name: playwright
image: mcr.microsoft.com/playwright:v1.50.0-noble
imagePullPolicy: IfNotPresent
command: ["sleep", "infinity"]
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2"
memory: "4Gi"
volumeMounts:
# Captures directory for screenshots and videos
- name: captures
mountPath: /captures
# Scripts ConfigMap mounted as scripts directory
- name: scripts
mountPath: /scripts
# Simple liveness check - container is running
livenessProbe:
exec:
command:
- test
- -f
- /scripts/capture.js
initialDelaySeconds: 5
periodSeconds: 60
# Readiness - node and playwright are available
readinessProbe:
exec:
command:
- node
- --version
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 10
volumes:
- name: captures
emptyDir: {}
- name: scripts
configMap:
name: playwright-scripts
defaultMode: 0755
---
# Headless service for StatefulSet
apiVersion: v1
kind: Service
metadata:
name: playwright
namespace: rdev
labels:
app.kubernetes.io/name: playwright
app.kubernetes.io/part-of: rdev
spec:
clusterIP: None
selector:
app: playwright
ports:
- port: 9323
name: debug

View File

@ -0,0 +1,108 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: playwright-scripts
namespace: rdev
labels:
app.kubernetes.io/name: playwright
app.kubernetes.io/part-of: rdev
data:
capture.js: |
#!/usr/bin/env node
// capture.js - Playwright screenshot/video capture script
// Input: --url, --viewports (comma-separated), --output (dir),
// --wait-for (selector), --full-page, --video
// Output: JSON manifest to stdout
const { chromium } = require('playwright');
const path = require('path');
const fs = require('fs');
async function main() {
const args = parseArgs(process.argv.slice(2));
if (!args.url) {
console.error('Error: --url is required');
process.exit(1);
}
const outputDir = args.output || '/captures/default';
const viewports = args.viewports ? args.viewports.split(',') : ['1920x1080', '768x1024', '375x667'];
const waitFor = args['wait-for'] || 'body';
const fullPage = args['full-page'] === 'true';
const recordVideo = args.video === 'true';
// Ensure output directory exists
fs.mkdirSync(outputDir, { recursive: true });
const browser = await chromium.launch({ headless: true });
const result = { screenshots: {} };
try {
for (const viewport of viewports) {
const [width, height] = viewport.split('x').map(Number);
const viewportName = `${width}x${height}`;
const contextOptions = {
viewport: { width, height },
};
if (recordVideo && viewport === viewports[0]) {
contextOptions.recordVideo = {
dir: outputDir,
size: { width, height }
};
}
const context = await browser.newContext(contextOptions);
const page = await context.newPage();
await page.goto(args.url, { waitUntil: 'networkidle', timeout: 30000 });
await page.waitForSelector(waitFor, { timeout: 10000 }).catch(() => {});
const screenshotPath = path.join(outputDir, `${viewportName.replace('x', '_')}.png`);
await page.screenshot({ path: screenshotPath, fullPage });
result.screenshots[viewportName] = screenshotPath;
if (recordVideo && viewport === viewports[0]) {
await page.close();
const video = page.video();
if (video) {
const videoPath = await video.path();
const finalVideoPath = path.join(outputDir, 'recording.webm');
fs.renameSync(videoPath, finalVideoPath);
result.video = finalVideoPath;
}
}
await context.close();
}
} finally {
await browser.close();
}
console.log(JSON.stringify(result));
}
function parseArgs(argv) {
const args = {};
for (let i = 0; i < argv.length; i++) {
if (argv[i].startsWith('--')) {
const key = argv[i].slice(2);
const eqIdx = key.indexOf('=');
if (eqIdx !== -1) {
args[key.slice(0, eqIdx)] = key.slice(eqIdx + 1);
} else if (argv[i + 1] && !argv[i + 1].startsWith('--')) {
args[key] = argv[++i];
} else {
args[key] = 'true';
}
}
}
return args;
}
main().catch(err => {
console.error('Error:', err.message);
process.exit(1);
});

View File

@ -0,0 +1,479 @@
# Visual Verification Implementation Breakdown
**Goal:** Add Playwright-based visual verification to rdev, enabling automated screenshot/video capture of deployed sites and AI-driven feature completeness evaluation. Integrate with SDLC as an optional QA gate and add a cookbook E2E test.
**Estimated Duration:** 4 weeks (assumes ~25 hours/week of focused work)
---
## Week 1: Foundation — Domain + Capture Infrastructure
**Goals:**
- Playwright pod deployed and reachable via kubectl exec
- Capture script working end-to-end
- Domain models and work task type in place
- Manual verification via kubectl exec confirms capture works
**Tasks:**
### Day 1-2: Playwright Pod Infrastructure
1. **Create Playwright pod manifest** (`deployments/k8s/base/playwright-pod.yaml`)
- StatefulSet with `mcr.microsoft.com/playwright:v1.50.0-noble` image
- `sleep infinity` command (stays alive for kubectl exec)
- Labels: `app: playwright`, `rdev.orchard9.ai/role: playwright`
- Volumes: `/captures` (emptyDir), `/scripts` (ConfigMap)
- Resources: 500m CPU / 1Gi request, 2 CPU / 4Gi limit
2. **Create capture script** (`deployments/k8s/base/playwright-scripts/capture.js`)
- ~60 lines Node.js using Playwright
- CLI: `--url`, `--viewports` (comma-sep), `--output`, `--wait-for`, `--full-page`, `--video`, `--timeout`
- Output: JSON manifest to stdout with screenshot paths
- Error handling: catch navigation failures, timeout gracefully
3. **Create ConfigMap for script** (`deployments/k8s/base/playwright-configmap.yaml`)
- Mount `capture.js` at `/scripts/capture.js`
4. **Deploy to cluster and test manually**
```bash
kubectl apply -f deployments/k8s/base/playwright-configmap.yaml
kubectl apply -f deployments/k8s/base/playwright-pod.yaml
kubectl exec playwright-0 -- node /scripts/capture.js \
--url=https://example.com --viewports=1920x1080 --output=/captures/test/
kubectl exec playwright-0 -- cat /captures/test/manifest.json
```
### Day 3: Domain Models
5. **Create domain types** (`internal/domain/verify.go`)
- `VerifySpec` struct with fields: URL, Viewports, WaitFor, WaitTimeout, FullPage, Video, Evaluate, Prompt, SpecPath, CallbackURL
- `Validate()` method: URL required, callback URL validation (reuse `ValidateCallbackURL`)
- `VerifyResult` struct: Success, Screenshots, Video, Evaluation, Score, Passed, DurationMs, Error
- `ToWorkResult()` method (promote screenshots to artifacts map)
6. **Add work task type** (`internal/domain/work.go`)
- Add `WorkTaskTypeVerify WorkTaskType = "verify"` to constants
- Update `IsValid()` to include verify
7. **Unit tests** (`internal/domain/verify_test.go`)
- Test Validate() with valid/invalid specs
- Test ToWorkResult() conversion
### Day 4-5: Verify Executor (Capture Only)
8. **Create verify executor** (`internal/worker/verify_executor.go`)
- Follow `BuildExecutor` pattern exactly
- `Execute(ctx, task)` method:
- Parse VerifySpec from task.Spec map
- Build kubectl exec command: `kubectl exec playwright-0 -- node /scripts/capture.js --url=X ...`
- Execute via existing `CommandExecutor` port
- Parse JSON manifest from stdout
- Return `BuildResult` with artifacts map containing screenshot paths
- Config struct: `VerifyExecutorConfig` with playwright pod name, namespace
- Constructor: `NewVerifyExecutor(executor, streams, logger, cfg)`
9. **Wire executor to WorkExecutor** (`internal/worker/work_executor.go`)
- Add `verifyExec *VerifyExecutor` field
- Add case in `executeTask()` switch for `WorkTaskTypeVerify`
- Update `NewWorkExecutor()` to accept VerifyExecutor
10. **Unit tests** (`internal/worker/verify_executor_test.go`)
- Mock CommandExecutor to return capture manifest JSON
- Test successful capture with multiple viewports
- Test failure handling (command fails, invalid JSON)
**Deliverables:**
- [ ] Playwright pod running in cluster
- [ ] Capture script takes screenshots successfully
- [ ] VerifySpec/VerifyResult domain types with tests
- [ ] VerifyExecutor can dispatch capture via kubectl exec
- [ ] Work queue can dispatch verify tasks (manual test via SQL insert)
**Foundation this enables:**
- Week 2 can build API layer knowing capture works
- Executor pattern established for AI evaluation later
---
## Week 2: API Layer + Manual E2E
**Goals:**
- Full API surface: POST /verify, GET /verify/{id}, GET /verifications
- Auth scopes configured
- Manual E2E working: API call → queue → capture → result
- Initial release candidate deployed to staging
**Tasks:**
### Day 1: Auth and Service Layer
1. **Add auth scopes** (`internal/auth/scopes.go`)
- `ScopeVerifyRead Scope = "verify:read"`
- `ScopeVerifyWrite Scope = "verify:write"`
- Add to `AllScopes` if needed
2. **Create verify service** (`internal/service/verify_service.go`)
- Follow `BuildService` pattern
- `StartVerify(ctx, projectID, spec)` → validate, enqueue task, return task ID
- `GetVerifyStatus(ctx, taskID)` → get task from work queue
- `ListVerifications(ctx, projectID, limit)` → list tasks by project
- Dependencies: WorkQueue port (existing)
3. **Unit tests** (`internal/service/verify_service_test.go`)
- Mock work queue
- Test enqueue, status, list
### Day 2-3: Handler Layer
4. **Create verify handler** (`internal/handlers/verify.go`)
- Follow `BuildsHandler` pattern exactly
- `Mount(r api.Router)` with scopes:
- POST `/projects/{id}/verify` → ScopeVerifyWrite
- GET `/projects/{id}/verifications` → ScopeVerifyRead
- GET `/verify/{taskId}` → ScopeVerifyRead
- Use `api.DecodeJSON()`, `validate.New()`, response helpers
- Request struct: `VerifyRequest` matching VerifySpec
- Response structs: match existing patterns
5. **Wire DI** (`cmd/rdev-api/main.go`)
- Create VerifyExecutor in worker setup
- Create VerifyService
- Create VerifyHandler
- Mount routes
6. **Handler tests** (`internal/handlers/verify_test.go`)
- Test POST with valid/invalid specs
- Test auth scope enforcement
- Test GET status/list
### Day 4: SSE Events
7. **Add verify events** (`internal/worker/verify_executor.go`)
- Publish events via StreamPublisher:
- `verify.started` - task claimed
- `verify.capturing` - starting capture
- `verify.captured` - capture complete with manifest
- `verify.completed` / `verify.failed` - final status
- Event constants in verify_executor.go (follow BuildExecutor pattern)
### Day 5: Manual E2E + Deploy
8. **Manual E2E test sequence**
```bash
# 1. Start verification
curl -X POST $RDEV_API_URL/projects/myproject/verify \
-H "X-API-Key: $RDEV_API_KEY" \
-H "Content-Type: application/json" \
-d '{"url": "https://myproject.threesix.ai", "viewports": ["1920x1080"]}'
# Response: {"task_id": "xxx"}
# 2. Poll for completion
curl $RDEV_API_URL/verify/xxx -H "X-API-Key: $RDEV_API_KEY"
# Response: screenshots in artifacts
```
9. **Build and deploy**
```bash
./scripts/release.sh v0.11.0 "feat: add visual verification (capture-only MVP)" --deploy
```
**Deliverables:**
- [ ] Auth scopes for verify:read/write
- [ ] VerifyService with enqueue/status/list
- [ ] VerifyHandler with 3 endpoints
- [ ] SSE events for verification progress
- [ ] Deployed to staging, manual E2E passing
**Foundation this enables:**
- Week 3 can add AI evaluation knowing API works
- Cookbook script can use standard api_call() pattern
---
## Week 3: AI Evaluation + Cookbook Test
**Goals:**
- AI evaluation path working (Claude reads screenshots, returns verdict)
- Cookbook E2E test script: `visual-verify-test.sh`
- Add to common.sh utilities
- Full E2E passing in CI
**Tasks:**
### Day 1-2: AI Evaluation Path
1. **Add evaluation to VerifyExecutor** (`internal/worker/verify_executor.go`)
- After successful capture, if `spec.Evaluate`:
- Build evaluation prompt: "Compare these screenshots against the specification..."
- Include spec.Prompt or read spec.SpecPath content
- Call Claude Code via CodeAgentRegistry
- Pass screenshots as attachments (file paths in pod)
- Parse evaluation output for score (look for "Score: XX/100" pattern)
- Set result.Evaluation, result.Score, result.Passed
2. **Evaluation prompt template** (hardcoded in executor for now)
```
Evaluate these screenshots against the following specification:
{spec.Prompt or contents of spec.SpecPath}
For each screenshot, assess:
1. Does the UI match the specification?
2. Are all required elements present?
3. Is the layout correct at this viewport?
End with: "Score: XX/100" and "PASSED" or "FAILED"
```
3. **Handle partial failures** (`internal/worker/verify_executor.go`)
- If capture succeeds but evaluation fails:
- Set success=true (screenshots are still useful)
- Leave evaluation=""
- Log warning
4. **Unit tests for evaluation path**
- Mock CodeAgentRegistry
- Test evaluation output parsing
- Test partial failure handling
### Day 3-4: Cookbook Test Script
5. **Add utility to common.sh** (`cookbooks/scripts/common.sh`)
```bash
# Wait for verification to complete
# Arguments: task_id [max_attempts] [poll_interval]
wait_for_verify() {
local task_id="$1"
local max_attempts="${2:-30}"
local poll_interval="${3:-5}"
# Poll GET /verify/{task_id} until completed/failed
}
```
6. **Create visual-verify-test.sh** (`cookbooks/scripts/visual-verify-test.sh`)
- Follow cookbook script SKILL.md patterns exactly
- Commands: run, status, diagnose, teardown
- Flow:
1. Create composable project with app-astro component
2. Wait for initial deploy (site is live)
3. Start build: "Create a hero section with a call-to-action button"
4. Wait for build to complete
5. Wait for CI pipeline
6. Wait for site to respond
7. Start verification: `POST /projects/{id}/verify {url, evaluate: true, prompt: ...}`
8. Wait for verify to complete
9. Assert: result.passed == true OR result.score >= 70
10. Teardown
7. **Add auto-teardown support**
- Parse `--auto-teardown` flag
- Register cleanup trap
- Set CLEANUP_PROJECT
### Day 5: Integration + CI
8. **Test locally**
```bash
./cookbooks/scripts/visual-verify-test.sh run vv-test --auto-teardown
```
9. **Add to CI** (if CI runs cookbook tests)
- Add visual-verify-test to test matrix
- Ensure playwright-0 pod is available in test environment
10. **Document in cookbook skill** (`.claude/skills/cookbook-scripts/SKILL.md`)
- Add `wait_for_verify()` to utilities list
- Add visual-verify-test.sh to examples
**Deliverables:**
- [ ] AI evaluation working with score extraction
- [ ] Partial failure handling (capture ok, eval fail)
- [ ] wait_for_verify() in common.sh
- [ ] visual-verify-test.sh passing end-to-end
- [ ] Documentation updated
**Foundation this enables:**
- Week 4 can add SDLC integration knowing full flow works
- Cookbook pattern established for future tests
---
## Week 4: SDLC Integration + Polish
**Goals:**
- Visual verification as optional SDLC gate between QA and merge
- Skeleton command: `/verify-feature`
- Build chaining: auto-verify after deploy
- Release v0.12.0 with full feature
**Tasks:**
### Day 1-2: SDLC Types and Rules
1. **Add artifact type** (`internal/sdlc/types.go`)
- `ArtifactVerification ArtifactType = "verification"`
- Add to `ValidArtifactTypes` slice
- Add case in `ArtifactFilename()` → returns `"verification.md"`
2. **Add action types** (`internal/sdlc/types.go`)
- `ActionVerifyFeature ActionType = "VERIFY_FEATURE"`
- `ActionFixVerificationIssues ActionType = "FIX_VERIFICATION_ISSUES"`
3. **Add classifier rules** (`internal/sdlc/rules_execution.go`)
- `needsVerificationRule()`:
- Condition: Phase=QA, qa_results=passed, verification=nil or pending
- Action: ActionVerifyFeature
- NextCommand: "/verify-feature {slug}"
- `verificationFailedRule()`:
- Condition: Phase=QA, verification=failed
- Action: ActionFixVerificationIssues
- NextCommand: "/fix-verification-issues {slug}"
- `verificationPassedRule()`:
- Condition: Phase=QA, qa_results=passed, verification=passed
- Action: ActionTransition to PhaseMerge
4. **Update rule ordering** (`internal/sdlc/rules.go`)
- Insert verification rules after qaPassedRule
- Update qaPassedRule: only transition if verification also passed OR feature doesn't require verification (config flag)
5. **Unit tests** (`internal/sdlc/rules_execution_test.go`)
- Test all three verification rules
- Test interaction with existing QA rules
### Day 3: Skeleton Command
6. **Create verify-feature command** (embedded template: `templates/skeleton/.claude/commands/verify-feature.md`)
```markdown
---
description: Visually verify a deployed feature
argument-hint: <feature-slug>
allowed-tools: Bash, Read, Write, Edit, Glob, Grep
---
Visually verify feature: $ARGUMENTS
## Instructions
1. Load feature spec from `.sdlc/features/$ARGUMENTS/spec.md`
2. Get project domain from CLAUDE.md or config
3. Determine the deployed URL
4. Execute verification via rdev API (if available) or Playwright directly
5. Write results to `.sdlc/features/$ARGUMENTS/verification.md`
6. Register artifact: `sdlc artifact create $ARGUMENTS verification`
## Output Format
Write `.sdlc/features/$ARGUMENTS/verification.md`:
```markdown
# Visual Verification: [Feature Title]
## Screenshots
| Viewport | Status | Notes |
|----------|--------|-------|
| Desktop (1920x1080) | PASS | All elements visible |
| Mobile (375x667) | PASS | Responsive layout correct |
## Evaluation
[AI or manual evaluation notes]
## Result
**Status:** PASSED
**Score:** 95/100
```
```
7. **Update skeleton template** to include the command
- Ensure new projects get verify-feature.md
### Day 4: Build Chaining (Optional)
8. **Add verify_after to BuildSpec** (`internal/domain/build.go`)
- `VerifyAfter bool` - auto-verify after successful deploy
- `VerifyURL string` - URL to verify (if different from project domain)
9. **Chain verification in BuildExecutor** (`internal/worker/build_executor.go`)
- After successful build + push (line ~270):
```go
if spec.VerifyAfter && spec.VerifyURL != "" {
// Enqueue verify task
}
```
- Or: callback webhook triggers external verification
10. **Update build handler** to accept verify_after/verify_url
### Day 5: Documentation + Release
11. **Update documentation**
- CLAUDE.md: Update platform status to "Done"
- visual-verification.md: Add SDLC integration examples
- sdlc.md: Document verification rules
12. **Integration test**
- Test full SDLC flow with verification gate
- Test classifier transitions correctly
13. **Final release**
```bash
./scripts/release.sh v0.12.0 "feat: visual verification with SDLC integration" --deploy
```
**Deliverables:**
- [ ] ArtifactVerification type in SDLC
- [ ] 3 classifier rules for verification gate
- [ ] verify-feature.md skeleton command
- [ ] Build chaining (verify_after flag)
- [ ] Full integration test passing
- [ ] v0.12.0 released
---
## Summary
| Week | Theme | Key Output |
|------|-------|------------|
| 1 | Foundation | Playwright pod + capture script + domain types + executor |
| 2 | API Layer | Handlers + service + auth scopes + manual E2E |
| 3 | AI + Cookbook | Evaluation path + visual-verify-test.sh + common.sh utils |
| 4 | SDLC + Polish | Classifier rules + skeleton command + build chaining + release |
## Risks and Mitigations
| Risk | Impact | Mitigation |
|------|--------|------------|
| Playwright pod OOM | Capture fails | Start with conservative limits (4Gi), tune based on usage |
| AI evaluation unreliable | Poor pass/fail decisions | Start with high threshold (70), tune; partial success mode |
| Screenshot storage fills up | Pod crashes | EmptyDir for now, add cleanup job or PVC later |
| SDLC rules conflict | Features stuck | Test extensively, make verification optional via config |
| Claude Code can't read screenshots | Evaluation broken | Test multimodal support; fallback to manual verification |
## Files Created/Modified
**New Files (13):**
- `internal/domain/verify.go`
- `internal/domain/verify_test.go`
- `internal/service/verify_service.go`
- `internal/service/verify_service_test.go`
- `internal/handlers/verify.go`
- `internal/handlers/verify_test.go`
- `internal/worker/verify_executor.go`
- `internal/worker/verify_executor_test.go`
- `deployments/k8s/base/playwright-pod.yaml`
- `deployments/k8s/base/playwright-configmap.yaml`
- `deployments/k8s/base/playwright-scripts/capture.js`
- `cookbooks/scripts/visual-verify-test.sh`
- `templates/skeleton/.claude/commands/verify-feature.md`
**Modified Files (8):**
- `internal/domain/work.go` - Add WorkTaskTypeVerify
- `internal/auth/scopes.go` - Add verify scopes
- `internal/worker/work_executor.go` - Add dispatch case
- `internal/sdlc/types.go` - Add artifact/action types
- `internal/sdlc/rules.go` - Register verification rules
- `internal/sdlc/rules_execution.go` - Add verification rules
- `cookbooks/scripts/common.sh` - Add wait_for_verify()
- `cmd/rdev-api/main.go` - Wire DI

View File

@ -23,6 +23,7 @@ steps:
- ${CI_COMMIT_SHA:0:8} - ${CI_COMMIT_SHA:0:8}
cache: true cache: true
skip-tls-verify: true skip-tls-verify: true
failure: retry
when: when:
- event: push - event: push
branch: main branch: main

View File

@ -10,7 +10,8 @@ ENV GOWORK=/app/go.work
WORKDIR /app WORKDIR /app
# Copy go workspace and all source (workspace deps are local) # Copy go workspace and all source (workspace deps are local)
COPY go.work go.work.sum* ./ COPY go.work ./
COPY go.work.sum ./
COPY pkg/ ./pkg/ COPY pkg/ ./pkg/
COPY services/{{COMPONENT_NAME}}/ ./services/{{COMPONENT_NAME}}/ COPY services/{{COMPONENT_NAME}}/ ./services/{{COMPONENT_NAME}}/

View File

@ -10,7 +10,8 @@ ENV GOWORK=/app/go.work
WORKDIR /app WORKDIR /app
# Copy go workspace and all source (workspace deps are local) # Copy go workspace and all source (workspace deps are local)
COPY go.work go.work.sum* ./ COPY go.work ./
COPY go.work.sum ./
COPY pkg/ ./pkg/ COPY pkg/ ./pkg/
COPY workers/{{COMPONENT_NAME}}/ ./workers/{{COMPONENT_NAME}}/ COPY workers/{{COMPONENT_NAME}}/ ./workers/{{COMPONENT_NAME}}/

View File

@ -9,6 +9,7 @@ steps:
- ${CI_COMMIT_SHA:0:8} - ${CI_COMMIT_SHA:0:8}
cache: true cache: true
skip-tls-verify: true skip-tls-verify: true
failure: retry
when: when:
- event: push - event: push
branch: main branch: main

View File

@ -23,6 +23,7 @@ steps:
- ${CI_COMMIT_SHA:0:8} - ${CI_COMMIT_SHA:0:8}
cache: true cache: true
skip-tls-verify: true skip-tls-verify: true
failure: retry
when: when:
- event: push - event: push
branch: main branch: main

View File

@ -8,6 +8,32 @@ clone:
depth: 1 depth: 1
steps: steps:
deps:
image: golang:1.23
commands:
- go work sync
- |
for dir in services/*/; do
if [ -f "$dir/go.mod" ]; then
(cd "$dir" && go mod tidy)
fi
done
- |
for dir in workers/*/; do
if [ -f "$dir/go.mod" ]; then
(cd "$dir" && go mod tidy)
fi
done
- |
for dir in cli/*/; do
if [ -f "$dir/go.mod" ]; then
(cd "$dir" && go mod tidy)
fi
done
when:
branch: main
event: push
# COMPONENT_STEPS_BELOW # COMPONENT_STEPS_BELOW
# Do not remove the marker above - component steps are inserted here # Do not remove the marker above - component steps are inserted here

View File

@ -3,7 +3,7 @@ module {{GO_MODULE}}/pkg
go 1.23 go 1.23
require ( require (
github.com/bdpiprava/scalar-go v0.1.2 github.com/bdpiprava/scalar-go v0.13.0
github.com/go-chi/chi/v5 v5.2.0 github.com/go-chi/chi/v5 v5.2.0
github.com/go-chi/cors v1.2.1 github.com/go-chi/cors v1.2.1
github.com/go-playground/validator/v10 v10.23.0 github.com/go-playground/validator/v10 v10.23.0

View File

@ -0,0 +1,74 @@
// Package zot provides a client for checking zot container registry health.
package zot
import (
"context"
"fmt"
"net/http"
"time"
"github.com/orchard9/rdev/internal/domain"
)
// Client checks zot registry health via the OCI /v2/ endpoint.
type Client struct {
url string
httpClient *http.Client
}
// NewClient creates a new zot health checker.
// The URL should be the registry base URL (e.g., "https://registry.threesix.ai").
func NewClient(url string) *Client {
return &Client{
url: url,
httpClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
// Check returns the health status of the registry.
// A 200 or 401 response indicates the registry is healthy (401 means auth required but registry is up).
func (c *Client) Check(ctx context.Context) domain.RegistryStatus {
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.url+"/v2/", nil)
if err != nil {
return domain.RegistryStatus{
Healthy: false,
URL: c.url,
Error: fmt.Sprintf("failed to create request: %v", err),
LastChecked: time.Now().UTC(),
}
}
resp, err := c.httpClient.Do(req)
latency := time.Since(start)
if err != nil {
return domain.RegistryStatus{
Healthy: false,
URL: c.url,
Latency: latency.String(),
Error: fmt.Sprintf("connection error: %v", err),
LastChecked: time.Now().UTC(),
}
}
defer func() { _ = resp.Body.Close() }()
// 200 = healthy, 401 = healthy but requires auth
healthy := resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusUnauthorized
status := domain.RegistryStatus{
Healthy: healthy,
URL: c.url,
Latency: latency.String(),
LastChecked: time.Now().UTC(),
}
if !healthy {
status.Error = fmt.Sprintf("unexpected status code: %d", resp.StatusCode)
}
return status
}

View File

@ -81,4 +81,5 @@ var (
// Infrastructure errors (should typically be wrapped) // Infrastructure errors (should typically be wrapped)
ErrDatabaseConnection = errors.New("database connection error") ErrDatabaseConnection = errors.New("database connection error")
ErrKubernetesError = errors.New("kubernetes error") ErrKubernetesError = errors.New("kubernetes error")
ErrRegistryUnavailable = errors.New("container registry unavailable")
) )

View File

@ -12,6 +12,7 @@ const (
OperationTypeProjectCreate OperationType = "project.create" OperationTypeProjectCreate OperationType = "project.create"
OperationTypeComponentAdd OperationType = "component.add" OperationTypeComponentAdd OperationType = "component.add"
OperationTypeBuild OperationType = "build" OperationTypeBuild OperationType = "build"
OperationTypeCIBuild OperationType = "ci.build"
OperationTypeResourceProvision OperationType = "resource.provision" OperationTypeResourceProvision OperationType = "resource.provision"
) )
@ -19,7 +20,7 @@ const (
func (t OperationType) IsValid() bool { func (t OperationType) IsValid() bool {
switch t { switch t {
case OperationTypeProjectCreate, OperationTypeComponentAdd, case OperationTypeProjectCreate, OperationTypeComponentAdd,
OperationTypeBuild, OperationTypeResourceProvision: OperationTypeBuild, OperationTypeCIBuild, OperationTypeResourceProvision:
return true return true
} }
return false return false

View File

@ -0,0 +1,12 @@
package domain
import "time"
// RegistryStatus represents the health status of a container registry.
type RegistryStatus struct {
Healthy bool `json:"healthy"`
URL string `json:"url"`
Latency string `json:"latency,omitempty"`
Error string `json:"error,omitempty"`
LastChecked time.Time `json:"last_checked"`
}

View File

@ -8,6 +8,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/orchard9/rdev/internal/metrics"
"github.com/orchard9/rdev/internal/port" "github.com/orchard9/rdev/internal/port"
"github.com/orchard9/rdev/pkg/api" "github.com/orchard9/rdev/pkg/api"
) )
@ -25,6 +26,7 @@ type HealthHandler struct {
k8sChecker port.KubernetesChecker k8sChecker port.KubernetesChecker
agentRegistry port.CodeAgentRegistry agentRegistry port.CodeAgentRegistry
workExecutor ExecutorHealthChecker workExecutor ExecutorHealthChecker
registryChecker port.RegistryChecker
} }
// NewHealthHandler creates a new health handler with dependencies. // NewHealthHandler creates a new health handler with dependencies.
@ -48,6 +50,12 @@ func (h *HealthHandler) WithWorkExecutor(executor ExecutorHealthChecker) *Health
return h return h
} }
// WithRegistryChecker adds a registry checker for health monitoring.
func (h *HealthHandler) WithRegistryChecker(checker port.RegistryChecker) *HealthHandler {
h.registryChecker = checker
return h
}
// Health returns a simple liveness check. // Health returns a simple liveness check.
// This should be lightweight and only fail if the process is unhealthy. // This should be lightweight and only fail if the process is unhealthy.
// GET /health // GET /health
@ -100,6 +108,11 @@ func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) {
checks["work_executor"] = h.checkWorkExecutor() checks["work_executor"] = h.checkWorkExecutor()
} }
// Registry check (informational - doesn't affect overall readiness)
if h.registryChecker != nil {
checks["registry"] = h.checkRegistry(ctx)
}
response := ReadinessResponse{ response := ReadinessResponse{
Status: "ready", Status: "ready",
Service: h.serviceName, Service: h.serviceName,
@ -217,6 +230,35 @@ func (h *HealthHandler) checkWorkExecutor() CheckResult {
} }
} }
// checkRegistry checks whether the container registry is healthy.
func (h *HealthHandler) checkRegistry(ctx context.Context) CheckResult {
status := h.registryChecker.Check(ctx)
// Update metrics
latencySeconds := 0.0
if status.Latency != "" {
// Parse duration string like "45ms"
if d, err := time.ParseDuration(status.Latency); err == nil {
latencySeconds = d.Seconds()
}
}
metrics.SetRegistryHealth(status.Healthy, latencySeconds)
result := CheckResult{
Healthy: status.Healthy,
Latency: status.Latency,
LastCheck: status.LastChecked,
}
if status.Healthy {
result.Message = "connected"
} else {
result.Message = status.Error
}
return result
}
// CheckResult represents the result of a health check. // CheckResult represents the result of a health check.
type CheckResult struct { type CheckResult struct {
Healthy bool `json:"healthy"` Healthy bool `json:"healthy"`

View File

@ -14,6 +14,7 @@ import (
"strings" "strings"
"github.com/orchard9/rdev/internal/domain" "github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/metrics"
"github.com/orchard9/rdev/internal/port" "github.com/orchard9/rdev/internal/port"
"github.com/orchard9/rdev/internal/service" "github.com/orchard9/rdev/internal/service"
"github.com/orchard9/rdev/pkg/api" "github.com/orchard9/rdev/pkg/api"
@ -166,6 +167,18 @@ func (h *WoodpeckerWebhookHandler) HandleWebhook(w http.ResponseWriter, r *http.
"build_number", payload.Build.Number, "build_number", payload.Build.Number,
) )
// Track failed builds for visibility
if payload.Build.Status == "failure" {
h.handleFailedBuild(ctx, payload)
api.WriteSuccess(w, r, map[string]any{
"status": "recorded",
"reason": "build failed",
"project": payload.Repo.Name,
"build": payload.Build.Number,
})
return
}
// Only process successful builds on main/master branch // Only process successful builds on main/master branch
if payload.Build.Status != "success" { if payload.Build.Status != "success" {
api.WriteSuccess(w, r, map[string]string{ api.WriteSuccess(w, r, map[string]string{
@ -287,3 +300,58 @@ func (h *WoodpeckerWebhookHandler) verifySignature(body []byte, signature string
return hmac.Equal([]byte(signature), []byte(expected)) return hmac.Equal([]byte(signature), []byte(expected))
} }
// handleFailedBuild records a failed CI build for visibility and debugging.
func (h *WoodpeckerWebhookHandler) handleFailedBuild(ctx context.Context, payload WoodpeckerPayload) {
projectName := payload.Repo.Name
h.logger.Warn("CI build failed",
"project", projectName,
"build_number", payload.Build.Number,
"branch", payload.Build.Branch,
"commit", payload.Build.Commit,
"author", payload.Build.Author,
)
// Record metrics
metrics.RecordCIBuild(projectName, "failure")
// Check if this looks like a registry push failure
// (We can't get detailed logs here, but we track the failure)
if payload.Build.Branch == "main" || payload.Build.Branch == "master" {
// Failed builds on main are likely image push failures
metrics.RecordCIPushFailure(projectName)
}
// Track as operation if operation service is configured
if h.operationService != nil {
operationID, _ := h.operationService.StartOperation(ctx, projectName,
domain.OperationTypeCIBuild,
map[string]any{
"repo": payload.Repo.FullName,
"branch": payload.Build.Branch,
"commit": payload.Build.Commit,
"build_number": payload.Build.Number,
"author": payload.Build.Author,
}, "")
if operationID != "" {
// Set external reference to build number
if opErr := h.operationService.SetExternalRef(ctx, operationID, fmt.Sprintf("build#%d", payload.Build.Number)); opErr != nil {
h.logger.Error("failed to set external ref", "error", opErr, "operation_id", operationID)
}
// Link to parent operation via commit SHA
if parent, err := h.operationService.FindByCommit(ctx, projectName, payload.Build.Commit); err == nil && parent != nil {
if opErr := h.operationService.LinkToParent(ctx, operationID, parent.ID); opErr != nil {
h.logger.Error("failed to link to parent operation", "error", opErr, "operation_id", operationID)
}
}
// Mark as failed
if opErr := h.operationService.FailOperation(ctx, operationID, "CI build failed", ""); opErr != nil {
h.logger.Error("failed to record operation failure", "error", opErr, "operation_id", operationID)
}
}
}
}

View File

@ -254,7 +254,59 @@ func TestWoodpeckerWebhookHandler_LinksToParentOperation(t *testing.T) {
t.Error("build operation not found") t.Error("build operation not found")
} }
func TestWoodpeckerWebhookHandler_IgnoresNonSuccessBuilds(t *testing.T) { func TestWoodpeckerWebhookHandler_RecordsFailedBuilds(t *testing.T) {
opRepo := newMockOperationRepo()
opSvc := service.NewOperationService(opRepo, slog.Default())
h := &WoodpeckerWebhookHandler{
operationService: opSvc,
logger: slog.Default(),
}
payload := WoodpeckerPayload{
Event: "push",
Repo: WoodpeckerRepo{Name: "my-project", FullName: "org/my-project"},
Build: WoodpeckerBuild{
Number: 99,
Status: "failure",
Branch: "main",
Commit: "abc123",
},
}
body, _ := json.Marshal(payload)
req := httptest.NewRequest(http.MethodPost, "/webhooks/woodpecker", strings.NewReader(string(body)))
rec := httptest.NewRecorder()
h.HandleWebhook(rec, req)
// Failed builds are now recorded for visibility
if opRepo.count() != 1 {
t.Errorf("expected 1 operation for failed build, got %d", opRepo.count())
}
// Verify the operation was marked as failed
for _, op := range opRepo.operations {
if op.Type != domain.OperationTypeCIBuild {
t.Errorf("expected operation type ci.build, got %s", op.Type)
}
if op.Status != domain.OperationStatusFailed {
t.Errorf("expected operation status failed, got %s", op.Status)
}
}
// Verify response indicates build was recorded
var resp struct {
Data map[string]any `json:"data"`
}
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if resp.Data["status"] != "recorded" {
t.Errorf("expected status 'recorded', got %v", resp.Data["status"])
}
}
func TestWoodpeckerWebhookHandler_IgnoresPendingBuilds(t *testing.T) {
opRepo := newMockOperationRepo() opRepo := newMockOperationRepo()
opSvc := service.NewOperationService(opRepo, slog.Default()) opSvc := service.NewOperationService(opRepo, slog.Default())
@ -267,7 +319,7 @@ func TestWoodpeckerWebhookHandler_IgnoresNonSuccessBuilds(t *testing.T) {
Event: "push", Event: "push",
Repo: WoodpeckerRepo{Name: "my-project"}, Repo: WoodpeckerRepo{Name: "my-project"},
Build: WoodpeckerBuild{ Build: WoodpeckerBuild{
Status: "failure", Status: "pending",
Branch: "main", Branch: "main",
Commit: "abc123", Commit: "abc123",
}, },
@ -278,8 +330,8 @@ func TestWoodpeckerWebhookHandler_IgnoresNonSuccessBuilds(t *testing.T) {
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
h.HandleWebhook(rec, req) h.HandleWebhook(rec, req)
// Non-success builds are ignored, so no operation should be created // Pending/running builds are ignored (only success and failure are handled)
if opRepo.count() != 0 { if opRepo.count() != 0 {
t.Errorf("expected no operations for failed build, got %d", opRepo.count()) t.Errorf("expected no operations for pending build, got %d", opRepo.count())
} }
} }

View File

@ -120,6 +120,28 @@ var (
Name: "rdev_api_requests_total", Name: "rdev_api_requests_total",
Help: "Total number of API requests", Help: "Total number of API requests",
}, []string{"method", "path", "status"}) }, []string{"method", "path", "status"})
// Registry health
registryHealthy = promauto.NewGauge(prometheus.GaugeOpts{
Name: "rdev_registry_healthy",
Help: "Whether the container registry is healthy (1) or not (0)",
})
registryLatency = promauto.NewGauge(prometheus.GaugeOpts{
Name: "rdev_registry_latency_seconds",
Help: "Latency of registry health check in seconds",
})
// CI builds
ciBuildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_ci_builds_total",
Help: "Total number of CI builds by project and status",
}, []string{"project", "status"})
ciPushFailures = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_ci_push_failures_total",
Help: "Total number of CI image push failures by project",
}, []string{"project"})
) )
// RecordCommand records a command execution. // RecordCommand records a command execution.
@ -206,6 +228,26 @@ func SetWorkQueueDepth(status string, count int64) {
workQueueDepth.WithLabelValues(status).Set(float64(count)) workQueueDepth.WithLabelValues(status).Set(float64(count))
} }
// SetRegistryHealth sets the registry health status.
func SetRegistryHealth(healthy bool, latencySeconds float64) {
val := 0.0
if healthy {
val = 1.0
}
registryHealthy.Set(val)
registryLatency.Set(latencySeconds)
}
// RecordCIBuild records a CI build event.
func RecordCIBuild(project, status string) {
ciBuildsTotal.WithLabelValues(project, status).Inc()
}
// RecordCIPushFailure records a CI image push failure.
func RecordCIPushFailure(project string) {
ciPushFailures.WithLabelValues(project).Inc()
}
// Handler returns the Prometheus HTTP handler. // Handler returns the Prometheus HTTP handler.
func Handler() http.Handler { func Handler() http.Handler {
return promhttp.Handler() return promhttp.Handler()

View File

@ -1,6 +1,10 @@
package port package port
import "context" import (
"context"
"github.com/orchard9/rdev/internal/domain"
)
// DatabasePinger checks database connectivity. // DatabasePinger checks database connectivity.
// *sql.DB satisfies this interface. // *sql.DB satisfies this interface.
@ -13,3 +17,9 @@ type KubernetesChecker interface {
// ServerVersion returns the server version string, or an error if unreachable. // ServerVersion returns the server version string, or an error if unreachable.
ServerVersion() (string, error) ServerVersion() (string, error)
} }
// RegistryChecker checks container registry health.
type RegistryChecker interface {
// Check returns the health status of the registry.
Check(ctx context.Context) domain.RegistryStatus
}

357
vision.md Normal file
View File

@ -0,0 +1,357 @@
# rdev: The Agent's Operating System
> **Platform:** threesix.ai
> **Category:** Infrastructure / Agent Orchestration Platform
> **Role:** The runtime environment where AI agents become software engineers
## The Problem: Agents Have No Workspace
Current agent systems suffer from **The Phantom Limb** problem: agents can *think* but they can't *do*. They generate code but have nowhere to run it. They propose changes but have no git repo. They want to deploy but have no infrastructure.
When you ask an agent to "build a landing page," it must:
- **Beg for shell access** (security nightmare)
- **Dump code to chat** (copy-paste purgatory)
- **Hope you handle infra** (manual setup hell)
**Real example:** A founder asks Claude to build a product landing page. Claude writes the code, but now what? The founder needs to set up a git repo, configure CI/CD, buy a domain, provision DNS, create a database, and figure out deployment. By the time infra is ready, the enthusiasm is gone. The code sits in a chat log. The product never launches.
## The Solution: Give Agents a Full Developer Environment
rdev rejects the idea that agents are just "code generators." Instead, it models agent work as a **Controlled Development Environment**:
- **Projects are isolated.** Each agent workspace is a Kubernetes pod with its own git repo, secrets, and environment.
- **Commands are executed.** Shell, Git, and Claude Code commands run inside pods, not locally.
- **Infrastructure is automatic.** Git repos, CI/CD, DNS, databases, caches, and deployments provision on demand.
- **Feature delivery is deterministic.** A 10-phase SDLC lifecycle guides every feature from idea to production.
## The Four Pillars
Every use case must demonstrate at least one pillar. If a shell script could do it, it's not a compelling use case.
| Pillar | What It Enables | Shell Script Gap |
|--------|-----------------|------------------|
| **First-Class Isolation** | Each project in its own pod with dedicated workspace, credentials, network | Shared machine, credential leakage, no boundaries |
| **Deterministic SDLC** | Every feature follows 10-phase lifecycle with classifier-driven transitions | Manual process, skipped steps, undefined state |
| **Infrastructure Orchestration** | Git, CI/CD, DNS, DB, cache, deployment created via API | Hours of manual setup per project |
| **Observable Execution** | Every command logged, streamed, auditable | Fire-and-forget scripts, no visibility |
## The Core Data Model: The Project
The atomic unit is not a Container, VM, or Directory. It is the **Project**:
```go
type Project struct {
// Identity
ID string // Kubernetes pod name
Name string // Human-readable name
Namespace string // K8s namespace isolation
// Infrastructure
GitRepo *GitRepo // Gitea repo with SSH/HTTPS URLs
Domain *Domain // Custom subdomain + TLS
Database *Database // CockroachDB isolated tenant
Cache *Cache // Redis ACL-scoped namespace
// Execution
Status ProjectStatus // Running, Stopped, Failed
Agent CodeAgent // Claude Code, OpenCode, etc.
WorkDir string // /workspace inside pod
// SDLC
Features []Feature // Active feature branches
Classifier ClassifierEngine // State machine for transitions
}
```
## The SDLC Lifecycle
Every feature follows a deterministic 10-phase lifecycle. The classifier engine evaluates state and returns the next valid action.
| Phase | What Happens | Artifacts Produced |
|-------|--------------|-------------------|
| **Draft** | Feature captured as rough idea | `spec.md` draft |
| **Specified** | Requirements refined, acceptance criteria defined | `spec.md` approved |
| **Planned** | Implementation strategy designed | `design.md` with component breakdown |
| **Ready** | Tasks extracted, blockers resolved | `tasks.md` with implementation items |
| **Implementation** | Code written task-by-task | Code commits, test coverage |
| **Review** | Code reviewed for quality | Review comments, fixes |
| **Audit** | Tech debt and security checked | Audit report |
| **QA** | Feature tested against spec | QA checklist, evidence |
| **Merge** | Feature branch merged to main | Git merge commit |
| **Released** | Deployed to production | Deployment record |
The classifier is a pure function: given current state, it returns the next action. No ambiguity. No skipped steps.
## The Work Queue: Scaled Agent Labor
Multiple agents can work across projects via the **Worker Pool**:
```go
type WorkTask struct {
// Identity
ID string // UUID
ProjectID string // Target project
Command string // claude, shell, git
// State
Status TaskStatus // pending → running → completed/failed
WorkerID *string // Assigned worker
Error *WorkTaskError // Classified failure
// Lifecycle
Attempts int // Retry count
CreatedAt time.Time
StartedAt *time.Time
CompletedAt *time.Time
}
```
Workers are stateless pods that poll for tasks. When a worker claims a task, it:
1. Executes the command in the target project's pod
2. Streams output back via SSE
3. Reports success/failure with error classification
Error classification enables smart retries:
| Error Class | Behavior |
|-------------|----------|
| **RateLimited** | Exponential backoff |
| **AuthFailed** | Fail immediately, notify |
| **Timeout** | Retry with longer timeout |
| **StaleWorker** | Reassign to healthy worker |
| **ResourceExhausted** | Wait for capacity |
## The Infrastructure Stack
A single API call provisions complete project infrastructure:
```http
POST /projects
{
"name": "acme-landing",
"template": "astro-landing"
}
```
This triggers:
| Step | Adapter | Result |
|------|---------|--------|
| 1. Git repo | Gitea | `git@gitea.orchard9.ai:projects/acme-landing.git` |
| 2. CI/CD | Woodpecker | Pipeline auto-activated, webhooks configured |
| 3. DNS | Cloudflare | `acme-landing.threesix.ai` A record |
| 4. TLS | Kubernetes | Wildcard cert via cert-manager |
| 5. Database | CockroachDB | Tenant `acme_landing` with isolated schema |
| 6. Cache | Redis | ACL-scoped `acme-landing:*` keys |
| 7. Deployment | Kubernetes | Deployment + Service + Ingress |
Total time: ~30 seconds. Manual equivalent: ~3 hours.
## Architecture: The Hexagonal Stack
| Layer | Package | Role |
|-------|---------|------|
| **Handlers** | `internal/handlers/` | HTTP endpoints, request validation, auth |
| **Services** | `internal/service/` | Business logic orchestration |
| **Ports** | `internal/port/` | Interface contracts (no implementation) |
| **Adapters** | `internal/adapter/` | Infrastructure implementations |
| **Domain** | `internal/domain/` | Pure business models (zero dependencies) |
The hexagonal metaphor:
- **Domain:** Pure truth. No imports except stdlib.
- **Ports:** Contracts. What the domain needs from the world.
- **Adapters:** Implementations. Kubernetes, Postgres, Gitea, etc.
- **Services:** Orchestration. Coordinate ports to achieve business goals.
```
┌────────────────────┐
│ HTTP Handlers │
└─────────┬──────────┘
┌─────────▼──────────┐
│ Service Layer │
└─────────┬──────────┘
┌─────────────────────┼─────────────────────┐
│ │ │
┌───────▼───────┐ ┌────────▼────────┐ ┌───────▼───────┐
│ Kubernetes │ │ PostgreSQL │ │ Gitea │
│ Adapter │ │ Adapter │ │ Adapter │
└───────────────┘ └─────────────────┘ └───────────────┘
```
## The Agent Registry
rdev supports multiple agent providers through a unified interface:
| Agent | Capabilities | Use Case |
|-------|--------------|----------|
| **Claude Code** | Full IDE replacement, complex reasoning | Feature implementation |
| **OpenCode** | Fast iteration, cost-effective | Simple fixes, testing |
| **Custom** | Extensible via registry | Specialized workflows |
Agents are interchangeable. The same work task can target different agents based on complexity, cost, or capability requirements.
## Key Capabilities
### Streaming Execution
Commands stream output in real-time via Server-Sent Events:
```http
GET /projects/acme-landing/events
Accept: text/event-stream
data: {"type":"output","line":"Installing dependencies..."}
data: {"type":"output","line":"Building production bundle..."}
data: {"type":"complete","exit_code":0}
```
### SDLC Orchestration
Ask the classifier what to do next:
```http
GET /projects/acme-landing/sdlc/features/user-auth/next
{
"action": "implement-task",
"task_id": "task-003",
"reason": "All blockers resolved, tasks available"
}
```
### Operation Audit Trail
Every operation is logged with step-level granularity:
```http
GET /projects/acme-landing/audit
[
{
"operation_id": "op-123",
"type": "sdlc_execute",
"steps": [
{"name": "read_state", "status": "completed", "duration_ms": 45},
{"name": "classify", "status": "completed", "duration_ms": 12},
{"name": "execute_action", "status": "completed", "duration_ms": 8234}
]
}
]
```
### Visual Verification (Planned)
Playwright captures screenshots and video for AI evaluation:
```http
POST /projects/acme-landing/verify
{
"url": "https://acme-landing.threesix.ai",
"viewports": ["desktop", "tablet", "mobile"],
"capture_video": true
}
```
## The Composable Monorepo
Projects can be composable monorepos with independent components:
```
acme-platform/
├── services/
│ ├── api/ # Go API service
│ └── worker/ # Background job processor
├── apps/
│ ├── web/ # React frontend
│ └── landing/ # Astro marketing site
└── packages/
└── shared/ # Shared types and utilities
```
Each component has:
- Independent deployment pipeline
- Own database/cache isolation
- Separate CI/CD triggers
- Shared monorepo patterns
## The Git Analogy
| Git Concept | rdev Equivalent |
|-------------|-----------------|
| Repository | Project (isolated pod with workspace) |
| Branch | Feature (SDLC lifecycle instance) |
| Commit | Artifact (spec, design, code, test) |
| Merge | Phase transition to Released |
| CI/CD | Woodpecker pipeline (auto-triggered) |
| Deploy | Kubernetes Deployment (auto-provisioned) |
## When to Use rdev
**Use rdev when:**
- You want agents to execute code, not just generate it
- You need isolated, auditable agent workspaces
- You want deterministic feature delivery with clear phases
- You need complete project infrastructure on demand
- You're building a platform where agents do development work
**Use raw Kubernetes when:**
- You're running traditional containerized workloads
- You don't need agent execution capabilities
- You want manual control over every resource
- You're not doing agent-driven development
**Use GitHub/GitLab when:**
- You have human-only development workflows
- You want managed SaaS with full features
- You don't need agent isolation
For agent-driven development at scale: **rdev is the operating system.**
## Future Vision
### Multi-Cluster Federation (Planned)
Projects distributed across clusters based on region, compliance, or capacity.
### Agent Collaboration (Planned)
Multiple agents working on the same project with coordination protocols and conflict resolution.
### Pattern Learning (Planned)
Successful patterns extracted from completed features and applied to new projects automatically.
### The Swarm (Planned)
A pool of specialized agents (frontend, backend, devops, QA) that self-organize around feature delivery.
## The Kubernetes Analogy
| K8s Concept | rdev Purpose |
|-------------|--------------|
| Pod | Project isolation boundary |
| Namespace | Multi-tenancy separation |
| Service | Internal project communication |
| Ingress | External project access |
| ConfigMap | Project configuration |
| Secret | Encrypted credentials |
| Job | Work task execution |
| CronJob | Scheduled maintenance |
## Why "Remote Developer"?
The name captures the essence: **rdev is a remote developer that never sleeps.**
- **Remote:** Runs in the cloud, accessible via API
- **Developer:** Does real development work, not just code generation
- **Deterministic:** Every action follows defined rules
- **Observable:** Every operation is logged and auditable
- **Scalable:** Worker pools handle unlimited concurrent tasks
When you dispatch work to rdev, you're not asking for code suggestions. You're assigning a task to a developer who will:
1. Clone the repo
2. Create a feature branch
3. Write the code
4. Run the tests
5. Submit for review
6. Deploy to production
The difference? This developer is an AI agent with a full development environment, not a human with a laptop.
---
**rdev: Give your agents a proper workspace.**