feat: add work task error classification and user-facing error codes

- Add WorkErrorCode type with RATE_LIMITED, AUTH_FAILED, TIMEOUT, STALE_WORKER, AGENT_ERROR, INVALID_SPEC - Add ClassifyAgentError function to detect error patterns from stderr - Add error_code column to work_queue table (migration 016) - Add FailWithCode method to WorkQueue interface and implementations - Update RequeueStaleWithIDs to mark permanently failed tasks with STALE_WORKER - Add ErrorCode to BuildResult for API responses - Update work executor to classify errors before failing tasks This enables users to see actual failure reasons (e.g., "RATE_LIMITED") instead of builds stuck in "running" state forever when Claude hits rate limits. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 00:07:34 -07:00 · 2026-02-03 00:07:34 -07:00 · cfba724f8a
commit cfba724f8a
parent cce4314a39
15 changed files with 420 additions and 17 deletions
--- a/internal/adapter/postgres/work_queue.go
+++ b/internal/adapter/postgres/work_queue.go
@ -167,7 +167,22 @@ func (r *WorkQueueRepository) Complete(ctx context.Context, taskID string, resul
 // Fail marks a task as failed with an error message.
 // Uses a single atomic UPDATE to avoid race conditions between SELECT and UPDATE.
 func (r *WorkQueueRepository) Fail(ctx context.Context, taskID string, errMsg string) error {
-	// Use a single atomic query that handles both retry and permanent failure cases
+	return r.FailWithCode(ctx, taskID, errMsg, domain.WorkErrorCodeNone)
 }
 // FailWithCode marks a task as failed with an error message and categorized error code.
 // The error code enables clients to distinguish failure types (rate limit, auth, timeout).
 // If retry_count < max_retries, the task will be re-queued as pending (error_code cleared).
 func (r *WorkQueueRepository) FailWithCode(ctx context.Context, taskID string, errMsg string, code domain.WorkErrorCode) error {
 	// Use a single atomic query that handles both retry and permanent failure cases.
 	// When retrying, clear error_code so the task gets a fresh start.
 	// Only set error_code on permanent failure.
 	var errorCode *string
 	if code != domain.WorkErrorCodeNone {
 		codeStr := string(code)
 		errorCode = &codeStr
 	}
 	result, err := r.db.ExecContext(ctx, `
 		UPDATE work_queue
 		SET
@ -191,9 +206,13 @@ func (r *WorkQueueRepository) Fail(ctx context.Context, taskID string, errMsg st
 				WHEN retry_count < max_retries THEN retry_count + 1
 				ELSE retry_count
 			END,
-			error = $1
+			error = $1,
 			error_code = CASE
 				WHEN retry_count >= max_retries THEN $3
 				ELSE NULL
 			END
 		WHERE id = $2
-	`, errMsg, taskID)
+	`, errMsg, taskID, errorCode)
 	if err != nil {
 		return fmt.Errorf("fail work task: %w", err)
--- a/internal/adapter/postgres/work_queue_queries.go
+++ b/internal/adapter/postgres/work_queue_queries.go
@ -23,11 +23,12 @@ func (r *WorkQueueRepository) GetTask(ctx context.Context, taskID string) (*doma
 	var completedAt sql.NullTime
 	var resultJSON []byte
 	var errorMsg sql.NullString
 	var errorCode sql.NullString
 	err := r.db.QueryRowContext(ctx, `
 		SELECT id, project_id, task_type, task_spec, status, priority, worker_id,
 		       callback_url, created_at, started_at, completed_at, result, error,
-		       retry_count, max_retries
+		       retry_count, max_retries, error_code
 		FROM work_queue
 		WHERE id = $1
 	`, taskID).Scan(
@ -46,6 +47,7 @@ func (r *WorkQueueRepository) GetTask(ctx context.Context, taskID string) (*doma
 		&errorMsg,
 		&task.RetryCount,
 		&task.MaxRetries,
 		&errorCode,
 	)
 	if errors.Is(err, sql.ErrNoRows) {
@ -73,6 +75,9 @@ func (r *WorkQueueRepository) GetTask(ctx context.Context, taskID string) (*doma
 	if errorMsg.Valid {
 		task.Error = errorMsg.String
 	}
 	if errorCode.Valid {
 		task.ErrorCode = domain.WorkErrorCode(errorCode.String)
 	}
 	// Parse task spec
 	if len(specJSON) > 0 {
@ -119,7 +124,7 @@ func (r *WorkQueueRepository) ListByProject(ctx context.Context, projectID strin
 	query := fmt.Sprintf(`
 		SELECT id, project_id, task_type, task_spec, status, priority, worker_id,
 		       callback_url, created_at, started_at, completed_at, result, error,
-		       retry_count, max_retries
+		       retry_count, max_retries, error_code
 		FROM work_queue
 		%s
 		ORDER BY created_at DESC
@ -214,12 +219,30 @@ func (r *WorkQueueRepository) RequeueStale(ctx context.Context, timeout time.Dur
 }
 // RequeueStaleWithIDs re-queues stale tasks and returns their IDs.
 // Tasks that have exceeded max_retries are marked as failed with STALE_WORKER error code.
 func (r *WorkQueueRepository) RequeueStaleWithIDs(ctx context.Context, timeout time.Duration) ([]string, error) {
 	cutoff := time.Now().Add(-timeout)
 	// First, mark tasks that have exceeded max_retries as permanently failed
 	_, err := r.db.ExecContext(ctx, `
 		UPDATE work_queue
 		SET status = 'failed', completed_at = NOW(),
 		    error = 'Worker timeout - max retries exceeded',
 		    error_code = 'STALE_WORKER'
 		WHERE status = 'running'
 		AND started_at < $1
 		AND retry_count >= max_retries
 	`, cutoff)
 	if err != nil {
 		return nil, fmt.Errorf("fail stale tasks: %w", err)
 	}
 	// Then, requeue tasks that can still be retried
 	rows, err := r.db.QueryContext(ctx, `
 		UPDATE work_queue
 		SET status = 'pending', worker_id = NULL, started_at = NULL,
-		    retry_count = retry_count + 1, error = 'Worker timeout - task requeued'
+		    retry_count = retry_count + 1, error = 'Worker timeout - task requeued',
 		    error_code = NULL
 		WHERE status = 'running'
 		AND started_at < $1
 		AND retry_count < max_retries
@ -253,6 +276,7 @@ func (r *WorkQueueRepository) scanTask(rows *sql.Rows) (*domain.WorkTask, error)
 	var completedAt sql.NullTime
 	var resultJSON []byte
 	var errorMsg sql.NullString
 	var errorCode sql.NullString
 	err := rows.Scan(
 		&task.ID,
@ -270,6 +294,7 @@ func (r *WorkQueueRepository) scanTask(rows *sql.Rows) (*domain.WorkTask, error)
 		&errorMsg,
 		&task.RetryCount,
 		&task.MaxRetries,
 		&errorCode,
 	)
 	if err != nil {
 		return nil, fmt.Errorf("scan task: %w", err)
@ -293,6 +318,9 @@ func (r *WorkQueueRepository) scanTask(rows *sql.Rows) (*domain.WorkTask, error)
 	if errorMsg.Valid {
 		task.Error = errorMsg.String
 	}
 	if errorCode.Valid {
 		task.ErrorCode = domain.WorkErrorCode(errorCode.String)
 	}
 	// Parse task spec
 	if len(specJSON) > 0 {
--- a/internal/db/migrations/016_work_queue_error_code.sql
+++ b/internal/db/migrations/016_work_queue_error_code.sql
@ -0,0 +1,10 @@
 -- Add error_code column to work_queue for categorized failure handling.
 -- This enables clients to distinguish between different failure modes
 -- (rate limit, auth failure, timeout, stale worker) and respond appropriately.
 ALTER TABLE work_queue ADD COLUMN IF NOT EXISTS error_code VARCHAR(50);
 -- Index for querying tasks by error code (useful for metrics and debugging)
 CREATE INDEX IF NOT EXISTS idx_work_queue_error_code ON work_queue(error_code) WHERE error_code IS NOT NULL;
 COMMENT ON COLUMN work_queue.error_code IS 'Categorized error type: RATE_LIMITED, AUTH_FAILED, TIMEOUT, STALE_WORKER, AGENT_ERROR, INVALID_SPEC';
--- a/internal/domain/build.go
+++ b/internal/domain/build.go
@ -101,6 +101,10 @@ type BuildResult struct {
 	// Error contains the error message if the build failed.
 	Error string `json:"error,omitempty"`
 	// ErrorCode categorizes the failure type for programmatic handling.
 	// Values: RATE_LIMITED, AUTH_FAILED, TIMEOUT, STALE_WORKER, AGENT_ERROR, INVALID_SPEC
 	ErrorCode WorkErrorCode `json:"error_code,omitempty"`
 	// CommitSHA is the git commit hash if auto-commit was enabled.
 	CommitSHA string `json:"commit_sha,omitempty"`
--- a/internal/domain/work.go
+++ b/internal/domain/work.go
@ -13,6 +13,138 @@ const (
 	WorkTaskStatusCancelled WorkTaskStatus = "cancelled"
 )
 // WorkErrorCode represents a categorized error type for failed tasks.
 // This enables clients to distinguish between different failure modes
 // and take appropriate action (e.g., retry vs wait vs report).
 type WorkErrorCode string
 const (
 	// WorkErrorCodeNone indicates no error (task succeeded or still running).
 	WorkErrorCodeNone WorkErrorCode = ""
 	// WorkErrorCodeRateLimited indicates the agent hit its rate limit.
 	// Client should wait for the limit to reset before retrying.
 	WorkErrorCodeRateLimited WorkErrorCode = "RATE_LIMITED"
 	// WorkErrorCodeAuthFailed indicates authentication/authorization failure.
 	// Requires manual intervention to re-authenticate the agent.
 	WorkErrorCodeAuthFailed WorkErrorCode = "AUTH_FAILED"
 	// WorkErrorCodeTimeout indicates the task exceeded its time limit.
 	// May be retried, possibly with a longer timeout or simpler prompt.
 	WorkErrorCodeTimeout WorkErrorCode = "TIMEOUT"
 	// WorkErrorCodeStaleWorker indicates the worker stopped responding.
 	// The task was recovered by maintenance and can be retried.
 	WorkErrorCodeStaleWorker WorkErrorCode = "STALE_WORKER"
 	// WorkErrorCodeAgentError indicates a generic agent execution error.
 	// The error message contains details.
 	WorkErrorCodeAgentError WorkErrorCode = "AGENT_ERROR"
 	// WorkErrorCodeInvalidSpec indicates the task specification was invalid.
 	// Should not be retried without fixing the spec.
 	WorkErrorCodeInvalidSpec WorkErrorCode = "INVALID_SPEC"
 )
 // ClassifyAgentError examines an error message and stderr output to determine
 // the appropriate error code. This enables automated handling of known failure modes.
 func ClassifyAgentError(errMsg, stderr string) WorkErrorCode {
 	combined := errMsg + "\n" + stderr
 	// Rate limit detection - Claude Code specific messages
 	rateLimitPatterns := []string{
 		"You've hit your limit",
 		"rate limit",
 		"Rate limit",
 		"too many requests",
 		"Too many requests",
 		"quota exceeded",
 		"Quota exceeded",
 	}
 	for _, pattern := range rateLimitPatterns {
 		if containsIgnoreCase(combined, pattern) {
 			return WorkErrorCodeRateLimited
 		}
 	}
 	// Authentication failure detection
 	authPatterns := []string{
 		"not authenticated",
 		"authentication failed",
 		"unauthorized",
 		"Unauthorized",
 		"invalid api key",
 		"Invalid API key",
 		"please log in",
 		"Please log in",
 		"claude login",
 	}
 	for _, pattern := range authPatterns {
 		if containsIgnoreCase(combined, pattern) {
 			return WorkErrorCodeAuthFailed
 		}
 	}
 	// Timeout detection
 	timeoutPatterns := []string{
 		"context deadline exceeded",
 		"context canceled",
 		"timeout",
 		"Timeout",
 		"timed out",
 	}
 	for _, pattern := range timeoutPatterns {
 		if containsIgnoreCase(combined, pattern) {
 			return WorkErrorCodeTimeout
 		}
 	}
 	// Default to generic agent error
 	return WorkErrorCodeAgentError
 }
 // containsIgnoreCase checks if s contains substr (case-insensitive).
 func containsIgnoreCase(s, substr string) bool {
 	return len(s) >= len(substr) &&
 		(s == substr ||
 			len(substr) == 0 ||
 			findIgnoreCase(s, substr) >= 0)
 }
 // findIgnoreCase finds substr in s (case-insensitive), returns -1 if not found.
 func findIgnoreCase(s, substr string) int {
 	if len(substr) == 0 {
 		return 0
 	}
 	if len(s) < len(substr) {
 		return -1
 	}
 	// Simple linear search with case-insensitive comparison
 	for i := 0; i <= len(s)-len(substr); i++ {
 		match := true
 		for j := 0; j < len(substr); j++ {
 			sc := s[i+j]
 			pc := substr[j]
 			// ASCII lowercase conversion
 			if sc >= 'A' && sc <= 'Z' {
 				sc += 'a' - 'A'
 			}
 			if pc >= 'A' && pc <= 'Z' {
 				pc += 'a' - 'A'
 			}
 			if sc != pc {
 				match = false
 				break
 			}
 		}
 		if match {
 			return i
 		}
 	}
 	return -1
 }
 // IsValid returns true if the status is a known valid status.
 func (s WorkTaskStatus) IsValid() bool {
 	switch s {
@ -31,12 +163,13 @@ const (
 	WorkTaskTypeTest   WorkTaskType = "test"
 	WorkTaskTypeDeploy WorkTaskType = "deploy"
 	WorkTaskTypeCustom WorkTaskType = "custom"
 	WorkTaskTypeVerify WorkTaskType = "verify"
 )
 // IsValid returns true if the task type is a known valid type.
 func (t WorkTaskType) IsValid() bool {
 	switch t {
-	case WorkTaskTypeBuild, WorkTaskTypeTest, WorkTaskTypeDeploy, WorkTaskTypeCustom:
+	case WorkTaskTypeBuild, WorkTaskTypeTest, WorkTaskTypeDeploy, WorkTaskTypeCustom, WorkTaskTypeVerify:
 		return true
 	}
 	return false
@ -86,6 +219,10 @@ type WorkTask struct {
 	// Error contains the error message (if failed).
 	Error string
 	// ErrorCode categorizes the failure type for programmatic handling.
 	// Only set when Status is WorkTaskStatusFailed.
 	ErrorCode WorkErrorCode
 	// RetryCount is the number of retry attempts.
 	RetryCount int
--- a/internal/domain/work_test.go
+++ b/internal/domain/work_test.go
@ -0,0 +1,103 @@
 package domain
 import "testing"
 func TestClassifyAgentError(t *testing.T) {
 	tests := []struct {
 		name     string
 		errMsg   string
 		stderr   string
 		expected WorkErrorCode
 	}{
 		{
 			name:     "rate limit in stderr",
 			errMsg:   "command failed",
 			stderr:   "You've hit your limit · resets 7am (UTC)",
 			expected: WorkErrorCodeRateLimited,
 		},
 		{
 			name:     "rate limit in error message",
 			errMsg:   "rate limit exceeded, try again later",
 			stderr:   "",
 			expected: WorkErrorCodeRateLimited,
 		},
 		{
 			name:     "quota exceeded",
 			errMsg:   "Quota exceeded for today",
 			stderr:   "",
 			expected: WorkErrorCodeRateLimited,
 		},
 		{
 			name:     "auth failed - not authenticated",
 			errMsg:   "not authenticated, please log in",
 			stderr:   "",
 			expected: WorkErrorCodeAuthFailed,
 		},
 		{
 			name:     "auth failed - invalid api key",
 			errMsg:   "Invalid API key provided",
 			stderr:   "",
 			expected: WorkErrorCodeAuthFailed,
 		},
 		{
 			name:     "auth failed - claude login hint",
 			errMsg:   "",
 			stderr:   "Run claude login to authenticate",
 			expected: WorkErrorCodeAuthFailed,
 		},
 		{
 			name:     "context timeout",
 			errMsg:   "context deadline exceeded",
 			stderr:   "",
 			expected: WorkErrorCodeTimeout,
 		},
 		{
 			name:     "operation timed out",
 			errMsg:   "operation timed out after 10 minutes",
 			stderr:   "",
 			expected: WorkErrorCodeTimeout,
 		},
 		{
 			name:     "generic error",
 			errMsg:   "something went wrong",
 			stderr:   "error: file not found",
 			expected: WorkErrorCodeAgentError,
 		},
 		{
 			name:     "empty error",
 			errMsg:   "",
 			stderr:   "",
 			expected: WorkErrorCodeAgentError,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			got := ClassifyAgentError(tt.errMsg, tt.stderr)
 			if got != tt.expected {
 				t.Errorf("ClassifyAgentError(%q, %q) = %q, want %q",
 					tt.errMsg, tt.stderr, got, tt.expected)
 			}
 		})
 	}
 }
 func TestWorkErrorCode_Constants(t *testing.T) {
 	// Ensure constants are defined with expected values
 	codes := map[WorkErrorCode]string{
 		WorkErrorCodeNone:        "",
 		WorkErrorCodeRateLimited: "RATE_LIMITED",
 		WorkErrorCodeAuthFailed:  "AUTH_FAILED",
 		WorkErrorCodeTimeout:     "TIMEOUT",
 		WorkErrorCodeStaleWorker: "STALE_WORKER",
 		WorkErrorCodeAgentError:  "AGENT_ERROR",
 		WorkErrorCodeInvalidSpec: "INVALID_SPEC",
 	}
 	for code, expected := range codes {
 		if string(code) != expected {
 			t.Errorf("WorkErrorCode constant %q has value %q, want %q",
 				expected, string(code), expected)
 		}
 	}
 }
--- a/internal/handlers/builds.go
+++ b/internal/handlers/builds.go
@ -82,6 +82,7 @@ type BuildResultDTO struct {
 	Success      bool              `json:"success"`
 	Output       string            `json:"output,omitempty"`
 	Error        string            `json:"error,omitempty"`
 	ErrorCode    string            `json:"error_code,omitempty"` // Categorized error type for programmatic handling
 	CommitSHA    string            `json:"commit_sha,omitempty"`
 	FilesChanged []string          `json:"files_changed,omitempty"`
 	DurationMs   int64             `json:"duration_ms"`
@ -112,6 +113,7 @@ func toBuildAuditDTO(e *domain.BuildAuditEntry) *BuildAuditDTO {
 			Success:      e.Result.Success,
 			Output:       e.Result.Output,
 			Error:        e.Result.Error,
 			ErrorCode:    string(e.Result.ErrorCode),
 			CommitSHA:    e.Result.CommitSHA,
 			FilesChanged: e.Result.FilesChanged,
 			DurationMs:   e.Result.DurationMs,
--- a/internal/handlers/work_test.go
+++ b/internal/handlers/work_test.go
@ -70,6 +70,10 @@ func (m *mockWorkQueue) Complete(ctx context.Context, taskID string, result *dom
 }
 func (m *mockWorkQueue) Fail(ctx context.Context, taskID string, errMsg string) error {
 	return m.FailWithCode(ctx, taskID, errMsg, domain.WorkErrorCodeNone)
 }
 func (m *mockWorkQueue) FailWithCode(ctx context.Context, taskID string, errMsg string, code domain.WorkErrorCode) error {
 	if m.err != nil {
 		return m.err
 	}
@ -84,6 +88,7 @@ func (m *mockWorkQueue) Fail(ctx context.Context, taskID string, errMsg string)
 	} else {
 		task.Status = domain.WorkTaskStatusFailed
 		task.Error = errMsg
 		task.ErrorCode = code
 		now := time.Now()
 		task.CompletedAt = &now
 	}
--- a/internal/port/work_queue.go
+++ b/internal/port/work_queue.go
@ -28,6 +28,11 @@ type WorkQueue interface {
 	// If retry_count < max_retries, the task will be re-queued as pending.
 	Fail(ctx context.Context, taskID string, errMsg string) error
 	// FailWithCode marks a task as failed with an error message and categorized error code.
 	// The error code enables clients to distinguish failure types (rate limit, auth, timeout).
 	// If retry_count < max_retries, the task will be re-queued as pending (error_code cleared).
 	FailWithCode(ctx context.Context, taskID string, errMsg string, code domain.WorkErrorCode) error
 	// Cancel marks a pending task as cancelled.
 	// Returns an error if the task is not in pending status.
 	Cancel(ctx context.Context, taskID string) error
--- a/internal/service/mock_test.go
+++ b/internal/service/mock_test.go
@ -62,6 +62,20 @@ func (m *mockWorkQueue) Complete(ctx context.Context, taskID string, result *dom
 }
 func (m *mockWorkQueue) Fail(ctx context.Context, taskID string, errMsg string) error {
 	return m.FailWithCode(ctx, taskID, errMsg, domain.WorkErrorCodeNone)
 }
 func (m *mockWorkQueue) FailWithCode(ctx context.Context, taskID string, errMsg string, code domain.WorkErrorCode) error {
 	if m.err != nil {
 		return m.err
 	}
 	task, ok := m.tasks[taskID]
 	if !ok {
 		return domain.ErrWorkTaskNotFound
 	}
 	task.Status = domain.WorkTaskStatusFailed
 	task.Error = errMsg
 	task.ErrorCode = code
 	return nil
 }
--- a/internal/service/work_service.go
+++ b/internal/service/work_service.go
@ -136,13 +136,19 @@ func (s *WorkService) CompleteTask(ctx context.Context, taskID string, result *d
 // FailTask marks a task as failed.
 func (s *WorkService) FailTask(ctx context.Context, taskID string, errMsg string) error {
 	return s.FailTaskWithCode(ctx, taskID, errMsg, domain.WorkErrorCodeNone)
 }
 // FailTaskWithCode marks a task as failed with a categorized error code.
 // The error code enables clients to distinguish failure types and respond appropriately.
 func (s *WorkService) FailTaskWithCode(ctx context.Context, taskID string, errMsg string, code domain.WorkErrorCode) error {
 	// Get task for callback URL before failing
 	task, err := s.queue.GetTask(ctx, taskID)
 	if err != nil {
 		return fmt.Errorf("get task: %w", err)
 	}
-	if err := s.queue.Fail(ctx, taskID, errMsg); err != nil {
+	if err := s.queue.FailWithCode(ctx, taskID, errMsg, code); err != nil {
 		return fmt.Errorf("fail task: %w", err)
 	}
@ -154,6 +160,7 @@ func (s *WorkService) FailTask(ctx context.Context, taskID string, errMsg string
 			"project", task.ProjectID,
 			"type", task.Type,
 			"error", errMsg,
 			"error_code", code,
 			"retry_count", task.RetryCount,
 		)
--- a/internal/worker/mock_test.go
+++ b/internal/worker/mock_test.go
@ -72,6 +72,10 @@ func (m *mockWorkQueue) Complete(_ context.Context, taskID string, result *domai
 }
 func (m *mockWorkQueue) Fail(_ context.Context, taskID string, errMsg string) error {
 	return m.FailWithCode(context.Background(), taskID, errMsg, domain.WorkErrorCodeNone)
 }
 func (m *mockWorkQueue) FailWithCode(_ context.Context, taskID string, errMsg string, code domain.WorkErrorCode) error {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	task, ok := m.tasks[taskID]
@ -82,6 +86,7 @@ func (m *mockWorkQueue) Fail(_ context.Context, taskID string, errMsg string) er
 	if task.RetryCount >= task.MaxRetries {
 		task.Status = domain.WorkTaskStatusFailed
 		task.Error = errMsg
 		task.ErrorCode = code
 	} else {
 		task.Status = domain.WorkTaskStatusPending
 		task.WorkerID = ""
@ -293,6 +298,51 @@ func (m *mockCodeAgentRegistry) AvailableAgents(_ context.Context) []port.CodeAg
 }
 func (m *mockCodeAgentRegistry) Count() int { return 1 }
 // =============================================================================
 // Mock CommandExecutor for verify tests
 // =============================================================================
 type mockCommandExecutor struct {
 	result      *domain.CommandResult
 	err         error
 	output      []domain.OutputLine
 	podExists   bool
 	podExistErr error
 }
 func newMockCommandExecutor() *mockCommandExecutor {
 	return &mockCommandExecutor{
 		result: &domain.CommandResult{
 			ExitCode:   0,
 			DurationMs: 100,
 		},
 		podExists: true,
 	}
 }
 func (m *mockCommandExecutor) Execute(_ context.Context, _ *domain.Command, _ string, handler domain.OutputHandler) (*domain.CommandResult, error) {
 	if m.err != nil {
 		return nil, m.err
 	}
 	// Deliver output lines to handler
 	for _, line := range m.output {
 		handler(line)
 	}
 	return m.result, nil
 }
 func (m *mockCommandExecutor) Cancel(_ context.Context, _ domain.CommandID) error {
 	return nil
 }
 func (m *mockCommandExecutor) PodExists(_ context.Context, _ string) (bool, error) {
 	return m.podExists, m.podExistErr
 }
 func (m *mockCommandExecutor) CheckConnection(_ context.Context) error {
 	return nil
 }
 // =============================================================================
 // Helper to build test dependencies
 // =============================================================================
--- a/internal/worker/queue_maintenance_test.go
+++ b/internal/worker/queue_maintenance_test.go
@ -52,6 +52,10 @@ func (m *mockMaintenanceQueue) Fail(_ context.Context, _ string, _ string) error
 	return nil
 }
 func (m *mockMaintenanceQueue) FailWithCode(_ context.Context, _ string, _ string, _ domain.WorkErrorCode) error {
 	return nil
 }
 func (m *mockMaintenanceQueue) Cancel(_ context.Context, _ string) error {
 	return nil
 }
--- a/internal/worker/work_executor.go
+++ b/internal/worker/work_executor.go
@ -22,6 +22,7 @@ type WorkExecutor struct {
 	workerSvc  *service.WorkerService
 	workSvc    *service.WorkService
 	buildExec  *BuildExecutor
 	verifyExec *VerifyExecutor
 	logger     *slog.Logger
 	workerID     string
@ -84,6 +85,7 @@ func NewWorkExecutor(
 	workerSvc *service.WorkerService,
 	workSvc *service.WorkService,
 	buildExec *BuildExecutor,
 	verifyExec *VerifyExecutor,
 	cfg *WorkExecutorConfig,
 ) *WorkExecutor {
 	if cfg == nil {
@ -111,6 +113,7 @@ func NewWorkExecutor(
 		workerSvc:    workerSvc,
 		workSvc:      workSvc,
 		buildExec:    buildExec,
 		verifyExec:   verifyExec,
 		logger:       cfg.Logger.With("component", "work-executor"),
 		workerID:     cfg.WorkerID,
 		hostname:     hostname,
@ -262,7 +265,11 @@ func (e *WorkExecutor) tryClaimAndExecute() {
 		if errMsg == "" {
 			errMsg = "execution failed"
 		}
-		if err := e.workSvc.FailTask(e.ctx, task.ID, errMsg); err != nil {
+
 		// Classify the error to enable appropriate client handling
 		errorCode := domain.ClassifyAgentError(errMsg, result.Output)
 		if err := e.workSvc.FailTaskWithCode(e.ctx, task.ID, errMsg, errorCode); err != nil {
 			e.logger.Error("failed to record task failure",
 				"task_id", task.ID,
 				"error", err,
@ -280,6 +287,14 @@ func (e *WorkExecutor) executeTask(ctx context.Context, task *domain.WorkTask) *
 	switch task.Type {
 	case domain.WorkTaskTypeBuild:
 		return e.buildExec.Execute(ctx, task)
 	case domain.WorkTaskTypeVerify:
 		if e.verifyExec == nil {
 			return &domain.BuildResult{
 				Success: false,
 				Error:   "verify executor not configured",
 			}
 		}
 		return e.verifyExec.Execute(ctx, task)
 	default:
 		return &domain.BuildResult{
 			Success: false,
--- a/internal/worker/work_executor_test.go
+++ b/internal/worker/work_executor_test.go
@ -21,7 +21,7 @@ func testLogger() *slog.Logger {
 func TestWorkExecutor_StartAndStop(t *testing.T) {
 	deps := newTestDeps()
-	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, &WorkExecutorConfig{
+	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, nil, &WorkExecutorConfig{
 		WorkerID:        "test-worker-1",
 		PollPeriod:      100 * time.Millisecond,
 		HeartbeatPeriod: 100 * time.Millisecond,
@ -75,7 +75,7 @@ func TestWorkExecutor_ClaimsAndExecutesTask(t *testing.T) {
 	}
 	deps.queue.mu.Unlock()
-	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, &WorkExecutorConfig{
+	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, nil, &WorkExecutorConfig{
 		WorkerID:        "test-worker-2",
 		PollPeriod:      50 * time.Millisecond,
 		HeartbeatPeriod: 5 * time.Second,
@ -118,7 +118,7 @@ func TestWorkExecutor_FailsTaskOnAgentError(t *testing.T) {
 	}
 	deps.queue.mu.Unlock()
-	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, &WorkExecutorConfig{
+	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, nil, &WorkExecutorConfig{
 		WorkerID:        "test-worker-3",
 		PollPeriod:      50 * time.Millisecond,
 		HeartbeatPeriod: 5 * time.Second,
@ -164,7 +164,7 @@ func TestWorkExecutor_UnsupportedTaskType(t *testing.T) {
 	}
 	deps.queue.mu.Unlock()
-	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, &WorkExecutorConfig{
+	executor := NewWorkExecutor(deps.workerSvc, deps.workSvc, deps.buildExec, nil, &WorkExecutorConfig{
 		WorkerID:        "test-worker-4",
 		PollPeriod:      50 * time.Millisecond,
 		HeartbeatPeriod: 5 * time.Second,