rdev/internal/domain/work.go
jordan cfba724f8a feat: add work task error classification and user-facing error codes
- Add WorkErrorCode type with RATE_LIMITED, AUTH_FAILED, TIMEOUT, STALE_WORKER, AGENT_ERROR, INVALID_SPEC
- Add ClassifyAgentError function to detect error patterns from stderr
- Add error_code column to work_queue table (migration 016)
- Add FailWithCode method to WorkQueue interface and implementations
- Update RequeueStaleWithIDs to mark permanently failed tasks with STALE_WORKER
- Add ErrorCode to BuildResult for API responses
- Update work executor to classify errors before failing tasks

This enables users to see actual failure reasons (e.g., "RATE_LIMITED") instead of
builds stuck in "running" state forever when Claude hits rate limits.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 00:07:34 -07:00

308 lines
8.1 KiB
Go

package domain
import "time"
// WorkTaskStatus represents the status of a work task.
type WorkTaskStatus string
const (
WorkTaskStatusPending WorkTaskStatus = "pending"
WorkTaskStatusRunning WorkTaskStatus = "running"
WorkTaskStatusCompleted WorkTaskStatus = "completed"
WorkTaskStatusFailed WorkTaskStatus = "failed"
WorkTaskStatusCancelled WorkTaskStatus = "cancelled"
)
// WorkErrorCode represents a categorized error type for failed tasks.
// This enables clients to distinguish between different failure modes
// and take appropriate action (e.g., retry vs wait vs report).
type WorkErrorCode string
const (
// WorkErrorCodeNone indicates no error (task succeeded or still running).
WorkErrorCodeNone WorkErrorCode = ""
// WorkErrorCodeRateLimited indicates the agent hit its rate limit.
// Client should wait for the limit to reset before retrying.
WorkErrorCodeRateLimited WorkErrorCode = "RATE_LIMITED"
// WorkErrorCodeAuthFailed indicates authentication/authorization failure.
// Requires manual intervention to re-authenticate the agent.
WorkErrorCodeAuthFailed WorkErrorCode = "AUTH_FAILED"
// WorkErrorCodeTimeout indicates the task exceeded its time limit.
// May be retried, possibly with a longer timeout or simpler prompt.
WorkErrorCodeTimeout WorkErrorCode = "TIMEOUT"
// WorkErrorCodeStaleWorker indicates the worker stopped responding.
// The task was recovered by maintenance and can be retried.
WorkErrorCodeStaleWorker WorkErrorCode = "STALE_WORKER"
// WorkErrorCodeAgentError indicates a generic agent execution error.
// The error message contains details.
WorkErrorCodeAgentError WorkErrorCode = "AGENT_ERROR"
// WorkErrorCodeInvalidSpec indicates the task specification was invalid.
// Should not be retried without fixing the spec.
WorkErrorCodeInvalidSpec WorkErrorCode = "INVALID_SPEC"
)
// ClassifyAgentError examines an error message and stderr output to determine
// the appropriate error code. This enables automated handling of known failure modes.
func ClassifyAgentError(errMsg, stderr string) WorkErrorCode {
combined := errMsg + "\n" + stderr
// Rate limit detection - Claude Code specific messages
rateLimitPatterns := []string{
"You've hit your limit",
"rate limit",
"Rate limit",
"too many requests",
"Too many requests",
"quota exceeded",
"Quota exceeded",
}
for _, pattern := range rateLimitPatterns {
if containsIgnoreCase(combined, pattern) {
return WorkErrorCodeRateLimited
}
}
// Authentication failure detection
authPatterns := []string{
"not authenticated",
"authentication failed",
"unauthorized",
"Unauthorized",
"invalid api key",
"Invalid API key",
"please log in",
"Please log in",
"claude login",
}
for _, pattern := range authPatterns {
if containsIgnoreCase(combined, pattern) {
return WorkErrorCodeAuthFailed
}
}
// Timeout detection
timeoutPatterns := []string{
"context deadline exceeded",
"context canceled",
"timeout",
"Timeout",
"timed out",
}
for _, pattern := range timeoutPatterns {
if containsIgnoreCase(combined, pattern) {
return WorkErrorCodeTimeout
}
}
// Default to generic agent error
return WorkErrorCodeAgentError
}
// containsIgnoreCase checks if s contains substr (case-insensitive).
func containsIgnoreCase(s, substr string) bool {
return len(s) >= len(substr) &&
(s == substr ||
len(substr) == 0 ||
findIgnoreCase(s, substr) >= 0)
}
// findIgnoreCase finds substr in s (case-insensitive), returns -1 if not found.
func findIgnoreCase(s, substr string) int {
if len(substr) == 0 {
return 0
}
if len(s) < len(substr) {
return -1
}
// Simple linear search with case-insensitive comparison
for i := 0; i <= len(s)-len(substr); i++ {
match := true
for j := 0; j < len(substr); j++ {
sc := s[i+j]
pc := substr[j]
// ASCII lowercase conversion
if sc >= 'A' && sc <= 'Z' {
sc += 'a' - 'A'
}
if pc >= 'A' && pc <= 'Z' {
pc += 'a' - 'A'
}
if sc != pc {
match = false
break
}
}
if match {
return i
}
}
return -1
}
// IsValid returns true if the status is a known valid status.
func (s WorkTaskStatus) IsValid() bool {
switch s {
case WorkTaskStatusPending, WorkTaskStatusRunning, WorkTaskStatusCompleted,
WorkTaskStatusFailed, WorkTaskStatusCancelled:
return true
}
return false
}
// WorkTaskType represents the type of work task.
type WorkTaskType string
const (
WorkTaskTypeBuild WorkTaskType = "build"
WorkTaskTypeTest WorkTaskType = "test"
WorkTaskTypeDeploy WorkTaskType = "deploy"
WorkTaskTypeCustom WorkTaskType = "custom"
WorkTaskTypeVerify WorkTaskType = "verify"
)
// IsValid returns true if the task type is a known valid type.
func (t WorkTaskType) IsValid() bool {
switch t {
case WorkTaskTypeBuild, WorkTaskTypeTest, WorkTaskTypeDeploy, WorkTaskTypeCustom, WorkTaskTypeVerify:
return true
}
return false
}
// WorkTask represents a task in the work queue.
type WorkTask struct {
// ID is the unique task identifier.
ID string
// ProjectID is the project this task belongs to.
ProjectID string
// Type is the task type (build, test, deploy, custom).
Type WorkTaskType
// Spec contains task-specific parameters.
// For build tasks: template, prompt, variables, auto_deploy, git_url
// For test tasks: test_command, git_url
// For deploy tasks: image, replicas, env
Spec map[string]any
// Status is the current task status.
Status WorkTaskStatus
// Priority determines execution order (higher = more urgent).
Priority int
// WorkerID is the ID of the worker that claimed this task.
WorkerID string
// CallbackURL is the webhook URL for completion notification.
CallbackURL string
// CreatedAt is when the task was created.
CreatedAt time.Time
// StartedAt is when a worker started executing the task.
StartedAt *time.Time
// CompletedAt is when the task finished (success or failure).
CompletedAt *time.Time
// Result contains the task output (if completed).
Result *WorkResult
// Error contains the error message (if failed).
Error string
// ErrorCode categorizes the failure type for programmatic handling.
// Only set when Status is WorkTaskStatusFailed.
ErrorCode WorkErrorCode
// RetryCount is the number of retry attempts.
RetryCount int
// MaxRetries is the maximum allowed retry attempts.
MaxRetries int
}
// WorkResult contains the result of a completed task.
type WorkResult struct {
// Output is the main output from task execution.
Output string `json:"output,omitempty"`
// Artifacts contains named artifacts from the task.
// For build tasks: commit_sha, deploy_url, etc.
Artifacts map[string]string `json:"artifacts,omitempty"`
}
// WorkQueueStats contains queue statistics.
type WorkQueueStats struct {
// Pending is the count of pending tasks.
Pending int64 `json:"pending"`
// Running is the count of running tasks.
Running int64 `json:"running"`
// Completed is the count of completed tasks (last 24h).
Completed int64 `json:"completed"`
// Failed is the count of failed tasks (last 24h).
Failed int64 `json:"failed"`
// Cancelled is the count of cancelled tasks (last 24h).
Cancelled int64 `json:"cancelled"`
// OldestPending is the age of the oldest pending task.
OldestPending *time.Duration `json:"oldest_pending,omitempty"`
}
// WorkListOptions contains pagination options for listing tasks.
type WorkListOptions struct {
// Limit is the maximum number of tasks to return (default: 50, max: 100).
Limit int
// Offset is the number of tasks to skip (for pagination).
Offset int
}
// DefaultWorkListOptions returns options with default values.
func DefaultWorkListOptions() WorkListOptions {
return WorkListOptions{
Limit: 50,
Offset: 0,
}
}
// Normalize applies defaults and limits to the options.
func (o *WorkListOptions) Normalize() {
if o.Limit <= 0 {
o.Limit = 50
}
if o.Limit > 100 {
o.Limit = 100
}
if o.Offset < 0 {
o.Offset = 0
}
}
// WorkListResult contains paginated task results.
type WorkListResult struct {
// Tasks is the list of tasks.
Tasks []*WorkTask
// Total is the total count of matching tasks (for pagination metadata).
Total int64
// Limit is the limit that was applied.
Limit int
// Offset is the offset that was applied.
Offset int
}