diff --git a/cmd/claudebox-sidecar/main.go b/cmd/claudebox-sidecar/main.go index 2f522f5..a28c0ba 100644 --- a/cmd/claudebox-sidecar/main.go +++ b/cmd/claudebox-sidecar/main.go @@ -67,7 +67,7 @@ func main() { Logger: log, })) r.Use(middleware.Recoverer) - r.Use(middleware.Timeout(10 * time.Minute)) + r.Use(middleware.Timeout(50 * time.Minute)) // Mount server routes server.Mount(r) @@ -78,7 +78,7 @@ func main() { Addr: addr, Handler: r, ReadTimeout: 30 * time.Second, - WriteTimeout: 15 * time.Minute, // Long timeout for streaming responses + WriteTimeout: 55 * time.Minute, // Long timeout for streaming responses IdleTimeout: 60 * time.Second, } diff --git a/cmd/rdev-api/main.go b/cmd/rdev-api/main.go index a301c66..f4135d9 100644 --- a/cmd/rdev-api/main.go +++ b/cmd/rdev-api/main.go @@ -381,6 +381,8 @@ func main() { agentRegistry, gitCommitter, projectRepo, + buildService, + database.DB, ) // Create app @@ -700,7 +702,7 @@ func main() { workQueueRepo, workerRegistryRepo, &worker.QueueMaintenanceConfig{ - StaleTaskTimeout: 30 * time.Minute, + StaleTaskTimeout: 60 * time.Minute, StaleWorkerTimeout: 2 * time.Minute, CleanupAge: 7 * 24 * time.Hour, MaintenancePeriod: 1 * time.Minute, diff --git a/internal/adapter/claudebox/client.go b/internal/adapter/claudebox/client.go index d8af85d..b6bc4e8 100644 --- a/internal/adapter/claudebox/client.go +++ b/internal/adapter/claudebox/client.go @@ -34,7 +34,7 @@ type ClientConfig struct { // NewClient creates a new claudebox client. func NewClient(cfg ClientConfig) *Client { if cfg.Timeout == 0 { - cfg.Timeout = 10 * time.Minute + cfg.Timeout = 50 * time.Minute // Safety net; per-request context cancellation provides real timeout } return &Client{ baseURL: strings.TrimSuffix(cfg.BaseURL, "/"), diff --git a/internal/adapter/claudebox/client_test.go b/internal/adapter/claudebox/client_test.go index 995aa21..75077df 100644 --- a/internal/adapter/claudebox/client_test.go +++ b/internal/adapter/claudebox/client_test.go @@ -18,8 +18,8 @@ func TestNewClient_DefaultTimeout(t *testing.T) { BaseURL: "http://localhost:8080", }) - if client.httpClient.Timeout != 10*time.Minute { - t.Errorf("expected default timeout 10m, got %v", client.httpClient.Timeout) + if client.httpClient.Timeout != 50*time.Minute { + t.Errorf("expected default timeout 50m, got %v", client.httpClient.Timeout) } } diff --git a/internal/adapter/sdlc/worker_executor.go b/internal/adapter/sdlc/worker_executor.go index e2d9c09..1c8f831 100644 --- a/internal/adapter/sdlc/worker_executor.go +++ b/internal/adapter/sdlc/worker_executor.go @@ -32,7 +32,7 @@ type WorkerSDLCExecutorConfig struct { // DB for fetching project git clone URLs. DB *sql.DB - // Timeout is the maximum wait time for task completion (default: 2 minutes). + // Timeout is the maximum wait time for task completion (default: 10 minutes). Timeout time.Duration Logger *slog.Logger @@ -42,7 +42,7 @@ type WorkerSDLCExecutorConfig struct { func NewWorkerSDLCExecutor(cfg WorkerSDLCExecutorConfig) *WorkerSDLCExecutor { timeout := cfg.Timeout if timeout == 0 { - timeout = 2 * time.Minute + timeout = 10 * time.Minute } logger := cfg.Logger if logger == nil { diff --git a/internal/domain/build.go b/internal/domain/build.go index 3024f8a..24c960a 100644 --- a/internal/domain/build.go +++ b/internal/domain/build.go @@ -34,6 +34,10 @@ type BuildSpec struct { // GitCloneURL is the HTTPS URL for cloning the project repository. // Required for builds that use AutoCommit/AutoPush on shared worker pods. GitCloneURL string `json:"git_clone_url,omitempty"` + + // TimeoutSeconds overrides the default agent execution timeout. + // 0 means use the default (10 minutes). Valid range: 60-5400 (1m to 90m). + TimeoutSeconds int `json:"timeout_seconds,omitempty"` } // Validate checks that the BuildSpec has required fields. @@ -41,6 +45,9 @@ func (s *BuildSpec) Validate() error { if s.Prompt == "" { return ErrPromptRequired } + if s.TimeoutSeconds != 0 && (s.TimeoutSeconds < 60 || s.TimeoutSeconds > 5400) { + return fmt.Errorf("timeout_seconds must be between 60 and 5400 (got %d)", s.TimeoutSeconds) + } if s.CallbackURL != "" { if err := ValidateCallbackURL(s.CallbackURL); err != nil { return err diff --git a/internal/handlers/builds.go b/internal/handlers/builds.go index 27676ab..5591d47 100644 --- a/internal/handlers/builds.go +++ b/internal/handlers/builds.go @@ -43,13 +43,14 @@ func (h *BuildsHandler) Mount(r api.Router) { // StartBuildRequest is the request body for POST /projects/{id}/builds. type StartBuildRequest struct { - Prompt string `json:"prompt"` - Template string `json:"template,omitempty"` - Variables map[string]string `json:"variables,omitempty"` - AutoCommit bool `json:"auto_commit"` - AutoPush bool `json:"auto_push"` - CallbackURL string `json:"callback_url,omitempty"` - GitCloneURL string `json:"git_clone_url,omitempty"` // Required when auto_commit or auto_push is true + Prompt string `json:"prompt"` + Template string `json:"template,omitempty"` + Variables map[string]string `json:"variables,omitempty"` + AutoCommit bool `json:"auto_commit"` + AutoPush bool `json:"auto_push"` + CallbackURL string `json:"callback_url,omitempty"` + GitCloneURL string `json:"git_clone_url,omitempty"` // Required when auto_commit or auto_push is true + TimeoutSeconds int `json:"timeout_seconds,omitempty"` // 0 = default (10m), valid range: 60-5400 } // StartBuildResponse is the response for POST /projects/{id}/builds. @@ -153,13 +154,14 @@ func (h *BuildsHandler) StartBuild(w http.ResponseWriter, r *http.Request) { } spec := domain.BuildSpec{ - Prompt: req.Prompt, - Template: req.Template, - Variables: req.Variables, - AutoCommit: req.AutoCommit, - AutoPush: req.AutoPush, - CallbackURL: req.CallbackURL, - GitCloneURL: req.GitCloneURL, + Prompt: req.Prompt, + Template: req.Template, + Variables: req.Variables, + AutoCommit: req.AutoCommit, + AutoPush: req.AutoPush, + CallbackURL: req.CallbackURL, + GitCloneURL: req.GitCloneURL, + TimeoutSeconds: req.TimeoutSeconds, } // Validate git_clone_url is provided when auto_commit or auto_push is enabled diff --git a/internal/handlers/sdlc_orchestrator.go b/internal/handlers/sdlc_orchestrator.go index 987f034..d2ac00e 100644 --- a/internal/handlers/sdlc_orchestrator.go +++ b/internal/handlers/sdlc_orchestrator.go @@ -50,7 +50,7 @@ func (h *SDLCOrchestratorHandler) Execute(w http.ResponseWriter, r *http.Request return } - ctx, cancel := context.WithTimeout(r.Context(), TimeoutLongRunning) + ctx, cancel := context.WithTimeout(r.Context(), TimeoutAgentExecution) defer cancel() result, err := h.orchestrator.ExecuteAction(ctx, projectID, &req) diff --git a/internal/handlers/sdlc_orchestrator_test.go b/internal/handlers/sdlc_orchestrator_test.go index 7bdaba6..408c67d 100644 --- a/internal/handlers/sdlc_orchestrator_test.go +++ b/internal/handlers/sdlc_orchestrator_test.go @@ -24,6 +24,8 @@ func setupOrchestratorHandler(exec *testSDLCExecutor) (*SDLCOrchestratorHandler, nil, // no agent registry for handler tests nil, // no git committer for handler tests repo, + nil, // no build service for handler tests + nil, // no db for handler tests ) handler := NewSDLCOrchestratorHandler(orchestrator) diff --git a/internal/handlers/timeouts.go b/internal/handlers/timeouts.go index 3195822..ada6309 100644 --- a/internal/handlers/timeouts.go +++ b/internal/handlers/timeouts.go @@ -37,4 +37,9 @@ const ( // TimeoutLongRunning is for agent/command execution that streams output. // 10 minutes. Claude Code commands can run extended operations. TimeoutLongRunning = 10 * time.Minute + + // TimeoutAgentExecution is for synchronous agent execution via /sdlc/execute. + // Accommodates medium-tier actions (20m) plus headroom for classify + post-processing. + // Heavy actions are dispatched async and return immediately. + TimeoutAgentExecution = 22 * time.Minute ) diff --git a/internal/sdlc/types.go b/internal/sdlc/types.go index 865f4f7..78a444f 100644 --- a/internal/sdlc/types.go +++ b/internal/sdlc/types.go @@ -138,6 +138,40 @@ const ( ActionIdle ActionType = "IDLE" ) +// Action timeout tiers. Each action maps to one of these based on expected duration. +const ( + // ActionTimeoutQuick is for fast actions: spec, design, tasks, qa-plan, branch, merge, archive. + ActionTimeoutQuick = 10 * time.Minute + + // ActionTimeoutMedium is for actions requiring codebase analysis: review, audit. + ActionTimeoutMedium = 20 * time.Minute + + // ActionTimeoutHeavy is for long-running actions: implement, fix-review, remediate, run-qa, fix-qa. + ActionTimeoutHeavy = 45 * time.Minute +) + +// ActionTimeout returns the timeout duration for a given action type. +func ActionTimeout(action ActionType) time.Duration { + switch action { + case ActionReviewCode, ActionAuditCode: + return ActionTimeoutMedium + case ActionImplementTask, ActionFixReviewIssues, ActionRemediateAudit, ActionRunQA, ActionFixQAFailures: + return ActionTimeoutHeavy + default: + return ActionTimeoutQuick + } +} + +// IsHeavyAction returns true for actions that should be dispatched asynchronously. +func IsHeavyAction(action ActionType) bool { + switch action { + case ActionImplementTask, ActionFixReviewIssues, ActionRemediateAudit, ActionRunQA, ActionFixQAFailures: + return true + default: + return false + } +} + // TaskStatus tracks the state of an implementation task. type TaskStatus string diff --git a/internal/service/build_service.go b/internal/service/build_service.go index 05beb81..64c5448 100644 --- a/internal/service/build_service.go +++ b/internal/service/build_service.go @@ -55,6 +55,9 @@ func (s *BuildService) StartBuild(ctx context.Context, projectID string, spec do if spec.GitCloneURL != "" { taskSpec["git_clone_url"] = spec.GitCloneURL } + if spec.TimeoutSeconds > 0 { + taskSpec["timeout_seconds"] = spec.TimeoutSeconds + } // Create work task task := &domain.WorkTask{ @@ -156,6 +159,9 @@ func (s *BuildService) StartBuildWithSDLCContext(ctx context.Context, projectID if spec.GitCloneURL != "" { taskSpec["git_clone_url"] = spec.GitCloneURL } + if spec.TimeoutSeconds > 0 { + taskSpec["timeout_seconds"] = spec.TimeoutSeconds + } // Add SDLC context for callback routing if sdlcCtx != nil { taskSpec["sdlc_context"] = sdlcCtx diff --git a/internal/service/sdlc_orchestrator.go b/internal/service/sdlc_orchestrator.go index 76c6d62..63e8356 100644 --- a/internal/service/sdlc_orchestrator.go +++ b/internal/service/sdlc_orchestrator.go @@ -2,8 +2,8 @@ package service import ( "context" + "database/sql" "fmt" - "time" "github.com/orchard9/rdev/internal/domain" "github.com/orchard9/rdev/internal/logging" @@ -32,6 +32,8 @@ type SDLCOrchestratorService struct { agentRegistry port.CodeAgentRegistry gitCommitter PodGitCommitter projectRepo port.ProjectRepository + buildService *BuildService // For async dispatch of heavy actions + db *sql.DB // For git URL lookup } // NewSDLCOrchestratorService creates a new orchestrator service. @@ -40,12 +42,16 @@ func NewSDLCOrchestratorService( agentRegistry port.CodeAgentRegistry, gitCommitter PodGitCommitter, projectRepo port.ProjectRepository, + buildService *BuildService, + db *sql.DB, ) *SDLCOrchestratorService { return &SDLCOrchestratorService{ sdlcService: sdlcService, agentRegistry: agentRegistry, gitCommitter: gitCommitter, projectRepo: projectRepo, + buildService: buildService, + db: db, } } @@ -62,6 +68,8 @@ type ExecutionResult struct { Output string `json:"output,omitempty"` Next *sdlc.Classification `json:"next,omitempty"` Error string `json:"error,omitempty"` + TaskID string `json:"task_id,omitempty"` // Set when dispatched async + Async bool `json:"async,omitempty"` // True when enqueued to work queue } // ResolveRequest describes a blocker resolution. @@ -94,12 +102,14 @@ func (s *SDLCOrchestratorService) ExecuteAction(ctx context.Context, projectID s Action: cl.Action, } - switch cl.Action { - case sdlc.ActionTransition: + switch { + case cl.Action == sdlc.ActionTransition: err = s.executeTransition(ctx, projectID, cl) - case sdlc.ActionIdle, sdlc.ActionBlocked, sdlc.ActionAwaitApproval: + case cl.Action == sdlc.ActionIdle || cl.Action == sdlc.ActionBlocked || cl.Action == sdlc.ActionAwaitApproval: result.Output = cl.Message result.Success = true + case sdlc.IsHeavyAction(cl.Action) && s.buildService != nil: + err = s.executeAgentActionAsync(ctx, projectID, cl, req, result) default: err = s.executeAgentAction(ctx, projectID, cl, req, result) } @@ -160,7 +170,7 @@ func (s *SDLCOrchestratorService) executeAgentAction(ctx context.Context, projec agentReq := &domain.AgentRequest{ Prompt: prompt, ProjectID: project.ID, - Timeout: 10 * time.Minute, + Timeout: sdlc.ActionTimeout(cl.Action), Metadata: map[string]string{ "pod_name": project.PodName, "namespace": "rdev", @@ -191,6 +201,75 @@ func (s *SDLCOrchestratorService) executeAgentAction(ctx context.Context, projec return nil } +// executeAgentActionAsync dispatches a heavy SDLC action through the work queue. +// Returns immediately with task_id and async=true. +func (s *SDLCOrchestratorService) executeAgentActionAsync(ctx context.Context, projectID string, cl *sdlc.Classification, req *ExecuteRequest, result *ExecutionResult) error { + log := logging.FromContext(ctx).WithService("sdlc_orchestrator") + + gitCloneURL, err := s.getProjectGitURL(ctx, projectID) + if err != nil { + return fmt.Errorf("resolve git URL for async dispatch: %w", err) + } + + timeout := sdlc.ActionTimeout(cl.Action) + buildSpec := domain.BuildSpec{ + Prompt: cl.NextCommand, + AutoCommit: true, + AutoPush: true, + GitCloneURL: gitCloneURL, + TimeoutSeconds: int(timeout.Seconds()), + } + + sdlcCtx := map[string]any{ + "feature": cl.Feature, + "action": string(cl.Action), + } + + taskID, err := s.buildService.StartBuildWithSDLCContext(ctx, projectID, buildSpec, sdlcCtx) + if err != nil { + return fmt.Errorf("enqueue async action: %w", err) + } + + log.Info("dispatched heavy action async", + logging.FieldProjectID, projectID, + "feature", cl.Feature, + "action", string(cl.Action), + "task_id", taskID, + "timeout_seconds", int(timeout.Seconds()), + ) + + result.TaskID = taskID + result.Async = true + result.Success = true + result.Output = fmt.Sprintf("Action %s dispatched asynchronously (task_id: %s)", cl.Action, taskID) + return nil +} + +// getProjectGitURL retrieves the git clone URL for a project from the database. +func (s *SDLCOrchestratorService) getProjectGitURL(ctx context.Context, projectID string) (string, error) { + if s.db == nil { + return "", fmt.Errorf("database not configured") + } + + var gitCloneHTTP sql.NullString + err := s.db.QueryRowContext(ctx, + `SELECT git_clone_http FROM projects WHERE id = $1`, + projectID, + ).Scan(&gitCloneHTTP) + if err != nil { + if err == sql.ErrNoRows { + return "", domain.ErrProjectNotFound + } + return "", fmt.Errorf("query project: %w", err) + } + + if !gitCloneHTTP.Valid || gitCloneHTTP.String == "" { + return "", fmt.Errorf("project %s has no git URL configured", projectID) + } + + return gitCloneHTTP.String, nil +} + // ResolveBlocker unblocks a feature and re-classifies. func (s *SDLCOrchestratorService) ResolveBlocker(ctx context.Context, projectID string, req *ResolveRequest) (*ExecutionResult, error) { if err := s.sdlcService.UnblockFeature(ctx, projectID, req.Feature); err != nil { diff --git a/internal/service/sdlc_orchestrator_test.go b/internal/service/sdlc_orchestrator_test.go index 4223d55..0c39c71 100644 --- a/internal/service/sdlc_orchestrator_test.go +++ b/internal/service/sdlc_orchestrator_test.go @@ -70,7 +70,7 @@ func (a *mockCodeAgent) Execute(_ context.Context, _ *domain.AgentRequest, handl func newTestOrchestrator(exec *mockSDLCExecutor, repo *mockProjectRepo, registry port.CodeAgentRegistry, committer PodGitCommitter) *SDLCOrchestratorService { sdlcSvc := NewSDLCService(exec, repo) - return NewSDLCOrchestratorService(sdlcSvc, registry, committer, repo) + return NewSDLCOrchestratorService(sdlcSvc, registry, committer, repo, nil, nil) } func TestOrchestrator_ExecuteAction_Idle(t *testing.T) { diff --git a/internal/worker/build_executor.go b/internal/worker/build_executor.go index 6c32647..ea7e52d 100644 --- a/internal/worker/build_executor.go +++ b/internal/worker/build_executor.go @@ -156,12 +156,18 @@ func (b *BuildExecutor) Execute(ctx context.Context, task *domain.WorkTask) *dom } } + // Derive agent timeout from spec or use default + agentTimeout := TimeoutWorkExecution + if timeoutSec, ok := task.Spec["timeout_seconds"].(float64); ok && timeoutSec > 0 { + agentTimeout = time.Duration(timeoutSec) * time.Second + } + // Build the agent request with pod metadata for Claude Code adapter agentReq := &domain.AgentRequest{ Prompt: spec.Prompt, ProjectID: domain.ProjectID(task.ProjectID), WorkingDir: workDir, - Timeout: 10 * time.Minute, + Timeout: agentTimeout, Metadata: map[string]string{ "pod_name": podName, "namespace": b.namespace, diff --git a/internal/worker/http_build_executor.go b/internal/worker/http_build_executor.go index 0b549a4..aa9fe8e 100644 --- a/internal/worker/http_build_executor.go +++ b/internal/worker/http_build_executor.go @@ -125,11 +125,17 @@ func (e *HTTPBuildExecutor) Execute(ctx context.Context, task *domain.WorkTask) var output strings.Builder const maxOutputSize = 1 << 20 // 1MB + // Derive timeout from spec or use default (10 minutes) + timeoutSec := 600 + if ts, ok := task.Spec["timeout_seconds"].(float64); ok && ts > 0 { + timeoutSec = int(ts) + } + // Use streaming execution execErr := e.client.ExecuteStream(ctx, &claudeboxclient.ExecuteRequest{ Prompt: spec.Prompt, WorkingDir: e.workDir, - Timeout: 600, // 10 minutes + Timeout: timeoutSec, }, func(evt claudeboxclient.StreamEvent) { // Map event types eventType := BuildEventOutput diff --git a/internal/worker/timeouts.go b/internal/worker/timeouts.go index 2be1d04..3fa378f 100644 --- a/internal/worker/timeouts.go +++ b/internal/worker/timeouts.go @@ -24,7 +24,20 @@ const ( // 30 seconds. These may involve multiple DB operations. TimeoutMaintenance = 30 * time.Second - // TimeoutWorkExecution is for executing work items (commands, builds, agent tasks). + // TimeoutWorkExecution is the default timeout for executing work items + // (commands, builds, agent tasks). Used when no spec-level timeout is provided. // 10 minutes. Long-running operations that stream output. TimeoutWorkExecution = 10 * time.Minute + + // TimeoutAgentDefault is for standard agent tasks (artifact generation). + // 12 minutes. Slightly above TimeoutWorkExecution to account for overhead. + TimeoutAgentDefault = 12 * time.Minute + + // TimeoutAgentMedium is for agent tasks requiring codebase analysis (review, audit). + // 22 minutes. Matches medium SDLC action tier plus overhead. + TimeoutAgentMedium = 22 * time.Minute + + // TimeoutAgentHeavy is for long-running agent tasks (implementation, fixes, QA). + // 47 minutes. Matches heavy SDLC action tier plus overhead. + TimeoutAgentHeavy = 47 * time.Minute ) diff --git a/internal/worker/work_executor.go b/internal/worker/work_executor.go index 492e427..5cf0d06 100644 --- a/internal/worker/work_executor.go +++ b/internal/worker/work_executor.go @@ -73,7 +73,7 @@ func DefaultWorkExecutorConfig() *WorkExecutorConfig { Capabilities: []string{"build"}, PollPeriod: 5 * time.Second, HeartbeatPeriod: 30 * time.Second, - TaskTimeout: 15 * time.Minute, + TaskTimeout: 50 * time.Minute, } } @@ -244,7 +244,7 @@ func (e *WorkExecutor) tryClaimAndExecute() { "type", task.Type, ) - taskCtx, taskCancel := context.WithTimeout(e.ctx, e.taskTimeout) + taskCtx, taskCancel := context.WithTimeout(e.ctx, e.taskTimeoutFor(task)) defer taskCancel() result := e.executeTask(taskCtx, task) @@ -278,6 +278,16 @@ func (e *WorkExecutor) tryClaimAndExecute() { } } +// taskTimeoutFor returns the timeout for a specific task, derived from its spec +// if available, falling back to the configured default. +func (e *WorkExecutor) taskTimeoutFor(task *domain.WorkTask) time.Duration { + if timeoutSec, ok := task.Spec["timeout_seconds"].(float64); ok && timeoutSec > 0 { + // Add 2 minutes headroom for git clone/push around the agent execution + return time.Duration(timeoutSec)*time.Second + 2*time.Minute + } + return e.taskTimeout +} + // executeTask routes a task to the appropriate handler based on its type. func (e *WorkExecutor) executeTask(ctx context.Context, task *domain.WorkTask) *domain.BuildResult { switch task.Type {