fix: worker graceful shutdown and RWO PVC compatibility
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed

- Add WaitGroup for graceful shutdown of in-flight tasks
- Change replicas to 1 with Recreate strategy (RWO PVC limitation)
- Optimize Dockerfile: combine RUN commands for smaller layers
- Add compiled binaries to .gitignore

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-06 00:35:00 -07:00
parent bc3b9b9e42
commit d7a6f37593
6 changed files with 84 additions and 20 deletions

6
.gitignore vendored
View File

@ -37,3 +37,9 @@ tmp/
*-deploy-key.pub *-deploy-key.pub
*-deploy-key.b64 *-deploy-key.b64
.agentive-remediation/ .agentive-remediation/
# Compiled binaries
/rdev-worker
/rdev-api
/claudebox-sidecar
/sdlc

View File

@ -7,8 +7,8 @@ WORKDIR /build
COPY go.mod go.sum ./ COPY go.mod go.sum ./
RUN go mod download RUN go mod download
COPY . . COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o sdlc ./cmd/sdlc RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o sdlc ./cmd/sdlc && \
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o claudebox-sidecar ./cmd/claudebox-sidecar CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o claudebox-sidecar ./cmd/claudebox-sidecar
# Runtime stage # Runtime stage
FROM ubuntu:22.04 FROM ubuntu:22.04

View File

@ -8,6 +8,7 @@ import (
"os" "os"
"os/signal" "os/signal"
"strings" "strings"
"sync"
"syscall" "syscall"
"time" "time"
@ -83,11 +84,14 @@ func main() {
WorkDir: "/workspace", WorkDir: "/workspace",
}) })
// WaitGroup to track in-flight tasks for graceful shutdown
var wg sync.WaitGroup
// Start heartbeat loop // Start heartbeat loop
go runHeartbeat(ctx, apiClient, cfg.WorkerID, cfg.HeartbeatInterval, log) go runHeartbeat(ctx, apiClient, cfg.WorkerID, cfg.HeartbeatInterval, log)
// Start work loop // Start work loop
go runWorkLoop(ctx, apiClient, buildExecutor, sdlcExecutor, cfg, log) go runWorkLoop(ctx, apiClient, buildExecutor, sdlcExecutor, cfg, log, &wg)
// Wait for shutdown signal // Wait for shutdown signal
quit := make(chan os.Signal, 1) quit := make(chan os.Signal, 1)
@ -97,9 +101,18 @@ func main() {
log.Info("shutting down worker") log.Info("shutting down worker")
cancel() cancel()
// Give ongoing work a chance to complete // Wait for in-flight tasks to complete with timeout
time.Sleep(5 * time.Second) done := make(chan struct{})
log.Info("worker stopped") go func() {
wg.Wait()
close(done)
}()
select {
case <-done:
log.Info("all tasks completed, worker stopped")
case <-time.After(cfg.TaskTimeout):
log.Warn("shutdown timeout, some tasks may be incomplete")
}
} }
// Config holds worker configuration. // Config holds worker configuration.
@ -180,6 +193,7 @@ func runWorkLoop(
sdlcExecutor *worker.HTTPSDLCTaskExecutor, sdlcExecutor *worker.HTTPSDLCTaskExecutor,
cfg *Config, cfg *Config,
log *logging.Logger, log *logging.Logger,
wg *sync.WaitGroup,
) { ) {
ticker := time.NewTicker(cfg.PollInterval) ticker := time.NewTicker(cfg.PollInterval)
defer ticker.Stop() defer ticker.Stop()
@ -206,8 +220,12 @@ func runWorkLoop(
"type", task.Type, "type", task.Type,
) )
// Execute the task // Execute the task with WaitGroup tracking
executeTask(ctx, client, buildExecutor, sdlcExecutor, task, cfg, log) wg.Add(1)
go func(t *domain.WorkTask) {
defer wg.Done()
executeTask(ctx, client, buildExecutor, sdlcExecutor, t, cfg, log)
}(task)
} }
} }
} }

View File

@ -9,7 +9,11 @@ metadata:
app.kubernetes.io/name: rdev-worker app.kubernetes.io/name: rdev-worker
app.kubernetes.io/part-of: rdev app.kubernetes.io/part-of: rdev
spec: spec:
replicas: 2 replicas: 1
# Recreate strategy required: claudebox-claude-config PVC is RWO (ReadWriteOnce)
# and cannot be attached to multiple pods simultaneously
strategy:
type: Recreate
selector: selector:
matchLabels: matchLabels:
app: rdev-worker app: rdev-worker

View File

@ -117,7 +117,10 @@ func (c *Client) Execute(ctx context.Context, req *ExecuteRequest) (*ExecuteResp
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return nil, fmt.Errorf("execute returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return nil, fmt.Errorf("execute returned status %d: %s", resp.StatusCode, string(bodyBytes)) return nil, fmt.Errorf("execute returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -163,7 +166,10 @@ func (c *Client) ExecuteStream(ctx context.Context, req *ExecuteRequest, handler
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return fmt.Errorf("execute stream returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return fmt.Errorf("execute stream returned status %d: %s", resp.StatusCode, string(bodyBytes)) return fmt.Errorf("execute stream returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -233,7 +239,10 @@ func (c *Client) GitClone(ctx context.Context, cloneURL, workDir string) (*GitCl
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return nil, fmt.Errorf("git clone returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return nil, fmt.Errorf("git clone returned status %d: %s", resp.StatusCode, string(bodyBytes)) return nil, fmt.Errorf("git clone returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -288,7 +297,10 @@ func (c *Client) GitCommitAndPush(ctx context.Context, message string, push bool
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return nil, fmt.Errorf("git commit returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return nil, fmt.Errorf("git commit returned status %d: %s", resp.StatusCode, string(bodyBytes)) return nil, fmt.Errorf("git commit returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -328,7 +340,10 @@ func (c *Client) GitStatus(ctx context.Context, workDir string) (*GitStatusRespo
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return nil, fmt.Errorf("git status returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return nil, fmt.Errorf("git status returned status %d: %s", resp.StatusCode, string(bodyBytes)) return nil, fmt.Errorf("git status returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -381,7 +396,10 @@ func (c *Client) RunSDLC(ctx context.Context, command string, args []string, wor
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return nil, fmt.Errorf("sdlc returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return nil, fmt.Errorf("sdlc returned status %d: %s", resp.StatusCode, string(bodyBytes)) return nil, fmt.Errorf("sdlc returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }

View File

@ -88,7 +88,10 @@ func (c *APIClient) Register(ctx context.Context, req *RegisterRequest) error {
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated { if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return fmt.Errorf("register returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return fmt.Errorf("register returned status %d: %s", resp.StatusCode, string(bodyBytes)) return fmt.Errorf("register returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -111,7 +114,10 @@ func (c *APIClient) Heartbeat(ctx context.Context, workerID string) error {
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return fmt.Errorf("heartbeat returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return fmt.Errorf("heartbeat returned status %d: %s", resp.StatusCode, string(bodyBytes)) return fmt.Errorf("heartbeat returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -193,7 +199,10 @@ func (c *APIClient) ClaimTask(ctx context.Context, workerID string) (*domain.Wor
} }
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return nil, fmt.Errorf("claim task returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return nil, fmt.Errorf("claim task returned status %d: %s", resp.StatusCode, string(bodyBytes)) return nil, fmt.Errorf("claim task returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -202,6 +211,9 @@ func (c *APIClient) ClaimTask(ctx context.Context, workerID string) (*domain.Wor
return nil, fmt.Errorf("decode response: %w", err) return nil, fmt.Errorf("decode response: %w", err)
} }
if result.Data.Task == nil {
return nil, fmt.Errorf("API returned success but no task data")
}
return result.Data.Task.ToWorkTask(), nil return result.Data.Task.ToWorkTask(), nil
} }
@ -251,7 +263,10 @@ func (c *APIClient) CompleteTask(ctx context.Context, workerID, taskID string, r
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return fmt.Errorf("complete task returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return fmt.Errorf("complete task returned status %d: %s", resp.StatusCode, string(bodyBytes)) return fmt.Errorf("complete task returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }
@ -292,7 +307,10 @@ func (c *APIClient) FailTask(ctx context.Context, workerID, taskID string, errMs
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body) bodyBytes, readErr := io.ReadAll(resp.Body)
if readErr != nil {
return fmt.Errorf("fail task returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
}
return fmt.Errorf("fail task returned status %d: %s", resp.StatusCode, string(bodyBytes)) return fmt.Errorf("fail task returned status %d: %s", resp.StatusCode, string(bodyBytes))
} }