fix: worker graceful shutdown and RWO PVC compatibility
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
- Add WaitGroup for graceful shutdown of in-flight tasks - Change replicas to 1 with Recreate strategy (RWO PVC limitation) - Optimize Dockerfile: combine RUN commands for smaller layers - Add compiled binaries to .gitignore Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
bc3b9b9e42
commit
d7a6f37593
6
.gitignore
vendored
6
.gitignore
vendored
@ -37,3 +37,9 @@ tmp/
|
||||
*-deploy-key.pub
|
||||
*-deploy-key.b64
|
||||
.agentive-remediation/
|
||||
|
||||
# Compiled binaries
|
||||
/rdev-worker
|
||||
/rdev-api
|
||||
/claudebox-sidecar
|
||||
/sdlc
|
||||
|
||||
@ -7,8 +7,8 @@ WORKDIR /build
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
COPY . .
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o sdlc ./cmd/sdlc
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o claudebox-sidecar ./cmd/claudebox-sidecar
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o sdlc ./cmd/sdlc && \
|
||||
CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o claudebox-sidecar ./cmd/claudebox-sidecar
|
||||
|
||||
# Runtime stage
|
||||
FROM ubuntu:22.04
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@ -83,11 +84,14 @@ func main() {
|
||||
WorkDir: "/workspace",
|
||||
})
|
||||
|
||||
// WaitGroup to track in-flight tasks for graceful shutdown
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Start heartbeat loop
|
||||
go runHeartbeat(ctx, apiClient, cfg.WorkerID, cfg.HeartbeatInterval, log)
|
||||
|
||||
// Start work loop
|
||||
go runWorkLoop(ctx, apiClient, buildExecutor, sdlcExecutor, cfg, log)
|
||||
go runWorkLoop(ctx, apiClient, buildExecutor, sdlcExecutor, cfg, log, &wg)
|
||||
|
||||
// Wait for shutdown signal
|
||||
quit := make(chan os.Signal, 1)
|
||||
@ -97,9 +101,18 @@ func main() {
|
||||
log.Info("shutting down worker")
|
||||
cancel()
|
||||
|
||||
// Give ongoing work a chance to complete
|
||||
time.Sleep(5 * time.Second)
|
||||
log.Info("worker stopped")
|
||||
// Wait for in-flight tasks to complete with timeout
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
log.Info("all tasks completed, worker stopped")
|
||||
case <-time.After(cfg.TaskTimeout):
|
||||
log.Warn("shutdown timeout, some tasks may be incomplete")
|
||||
}
|
||||
}
|
||||
|
||||
// Config holds worker configuration.
|
||||
@ -180,6 +193,7 @@ func runWorkLoop(
|
||||
sdlcExecutor *worker.HTTPSDLCTaskExecutor,
|
||||
cfg *Config,
|
||||
log *logging.Logger,
|
||||
wg *sync.WaitGroup,
|
||||
) {
|
||||
ticker := time.NewTicker(cfg.PollInterval)
|
||||
defer ticker.Stop()
|
||||
@ -206,8 +220,12 @@ func runWorkLoop(
|
||||
"type", task.Type,
|
||||
)
|
||||
|
||||
// Execute the task
|
||||
executeTask(ctx, client, buildExecutor, sdlcExecutor, task, cfg, log)
|
||||
// Execute the task with WaitGroup tracking
|
||||
wg.Add(1)
|
||||
go func(t *domain.WorkTask) {
|
||||
defer wg.Done()
|
||||
executeTask(ctx, client, buildExecutor, sdlcExecutor, t, cfg, log)
|
||||
}(task)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,7 +9,11 @@ metadata:
|
||||
app.kubernetes.io/name: rdev-worker
|
||||
app.kubernetes.io/part-of: rdev
|
||||
spec:
|
||||
replicas: 2
|
||||
replicas: 1
|
||||
# Recreate strategy required: claudebox-claude-config PVC is RWO (ReadWriteOnce)
|
||||
# and cannot be attached to multiple pods simultaneously
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: rdev-worker
|
||||
|
||||
@ -117,7 +117,10 @@ func (c *Client) Execute(ctx context.Context, req *ExecuteRequest) (*ExecuteResp
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, fmt.Errorf("execute returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return nil, fmt.Errorf("execute returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -163,7 +166,10 @@ func (c *Client) ExecuteStream(ctx context.Context, req *ExecuteRequest, handler
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("execute stream returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return fmt.Errorf("execute stream returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -233,7 +239,10 @@ func (c *Client) GitClone(ctx context.Context, cloneURL, workDir string) (*GitCl
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, fmt.Errorf("git clone returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return nil, fmt.Errorf("git clone returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -288,7 +297,10 @@ func (c *Client) GitCommitAndPush(ctx context.Context, message string, push bool
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, fmt.Errorf("git commit returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return nil, fmt.Errorf("git commit returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -328,7 +340,10 @@ func (c *Client) GitStatus(ctx context.Context, workDir string) (*GitStatusRespo
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, fmt.Errorf("git status returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return nil, fmt.Errorf("git status returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -381,7 +396,10 @@ func (c *Client) RunSDLC(ctx context.Context, command string, args []string, wor
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, fmt.Errorf("sdlc returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return nil, fmt.Errorf("sdlc returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
|
||||
@ -88,7 +88,10 @@ func (c *APIClient) Register(ctx context.Context, req *RegisterRequest) error {
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("register returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return fmt.Errorf("register returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -111,7 +114,10 @@ func (c *APIClient) Heartbeat(ctx context.Context, workerID string) error {
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("heartbeat returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return fmt.Errorf("heartbeat returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -193,7 +199,10 @@ func (c *APIClient) ClaimTask(ctx context.Context, workerID string) (*domain.Wor
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, fmt.Errorf("claim task returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return nil, fmt.Errorf("claim task returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -202,6 +211,9 @@ func (c *APIClient) ClaimTask(ctx context.Context, workerID string) (*domain.Wor
|
||||
return nil, fmt.Errorf("decode response: %w", err)
|
||||
}
|
||||
|
||||
if result.Data.Task == nil {
|
||||
return nil, fmt.Errorf("API returned success but no task data")
|
||||
}
|
||||
return result.Data.Task.ToWorkTask(), nil
|
||||
}
|
||||
|
||||
@ -251,7 +263,10 @@ func (c *APIClient) CompleteTask(ctx context.Context, workerID, taskID string, r
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("complete task returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return fmt.Errorf("complete task returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
@ -292,7 +307,10 @@ func (c *APIClient) FailTask(ctx context.Context, workerID, taskID string, errMs
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
bodyBytes, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("fail task returned status %d (failed to read body: %w)", resp.StatusCode, readErr)
|
||||
}
|
||||
return fmt.Errorf("fail task returned status %d: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user