feat: add diagnostics endpoint and external health monitoring

- Add /diagnostics endpoint for system health overview
- Add external health worker for monitoring Gitea, Woodpecker, Registry
- Add health check methods to Gitea and Woodpecker clients
- Remove hardcoded fallback projects (pantheon, aeries)
- Add diagnostics domain types and service layer
- Add comprehensive tests for diagnostics handler and service
- Fix tests to use registered test project instead of hardcoded one

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
jordan 2026-02-03 19:10:56 -07:00
parent 9128dd23b5
commit 210064d490
17 changed files with 1724 additions and 42 deletions

View File

@ -0,0 +1,49 @@
# External Health Checker
**Last Updated:** 2026-02-03
**Confidence:** High
## Summary
Background worker that continuously monitors external systems (registry, CI, git) and surfaces issues proactively via metrics, logs, and the `/ready` endpoint. Runs every 30s, caches results for instant lookups, and logs state transitions.
**Key Facts:**
- Monitors: `registry` (zot), `ci` (woodpecker), `git` (gitea)
- Check interval: 30 seconds (configurable)
- Caches results for `/ready` endpoint (no blocking network calls)
- Logs only on state changes (healthy→unhealthy, unhealthy→healthy)
- Preserves `LastHealthy` timestamp through unhealthy periods
**File Pointers:**
- Domain types: `internal/domain/external_health.go`
- Worker implementation: `internal/worker/external_health.go`
- Port interface: `internal/port/health.go:ExternalHealthChecker`
- Handler integration: `internal/handlers/health.go:WithExternalHealthChecker`
- Wiring: `cmd/rdev-api/main.go:433-455`
## How It Works
1. Background goroutine polls all configured external systems every 30s
2. Checks run in parallel with 10s timeout per system
3. Results cached in thread-safe map
4. `/ready` reads cached statuses (no network calls)
5. Prometheus metrics updated on each check cycle
**Adapter implementations:**
- Registry: `internal/adapter/zot/client.go:Check()` - calls `/v2/` endpoint
- CI: `internal/adapter/woodpecker/client.go:Check()` - calls `Self()` API
- Git: `internal/adapter/gitea/client.go:Check()` - calls `ListMyOrgs()`
## Prometheus Metrics
| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `rdev_external_system_healthy` | Gauge | `system` | 1=healthy, 0=unhealthy |
| `rdev_external_system_latency_seconds` | Gauge | `system` | Check latency |
| `rdev_external_system_last_check_timestamp` | Gauge | `system` | Unix timestamp of last check |
## Related Topics
- [Work Queue](./work-queue.md) - Uses similar background worker pattern
- [CI Provider](./ci-provider.md) - Woodpecker adapter details
- [Worker Pool](./worker-pool.md) - Another background worker example

View File

@ -418,12 +418,43 @@ func main() {
logger.Info("registry health checker initialized", "url", registryURL) logger.Info("registry health checker initialized", "url", registryURL)
} }
// Initialize diagnostics service (aggregates health data for debugging)
diagnosticsService := service.NewDiagnosticsService(
operationRepo,
registryChecker,
woodpeckerClient,
service.DiagnosticsServiceConfig{
DefaultGitOwner: infraCfg.GiteaDefaultOrg,
Logger: logger,
},
)
diagnosticsHandler := handlers.NewDiagnosticsHandler(diagnosticsService, projectRepo, logger)
// Initialize external health checker (background monitoring of registry, CI, git)
var externalHealthChecker *worker.ExternalHealthChecker
if registryChecker != nil || woodpeckerClient != nil || giteaClient != nil {
externalHealthChecker = worker.NewExternalHealthChecker(
registryChecker,
woodpeckerClient,
giteaClient,
worker.ExternalHealthConfig{
CheckInterval: 30 * time.Second,
Logger: logger,
},
)
externalHealthChecker.Start()
logger.Info("external health checker started")
}
// Override default health/ready endpoints with full dependency checks // Override default health/ready endpoints with full dependency checks
healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil). healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil).
WithAgentRegistry(agentRegistry) WithAgentRegistry(agentRegistry)
if registryChecker != nil { if registryChecker != nil {
healthHandler = healthHandler.WithRegistryChecker(registryChecker) healthHandler = healthHandler.WithRegistryChecker(registryChecker)
} }
if externalHealthChecker != nil {
healthHandler = healthHandler.WithExternalHealthChecker(externalHealthChecker)
}
app.Router().Get("/health", healthHandler.Health) app.Router().Get("/health", healthHandler.Health)
app.Router().Get("/ready", healthHandler.Ready) app.Router().Get("/ready", healthHandler.Ready)
@ -448,6 +479,7 @@ func main() {
buildsHandler.Mount(app.Router()) buildsHandler.Mount(app.Router())
createAndBuildHandler.Mount(app.Router()) createAndBuildHandler.Mount(app.Router())
operationsHandler.Mount(app.Router()) operationsHandler.Mount(app.Router())
diagnosticsHandler.Mount(app.Router())
sdlcHandler.Mount(app.Router()) sdlcHandler.Mount(app.Router())
sdlcOrchestratorHandler.Mount(app.Router()) sdlcOrchestratorHandler.Mount(app.Router())
@ -514,6 +546,9 @@ func main() {
app.EnableDocs(buildOpenAPISpec()) app.EnableDocs(buildOpenAPISpec())
app.OnShutdown(func(ctx context.Context) error { app.OnShutdown(func(ctx context.Context) error {
if externalHealthChecker != nil {
externalHealthChecker.Stop()
}
workExecutor.Stop() workExecutor.Stop()
queueMaintenance.Stop() queueMaintenance.Stop()
operationCleanup.Stop() operationCleanup.Stop()

View File

@ -11,18 +11,21 @@ package gitea
import ( import (
"context" "context"
"fmt" "fmt"
"time"
"code.gitea.io/sdk/gitea" "code.gitea.io/sdk/gitea"
"github.com/orchard9/rdev/internal/domain" "github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/port" "github.com/orchard9/rdev/internal/port"
) )
// Ensure Client implements GitRepository. // Ensure Client implements GitRepository and ExternalHealthChecker.
var _ port.GitRepository = (*Client)(nil) var _ port.GitRepository = (*Client)(nil)
var _ port.ExternalHealthChecker = (*Client)(nil)
// Client is a Gitea API client adapter. // Client is a Gitea API client adapter.
type Client struct { type Client struct {
client *gitea.Client client *gitea.Client
url string // Gitea server URL for health checks
defaultOwner string // default organization/user for new repos defaultOwner string // default organization/user for new repos
} }
@ -43,6 +46,7 @@ func NewClient(url, token, defaultOwner string) (*Client, error) {
} }
return &Client{ return &Client{
client: client, client: client,
url: url,
defaultOwner: defaultOwner, defaultOwner: defaultOwner,
}, nil }, nil
} }
@ -208,6 +212,45 @@ func (c *Client) DeleteWebhook(ctx context.Context, owner, repo string, webhookI
return nil return nil
} }
// Check returns the health status of the Gitea server.
// Implements port.ExternalHealthChecker.
func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus {
start := time.Now()
status := domain.ExternalSystemStatus{
System: domain.ExternalSystemGit,
URL: c.url,
}
// Gitea SDK doesn't support context propagation for HTTP requests,
// but check for cancellation before making the call.
select {
case <-ctx.Done():
status.Latency = time.Since(start)
status.LastChecked = time.Now().UTC()
status.Healthy = false
status.Error = ctx.Err().Error()
return status
default:
}
// Call ListMyOrgs (lightweight, tests auth)
_, _, err := c.client.ListMyOrgs(gitea.ListOrgsOptions{
ListOptions: gitea.ListOptions{PageSize: 1},
})
status.Latency = time.Since(start)
status.LastChecked = time.Now().UTC()
if err != nil {
status.Healthy = false
status.Error = err.Error()
} else {
status.Healthy = true
status.LastHealthy = status.LastChecked
}
return status
}
// repoFromGitea converts a gitea.Repository to domain.Repo. // repoFromGitea converts a gitea.Repository to domain.Repo.
func repoFromGitea(r *gitea.Repository) *domain.Repo { func repoFromGitea(r *gitea.Repository) *domain.Repo {
return &domain.Repo{ return &domain.Repo{

View File

@ -61,23 +61,9 @@ func NewProjectRepositoryWithClient(namespace string, client *kubernetes.Clients
} }
// initFallbackProjects adds hardcoded projects for when K8s client is unavailable. // initFallbackProjects adds hardcoded projects for when K8s client is unavailable.
// Currently empty - projects are discovered dynamically from K8s or stored in the database.
func (r *ProjectRepository) initFallbackProjects() { func (r *ProjectRepository) initFallbackProjects() {
r.projects["pantheon"] = &domain.Project{ // No hardcoded fallback projects
ID: "pantheon",
Name: "Pantheon",
Description: "Go API backend",
PodName: "claudebox-pantheon-0",
Status: domain.ProjectStatusUnknown,
Workspace: "/workspace",
}
r.projects["aeries"] = &domain.Project{
ID: "aeries",
Name: "Aeries",
Description: "Note community platform",
PodName: "claudebox-aeries-0",
Status: domain.ProjectStatusUnknown,
Workspace: "/workspace",
}
} }
// Ensure ProjectRepository implements port.ProjectRepository at compile time. // Ensure ProjectRepository implements port.ProjectRepository at compile time.

View File

@ -28,8 +28,9 @@ import (
"github.com/orchard9/rdev/internal/port" "github.com/orchard9/rdev/internal/port"
) )
// Ensure Client implements CIProvider. // Ensure Client implements CIProvider and ExternalHealthChecker.
var _ port.CIProvider = (*Client)(nil) var _ port.CIProvider = (*Client)(nil)
var _ port.ExternalHealthChecker = (*Client)(nil)
// tokenTransport is an http.RoundTripper that adds bearer token auth. // tokenTransport is an http.RoundTripper that adds bearer token auth.
type tokenTransport struct { type tokenTransport struct {
@ -331,6 +332,42 @@ func (c *Client) DeleteSecret(ctx context.Context, owner, repo, secretName strin
return nil return nil
} }
// Check returns the health status of the Woodpecker CI system.
// Implements port.ExternalHealthChecker.
func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus {
start := time.Now()
status := domain.ExternalSystemStatus{
System: domain.ExternalSystemCI,
URL: c.url,
}
// Check context cancellation
select {
case <-ctx.Done():
status.Latency = time.Since(start)
status.LastChecked = time.Now().UTC()
status.Healthy = false
status.Error = ctx.Err().Error()
return status
default:
}
// Call Self() to get current user info (lightweight, tests auth)
_, err := c.client.Self()
status.Latency = time.Since(start)
status.LastChecked = time.Now().UTC()
if err != nil {
status.Healthy = false
status.Error = err.Error()
} else {
status.Healthy = true
status.LastHealthy = status.LastChecked
}
return status
}
// repoFromWoodpecker converts a woodpecker.Repo to domain.CIRepo. // repoFromWoodpecker converts a woodpecker.Repo to domain.CIRepo.
func repoFromWoodpecker(r *woodpecker.Repo) *domain.CIRepo { func repoFromWoodpecker(r *woodpecker.Repo) *domain.CIRepo {
// Parse forge remote ID (string in SDK, int64 in our domain) // Parse forge remote ID (string in SDK, int64 in our domain)

View File

@ -0,0 +1,110 @@
package domain
import "time"
// ProjectDiagnostics provides a unified view of project health for debugging.
// It aggregates data from operations, CI pipelines, and registry health.
type ProjectDiagnostics struct {
// ProjectID is the project being diagnosed.
ProjectID string `json:"project_id"`
// GeneratedAt is when this diagnostic was generated.
GeneratedAt time.Time `json:"generated_at"`
// Summary is a one-line status: "healthy", "degraded", or "unhealthy".
Summary string `json:"summary"`
// Issues is a list of detected problems (empty if healthy).
Issues []DiagnosticIssue `json:"issues,omitempty"`
// RecentOperations are the last N operations for this project.
RecentOperations []OperationSummary `json:"recent_operations,omitempty"`
// CI contains CI/CD pipeline status.
CI *CIDiagnostics `json:"ci,omitempty"`
// Registry contains container registry health.
Registry *RegistryStatus `json:"registry,omitempty"`
}
// DiagnosticIssue represents a detected problem.
type DiagnosticIssue struct {
// Severity: "error", "warning", "info"
Severity string `json:"severity"`
// Source: "operation", "ci", "registry"
Source string `json:"source"`
// Message is a human-readable description.
Message string `json:"message"`
// Details provides additional context (e.g., error output).
Details string `json:"details,omitempty"`
// Timestamp is when the issue occurred.
Timestamp time.Time `json:"timestamp,omitempty"`
}
// OperationSummary is a condensed view of an operation for diagnostics.
type OperationSummary struct {
ID string `json:"id"`
Type OperationType `json:"type"`
Status OperationStatus `json:"status"`
StartedAt time.Time `json:"started_at"`
DurationMs int64 `json:"duration_ms,omitempty"`
Error string `json:"error,omitempty"`
ExternalRef string `json:"external_ref,omitempty"`
}
// CIDiagnostics contains CI pipeline health information.
type CIDiagnostics struct {
// Available indicates if CI is configured for this project.
Available bool `json:"available"`
// RecentPipelines are the last N pipeline executions.
RecentPipelines []CIPipelineSummary `json:"recent_pipelines,omitempty"`
// LastFailure contains details of the most recent failed pipeline.
LastFailure *CIPipelineFailure `json:"last_failure,omitempty"`
}
// CIPipelineSummary is a condensed view of a pipeline for diagnostics.
type CIPipelineSummary struct {
Number int64 `json:"number"`
Status string `json:"status"`
Branch string `json:"branch"`
Commit string `json:"commit"`
StartedAt time.Time `json:"started_at"`
Duration string `json:"duration,omitempty"`
}
// CIPipelineFailure contains details about a failed pipeline.
type CIPipelineFailure struct {
Number int64 `json:"number"`
FailedStep string `json:"failed_step"`
Error string `json:"error,omitempty"`
LogTail string `json:"log_tail,omitempty"`
URL string `json:"url,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
// DiagnosticsSummary constants.
const (
DiagnosticsSummaryHealthy = "healthy"
DiagnosticsSummaryDegraded = "degraded"
DiagnosticsSummaryUnhealthy = "unhealthy"
)
// DiagnosticSeverity constants.
const (
DiagnosticSeverityError = "error"
DiagnosticSeverityWarning = "warning"
DiagnosticSeverityInfo = "info"
)
// DiagnosticSource constants.
const (
DiagnosticSourceOperation = "operation"
DiagnosticSourceCI = "ci"
DiagnosticSourceRegistry = "registry"
)

View File

@ -0,0 +1,23 @@
package domain
import "time"
// ExternalSystem identifies an external dependency.
type ExternalSystem string
const (
ExternalSystemRegistry ExternalSystem = "registry"
ExternalSystemCI ExternalSystem = "ci"
ExternalSystemGit ExternalSystem = "git"
)
// ExternalSystemStatus represents the health of an external system.
type ExternalSystemStatus struct {
System ExternalSystem `json:"system"`
Healthy bool `json:"healthy"`
URL string `json:"url"`
Latency time.Duration `json:"latency"`
Error string `json:"error,omitempty"`
LastChecked time.Time `json:"last_checked"`
LastHealthy time.Time `json:"last_healthy,omitempty"`
}

View File

@ -0,0 +1,87 @@
// Package handlers provides HTTP handlers for the rdev API.
package handlers
import (
"context"
"log/slog"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/orchard9/rdev/internal/auth"
"github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/port"
"github.com/orchard9/rdev/pkg/api"
)
// DiagnosticsGetter retrieves project diagnostics.
type DiagnosticsGetter interface {
GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error)
}
// DiagnosticsHandler handles project diagnostics requests.
type DiagnosticsHandler struct {
diagnostics DiagnosticsGetter
projects port.ProjectRepository
logger *slog.Logger
}
// NewDiagnosticsHandler creates a new diagnostics handler.
func NewDiagnosticsHandler(
diagnostics DiagnosticsGetter,
projects port.ProjectRepository,
logger *slog.Logger,
) *DiagnosticsHandler {
if logger == nil {
logger = slog.Default()
}
return &DiagnosticsHandler{
diagnostics: diagnostics,
projects: projects,
logger: logger,
}
}
// Mount registers the diagnostics routes.
func (h *DiagnosticsHandler) Mount(r api.Router) {
r.Route("/projects/{projectId}/diagnostics", func(r chi.Router) {
r.With(auth.RequireScope(auth.ScopeProjectsRead, auth.ScopeAdmin)).Get("/", h.GetDiagnostics)
})
}
// GetDiagnostics returns comprehensive health information for a project.
// GET /projects/{projectId}/diagnostics
func (h *DiagnosticsHandler) GetDiagnostics(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), TimeoutStandard)
defer cancel()
projectID := chi.URLParam(r, "projectId")
if projectID == "" {
api.WriteBadRequest(w, r, "project ID is required")
return
}
// Verify project exists (optional - diagnostics can still be useful for non-k8s projects)
if h.projects != nil {
if _, err := h.projects.Get(ctx, domain.ProjectID(projectID)); err != nil {
if err == domain.ErrProjectNotFound {
// Log but continue - the project might exist in git/CI but not as a k8s pod
h.logger.Debug("project not found in k8s, continuing with diagnostics",
"project_id", projectID,
)
}
}
}
diag, err := h.diagnostics.GetDiagnostics(ctx, projectID)
if err != nil {
h.logger.Error("failed to get diagnostics",
"error", err,
"project_id", projectID,
)
api.WriteInternalError(w, r, "failed to retrieve diagnostics")
return
}
api.WriteSuccess(w, r, diag)
}

View File

@ -0,0 +1,148 @@
package handlers
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/go-chi/chi/v5"
"github.com/orchard9/rdev/internal/domain"
)
// mockDiagnosticsGetter implements DiagnosticsGetter for testing.
type mockDiagnosticsGetter struct {
diagnostics *domain.ProjectDiagnostics
err error
}
func (m *mockDiagnosticsGetter) GetDiagnostics(_ context.Context, projectID string) (*domain.ProjectDiagnostics, error) {
if m.err != nil {
return nil, m.err
}
if m.diagnostics != nil {
return m.diagnostics, nil
}
// Return default healthy diagnostics
return &domain.ProjectDiagnostics{
ProjectID: projectID,
GeneratedAt: time.Now().UTC(),
Summary: domain.DiagnosticsSummaryHealthy,
Issues: []domain.DiagnosticIssue{},
}, nil
}
func TestDiagnosticsHandler_GetDiagnostics_Success(t *testing.T) {
getter := &mockDiagnosticsGetter{}
projects := newMockProjectRepo()
h := NewDiagnosticsHandler(getter, projects, nil)
// Create router with chi to handle URL params
r := chi.NewRouter()
r.Use(testAdminAuth) // Add auth context for tests
h.Mount(r)
req := httptest.NewRequest("GET", "/projects/test-project/diagnostics/", nil)
rec := httptest.NewRecorder()
r.ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Errorf("GetDiagnostics() status = %d, want %d; body = %s", rec.Code, http.StatusOK, rec.Body.String())
}
var resp map[string]any
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
data, ok := resp["data"].(map[string]any)
if !ok {
t.Fatalf("response missing data field")
}
if data["project_id"] != "test-project" {
t.Errorf("project_id = %q, want %q", data["project_id"], "test-project")
}
if data["summary"] != domain.DiagnosticsSummaryHealthy {
t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryHealthy)
}
}
func TestDiagnosticsHandler_GetDiagnostics_WithIssues(t *testing.T) {
getter := &mockDiagnosticsGetter{
diagnostics: &domain.ProjectDiagnostics{
ProjectID: "unhealthy-project",
GeneratedAt: time.Now().UTC(),
Summary: domain.DiagnosticsSummaryUnhealthy,
Issues: []domain.DiagnosticIssue{
{
Severity: domain.DiagnosticSeverityError,
Source: domain.DiagnosticSourceCI,
Message: "CI build #42 failed",
},
{
Severity: domain.DiagnosticSeverityWarning,
Source: domain.DiagnosticSourceRegistry,
Message: "Container registry slow",
},
},
},
}
projects := newMockProjectRepo()
h := NewDiagnosticsHandler(getter, projects, nil)
r := chi.NewRouter()
r.Use(testAdminAuth) // Add auth context for tests
h.Mount(r)
req := httptest.NewRequest("GET", "/projects/unhealthy-project/diagnostics/", nil)
rec := httptest.NewRecorder()
r.ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusOK)
}
var resp map[string]any
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
data, ok := resp["data"].(map[string]any)
if !ok {
t.Fatalf("response missing data field")
}
if data["summary"] != domain.DiagnosticsSummaryUnhealthy {
t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryUnhealthy)
}
issues, ok := data["issues"].([]any)
if !ok {
t.Fatalf("response missing issues field")
}
if len(issues) != 2 {
t.Errorf("issues count = %d, want %d", len(issues), 2)
}
}
func TestDiagnosticsHandler_GetDiagnostics_MissingProjectID(t *testing.T) {
getter := &mockDiagnosticsGetter{}
projects := newMockProjectRepo()
h := NewDiagnosticsHandler(getter, projects, nil)
// Direct call without chi router to test missing projectId
req := httptest.NewRequest("GET", "/projects//diagnostics/", nil)
rec := httptest.NewRecorder()
h.GetDiagnostics(rec, req)
if rec.Code != http.StatusBadRequest {
t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusBadRequest)
}
}

View File

@ -8,6 +8,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/metrics" "github.com/orchard9/rdev/internal/metrics"
"github.com/orchard9/rdev/internal/port" "github.com/orchard9/rdev/internal/port"
"github.com/orchard9/rdev/pkg/api" "github.com/orchard9/rdev/pkg/api"
@ -19,6 +20,11 @@ type ExecutorHealthChecker interface {
WorkerID() string WorkerID() string
} }
// ExternalHealthStatusProvider provides cached external system health statuses.
type ExternalHealthStatusProvider interface {
GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus
}
// HealthHandler handles health and readiness checks. // HealthHandler handles health and readiness checks.
type HealthHandler struct { type HealthHandler struct {
serviceName string serviceName string
@ -27,6 +33,7 @@ type HealthHandler struct {
agentRegistry port.CodeAgentRegistry agentRegistry port.CodeAgentRegistry
workExecutor ExecutorHealthChecker workExecutor ExecutorHealthChecker
registryChecker port.RegistryChecker registryChecker port.RegistryChecker
externalChecker ExternalHealthStatusProvider
} }
// NewHealthHandler creates a new health handler with dependencies. // NewHealthHandler creates a new health handler with dependencies.
@ -56,6 +63,12 @@ func (h *HealthHandler) WithRegistryChecker(checker port.RegistryChecker) *Healt
return h return h
} }
// WithExternalHealthChecker adds a cached external health checker for monitoring.
func (h *HealthHandler) WithExternalHealthChecker(checker ExternalHealthStatusProvider) *HealthHandler {
h.externalChecker = checker
return h
}
// Health returns a simple liveness check. // Health returns a simple liveness check.
// This should be lightweight and only fail if the process is unhealthy. // This should be lightweight and only fail if the process is unhealthy.
// GET /health // GET /health
@ -113,6 +126,26 @@ func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) {
checks["registry"] = h.checkRegistry(ctx) checks["registry"] = h.checkRegistry(ctx)
} }
// External system checks (cached, from background worker)
if h.externalChecker != nil {
for system, status := range h.externalChecker.GetAllStatuses() {
checks["external:"+string(system)] = CheckResult{
Healthy: status.Healthy,
Message: status.Error,
Latency: status.Latency.String(),
LastCheck: status.LastChecked,
}
if status.Healthy {
checks["external:"+string(system)] = CheckResult{
Healthy: true,
Message: "connected",
Latency: status.Latency.String(),
LastCheck: status.LastChecked,
}
}
}
}
response := ReadinessResponse{ response := ReadinessResponse{
Status: "ready", Status: "ready",
Service: h.serviceName, Service: h.serviceName,

View File

@ -2,6 +2,7 @@ package handlers
import ( import (
"bytes" "bytes"
"context"
"encoding/json" "encoding/json"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
@ -10,11 +11,22 @@ import (
"github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5"
"github.com/orchard9/rdev/internal/adapter/kubernetes" "github.com/orchard9/rdev/internal/adapter/kubernetes"
"github.com/orchard9/rdev/internal/domain"
) )
// newTestProjectsHandler creates a ProjectsHandler for testing. // newTestProjectsHandler creates a ProjectsHandler for testing.
// It registers a test project "test-project" for use in tests.
func newTestProjectsHandler() *ProjectsHandler { func newTestProjectsHandler() *ProjectsHandler {
repo := kubernetes.NewProjectRepository("test-namespace") repo := kubernetes.NewProjectRepository("test-namespace")
// Register a test project for tests to use
_ = repo.Register(context.Background(), &domain.Project{
ID: "test-project",
Name: "Test Project",
Description: "Test project for unit tests",
PodName: "test-project-pod-0",
Status: domain.ProjectStatusRunning,
Workspace: "/workspace",
})
exec := kubernetes.NewExecutor("test-namespace") exec := kubernetes.NewExecutor("test-namespace")
return NewProjectsHandler(repo, exec) return NewProjectsHandler(repo, exec)
} }
@ -61,7 +73,7 @@ func TestProjectsHandler_Get(t *testing.T) {
projectID string projectID string
wantStatus int wantStatus int
}{ }{
{"existing project", "pantheon", http.StatusOK}, {"existing project", "test-project", http.StatusOK},
{"non-existent project", "nonexistent", http.StatusNotFound}, {"non-existent project", "nonexistent", http.StatusNotFound},
} }
@ -95,7 +107,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
}{ }{
{ {
name: "valid request", name: "valid request",
projectID: "pantheon", projectID: "test-project",
body: ClaudeRequest{ body: ClaudeRequest{
Prompt: "Hello, world!", Prompt: "Hello, world!",
}, },
@ -103,7 +115,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
}, },
{ {
name: "missing prompt", name: "missing prompt",
projectID: "pantheon", projectID: "test-project",
body: ClaudeRequest{ body: ClaudeRequest{
Prompt: "", Prompt: "",
}, },
@ -118,7 +130,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
}, },
{ {
name: "null byte in prompt", name: "null byte in prompt",
projectID: "pantheon", projectID: "test-project",
body: ClaudeRequest{ body: ClaudeRequest{
Prompt: "Hello\x00World", Prompt: "Hello\x00World",
}, },
@ -127,7 +139,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
}, },
{ {
name: "invalid stream ID", name: "invalid stream ID",
projectID: "pantheon", projectID: "test-project",
body: ClaudeRequest{ body: ClaudeRequest{
Prompt: "Hello", Prompt: "Hello",
StreamID: "invalid stream id with spaces", StreamID: "invalid stream id with spaces",
@ -175,7 +187,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}{ }{
{ {
name: "valid command", name: "valid command",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "ls -la", Command: "ls -la",
}, },
@ -183,7 +195,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}, },
{ {
name: "missing command", name: "missing command",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "", Command: "",
}, },
@ -192,7 +204,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}, },
{ {
name: "dangerous command with semicolon", name: "dangerous command with semicolon",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "ls; rm -rf /", Command: "ls; rm -rf /",
}, },
@ -201,7 +213,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}, },
{ {
name: "dangerous command with pipe", name: "dangerous command with pipe",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "cat /etc/passwd | grep root", Command: "cat /etc/passwd | grep root",
}, },
@ -210,7 +222,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}, },
{ {
name: "command substitution", name: "command substitution",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "echo $(whoami)", Command: "echo $(whoami)",
}, },
@ -219,7 +231,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}, },
{ {
name: "redirect", name: "redirect",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "ls > /tmp/out.txt", Command: "ls > /tmp/out.txt",
}, },
@ -228,7 +240,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
}, },
{ {
name: "rm rf root", name: "rm rf root",
projectID: "pantheon", projectID: "test-project",
body: ShellRequest{ body: ShellRequest{
Command: "rm -rf /", Command: "rm -rf /",
}, },
@ -281,7 +293,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
}{ }{
{ {
name: "valid git status", name: "valid git status",
projectID: "pantheon", projectID: "test-project",
body: GitRequest{ body: GitRequest{
Args: []string{"status"}, Args: []string{"status"},
}, },
@ -289,7 +301,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
}, },
{ {
name: "valid git log", name: "valid git log",
projectID: "pantheon", projectID: "test-project",
body: GitRequest{ body: GitRequest{
Args: []string{"log", "--oneline", "-10"}, Args: []string{"log", "--oneline", "-10"},
}, },
@ -297,7 +309,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
}, },
{ {
name: "missing args", name: "missing args",
projectID: "pantheon", projectID: "test-project",
body: GitRequest{ body: GitRequest{
Args: []string{}, Args: []string{},
}, },
@ -306,7 +318,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
}, },
{ {
name: "git config blocked", name: "git config blocked",
projectID: "pantheon", projectID: "test-project",
body: GitRequest{ body: GitRequest{
Args: []string{"config", "--global", "user.name", "attacker"}, Args: []string{"config", "--global", "user.name", "attacker"},
}, },
@ -315,7 +327,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
}, },
{ {
name: "git remote blocked", name: "git remote blocked",
projectID: "pantheon", projectID: "test-project",
body: GitRequest{ body: GitRequest{
Args: []string{"remote", "add", "evil", "https://evil.com/repo"}, Args: []string{"remote", "add", "evil", "https://evil.com/repo"},
}, },
@ -324,7 +336,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
}, },
{ {
name: "force push blocked", name: "force push blocked",
projectID: "pantheon", projectID: "test-project",
body: GitRequest{ body: GitRequest{
Args: []string{"push", "-f", "origin", "main"}, Args: []string{"push", "-f", "origin", "main"},
}, },
@ -394,9 +406,9 @@ func TestProjectsHandler_InvalidJSON(t *testing.T) {
method string method string
path string path string
}{ }{
{"POST", "/projects/pantheon/claude"}, {"POST", "/projects/test-project/claude"},
{"POST", "/projects/pantheon/shell"}, {"POST", "/projects/test-project/shell"},
{"POST", "/projects/pantheon/git"}, {"POST", "/projects/test-project/git"},
} }
for _, ep := range endpoints { for _, ep := range endpoints {
@ -429,12 +441,12 @@ func TestCommandIDGeneration(t *testing.T) {
body := ClaudeRequest{Prompt: "test"} body := ClaudeRequest{Prompt: "test"}
bodyBytes, _ := json.Marshal(body) bodyBytes, _ := json.Marshal(body)
req1 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes)) req1 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
req1.Header.Set("Content-Type", "application/json") req1.Header.Set("Content-Type", "application/json")
rec1 := httptest.NewRecorder() rec1 := httptest.NewRecorder()
router.ServeHTTP(rec1, req1) router.ServeHTTP(rec1, req1)
req2 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes)) req2 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
req2.Header.Set("Content-Type", "application/json") req2.Header.Set("Content-Type", "application/json")
rec2 := httptest.NewRecorder() rec2 := httptest.NewRecorder()
router.ServeHTTP(rec2, req2) router.ServeHTTP(rec2, req2)
@ -465,7 +477,7 @@ func TestCustomStreamID(t *testing.T) {
} }
bodyBytes, _ := json.Marshal(body) bodyBytes, _ := json.Marshal(body)
req := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes)) req := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
req.Header.Set("Content-Type", "application/json") req.Header.Set("Content-Type", "application/json")
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
router.ServeHTTP(rec, req) router.ServeHTTP(rec, req)

View File

@ -142,6 +142,22 @@ var (
Name: "rdev_ci_push_failures_total", Name: "rdev_ci_push_failures_total",
Help: "Total number of CI image push failures by project", Help: "Total number of CI image push failures by project",
}, []string{"project"}) }, []string{"project"})
// External system health
externalSystemHealthy = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_external_system_healthy",
Help: "Whether external system is healthy (1) or not (0)",
}, []string{"system"})
externalSystemLatency = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_external_system_latency_seconds",
Help: "Latency of external system health check in seconds",
}, []string{"system"})
externalSystemLastCheck = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_external_system_last_check_timestamp",
Help: "Unix timestamp of last health check",
}, []string{"system"})
) )
// RecordCommand records a command execution. // RecordCommand records a command execution.
@ -248,6 +264,17 @@ func RecordCIPushFailure(project string) {
ciPushFailures.WithLabelValues(project).Inc() ciPushFailures.WithLabelValues(project).Inc()
} }
// SetExternalSystemHealth updates the health metrics for an external system.
func SetExternalSystemHealth(system string, healthy bool, latencySeconds float64) {
val := 0.0
if healthy {
val = 1.0
}
externalSystemHealthy.WithLabelValues(system).Set(val)
externalSystemLatency.WithLabelValues(system).Set(latencySeconds)
externalSystemLastCheck.WithLabelValues(system).Set(float64(time.Now().Unix()))
}
// Handler returns the Prometheus HTTP handler. // Handler returns the Prometheus HTTP handler.
func Handler() http.Handler { func Handler() http.Handler {
return promhttp.Handler() return promhttp.Handler()

View File

@ -23,3 +23,9 @@ type RegistryChecker interface {
// Check returns the health status of the registry. // Check returns the health status of the registry.
Check(ctx context.Context) domain.RegistryStatus Check(ctx context.Context) domain.RegistryStatus
} }
// ExternalHealthChecker checks an external system's health.
type ExternalHealthChecker interface {
// Check returns the health status of the external system.
Check(ctx context.Context) domain.ExternalSystemStatus
}

View File

@ -0,0 +1,295 @@
// Package service provides business logic services.
package service
import (
"context"
"fmt"
"log/slog"
"time"
"github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/port"
)
// DiagnosticsServiceConfig configures the diagnostics service.
type DiagnosticsServiceConfig struct {
// DefaultGitOwner is the git organization for CI lookups.
DefaultGitOwner string
// MaxRecentOperations is how many operations to include.
MaxRecentOperations int
// MaxRecentPipelines is how many pipelines to include.
MaxRecentPipelines int
Logger *slog.Logger
}
// DiagnosticsService aggregates project health information from multiple sources.
type DiagnosticsService struct {
operationRepo port.OperationRepository
registryChecker port.RegistryChecker
ciProvider port.CIProvider
defaultGitOwner string
maxRecentOperations int
maxRecentPipelines int
logger *slog.Logger
}
// NewDiagnosticsService creates a new diagnostics service.
func NewDiagnosticsService(
operationRepo port.OperationRepository,
registryChecker port.RegistryChecker,
ciProvider port.CIProvider,
cfg DiagnosticsServiceConfig,
) *DiagnosticsService {
logger := cfg.Logger
if logger == nil {
logger = slog.Default()
}
maxOps := cfg.MaxRecentOperations
if maxOps <= 0 {
maxOps = 10
}
maxPipelines := cfg.MaxRecentPipelines
if maxPipelines <= 0 {
maxPipelines = 5
}
return &DiagnosticsService{
operationRepo: operationRepo,
registryChecker: registryChecker,
ciProvider: ciProvider,
defaultGitOwner: cfg.DefaultGitOwner,
maxRecentOperations: maxOps,
maxRecentPipelines: maxPipelines,
logger: logger.With("service", "diagnostics"),
}
}
// GetDiagnostics returns comprehensive health information for a project.
func (s *DiagnosticsService) GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error) {
diag := &domain.ProjectDiagnostics{
ProjectID: projectID,
GeneratedAt: time.Now().UTC(),
Summary: domain.DiagnosticsSummaryHealthy,
Issues: []domain.DiagnosticIssue{},
}
// Collect data from each source (don't fail if one source fails)
s.collectOperations(ctx, projectID, diag)
s.collectRegistryHealth(ctx, diag)
s.collectCIStatus(ctx, projectID, diag)
// Determine overall summary
s.calculateSummary(diag)
return diag, nil
}
// collectOperations fetches recent operations and extracts issues.
func (s *DiagnosticsService) collectOperations(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) {
filter := domain.OperationFilters{
ProjectID: projectID,
Limit: s.maxRecentOperations,
}
ops, err := s.operationRepo.List(ctx, filter)
if err != nil {
s.logger.Warn("failed to fetch operations for diagnostics",
"error", err,
"project_id", projectID,
)
diag.Issues = append(diag.Issues, domain.DiagnosticIssue{
Severity: domain.DiagnosticSeverityWarning,
Source: domain.DiagnosticSourceOperation,
Message: "Unable to fetch operation history",
Details: err.Error(),
})
return
}
// Convert to summaries
for _, op := range ops {
summary := domain.OperationSummary{
ID: op.ID,
Type: op.Type,
Status: op.Status,
StartedAt: op.StartedAt,
DurationMs: op.DurationMs,
Error: op.Error,
ExternalRef: op.ExternalRef,
}
diag.RecentOperations = append(diag.RecentOperations, summary)
// Extract issues from failed operations
if op.Status == domain.OperationStatusFailed {
issue := domain.DiagnosticIssue{
Severity: domain.DiagnosticSeverityError,
Source: domain.DiagnosticSourceOperation,
Message: fmt.Sprintf("%s operation failed", op.Type),
Timestamp: op.StartedAt,
}
if op.Error != "" {
issue.Details = op.Error
}
if op.ExternalRef != "" {
issue.Message += fmt.Sprintf(" (%s)", op.ExternalRef)
}
diag.Issues = append(diag.Issues, issue)
}
}
}
// collectRegistryHealth checks registry status.
func (s *DiagnosticsService) collectRegistryHealth(ctx context.Context, diag *domain.ProjectDiagnostics) {
if s.registryChecker == nil {
return
}
status := s.registryChecker.Check(ctx)
diag.Registry = &status
if !status.Healthy {
diag.Issues = append(diag.Issues, domain.DiagnosticIssue{
Severity: domain.DiagnosticSeverityError,
Source: domain.DiagnosticSourceRegistry,
Message: "Container registry unhealthy",
Details: status.Error,
Timestamp: status.LastChecked,
})
}
}
// collectCIStatus fetches CI pipeline information.
func (s *DiagnosticsService) collectCIStatus(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) {
if s.ciProvider == nil {
diag.CI = &domain.CIDiagnostics{Available: false}
return
}
owner := s.defaultGitOwner
if owner == "" {
owner = "jordan" // fallback
}
ciDiag := &domain.CIDiagnostics{Available: true}
pipelines, err := s.ciProvider.ListPipelines(ctx, owner, projectID)
if err != nil {
s.logger.Warn("failed to fetch pipelines for diagnostics",
"error", err,
"project_id", projectID,
)
ciDiag.Available = false
diag.CI = ciDiag
return
}
// Convert to summaries and find failures
var lastFailure *domain.CIPipeline
for i, p := range pipelines {
if i >= s.maxRecentPipelines {
break
}
summary := domain.CIPipelineSummary{
Number: p.Number,
Status: p.Status,
Branch: p.Branch,
Commit: p.Commit,
StartedAt: p.Started,
}
if p.Finished.After(p.Started) {
summary.Duration = p.Finished.Sub(p.Started).Round(time.Second).String()
}
ciDiag.RecentPipelines = append(ciDiag.RecentPipelines, summary)
// Track the most recent failure
if p.Status == "failure" && lastFailure == nil {
lastFailure = p
}
}
// Get details on the last failure
if lastFailure != nil {
failure := s.getFailureDetails(ctx, owner, projectID, lastFailure)
ciDiag.LastFailure = failure
// Add as issue
issue := domain.DiagnosticIssue{
Severity: domain.DiagnosticSeverityError,
Source: domain.DiagnosticSourceCI,
Message: fmt.Sprintf("CI build #%d failed", lastFailure.Number),
Timestamp: lastFailure.Finished,
}
if failure != nil && failure.FailedStep != "" {
issue.Message += fmt.Sprintf(" at step '%s'", failure.FailedStep)
if failure.Error != "" {
issue.Details = failure.Error
}
}
diag.Issues = append(diag.Issues, issue)
}
diag.CI = ciDiag
}
// getFailureDetails fetches step-level details for a failed pipeline.
func (s *DiagnosticsService) getFailureDetails(ctx context.Context, owner, repo string, pipeline *domain.CIPipeline) *domain.CIPipelineFailure {
failure := &domain.CIPipelineFailure{
Number: pipeline.Number,
Timestamp: pipeline.Finished,
}
steps, err := s.ciProvider.GetPipelineSteps(ctx, owner, repo, pipeline.Number)
if err != nil {
s.logger.Warn("failed to fetch pipeline steps",
"error", err,
"pipeline", pipeline.Number,
)
return failure
}
failure.URL = steps.URL
// Find the failed step
for _, step := range steps.Steps {
if step.Status == "failure" || step.Status == "error" {
failure.FailedStep = step.Name
failure.Error = step.Error
if step.Log != "" {
failure.LogTail = step.Log
}
break
}
}
return failure
}
// calculateSummary determines the overall health status.
func (s *DiagnosticsService) calculateSummary(diag *domain.ProjectDiagnostics) {
errorCount := 0
warningCount := 0
for _, issue := range diag.Issues {
switch issue.Severity {
case domain.DiagnosticSeverityError:
errorCount++
case domain.DiagnosticSeverityWarning:
warningCount++
}
}
if errorCount > 0 {
diag.Summary = domain.DiagnosticsSummaryUnhealthy
} else if warningCount > 0 {
diag.Summary = domain.DiagnosticsSummaryDegraded
} else {
diag.Summary = domain.DiagnosticsSummaryHealthy
}
}

View File

@ -0,0 +1,302 @@
package service
import (
"context"
"testing"
"time"
"github.com/orchard9/rdev/internal/domain"
)
// mockOperationRepo implements port.OperationRepository for testing.
type mockOperationRepo struct {
operations []*domain.Operation
err error
}
func (m *mockOperationRepo) Create(_ context.Context, _ *domain.Operation) error {
return nil
}
func (m *mockOperationRepo) Update(_ context.Context, _ *domain.Operation) error {
return nil
}
func (m *mockOperationRepo) Get(_ context.Context, _ string) (*domain.Operation, error) {
return nil, domain.ErrOperationNotFound
}
func (m *mockOperationRepo) GetByCommitSHA(_ context.Context, _, _ string) (*domain.Operation, error) {
return nil, domain.ErrOperationNotFound
}
func (m *mockOperationRepo) List(_ context.Context, filter domain.OperationFilters) ([]*domain.Operation, error) {
if m.err != nil {
return nil, m.err
}
var result []*domain.Operation
for _, op := range m.operations {
if filter.ProjectID != "" && op.ProjectID != filter.ProjectID {
continue
}
result = append(result, op)
}
return result, nil
}
func (m *mockOperationRepo) AddStep(_ context.Context, _ string, _ domain.OperationStep) error {
return nil
}
func (m *mockOperationRepo) UpdateStep(_ context.Context, _ string, _ domain.OperationStep) error {
return nil
}
func (m *mockOperationRepo) Complete(_ context.Context, _ string, _ domain.OperationStatus, _ map[string]any, _, _ string) error {
return nil
}
func (m *mockOperationRepo) SetCommitSHA(_ context.Context, _, _ string) error {
return nil
}
func (m *mockOperationRepo) SetTriggeredBy(_ context.Context, _, _ string) error {
return nil
}
func (m *mockOperationRepo) DeleteOlderThan(_ context.Context, _ time.Time) (int64, error) {
return 0, nil
}
// mockRegistryChecker implements port.RegistryChecker for testing.
type mockRegistryChecker struct {
status domain.RegistryStatus
}
func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus {
return m.status
}
// mockCIProvider implements port.CIProvider for testing.
type mockCIProvider struct {
pipelines []*domain.CIPipeline
steps *domain.CIPipelineSteps
err error
}
func (m *mockCIProvider) ActivateRepo(_ context.Context, _, _, _ string) (*domain.CIRepo, error) {
return nil, nil
}
func (m *mockCIProvider) DeactivateRepo(_ context.Context, _, _ string) error {
return nil
}
func (m *mockCIProvider) GetRepo(_ context.Context, _, _ string) (*domain.CIRepo, error) {
return nil, nil
}
func (m *mockCIProvider) ListRepos(_ context.Context) ([]*domain.CIRepo, error) {
return nil, nil
}
func (m *mockCIProvider) AddSecret(_ context.Context, _, _ string, _ domain.CISecret) error {
return nil
}
func (m *mockCIProvider) DeleteSecret(_ context.Context, _, _, _ string) error {
return nil
}
func (m *mockCIProvider) ListPipelines(_ context.Context, _, _ string) ([]*domain.CIPipeline, error) {
if m.err != nil {
return nil, m.err
}
return m.pipelines, nil
}
func (m *mockCIProvider) GetPipeline(_ context.Context, _, _ string, _ int64) (*domain.CIPipeline, error) {
return nil, nil
}
func (m *mockCIProvider) GetPipelineSteps(_ context.Context, _, _ string, _ int64) (*domain.CIPipelineSteps, error) {
if m.steps != nil {
return m.steps, nil
}
return nil, nil
}
func (m *mockCIProvider) TriggerBuild(_ context.Context, _, _, _ string) (int64, error) {
return 0, nil
}
func TestDiagnosticsService_GetDiagnostics_Healthy(t *testing.T) {
opRepo := &mockOperationRepo{
operations: []*domain.Operation{
{
ID: "op-1",
ProjectID: "test-project",
Type: domain.OperationTypeBuild,
Status: domain.OperationStatusCompleted,
StartedAt: time.Now().Add(-1 * time.Hour),
},
},
}
registry := &mockRegistryChecker{
status: domain.RegistryStatus{
Healthy: true,
URL: "https://registry.example.com",
Latency: "10ms",
LastChecked: time.Now(),
},
}
ci := &mockCIProvider{
pipelines: []*domain.CIPipeline{
{
Number: 42,
Status: "success",
Branch: "main",
Commit: "abc123",
},
},
}
svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{
DefaultGitOwner: "test-org",
})
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
if err != nil {
t.Fatalf("GetDiagnostics() error = %v", err)
}
if diag.ProjectID != "test-project" {
t.Errorf("ProjectID = %q, want %q", diag.ProjectID, "test-project")
}
if diag.Summary != domain.DiagnosticsSummaryHealthy {
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy)
}
if len(diag.Issues) != 0 {
t.Errorf("Issues count = %d, want 0", len(diag.Issues))
}
if len(diag.RecentOperations) != 1 {
t.Errorf("RecentOperations count = %d, want 1", len(diag.RecentOperations))
}
if diag.Registry == nil {
t.Error("Registry is nil, want non-nil")
}
if diag.CI == nil || !diag.CI.Available {
t.Error("CI not available")
}
}
func TestDiagnosticsService_GetDiagnostics_Unhealthy(t *testing.T) {
opRepo := &mockOperationRepo{
operations: []*domain.Operation{
{
ID: "op-1",
ProjectID: "test-project",
Type: domain.OperationTypeBuild,
Status: domain.OperationStatusFailed,
StartedAt: time.Now().Add(-1 * time.Hour),
Error: "deployment failed",
},
},
}
registry := &mockRegistryChecker{
status: domain.RegistryStatus{
Healthy: false,
URL: "https://registry.example.com",
Error: "connection refused",
LastChecked: time.Now(),
},
}
ci := &mockCIProvider{
pipelines: []*domain.CIPipeline{
{
Number: 43,
Status: "failure",
Branch: "main",
Commit: "def456",
Finished: time.Now().Add(-30 * time.Minute),
},
},
steps: &domain.CIPipelineSteps{
PipelineNumber: 43,
URL: "https://ci.example.com/build/43",
Steps: []domain.CIPipelineStep{
{
Name: "test",
Status: "failure",
Error: "tests failed",
Log: "FAIL: TestSomething",
},
},
},
}
svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{
DefaultGitOwner: "test-org",
})
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
if err != nil {
t.Fatalf("GetDiagnostics() error = %v", err)
}
if diag.Summary != domain.DiagnosticsSummaryUnhealthy {
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryUnhealthy)
}
// Should have 3 issues: failed operation, unhealthy registry, failed CI
if len(diag.Issues) < 3 {
t.Errorf("Issues count = %d, want at least 3", len(diag.Issues))
}
// Check CI failure details
if diag.CI == nil || diag.CI.LastFailure == nil {
t.Fatal("CI.LastFailure is nil")
}
if diag.CI.LastFailure.FailedStep != "test" {
t.Errorf("FailedStep = %q, want %q", diag.CI.LastFailure.FailedStep, "test")
}
}
func TestDiagnosticsService_GetDiagnostics_Degraded(t *testing.T) {
opRepo := &mockOperationRepo{
operations: []*domain.Operation{
{
ID: "op-1",
ProjectID: "test-project",
Type: domain.OperationTypeBuild,
Status: domain.OperationStatusCompleted,
StartedAt: time.Now().Add(-1 * time.Hour),
},
},
}
registry := &mockRegistryChecker{
status: domain.RegistryStatus{
Healthy: true,
URL: "https://registry.example.com",
LastChecked: time.Now(),
},
}
// No CI provider - should produce a warning but not error
svc := NewDiagnosticsService(opRepo, registry, nil, DiagnosticsServiceConfig{
DefaultGitOwner: "test-org",
})
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
if err != nil {
t.Fatalf("GetDiagnostics() error = %v", err)
}
// Without CI, status should still be healthy (CI unavailable is not an issue)
if diag.Summary != domain.DiagnosticsSummaryHealthy {
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy)
}
if diag.CI == nil || diag.CI.Available {
t.Error("CI should be not available when no provider")
}
}

View File

@ -0,0 +1,248 @@
package worker
import (
"context"
"log/slog"
"sync"
"time"
"github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/metrics"
"github.com/orchard9/rdev/internal/port"
)
// ExternalHealthChecker runs periodic health checks on external systems
// (registry, CI, git) and caches the results for the /ready endpoint.
type ExternalHealthChecker struct {
registry port.RegistryChecker // zot
ci port.ExternalHealthChecker // woodpecker
git port.ExternalHealthChecker // gitea
interval time.Duration
logger *slog.Logger
// Internal state (thread-safe)
mu sync.RWMutex
statuses map[domain.ExternalSystem]domain.ExternalSystemStatus
// Lifecycle
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
}
// ExternalHealthConfig configures the health checker.
type ExternalHealthConfig struct {
// CheckInterval is how often to check external systems. Default: 30s.
CheckInterval time.Duration
Logger *slog.Logger
}
// DefaultExternalHealthConfig returns sensible defaults.
func DefaultExternalHealthConfig() ExternalHealthConfig {
return ExternalHealthConfig{
CheckInterval: 30 * time.Second,
Logger: slog.Default(),
}
}
// NewExternalHealthChecker creates a new external health checker.
// All checker parameters are optional (nil means skip that system).
func NewExternalHealthChecker(
registry port.RegistryChecker,
ci port.ExternalHealthChecker,
git port.ExternalHealthChecker,
cfg ExternalHealthConfig,
) *ExternalHealthChecker {
if cfg.CheckInterval == 0 {
cfg.CheckInterval = 30 * time.Second
}
if cfg.Logger == nil {
cfg.Logger = slog.Default()
}
ctx, cancel := context.WithCancel(context.Background())
return &ExternalHealthChecker{
registry: registry,
ci: ci,
git: git,
interval: cfg.CheckInterval,
logger: cfg.Logger.With("component", "external-health"),
statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus),
ctx: ctx,
cancel: cancel,
}
}
// Start begins the background check loop.
func (c *ExternalHealthChecker) Start() {
c.logger.Info("external health checker started", "interval", c.interval)
c.wg.Add(1)
go c.checkLoop()
}
// Stop gracefully shuts down the checker.
func (c *ExternalHealthChecker) Stop() {
c.logger.Info("external health checker stopping")
c.cancel()
c.wg.Wait()
c.logger.Info("external health checker stopped")
}
// GetStatus returns the cached status for a specific system.
func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
status, ok := c.statuses[system]
return status, ok
}
// GetAllStatuses returns a copy of all cached statuses.
func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus {
c.mu.RLock()
defer c.mu.RUnlock()
result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses))
for k, v := range c.statuses {
result[k] = v
}
return result
}
// checkLoop runs periodic health checks.
func (c *ExternalHealthChecker) checkLoop() {
defer c.wg.Done()
// Run immediately on start
c.runChecks()
ticker := time.NewTicker(c.interval)
defer ticker.Stop()
for {
select {
case <-c.ctx.Done():
return
case <-ticker.C:
c.runChecks()
}
}
}
// runChecks performs health checks on all configured systems in parallel.
func (c *ExternalHealthChecker) runChecks() {
ctx, cancel := context.WithTimeout(c.ctx, 10*time.Second)
defer cancel()
var wg sync.WaitGroup
results := make(chan domain.ExternalSystemStatus, 3)
// Check registry (zot)
if c.registry != nil {
wg.Add(1)
go func() {
defer wg.Done()
regStatus := c.registry.Check(ctx)
// Convert domain.RegistryStatus to domain.ExternalSystemStatus
status := domain.ExternalSystemStatus{
System: domain.ExternalSystemRegistry,
Healthy: regStatus.Healthy,
URL: regStatus.URL,
Error: regStatus.Error,
LastChecked: regStatus.LastChecked,
}
// Parse latency string (e.g., "45ms") to duration
if regStatus.Latency != "" {
if d, err := time.ParseDuration(regStatus.Latency); err == nil {
status.Latency = d
}
}
if status.Healthy {
status.LastHealthy = status.LastChecked
}
results <- status
}()
}
// Check CI (woodpecker)
if c.ci != nil {
wg.Add(1)
go func() {
defer wg.Done()
results <- c.ci.Check(ctx)
}()
}
// Check git (gitea)
if c.git != nil {
wg.Add(1)
go func() {
defer wg.Done()
results <- c.git.Check(ctx)
}()
}
// Wait for all checks to complete, then close results channel
go func() {
wg.Wait()
close(results)
}()
// Collect results and update state
for status := range results {
c.updateStatus(status)
}
}
// updateStatus updates cached status and logs/metrics on state changes.
func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) {
c.mu.Lock()
defer c.mu.Unlock()
prev, existed := c.statuses[status.System]
// Preserve LastHealthy from previous status if current is unhealthy
if !status.Healthy && existed && !prev.LastHealthy.IsZero() {
status.LastHealthy = prev.LastHealthy
}
c.statuses[status.System] = status
// Log state transitions
if !existed {
// First check
if status.Healthy {
c.logger.Info("external system healthy",
"system", status.System,
"url", status.URL,
"latency", status.Latency,
)
} else {
c.logger.Warn("external system unhealthy",
"system", status.System,
"url", status.URL,
"error", status.Error,
)
}
} else if prev.Healthy != status.Healthy {
// State changed
if status.Healthy {
c.logger.Info("external system recovered",
"system", status.System,
"url", status.URL,
"latency", status.Latency,
)
} else {
c.logger.Warn("external system became unhealthy",
"system", status.System,
"url", status.URL,
"error", status.Error,
)
}
}
// Update Prometheus metrics
metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds())
}

View File

@ -0,0 +1,241 @@
package worker
import (
"context"
"testing"
"time"
"github.com/orchard9/rdev/internal/domain"
)
// mockRegistryChecker is a mock implementation of port.RegistryChecker.
type mockRegistryChecker struct {
healthy bool
err string
latency time.Duration
}
func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus {
status := domain.RegistryStatus{
Healthy: m.healthy,
URL: "https://registry.test",
Latency: m.latency.String(),
LastChecked: time.Now().UTC(),
}
if !m.healthy {
status.Error = m.err
}
return status
}
// mockExternalHealthChecker is a mock implementation of port.ExternalHealthChecker.
type mockExternalHealthChecker struct {
system domain.ExternalSystem
healthy bool
err string
latency time.Duration
}
func (m *mockExternalHealthChecker) Check(_ context.Context) domain.ExternalSystemStatus {
status := domain.ExternalSystemStatus{
System: m.system,
Healthy: m.healthy,
URL: "https://test.system",
Latency: m.latency,
LastChecked: time.Now().UTC(),
}
if m.healthy {
status.LastHealthy = status.LastChecked
} else {
status.Error = m.err
}
return status
}
func TestExternalHealthChecker_GetStatus(t *testing.T) {
registry := &mockRegistryChecker{healthy: true, latency: 50 * time.Millisecond}
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: true, latency: 100 * time.Millisecond}
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true, latency: 75 * time.Millisecond}
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
CheckInterval: 100 * time.Millisecond,
})
// Run checks synchronously
checker.runChecks()
// Verify registry status
regStatus, ok := checker.GetStatus(domain.ExternalSystemRegistry)
if !ok {
t.Fatal("expected registry status to exist")
}
if !regStatus.Healthy {
t.Error("expected registry to be healthy")
}
// Verify CI status
ciStatus, ok := checker.GetStatus(domain.ExternalSystemCI)
if !ok {
t.Fatal("expected CI status to exist")
}
if !ciStatus.Healthy {
t.Error("expected CI to be healthy")
}
// Verify Git status
gitStatus, ok := checker.GetStatus(domain.ExternalSystemGit)
if !ok {
t.Fatal("expected Git status to exist")
}
if !gitStatus.Healthy {
t.Error("expected Git to be healthy")
}
}
func TestExternalHealthChecker_GetAllStatuses(t *testing.T) {
registry := &mockRegistryChecker{healthy: true}
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "connection refused"}
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true}
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
CheckInterval: 100 * time.Millisecond,
})
checker.runChecks()
statuses := checker.GetAllStatuses()
if len(statuses) != 3 {
t.Fatalf("expected 3 statuses, got %d", len(statuses))
}
if statuses[domain.ExternalSystemRegistry].Healthy != true {
t.Error("expected registry to be healthy")
}
if statuses[domain.ExternalSystemCI].Healthy != false {
t.Error("expected CI to be unhealthy")
}
if statuses[domain.ExternalSystemCI].Error != "connection refused" {
t.Errorf("expected CI error 'connection refused', got %q", statuses[domain.ExternalSystemCI].Error)
}
if statuses[domain.ExternalSystemGit].Healthy != true {
t.Error("expected Git to be healthy")
}
}
func TestExternalHealthChecker_NilCheckers(t *testing.T) {
// All nil checkers should result in empty statuses
checker := NewExternalHealthChecker(nil, nil, nil, ExternalHealthConfig{
CheckInterval: 100 * time.Millisecond,
})
checker.runChecks()
statuses := checker.GetAllStatuses()
if len(statuses) != 0 {
t.Fatalf("expected 0 statuses with nil checkers, got %d", len(statuses))
}
}
func TestExternalHealthChecker_StartStop(t *testing.T) {
registry := &mockRegistryChecker{healthy: true}
checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{
CheckInterval: 50 * time.Millisecond,
})
checker.Start()
// Wait for a couple of check cycles
time.Sleep(120 * time.Millisecond)
// Verify status was populated
status, ok := checker.GetStatus(domain.ExternalSystemRegistry)
if !ok {
t.Fatal("expected registry status after start")
}
if !status.Healthy {
t.Error("expected registry to be healthy")
}
checker.Stop()
// After stop, statuses should still be available (cached)
status, ok = checker.GetStatus(domain.ExternalSystemRegistry)
if !ok {
t.Fatal("expected registry status after stop")
}
}
func TestExternalHealthChecker_StateTransition(t *testing.T) {
registry := &mockRegistryChecker{healthy: true}
checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{
CheckInterval: 100 * time.Millisecond,
})
// Initial check - healthy
checker.runChecks()
status, _ := checker.GetStatus(domain.ExternalSystemRegistry)
if !status.Healthy {
t.Error("expected initial status to be healthy")
}
firstHealthy := status.LastHealthy
// Change to unhealthy
registry.healthy = false
registry.err = "connection refused"
checker.runChecks()
status, _ = checker.GetStatus(domain.ExternalSystemRegistry)
if status.Healthy {
t.Error("expected status to be unhealthy after state change")
}
// LastHealthy should be preserved from when it was healthy
if status.LastHealthy.IsZero() {
t.Error("expected LastHealthy to be preserved")
}
if !status.LastHealthy.Equal(firstHealthy) {
t.Error("expected LastHealthy to remain from healthy period")
}
// Recover to healthy
registry.healthy = true
registry.err = ""
checker.runChecks()
status, _ = checker.GetStatus(domain.ExternalSystemRegistry)
if !status.Healthy {
t.Error("expected status to be healthy after recovery")
}
// LastHealthy should be updated
if status.LastHealthy.Before(firstHealthy) {
t.Error("expected LastHealthy to be updated on recovery")
}
}
func TestExternalHealthChecker_PartialFailure(t *testing.T) {
// Registry healthy, CI unhealthy, Git healthy
registry := &mockRegistryChecker{healthy: true}
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "timeout"}
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true}
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
CheckInterval: 100 * time.Millisecond,
})
checker.runChecks()
statuses := checker.GetAllStatuses()
// Partial failure should not affect other systems
if !statuses[domain.ExternalSystemRegistry].Healthy {
t.Error("registry should be healthy despite CI failure")
}
if statuses[domain.ExternalSystemCI].Healthy {
t.Error("CI should be unhealthy")
}
if !statuses[domain.ExternalSystemGit].Healthy {
t.Error("git should be healthy despite CI failure")
}
}