From 210064d490febe9e66bae8ebd4c3b93c01340d36 Mon Sep 17 00:00:00 2001 From: jordan Date: Tue, 3 Feb 2026 19:10:56 -0700 Subject: [PATCH] feat: add diagnostics endpoint and external health monitoring - Add /diagnostics endpoint for system health overview - Add external health worker for monitoring Gitea, Woodpecker, Registry - Add health check methods to Gitea and Woodpecker clients - Remove hardcoded fallback projects (pantheon, aeries) - Add diagnostics domain types and service layer - Add comprehensive tests for diagnostics handler and service - Fix tests to use registered test project instead of hardcoded one Co-Authored-By: Claude Opus 4.5 --- ai-lookup/services/external-health.md | 49 +++ cmd/rdev-api/main.go | 35 ++ internal/adapter/gitea/client.go | 45 ++- .../adapter/kubernetes/project_repository.go | 18 +- internal/adapter/woodpecker/client.go | 39 ++- internal/domain/diagnostics.go | 110 +++++++ internal/domain/external_health.go | 23 ++ internal/handlers/diagnostics.go | 87 +++++ internal/handlers/diagnostics_test.go | 148 +++++++++ internal/handlers/health.go | 33 ++ internal/handlers/projects_test.go | 60 ++-- internal/metrics/metrics.go | 27 ++ internal/port/health.go | 6 + internal/service/diagnostics_service.go | 295 +++++++++++++++++ internal/service/diagnostics_service_test.go | 302 ++++++++++++++++++ internal/worker/external_health.go | 248 ++++++++++++++ internal/worker/external_health_test.go | 241 ++++++++++++++ 17 files changed, 1724 insertions(+), 42 deletions(-) create mode 100644 ai-lookup/services/external-health.md create mode 100644 internal/domain/diagnostics.go create mode 100644 internal/domain/external_health.go create mode 100644 internal/handlers/diagnostics.go create mode 100644 internal/handlers/diagnostics_test.go create mode 100644 internal/service/diagnostics_service.go create mode 100644 internal/service/diagnostics_service_test.go create mode 100644 internal/worker/external_health.go create mode 100644 internal/worker/external_health_test.go diff --git a/ai-lookup/services/external-health.md b/ai-lookup/services/external-health.md new file mode 100644 index 0000000..be1deb4 --- /dev/null +++ b/ai-lookup/services/external-health.md @@ -0,0 +1,49 @@ +# External Health Checker + +**Last Updated:** 2026-02-03 +**Confidence:** High + +## Summary + +Background worker that continuously monitors external systems (registry, CI, git) and surfaces issues proactively via metrics, logs, and the `/ready` endpoint. Runs every 30s, caches results for instant lookups, and logs state transitions. + +**Key Facts:** +- Monitors: `registry` (zot), `ci` (woodpecker), `git` (gitea) +- Check interval: 30 seconds (configurable) +- Caches results for `/ready` endpoint (no blocking network calls) +- Logs only on state changes (healthy→unhealthy, unhealthy→healthy) +- Preserves `LastHealthy` timestamp through unhealthy periods + +**File Pointers:** +- Domain types: `internal/domain/external_health.go` +- Worker implementation: `internal/worker/external_health.go` +- Port interface: `internal/port/health.go:ExternalHealthChecker` +- Handler integration: `internal/handlers/health.go:WithExternalHealthChecker` +- Wiring: `cmd/rdev-api/main.go:433-455` + +## How It Works + +1. Background goroutine polls all configured external systems every 30s +2. Checks run in parallel with 10s timeout per system +3. Results cached in thread-safe map +4. `/ready` reads cached statuses (no network calls) +5. Prometheus metrics updated on each check cycle + +**Adapter implementations:** +- Registry: `internal/adapter/zot/client.go:Check()` - calls `/v2/` endpoint +- CI: `internal/adapter/woodpecker/client.go:Check()` - calls `Self()` API +- Git: `internal/adapter/gitea/client.go:Check()` - calls `ListMyOrgs()` + +## Prometheus Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `rdev_external_system_healthy` | Gauge | `system` | 1=healthy, 0=unhealthy | +| `rdev_external_system_latency_seconds` | Gauge | `system` | Check latency | +| `rdev_external_system_last_check_timestamp` | Gauge | `system` | Unix timestamp of last check | + +## Related Topics + +- [Work Queue](./work-queue.md) - Uses similar background worker pattern +- [CI Provider](./ci-provider.md) - Woodpecker adapter details +- [Worker Pool](./worker-pool.md) - Another background worker example diff --git a/cmd/rdev-api/main.go b/cmd/rdev-api/main.go index 1cd5eb6..bf3a85f 100644 --- a/cmd/rdev-api/main.go +++ b/cmd/rdev-api/main.go @@ -418,12 +418,43 @@ func main() { logger.Info("registry health checker initialized", "url", registryURL) } + // Initialize diagnostics service (aggregates health data for debugging) + diagnosticsService := service.NewDiagnosticsService( + operationRepo, + registryChecker, + woodpeckerClient, + service.DiagnosticsServiceConfig{ + DefaultGitOwner: infraCfg.GiteaDefaultOrg, + Logger: logger, + }, + ) + diagnosticsHandler := handlers.NewDiagnosticsHandler(diagnosticsService, projectRepo, logger) + + // Initialize external health checker (background monitoring of registry, CI, git) + var externalHealthChecker *worker.ExternalHealthChecker + if registryChecker != nil || woodpeckerClient != nil || giteaClient != nil { + externalHealthChecker = worker.NewExternalHealthChecker( + registryChecker, + woodpeckerClient, + giteaClient, + worker.ExternalHealthConfig{ + CheckInterval: 30 * time.Second, + Logger: logger, + }, + ) + externalHealthChecker.Start() + logger.Info("external health checker started") + } + // Override default health/ready endpoints with full dependency checks healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil). WithAgentRegistry(agentRegistry) if registryChecker != nil { healthHandler = healthHandler.WithRegistryChecker(registryChecker) } + if externalHealthChecker != nil { + healthHandler = healthHandler.WithExternalHealthChecker(externalHealthChecker) + } app.Router().Get("/health", healthHandler.Health) app.Router().Get("/ready", healthHandler.Ready) @@ -448,6 +479,7 @@ func main() { buildsHandler.Mount(app.Router()) createAndBuildHandler.Mount(app.Router()) operationsHandler.Mount(app.Router()) + diagnosticsHandler.Mount(app.Router()) sdlcHandler.Mount(app.Router()) sdlcOrchestratorHandler.Mount(app.Router()) @@ -514,6 +546,9 @@ func main() { app.EnableDocs(buildOpenAPISpec()) app.OnShutdown(func(ctx context.Context) error { + if externalHealthChecker != nil { + externalHealthChecker.Stop() + } workExecutor.Stop() queueMaintenance.Stop() operationCleanup.Stop() diff --git a/internal/adapter/gitea/client.go b/internal/adapter/gitea/client.go index 1e3801c..dcfaf42 100644 --- a/internal/adapter/gitea/client.go +++ b/internal/adapter/gitea/client.go @@ -11,18 +11,21 @@ package gitea import ( "context" "fmt" + "time" "code.gitea.io/sdk/gitea" "github.com/orchard9/rdev/internal/domain" "github.com/orchard9/rdev/internal/port" ) -// Ensure Client implements GitRepository. +// Ensure Client implements GitRepository and ExternalHealthChecker. var _ port.GitRepository = (*Client)(nil) +var _ port.ExternalHealthChecker = (*Client)(nil) // Client is a Gitea API client adapter. type Client struct { client *gitea.Client + url string // Gitea server URL for health checks defaultOwner string // default organization/user for new repos } @@ -43,6 +46,7 @@ func NewClient(url, token, defaultOwner string) (*Client, error) { } return &Client{ client: client, + url: url, defaultOwner: defaultOwner, }, nil } @@ -208,6 +212,45 @@ func (c *Client) DeleteWebhook(ctx context.Context, owner, repo string, webhookI return nil } +// Check returns the health status of the Gitea server. +// Implements port.ExternalHealthChecker. +func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus { + start := time.Now() + status := domain.ExternalSystemStatus{ + System: domain.ExternalSystemGit, + URL: c.url, + } + + // Gitea SDK doesn't support context propagation for HTTP requests, + // but check for cancellation before making the call. + select { + case <-ctx.Done(): + status.Latency = time.Since(start) + status.LastChecked = time.Now().UTC() + status.Healthy = false + status.Error = ctx.Err().Error() + return status + default: + } + + // Call ListMyOrgs (lightweight, tests auth) + _, _, err := c.client.ListMyOrgs(gitea.ListOrgsOptions{ + ListOptions: gitea.ListOptions{PageSize: 1}, + }) + status.Latency = time.Since(start) + status.LastChecked = time.Now().UTC() + + if err != nil { + status.Healthy = false + status.Error = err.Error() + } else { + status.Healthy = true + status.LastHealthy = status.LastChecked + } + + return status +} + // repoFromGitea converts a gitea.Repository to domain.Repo. func repoFromGitea(r *gitea.Repository) *domain.Repo { return &domain.Repo{ diff --git a/internal/adapter/kubernetes/project_repository.go b/internal/adapter/kubernetes/project_repository.go index 9795b5b..0bc8f43 100644 --- a/internal/adapter/kubernetes/project_repository.go +++ b/internal/adapter/kubernetes/project_repository.go @@ -61,23 +61,9 @@ func NewProjectRepositoryWithClient(namespace string, client *kubernetes.Clients } // initFallbackProjects adds hardcoded projects for when K8s client is unavailable. +// Currently empty - projects are discovered dynamically from K8s or stored in the database. func (r *ProjectRepository) initFallbackProjects() { - r.projects["pantheon"] = &domain.Project{ - ID: "pantheon", - Name: "Pantheon", - Description: "Go API backend", - PodName: "claudebox-pantheon-0", - Status: domain.ProjectStatusUnknown, - Workspace: "/workspace", - } - r.projects["aeries"] = &domain.Project{ - ID: "aeries", - Name: "Aeries", - Description: "Note community platform", - PodName: "claudebox-aeries-0", - Status: domain.ProjectStatusUnknown, - Workspace: "/workspace", - } + // No hardcoded fallback projects } // Ensure ProjectRepository implements port.ProjectRepository at compile time. diff --git a/internal/adapter/woodpecker/client.go b/internal/adapter/woodpecker/client.go index 8f90b74..57af3f6 100644 --- a/internal/adapter/woodpecker/client.go +++ b/internal/adapter/woodpecker/client.go @@ -28,8 +28,9 @@ import ( "github.com/orchard9/rdev/internal/port" ) -// Ensure Client implements CIProvider. +// Ensure Client implements CIProvider and ExternalHealthChecker. var _ port.CIProvider = (*Client)(nil) +var _ port.ExternalHealthChecker = (*Client)(nil) // tokenTransport is an http.RoundTripper that adds bearer token auth. type tokenTransport struct { @@ -331,6 +332,42 @@ func (c *Client) DeleteSecret(ctx context.Context, owner, repo, secretName strin return nil } +// Check returns the health status of the Woodpecker CI system. +// Implements port.ExternalHealthChecker. +func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus { + start := time.Now() + status := domain.ExternalSystemStatus{ + System: domain.ExternalSystemCI, + URL: c.url, + } + + // Check context cancellation + select { + case <-ctx.Done(): + status.Latency = time.Since(start) + status.LastChecked = time.Now().UTC() + status.Healthy = false + status.Error = ctx.Err().Error() + return status + default: + } + + // Call Self() to get current user info (lightweight, tests auth) + _, err := c.client.Self() + status.Latency = time.Since(start) + status.LastChecked = time.Now().UTC() + + if err != nil { + status.Healthy = false + status.Error = err.Error() + } else { + status.Healthy = true + status.LastHealthy = status.LastChecked + } + + return status +} + // repoFromWoodpecker converts a woodpecker.Repo to domain.CIRepo. func repoFromWoodpecker(r *woodpecker.Repo) *domain.CIRepo { // Parse forge remote ID (string in SDK, int64 in our domain) diff --git a/internal/domain/diagnostics.go b/internal/domain/diagnostics.go new file mode 100644 index 0000000..b3f545d --- /dev/null +++ b/internal/domain/diagnostics.go @@ -0,0 +1,110 @@ +package domain + +import "time" + +// ProjectDiagnostics provides a unified view of project health for debugging. +// It aggregates data from operations, CI pipelines, and registry health. +type ProjectDiagnostics struct { + // ProjectID is the project being diagnosed. + ProjectID string `json:"project_id"` + + // GeneratedAt is when this diagnostic was generated. + GeneratedAt time.Time `json:"generated_at"` + + // Summary is a one-line status: "healthy", "degraded", or "unhealthy". + Summary string `json:"summary"` + + // Issues is a list of detected problems (empty if healthy). + Issues []DiagnosticIssue `json:"issues,omitempty"` + + // RecentOperations are the last N operations for this project. + RecentOperations []OperationSummary `json:"recent_operations,omitempty"` + + // CI contains CI/CD pipeline status. + CI *CIDiagnostics `json:"ci,omitempty"` + + // Registry contains container registry health. + Registry *RegistryStatus `json:"registry,omitempty"` +} + +// DiagnosticIssue represents a detected problem. +type DiagnosticIssue struct { + // Severity: "error", "warning", "info" + Severity string `json:"severity"` + + // Source: "operation", "ci", "registry" + Source string `json:"source"` + + // Message is a human-readable description. + Message string `json:"message"` + + // Details provides additional context (e.g., error output). + Details string `json:"details,omitempty"` + + // Timestamp is when the issue occurred. + Timestamp time.Time `json:"timestamp,omitempty"` +} + +// OperationSummary is a condensed view of an operation for diagnostics. +type OperationSummary struct { + ID string `json:"id"` + Type OperationType `json:"type"` + Status OperationStatus `json:"status"` + StartedAt time.Time `json:"started_at"` + DurationMs int64 `json:"duration_ms,omitempty"` + Error string `json:"error,omitempty"` + ExternalRef string `json:"external_ref,omitempty"` +} + +// CIDiagnostics contains CI pipeline health information. +type CIDiagnostics struct { + // Available indicates if CI is configured for this project. + Available bool `json:"available"` + + // RecentPipelines are the last N pipeline executions. + RecentPipelines []CIPipelineSummary `json:"recent_pipelines,omitempty"` + + // LastFailure contains details of the most recent failed pipeline. + LastFailure *CIPipelineFailure `json:"last_failure,omitempty"` +} + +// CIPipelineSummary is a condensed view of a pipeline for diagnostics. +type CIPipelineSummary struct { + Number int64 `json:"number"` + Status string `json:"status"` + Branch string `json:"branch"` + Commit string `json:"commit"` + StartedAt time.Time `json:"started_at"` + Duration string `json:"duration,omitempty"` +} + +// CIPipelineFailure contains details about a failed pipeline. +type CIPipelineFailure struct { + Number int64 `json:"number"` + FailedStep string `json:"failed_step"` + Error string `json:"error,omitempty"` + LogTail string `json:"log_tail,omitempty"` + URL string `json:"url,omitempty"` + Timestamp time.Time `json:"timestamp"` +} + +// DiagnosticsSummary constants. +const ( + DiagnosticsSummaryHealthy = "healthy" + DiagnosticsSummaryDegraded = "degraded" + DiagnosticsSummaryUnhealthy = "unhealthy" +) + +// DiagnosticSeverity constants. +const ( + DiagnosticSeverityError = "error" + DiagnosticSeverityWarning = "warning" + DiagnosticSeverityInfo = "info" +) + +// DiagnosticSource constants. +const ( + DiagnosticSourceOperation = "operation" + DiagnosticSourceCI = "ci" + DiagnosticSourceRegistry = "registry" +) diff --git a/internal/domain/external_health.go b/internal/domain/external_health.go new file mode 100644 index 0000000..013281d --- /dev/null +++ b/internal/domain/external_health.go @@ -0,0 +1,23 @@ +package domain + +import "time" + +// ExternalSystem identifies an external dependency. +type ExternalSystem string + +const ( + ExternalSystemRegistry ExternalSystem = "registry" + ExternalSystemCI ExternalSystem = "ci" + ExternalSystemGit ExternalSystem = "git" +) + +// ExternalSystemStatus represents the health of an external system. +type ExternalSystemStatus struct { + System ExternalSystem `json:"system"` + Healthy bool `json:"healthy"` + URL string `json:"url"` + Latency time.Duration `json:"latency"` + Error string `json:"error,omitempty"` + LastChecked time.Time `json:"last_checked"` + LastHealthy time.Time `json:"last_healthy,omitempty"` +} diff --git a/internal/handlers/diagnostics.go b/internal/handlers/diagnostics.go new file mode 100644 index 0000000..f941874 --- /dev/null +++ b/internal/handlers/diagnostics.go @@ -0,0 +1,87 @@ +// Package handlers provides HTTP handlers for the rdev API. +package handlers + +import ( + "context" + "log/slog" + "net/http" + + "github.com/go-chi/chi/v5" + + "github.com/orchard9/rdev/internal/auth" + "github.com/orchard9/rdev/internal/domain" + "github.com/orchard9/rdev/internal/port" + "github.com/orchard9/rdev/pkg/api" +) + +// DiagnosticsGetter retrieves project diagnostics. +type DiagnosticsGetter interface { + GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error) +} + +// DiagnosticsHandler handles project diagnostics requests. +type DiagnosticsHandler struct { + diagnostics DiagnosticsGetter + projects port.ProjectRepository + logger *slog.Logger +} + +// NewDiagnosticsHandler creates a new diagnostics handler. +func NewDiagnosticsHandler( + diagnostics DiagnosticsGetter, + projects port.ProjectRepository, + logger *slog.Logger, +) *DiagnosticsHandler { + if logger == nil { + logger = slog.Default() + } + return &DiagnosticsHandler{ + diagnostics: diagnostics, + projects: projects, + logger: logger, + } +} + +// Mount registers the diagnostics routes. +func (h *DiagnosticsHandler) Mount(r api.Router) { + r.Route("/projects/{projectId}/diagnostics", func(r chi.Router) { + r.With(auth.RequireScope(auth.ScopeProjectsRead, auth.ScopeAdmin)).Get("/", h.GetDiagnostics) + }) +} + +// GetDiagnostics returns comprehensive health information for a project. +// GET /projects/{projectId}/diagnostics +func (h *DiagnosticsHandler) GetDiagnostics(w http.ResponseWriter, r *http.Request) { + ctx, cancel := context.WithTimeout(r.Context(), TimeoutStandard) + defer cancel() + + projectID := chi.URLParam(r, "projectId") + if projectID == "" { + api.WriteBadRequest(w, r, "project ID is required") + return + } + + // Verify project exists (optional - diagnostics can still be useful for non-k8s projects) + if h.projects != nil { + if _, err := h.projects.Get(ctx, domain.ProjectID(projectID)); err != nil { + if err == domain.ErrProjectNotFound { + // Log but continue - the project might exist in git/CI but not as a k8s pod + h.logger.Debug("project not found in k8s, continuing with diagnostics", + "project_id", projectID, + ) + } + } + } + + diag, err := h.diagnostics.GetDiagnostics(ctx, projectID) + if err != nil { + h.logger.Error("failed to get diagnostics", + "error", err, + "project_id", projectID, + ) + api.WriteInternalError(w, r, "failed to retrieve diagnostics") + return + } + + api.WriteSuccess(w, r, diag) +} diff --git a/internal/handlers/diagnostics_test.go b/internal/handlers/diagnostics_test.go new file mode 100644 index 0000000..09c16f4 --- /dev/null +++ b/internal/handlers/diagnostics_test.go @@ -0,0 +1,148 @@ +package handlers + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/go-chi/chi/v5" + + "github.com/orchard9/rdev/internal/domain" +) + +// mockDiagnosticsGetter implements DiagnosticsGetter for testing. +type mockDiagnosticsGetter struct { + diagnostics *domain.ProjectDiagnostics + err error +} + +func (m *mockDiagnosticsGetter) GetDiagnostics(_ context.Context, projectID string) (*domain.ProjectDiagnostics, error) { + if m.err != nil { + return nil, m.err + } + if m.diagnostics != nil { + return m.diagnostics, nil + } + // Return default healthy diagnostics + return &domain.ProjectDiagnostics{ + ProjectID: projectID, + GeneratedAt: time.Now().UTC(), + Summary: domain.DiagnosticsSummaryHealthy, + Issues: []domain.DiagnosticIssue{}, + }, nil +} + +func TestDiagnosticsHandler_GetDiagnostics_Success(t *testing.T) { + getter := &mockDiagnosticsGetter{} + projects := newMockProjectRepo() + h := NewDiagnosticsHandler(getter, projects, nil) + + // Create router with chi to handle URL params + r := chi.NewRouter() + r.Use(testAdminAuth) // Add auth context for tests + h.Mount(r) + + req := httptest.NewRequest("GET", "/projects/test-project/diagnostics/", nil) + rec := httptest.NewRecorder() + + r.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("GetDiagnostics() status = %d, want %d; body = %s", rec.Code, http.StatusOK, rec.Body.String()) + } + + var resp map[string]any + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + data, ok := resp["data"].(map[string]any) + if !ok { + t.Fatalf("response missing data field") + } + + if data["project_id"] != "test-project" { + t.Errorf("project_id = %q, want %q", data["project_id"], "test-project") + } + if data["summary"] != domain.DiagnosticsSummaryHealthy { + t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryHealthy) + } +} + +func TestDiagnosticsHandler_GetDiagnostics_WithIssues(t *testing.T) { + getter := &mockDiagnosticsGetter{ + diagnostics: &domain.ProjectDiagnostics{ + ProjectID: "unhealthy-project", + GeneratedAt: time.Now().UTC(), + Summary: domain.DiagnosticsSummaryUnhealthy, + Issues: []domain.DiagnosticIssue{ + { + Severity: domain.DiagnosticSeverityError, + Source: domain.DiagnosticSourceCI, + Message: "CI build #42 failed", + }, + { + Severity: domain.DiagnosticSeverityWarning, + Source: domain.DiagnosticSourceRegistry, + Message: "Container registry slow", + }, + }, + }, + } + projects := newMockProjectRepo() + h := NewDiagnosticsHandler(getter, projects, nil) + + r := chi.NewRouter() + r.Use(testAdminAuth) // Add auth context for tests + h.Mount(r) + + req := httptest.NewRequest("GET", "/projects/unhealthy-project/diagnostics/", nil) + rec := httptest.NewRecorder() + + r.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp map[string]any + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + data, ok := resp["data"].(map[string]any) + if !ok { + t.Fatalf("response missing data field") + } + + if data["summary"] != domain.DiagnosticsSummaryUnhealthy { + t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryUnhealthy) + } + + issues, ok := data["issues"].([]any) + if !ok { + t.Fatalf("response missing issues field") + } + if len(issues) != 2 { + t.Errorf("issues count = %d, want %d", len(issues), 2) + } +} + +func TestDiagnosticsHandler_GetDiagnostics_MissingProjectID(t *testing.T) { + getter := &mockDiagnosticsGetter{} + projects := newMockProjectRepo() + h := NewDiagnosticsHandler(getter, projects, nil) + + // Direct call without chi router to test missing projectId + req := httptest.NewRequest("GET", "/projects//diagnostics/", nil) + rec := httptest.NewRecorder() + + h.GetDiagnostics(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} diff --git a/internal/handlers/health.go b/internal/handlers/health.go index a656ac8..e8bfa5e 100644 --- a/internal/handlers/health.go +++ b/internal/handlers/health.go @@ -8,6 +8,7 @@ import ( "strings" "time" + "github.com/orchard9/rdev/internal/domain" "github.com/orchard9/rdev/internal/metrics" "github.com/orchard9/rdev/internal/port" "github.com/orchard9/rdev/pkg/api" @@ -19,6 +20,11 @@ type ExecutorHealthChecker interface { WorkerID() string } +// ExternalHealthStatusProvider provides cached external system health statuses. +type ExternalHealthStatusProvider interface { + GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus +} + // HealthHandler handles health and readiness checks. type HealthHandler struct { serviceName string @@ -27,6 +33,7 @@ type HealthHandler struct { agentRegistry port.CodeAgentRegistry workExecutor ExecutorHealthChecker registryChecker port.RegistryChecker + externalChecker ExternalHealthStatusProvider } // NewHealthHandler creates a new health handler with dependencies. @@ -56,6 +63,12 @@ func (h *HealthHandler) WithRegistryChecker(checker port.RegistryChecker) *Healt return h } +// WithExternalHealthChecker adds a cached external health checker for monitoring. +func (h *HealthHandler) WithExternalHealthChecker(checker ExternalHealthStatusProvider) *HealthHandler { + h.externalChecker = checker + return h +} + // Health returns a simple liveness check. // This should be lightweight and only fail if the process is unhealthy. // GET /health @@ -113,6 +126,26 @@ func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) { checks["registry"] = h.checkRegistry(ctx) } + // External system checks (cached, from background worker) + if h.externalChecker != nil { + for system, status := range h.externalChecker.GetAllStatuses() { + checks["external:"+string(system)] = CheckResult{ + Healthy: status.Healthy, + Message: status.Error, + Latency: status.Latency.String(), + LastCheck: status.LastChecked, + } + if status.Healthy { + checks["external:"+string(system)] = CheckResult{ + Healthy: true, + Message: "connected", + Latency: status.Latency.String(), + LastCheck: status.LastChecked, + } + } + } + } + response := ReadinessResponse{ Status: "ready", Service: h.serviceName, diff --git a/internal/handlers/projects_test.go b/internal/handlers/projects_test.go index 5fa35db..ead6039 100644 --- a/internal/handlers/projects_test.go +++ b/internal/handlers/projects_test.go @@ -2,6 +2,7 @@ package handlers import ( "bytes" + "context" "encoding/json" "net/http" "net/http/httptest" @@ -10,11 +11,22 @@ import ( "github.com/go-chi/chi/v5" "github.com/orchard9/rdev/internal/adapter/kubernetes" + "github.com/orchard9/rdev/internal/domain" ) // newTestProjectsHandler creates a ProjectsHandler for testing. +// It registers a test project "test-project" for use in tests. func newTestProjectsHandler() *ProjectsHandler { repo := kubernetes.NewProjectRepository("test-namespace") + // Register a test project for tests to use + _ = repo.Register(context.Background(), &domain.Project{ + ID: "test-project", + Name: "Test Project", + Description: "Test project for unit tests", + PodName: "test-project-pod-0", + Status: domain.ProjectStatusRunning, + Workspace: "/workspace", + }) exec := kubernetes.NewExecutor("test-namespace") return NewProjectsHandler(repo, exec) } @@ -61,7 +73,7 @@ func TestProjectsHandler_Get(t *testing.T) { projectID string wantStatus int }{ - {"existing project", "pantheon", http.StatusOK}, + {"existing project", "test-project", http.StatusOK}, {"non-existent project", "nonexistent", http.StatusNotFound}, } @@ -95,7 +107,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) { }{ { name: "valid request", - projectID: "pantheon", + projectID: "test-project", body: ClaudeRequest{ Prompt: "Hello, world!", }, @@ -103,7 +115,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) { }, { name: "missing prompt", - projectID: "pantheon", + projectID: "test-project", body: ClaudeRequest{ Prompt: "", }, @@ -118,7 +130,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) { }, { name: "null byte in prompt", - projectID: "pantheon", + projectID: "test-project", body: ClaudeRequest{ Prompt: "Hello\x00World", }, @@ -127,7 +139,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) { }, { name: "invalid stream ID", - projectID: "pantheon", + projectID: "test-project", body: ClaudeRequest{ Prompt: "Hello", StreamID: "invalid stream id with spaces", @@ -175,7 +187,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }{ { name: "valid command", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "ls -la", }, @@ -183,7 +195,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }, { name: "missing command", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "", }, @@ -192,7 +204,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }, { name: "dangerous command with semicolon", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "ls; rm -rf /", }, @@ -201,7 +213,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }, { name: "dangerous command with pipe", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "cat /etc/passwd | grep root", }, @@ -210,7 +222,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }, { name: "command substitution", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "echo $(whoami)", }, @@ -219,7 +231,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }, { name: "redirect", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "ls > /tmp/out.txt", }, @@ -228,7 +240,7 @@ func TestProjectsHandler_RunShell(t *testing.T) { }, { name: "rm rf root", - projectID: "pantheon", + projectID: "test-project", body: ShellRequest{ Command: "rm -rf /", }, @@ -281,7 +293,7 @@ func TestProjectsHandler_RunGit(t *testing.T) { }{ { name: "valid git status", - projectID: "pantheon", + projectID: "test-project", body: GitRequest{ Args: []string{"status"}, }, @@ -289,7 +301,7 @@ func TestProjectsHandler_RunGit(t *testing.T) { }, { name: "valid git log", - projectID: "pantheon", + projectID: "test-project", body: GitRequest{ Args: []string{"log", "--oneline", "-10"}, }, @@ -297,7 +309,7 @@ func TestProjectsHandler_RunGit(t *testing.T) { }, { name: "missing args", - projectID: "pantheon", + projectID: "test-project", body: GitRequest{ Args: []string{}, }, @@ -306,7 +318,7 @@ func TestProjectsHandler_RunGit(t *testing.T) { }, { name: "git config blocked", - projectID: "pantheon", + projectID: "test-project", body: GitRequest{ Args: []string{"config", "--global", "user.name", "attacker"}, }, @@ -315,7 +327,7 @@ func TestProjectsHandler_RunGit(t *testing.T) { }, { name: "git remote blocked", - projectID: "pantheon", + projectID: "test-project", body: GitRequest{ Args: []string{"remote", "add", "evil", "https://evil.com/repo"}, }, @@ -324,7 +336,7 @@ func TestProjectsHandler_RunGit(t *testing.T) { }, { name: "force push blocked", - projectID: "pantheon", + projectID: "test-project", body: GitRequest{ Args: []string{"push", "-f", "origin", "main"}, }, @@ -394,9 +406,9 @@ func TestProjectsHandler_InvalidJSON(t *testing.T) { method string path string }{ - {"POST", "/projects/pantheon/claude"}, - {"POST", "/projects/pantheon/shell"}, - {"POST", "/projects/pantheon/git"}, + {"POST", "/projects/test-project/claude"}, + {"POST", "/projects/test-project/shell"}, + {"POST", "/projects/test-project/git"}, } for _, ep := range endpoints { @@ -429,12 +441,12 @@ func TestCommandIDGeneration(t *testing.T) { body := ClaudeRequest{Prompt: "test"} bodyBytes, _ := json.Marshal(body) - req1 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes)) + req1 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes)) req1.Header.Set("Content-Type", "application/json") rec1 := httptest.NewRecorder() router.ServeHTTP(rec1, req1) - req2 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes)) + req2 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes)) req2.Header.Set("Content-Type", "application/json") rec2 := httptest.NewRecorder() router.ServeHTTP(rec2, req2) @@ -465,7 +477,7 @@ func TestCustomStreamID(t *testing.T) { } bodyBytes, _ := json.Marshal(body) - req := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes)) + req := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes)) req.Header.Set("Content-Type", "application/json") rec := httptest.NewRecorder() router.ServeHTTP(rec, req) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 275a5ba..4456487 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -142,6 +142,22 @@ var ( Name: "rdev_ci_push_failures_total", Help: "Total number of CI image push failures by project", }, []string{"project"}) + + // External system health + externalSystemHealthy = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "rdev_external_system_healthy", + Help: "Whether external system is healthy (1) or not (0)", + }, []string{"system"}) + + externalSystemLatency = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "rdev_external_system_latency_seconds", + Help: "Latency of external system health check in seconds", + }, []string{"system"}) + + externalSystemLastCheck = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "rdev_external_system_last_check_timestamp", + Help: "Unix timestamp of last health check", + }, []string{"system"}) ) // RecordCommand records a command execution. @@ -248,6 +264,17 @@ func RecordCIPushFailure(project string) { ciPushFailures.WithLabelValues(project).Inc() } +// SetExternalSystemHealth updates the health metrics for an external system. +func SetExternalSystemHealth(system string, healthy bool, latencySeconds float64) { + val := 0.0 + if healthy { + val = 1.0 + } + externalSystemHealthy.WithLabelValues(system).Set(val) + externalSystemLatency.WithLabelValues(system).Set(latencySeconds) + externalSystemLastCheck.WithLabelValues(system).Set(float64(time.Now().Unix())) +} + // Handler returns the Prometheus HTTP handler. func Handler() http.Handler { return promhttp.Handler() diff --git a/internal/port/health.go b/internal/port/health.go index 582792e..c07dc42 100644 --- a/internal/port/health.go +++ b/internal/port/health.go @@ -23,3 +23,9 @@ type RegistryChecker interface { // Check returns the health status of the registry. Check(ctx context.Context) domain.RegistryStatus } + +// ExternalHealthChecker checks an external system's health. +type ExternalHealthChecker interface { + // Check returns the health status of the external system. + Check(ctx context.Context) domain.ExternalSystemStatus +} diff --git a/internal/service/diagnostics_service.go b/internal/service/diagnostics_service.go new file mode 100644 index 0000000..66e2b44 --- /dev/null +++ b/internal/service/diagnostics_service.go @@ -0,0 +1,295 @@ +// Package service provides business logic services. +package service + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/orchard9/rdev/internal/domain" + "github.com/orchard9/rdev/internal/port" +) + +// DiagnosticsServiceConfig configures the diagnostics service. +type DiagnosticsServiceConfig struct { + // DefaultGitOwner is the git organization for CI lookups. + DefaultGitOwner string + + // MaxRecentOperations is how many operations to include. + MaxRecentOperations int + + // MaxRecentPipelines is how many pipelines to include. + MaxRecentPipelines int + + Logger *slog.Logger +} + +// DiagnosticsService aggregates project health information from multiple sources. +type DiagnosticsService struct { + operationRepo port.OperationRepository + registryChecker port.RegistryChecker + ciProvider port.CIProvider + + defaultGitOwner string + maxRecentOperations int + maxRecentPipelines int + logger *slog.Logger +} + +// NewDiagnosticsService creates a new diagnostics service. +func NewDiagnosticsService( + operationRepo port.OperationRepository, + registryChecker port.RegistryChecker, + ciProvider port.CIProvider, + cfg DiagnosticsServiceConfig, +) *DiagnosticsService { + logger := cfg.Logger + if logger == nil { + logger = slog.Default() + } + + maxOps := cfg.MaxRecentOperations + if maxOps <= 0 { + maxOps = 10 + } + + maxPipelines := cfg.MaxRecentPipelines + if maxPipelines <= 0 { + maxPipelines = 5 + } + + return &DiagnosticsService{ + operationRepo: operationRepo, + registryChecker: registryChecker, + ciProvider: ciProvider, + defaultGitOwner: cfg.DefaultGitOwner, + maxRecentOperations: maxOps, + maxRecentPipelines: maxPipelines, + logger: logger.With("service", "diagnostics"), + } +} + +// GetDiagnostics returns comprehensive health information for a project. +func (s *DiagnosticsService) GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error) { + diag := &domain.ProjectDiagnostics{ + ProjectID: projectID, + GeneratedAt: time.Now().UTC(), + Summary: domain.DiagnosticsSummaryHealthy, + Issues: []domain.DiagnosticIssue{}, + } + + // Collect data from each source (don't fail if one source fails) + s.collectOperations(ctx, projectID, diag) + s.collectRegistryHealth(ctx, diag) + s.collectCIStatus(ctx, projectID, diag) + + // Determine overall summary + s.calculateSummary(diag) + + return diag, nil +} + +// collectOperations fetches recent operations and extracts issues. +func (s *DiagnosticsService) collectOperations(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) { + filter := domain.OperationFilters{ + ProjectID: projectID, + Limit: s.maxRecentOperations, + } + + ops, err := s.operationRepo.List(ctx, filter) + if err != nil { + s.logger.Warn("failed to fetch operations for diagnostics", + "error", err, + "project_id", projectID, + ) + diag.Issues = append(diag.Issues, domain.DiagnosticIssue{ + Severity: domain.DiagnosticSeverityWarning, + Source: domain.DiagnosticSourceOperation, + Message: "Unable to fetch operation history", + Details: err.Error(), + }) + return + } + + // Convert to summaries + for _, op := range ops { + summary := domain.OperationSummary{ + ID: op.ID, + Type: op.Type, + Status: op.Status, + StartedAt: op.StartedAt, + DurationMs: op.DurationMs, + Error: op.Error, + ExternalRef: op.ExternalRef, + } + diag.RecentOperations = append(diag.RecentOperations, summary) + + // Extract issues from failed operations + if op.Status == domain.OperationStatusFailed { + issue := domain.DiagnosticIssue{ + Severity: domain.DiagnosticSeverityError, + Source: domain.DiagnosticSourceOperation, + Message: fmt.Sprintf("%s operation failed", op.Type), + Timestamp: op.StartedAt, + } + if op.Error != "" { + issue.Details = op.Error + } + if op.ExternalRef != "" { + issue.Message += fmt.Sprintf(" (%s)", op.ExternalRef) + } + diag.Issues = append(diag.Issues, issue) + } + } +} + +// collectRegistryHealth checks registry status. +func (s *DiagnosticsService) collectRegistryHealth(ctx context.Context, diag *domain.ProjectDiagnostics) { + if s.registryChecker == nil { + return + } + + status := s.registryChecker.Check(ctx) + diag.Registry = &status + + if !status.Healthy { + diag.Issues = append(diag.Issues, domain.DiagnosticIssue{ + Severity: domain.DiagnosticSeverityError, + Source: domain.DiagnosticSourceRegistry, + Message: "Container registry unhealthy", + Details: status.Error, + Timestamp: status.LastChecked, + }) + } +} + +// collectCIStatus fetches CI pipeline information. +func (s *DiagnosticsService) collectCIStatus(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) { + if s.ciProvider == nil { + diag.CI = &domain.CIDiagnostics{Available: false} + return + } + + owner := s.defaultGitOwner + if owner == "" { + owner = "jordan" // fallback + } + + ciDiag := &domain.CIDiagnostics{Available: true} + + pipelines, err := s.ciProvider.ListPipelines(ctx, owner, projectID) + if err != nil { + s.logger.Warn("failed to fetch pipelines for diagnostics", + "error", err, + "project_id", projectID, + ) + ciDiag.Available = false + diag.CI = ciDiag + return + } + + // Convert to summaries and find failures + var lastFailure *domain.CIPipeline + for i, p := range pipelines { + if i >= s.maxRecentPipelines { + break + } + + summary := domain.CIPipelineSummary{ + Number: p.Number, + Status: p.Status, + Branch: p.Branch, + Commit: p.Commit, + StartedAt: p.Started, + } + if p.Finished.After(p.Started) { + summary.Duration = p.Finished.Sub(p.Started).Round(time.Second).String() + } + ciDiag.RecentPipelines = append(ciDiag.RecentPipelines, summary) + + // Track the most recent failure + if p.Status == "failure" && lastFailure == nil { + lastFailure = p + } + } + + // Get details on the last failure + if lastFailure != nil { + failure := s.getFailureDetails(ctx, owner, projectID, lastFailure) + ciDiag.LastFailure = failure + + // Add as issue + issue := domain.DiagnosticIssue{ + Severity: domain.DiagnosticSeverityError, + Source: domain.DiagnosticSourceCI, + Message: fmt.Sprintf("CI build #%d failed", lastFailure.Number), + Timestamp: lastFailure.Finished, + } + if failure != nil && failure.FailedStep != "" { + issue.Message += fmt.Sprintf(" at step '%s'", failure.FailedStep) + if failure.Error != "" { + issue.Details = failure.Error + } + } + diag.Issues = append(diag.Issues, issue) + } + + diag.CI = ciDiag +} + +// getFailureDetails fetches step-level details for a failed pipeline. +func (s *DiagnosticsService) getFailureDetails(ctx context.Context, owner, repo string, pipeline *domain.CIPipeline) *domain.CIPipelineFailure { + failure := &domain.CIPipelineFailure{ + Number: pipeline.Number, + Timestamp: pipeline.Finished, + } + + steps, err := s.ciProvider.GetPipelineSteps(ctx, owner, repo, pipeline.Number) + if err != nil { + s.logger.Warn("failed to fetch pipeline steps", + "error", err, + "pipeline", pipeline.Number, + ) + return failure + } + + failure.URL = steps.URL + + // Find the failed step + for _, step := range steps.Steps { + if step.Status == "failure" || step.Status == "error" { + failure.FailedStep = step.Name + failure.Error = step.Error + if step.Log != "" { + failure.LogTail = step.Log + } + break + } + } + + return failure +} + +// calculateSummary determines the overall health status. +func (s *DiagnosticsService) calculateSummary(diag *domain.ProjectDiagnostics) { + errorCount := 0 + warningCount := 0 + + for _, issue := range diag.Issues { + switch issue.Severity { + case domain.DiagnosticSeverityError: + errorCount++ + case domain.DiagnosticSeverityWarning: + warningCount++ + } + } + + if errorCount > 0 { + diag.Summary = domain.DiagnosticsSummaryUnhealthy + } else if warningCount > 0 { + diag.Summary = domain.DiagnosticsSummaryDegraded + } else { + diag.Summary = domain.DiagnosticsSummaryHealthy + } +} diff --git a/internal/service/diagnostics_service_test.go b/internal/service/diagnostics_service_test.go new file mode 100644 index 0000000..fcf2fb7 --- /dev/null +++ b/internal/service/diagnostics_service_test.go @@ -0,0 +1,302 @@ +package service + +import ( + "context" + "testing" + "time" + + "github.com/orchard9/rdev/internal/domain" +) + +// mockOperationRepo implements port.OperationRepository for testing. +type mockOperationRepo struct { + operations []*domain.Operation + err error +} + +func (m *mockOperationRepo) Create(_ context.Context, _ *domain.Operation) error { + return nil +} + +func (m *mockOperationRepo) Update(_ context.Context, _ *domain.Operation) error { + return nil +} + +func (m *mockOperationRepo) Get(_ context.Context, _ string) (*domain.Operation, error) { + return nil, domain.ErrOperationNotFound +} + +func (m *mockOperationRepo) GetByCommitSHA(_ context.Context, _, _ string) (*domain.Operation, error) { + return nil, domain.ErrOperationNotFound +} + +func (m *mockOperationRepo) List(_ context.Context, filter domain.OperationFilters) ([]*domain.Operation, error) { + if m.err != nil { + return nil, m.err + } + var result []*domain.Operation + for _, op := range m.operations { + if filter.ProjectID != "" && op.ProjectID != filter.ProjectID { + continue + } + result = append(result, op) + } + return result, nil +} + +func (m *mockOperationRepo) AddStep(_ context.Context, _ string, _ domain.OperationStep) error { + return nil +} + +func (m *mockOperationRepo) UpdateStep(_ context.Context, _ string, _ domain.OperationStep) error { + return nil +} + +func (m *mockOperationRepo) Complete(_ context.Context, _ string, _ domain.OperationStatus, _ map[string]any, _, _ string) error { + return nil +} + +func (m *mockOperationRepo) SetCommitSHA(_ context.Context, _, _ string) error { + return nil +} + +func (m *mockOperationRepo) SetTriggeredBy(_ context.Context, _, _ string) error { + return nil +} + +func (m *mockOperationRepo) DeleteOlderThan(_ context.Context, _ time.Time) (int64, error) { + return 0, nil +} + +// mockRegistryChecker implements port.RegistryChecker for testing. +type mockRegistryChecker struct { + status domain.RegistryStatus +} + +func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus { + return m.status +} + +// mockCIProvider implements port.CIProvider for testing. +type mockCIProvider struct { + pipelines []*domain.CIPipeline + steps *domain.CIPipelineSteps + err error +} + +func (m *mockCIProvider) ActivateRepo(_ context.Context, _, _, _ string) (*domain.CIRepo, error) { + return nil, nil +} + +func (m *mockCIProvider) DeactivateRepo(_ context.Context, _, _ string) error { + return nil +} + +func (m *mockCIProvider) GetRepo(_ context.Context, _, _ string) (*domain.CIRepo, error) { + return nil, nil +} + +func (m *mockCIProvider) ListRepos(_ context.Context) ([]*domain.CIRepo, error) { + return nil, nil +} + +func (m *mockCIProvider) AddSecret(_ context.Context, _, _ string, _ domain.CISecret) error { + return nil +} + +func (m *mockCIProvider) DeleteSecret(_ context.Context, _, _, _ string) error { + return nil +} + +func (m *mockCIProvider) ListPipelines(_ context.Context, _, _ string) ([]*domain.CIPipeline, error) { + if m.err != nil { + return nil, m.err + } + return m.pipelines, nil +} + +func (m *mockCIProvider) GetPipeline(_ context.Context, _, _ string, _ int64) (*domain.CIPipeline, error) { + return nil, nil +} + +func (m *mockCIProvider) GetPipelineSteps(_ context.Context, _, _ string, _ int64) (*domain.CIPipelineSteps, error) { + if m.steps != nil { + return m.steps, nil + } + return nil, nil +} + +func (m *mockCIProvider) TriggerBuild(_ context.Context, _, _, _ string) (int64, error) { + return 0, nil +} + +func TestDiagnosticsService_GetDiagnostics_Healthy(t *testing.T) { + opRepo := &mockOperationRepo{ + operations: []*domain.Operation{ + { + ID: "op-1", + ProjectID: "test-project", + Type: domain.OperationTypeBuild, + Status: domain.OperationStatusCompleted, + StartedAt: time.Now().Add(-1 * time.Hour), + }, + }, + } + registry := &mockRegistryChecker{ + status: domain.RegistryStatus{ + Healthy: true, + URL: "https://registry.example.com", + Latency: "10ms", + LastChecked: time.Now(), + }, + } + ci := &mockCIProvider{ + pipelines: []*domain.CIPipeline{ + { + Number: 42, + Status: "success", + Branch: "main", + Commit: "abc123", + }, + }, + } + + svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{ + DefaultGitOwner: "test-org", + }) + + diag, err := svc.GetDiagnostics(context.Background(), "test-project") + if err != nil { + t.Fatalf("GetDiagnostics() error = %v", err) + } + + if diag.ProjectID != "test-project" { + t.Errorf("ProjectID = %q, want %q", diag.ProjectID, "test-project") + } + if diag.Summary != domain.DiagnosticsSummaryHealthy { + t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy) + } + if len(diag.Issues) != 0 { + t.Errorf("Issues count = %d, want 0", len(diag.Issues)) + } + if len(diag.RecentOperations) != 1 { + t.Errorf("RecentOperations count = %d, want 1", len(diag.RecentOperations)) + } + if diag.Registry == nil { + t.Error("Registry is nil, want non-nil") + } + if diag.CI == nil || !diag.CI.Available { + t.Error("CI not available") + } +} + +func TestDiagnosticsService_GetDiagnostics_Unhealthy(t *testing.T) { + opRepo := &mockOperationRepo{ + operations: []*domain.Operation{ + { + ID: "op-1", + ProjectID: "test-project", + Type: domain.OperationTypeBuild, + Status: domain.OperationStatusFailed, + StartedAt: time.Now().Add(-1 * time.Hour), + Error: "deployment failed", + }, + }, + } + registry := &mockRegistryChecker{ + status: domain.RegistryStatus{ + Healthy: false, + URL: "https://registry.example.com", + Error: "connection refused", + LastChecked: time.Now(), + }, + } + ci := &mockCIProvider{ + pipelines: []*domain.CIPipeline{ + { + Number: 43, + Status: "failure", + Branch: "main", + Commit: "def456", + Finished: time.Now().Add(-30 * time.Minute), + }, + }, + steps: &domain.CIPipelineSteps{ + PipelineNumber: 43, + URL: "https://ci.example.com/build/43", + Steps: []domain.CIPipelineStep{ + { + Name: "test", + Status: "failure", + Error: "tests failed", + Log: "FAIL: TestSomething", + }, + }, + }, + } + + svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{ + DefaultGitOwner: "test-org", + }) + + diag, err := svc.GetDiagnostics(context.Background(), "test-project") + if err != nil { + t.Fatalf("GetDiagnostics() error = %v", err) + } + + if diag.Summary != domain.DiagnosticsSummaryUnhealthy { + t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryUnhealthy) + } + + // Should have 3 issues: failed operation, unhealthy registry, failed CI + if len(diag.Issues) < 3 { + t.Errorf("Issues count = %d, want at least 3", len(diag.Issues)) + } + + // Check CI failure details + if diag.CI == nil || diag.CI.LastFailure == nil { + t.Fatal("CI.LastFailure is nil") + } + if diag.CI.LastFailure.FailedStep != "test" { + t.Errorf("FailedStep = %q, want %q", diag.CI.LastFailure.FailedStep, "test") + } +} + +func TestDiagnosticsService_GetDiagnostics_Degraded(t *testing.T) { + opRepo := &mockOperationRepo{ + operations: []*domain.Operation{ + { + ID: "op-1", + ProjectID: "test-project", + Type: domain.OperationTypeBuild, + Status: domain.OperationStatusCompleted, + StartedAt: time.Now().Add(-1 * time.Hour), + }, + }, + } + registry := &mockRegistryChecker{ + status: domain.RegistryStatus{ + Healthy: true, + URL: "https://registry.example.com", + LastChecked: time.Now(), + }, + } + + // No CI provider - should produce a warning but not error + svc := NewDiagnosticsService(opRepo, registry, nil, DiagnosticsServiceConfig{ + DefaultGitOwner: "test-org", + }) + + diag, err := svc.GetDiagnostics(context.Background(), "test-project") + if err != nil { + t.Fatalf("GetDiagnostics() error = %v", err) + } + + // Without CI, status should still be healthy (CI unavailable is not an issue) + if diag.Summary != domain.DiagnosticsSummaryHealthy { + t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy) + } + if diag.CI == nil || diag.CI.Available { + t.Error("CI should be not available when no provider") + } +} diff --git a/internal/worker/external_health.go b/internal/worker/external_health.go new file mode 100644 index 0000000..e57ef86 --- /dev/null +++ b/internal/worker/external_health.go @@ -0,0 +1,248 @@ +package worker + +import ( + "context" + "log/slog" + "sync" + "time" + + "github.com/orchard9/rdev/internal/domain" + "github.com/orchard9/rdev/internal/metrics" + "github.com/orchard9/rdev/internal/port" +) + +// ExternalHealthChecker runs periodic health checks on external systems +// (registry, CI, git) and caches the results for the /ready endpoint. +type ExternalHealthChecker struct { + registry port.RegistryChecker // zot + ci port.ExternalHealthChecker // woodpecker + git port.ExternalHealthChecker // gitea + + interval time.Duration + logger *slog.Logger + + // Internal state (thread-safe) + mu sync.RWMutex + statuses map[domain.ExternalSystem]domain.ExternalSystemStatus + + // Lifecycle + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup +} + +// ExternalHealthConfig configures the health checker. +type ExternalHealthConfig struct { + // CheckInterval is how often to check external systems. Default: 30s. + CheckInterval time.Duration + Logger *slog.Logger +} + +// DefaultExternalHealthConfig returns sensible defaults. +func DefaultExternalHealthConfig() ExternalHealthConfig { + return ExternalHealthConfig{ + CheckInterval: 30 * time.Second, + Logger: slog.Default(), + } +} + +// NewExternalHealthChecker creates a new external health checker. +// All checker parameters are optional (nil means skip that system). +func NewExternalHealthChecker( + registry port.RegistryChecker, + ci port.ExternalHealthChecker, + git port.ExternalHealthChecker, + cfg ExternalHealthConfig, +) *ExternalHealthChecker { + if cfg.CheckInterval == 0 { + cfg.CheckInterval = 30 * time.Second + } + if cfg.Logger == nil { + cfg.Logger = slog.Default() + } + + ctx, cancel := context.WithCancel(context.Background()) + + return &ExternalHealthChecker{ + registry: registry, + ci: ci, + git: git, + interval: cfg.CheckInterval, + logger: cfg.Logger.With("component", "external-health"), + statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus), + ctx: ctx, + cancel: cancel, + } +} + +// Start begins the background check loop. +func (c *ExternalHealthChecker) Start() { + c.logger.Info("external health checker started", "interval", c.interval) + + c.wg.Add(1) + go c.checkLoop() +} + +// Stop gracefully shuts down the checker. +func (c *ExternalHealthChecker) Stop() { + c.logger.Info("external health checker stopping") + c.cancel() + c.wg.Wait() + c.logger.Info("external health checker stopped") +} + +// GetStatus returns the cached status for a specific system. +func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + status, ok := c.statuses[system] + return status, ok +} + +// GetAllStatuses returns a copy of all cached statuses. +func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus { + c.mu.RLock() + defer c.mu.RUnlock() + + result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses)) + for k, v := range c.statuses { + result[k] = v + } + return result +} + +// checkLoop runs periodic health checks. +func (c *ExternalHealthChecker) checkLoop() { + defer c.wg.Done() + + // Run immediately on start + c.runChecks() + + ticker := time.NewTicker(c.interval) + defer ticker.Stop() + + for { + select { + case <-c.ctx.Done(): + return + case <-ticker.C: + c.runChecks() + } + } +} + +// runChecks performs health checks on all configured systems in parallel. +func (c *ExternalHealthChecker) runChecks() { + ctx, cancel := context.WithTimeout(c.ctx, 10*time.Second) + defer cancel() + + var wg sync.WaitGroup + results := make(chan domain.ExternalSystemStatus, 3) + + // Check registry (zot) + if c.registry != nil { + wg.Add(1) + go func() { + defer wg.Done() + regStatus := c.registry.Check(ctx) + // Convert domain.RegistryStatus to domain.ExternalSystemStatus + status := domain.ExternalSystemStatus{ + System: domain.ExternalSystemRegistry, + Healthy: regStatus.Healthy, + URL: regStatus.URL, + Error: regStatus.Error, + LastChecked: regStatus.LastChecked, + } + // Parse latency string (e.g., "45ms") to duration + if regStatus.Latency != "" { + if d, err := time.ParseDuration(regStatus.Latency); err == nil { + status.Latency = d + } + } + if status.Healthy { + status.LastHealthy = status.LastChecked + } + results <- status + }() + } + + // Check CI (woodpecker) + if c.ci != nil { + wg.Add(1) + go func() { + defer wg.Done() + results <- c.ci.Check(ctx) + }() + } + + // Check git (gitea) + if c.git != nil { + wg.Add(1) + go func() { + defer wg.Done() + results <- c.git.Check(ctx) + }() + } + + // Wait for all checks to complete, then close results channel + go func() { + wg.Wait() + close(results) + }() + + // Collect results and update state + for status := range results { + c.updateStatus(status) + } +} + +// updateStatus updates cached status and logs/metrics on state changes. +func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) { + c.mu.Lock() + defer c.mu.Unlock() + + prev, existed := c.statuses[status.System] + + // Preserve LastHealthy from previous status if current is unhealthy + if !status.Healthy && existed && !prev.LastHealthy.IsZero() { + status.LastHealthy = prev.LastHealthy + } + + c.statuses[status.System] = status + + // Log state transitions + if !existed { + // First check + if status.Healthy { + c.logger.Info("external system healthy", + "system", status.System, + "url", status.URL, + "latency", status.Latency, + ) + } else { + c.logger.Warn("external system unhealthy", + "system", status.System, + "url", status.URL, + "error", status.Error, + ) + } + } else if prev.Healthy != status.Healthy { + // State changed + if status.Healthy { + c.logger.Info("external system recovered", + "system", status.System, + "url", status.URL, + "latency", status.Latency, + ) + } else { + c.logger.Warn("external system became unhealthy", + "system", status.System, + "url", status.URL, + "error", status.Error, + ) + } + } + + // Update Prometheus metrics + metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds()) +} diff --git a/internal/worker/external_health_test.go b/internal/worker/external_health_test.go new file mode 100644 index 0000000..45ca7c0 --- /dev/null +++ b/internal/worker/external_health_test.go @@ -0,0 +1,241 @@ +package worker + +import ( + "context" + "testing" + "time" + + "github.com/orchard9/rdev/internal/domain" +) + +// mockRegistryChecker is a mock implementation of port.RegistryChecker. +type mockRegistryChecker struct { + healthy bool + err string + latency time.Duration +} + +func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus { + status := domain.RegistryStatus{ + Healthy: m.healthy, + URL: "https://registry.test", + Latency: m.latency.String(), + LastChecked: time.Now().UTC(), + } + if !m.healthy { + status.Error = m.err + } + return status +} + +// mockExternalHealthChecker is a mock implementation of port.ExternalHealthChecker. +type mockExternalHealthChecker struct { + system domain.ExternalSystem + healthy bool + err string + latency time.Duration +} + +func (m *mockExternalHealthChecker) Check(_ context.Context) domain.ExternalSystemStatus { + status := domain.ExternalSystemStatus{ + System: m.system, + Healthy: m.healthy, + URL: "https://test.system", + Latency: m.latency, + LastChecked: time.Now().UTC(), + } + if m.healthy { + status.LastHealthy = status.LastChecked + } else { + status.Error = m.err + } + return status +} + +func TestExternalHealthChecker_GetStatus(t *testing.T) { + registry := &mockRegistryChecker{healthy: true, latency: 50 * time.Millisecond} + ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: true, latency: 100 * time.Millisecond} + git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true, latency: 75 * time.Millisecond} + + checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{ + CheckInterval: 100 * time.Millisecond, + }) + + // Run checks synchronously + checker.runChecks() + + // Verify registry status + regStatus, ok := checker.GetStatus(domain.ExternalSystemRegistry) + if !ok { + t.Fatal("expected registry status to exist") + } + if !regStatus.Healthy { + t.Error("expected registry to be healthy") + } + + // Verify CI status + ciStatus, ok := checker.GetStatus(domain.ExternalSystemCI) + if !ok { + t.Fatal("expected CI status to exist") + } + if !ciStatus.Healthy { + t.Error("expected CI to be healthy") + } + + // Verify Git status + gitStatus, ok := checker.GetStatus(domain.ExternalSystemGit) + if !ok { + t.Fatal("expected Git status to exist") + } + if !gitStatus.Healthy { + t.Error("expected Git to be healthy") + } +} + +func TestExternalHealthChecker_GetAllStatuses(t *testing.T) { + registry := &mockRegistryChecker{healthy: true} + ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "connection refused"} + git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true} + + checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{ + CheckInterval: 100 * time.Millisecond, + }) + + checker.runChecks() + + statuses := checker.GetAllStatuses() + + if len(statuses) != 3 { + t.Fatalf("expected 3 statuses, got %d", len(statuses)) + } + + if statuses[domain.ExternalSystemRegistry].Healthy != true { + t.Error("expected registry to be healthy") + } + if statuses[domain.ExternalSystemCI].Healthy != false { + t.Error("expected CI to be unhealthy") + } + if statuses[domain.ExternalSystemCI].Error != "connection refused" { + t.Errorf("expected CI error 'connection refused', got %q", statuses[domain.ExternalSystemCI].Error) + } + if statuses[domain.ExternalSystemGit].Healthy != true { + t.Error("expected Git to be healthy") + } +} + +func TestExternalHealthChecker_NilCheckers(t *testing.T) { + // All nil checkers should result in empty statuses + checker := NewExternalHealthChecker(nil, nil, nil, ExternalHealthConfig{ + CheckInterval: 100 * time.Millisecond, + }) + + checker.runChecks() + + statuses := checker.GetAllStatuses() + if len(statuses) != 0 { + t.Fatalf("expected 0 statuses with nil checkers, got %d", len(statuses)) + } +} + +func TestExternalHealthChecker_StartStop(t *testing.T) { + registry := &mockRegistryChecker{healthy: true} + + checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{ + CheckInterval: 50 * time.Millisecond, + }) + + checker.Start() + + // Wait for a couple of check cycles + time.Sleep(120 * time.Millisecond) + + // Verify status was populated + status, ok := checker.GetStatus(domain.ExternalSystemRegistry) + if !ok { + t.Fatal("expected registry status after start") + } + if !status.Healthy { + t.Error("expected registry to be healthy") + } + + checker.Stop() + + // After stop, statuses should still be available (cached) + status, ok = checker.GetStatus(domain.ExternalSystemRegistry) + if !ok { + t.Fatal("expected registry status after stop") + } +} + +func TestExternalHealthChecker_StateTransition(t *testing.T) { + registry := &mockRegistryChecker{healthy: true} + + checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{ + CheckInterval: 100 * time.Millisecond, + }) + + // Initial check - healthy + checker.runChecks() + status, _ := checker.GetStatus(domain.ExternalSystemRegistry) + if !status.Healthy { + t.Error("expected initial status to be healthy") + } + firstHealthy := status.LastHealthy + + // Change to unhealthy + registry.healthy = false + registry.err = "connection refused" + checker.runChecks() + + status, _ = checker.GetStatus(domain.ExternalSystemRegistry) + if status.Healthy { + t.Error("expected status to be unhealthy after state change") + } + // LastHealthy should be preserved from when it was healthy + if status.LastHealthy.IsZero() { + t.Error("expected LastHealthy to be preserved") + } + if !status.LastHealthy.Equal(firstHealthy) { + t.Error("expected LastHealthy to remain from healthy period") + } + + // Recover to healthy + registry.healthy = true + registry.err = "" + checker.runChecks() + + status, _ = checker.GetStatus(domain.ExternalSystemRegistry) + if !status.Healthy { + t.Error("expected status to be healthy after recovery") + } + // LastHealthy should be updated + if status.LastHealthy.Before(firstHealthy) { + t.Error("expected LastHealthy to be updated on recovery") + } +} + +func TestExternalHealthChecker_PartialFailure(t *testing.T) { + // Registry healthy, CI unhealthy, Git healthy + registry := &mockRegistryChecker{healthy: true} + ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "timeout"} + git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true} + + checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{ + CheckInterval: 100 * time.Millisecond, + }) + + checker.runChecks() + + statuses := checker.GetAllStatuses() + + // Partial failure should not affect other systems + if !statuses[domain.ExternalSystemRegistry].Healthy { + t.Error("registry should be healthy despite CI failure") + } + if statuses[domain.ExternalSystemCI].Healthy { + t.Error("CI should be unhealthy") + } + if !statuses[domain.ExternalSystemGit].Healthy { + t.Error("git should be healthy despite CI failure") + } +}