feat: add diagnostics endpoint and external health monitoring
- Add /diagnostics endpoint for system health overview - Add external health worker for monitoring Gitea, Woodpecker, Registry - Add health check methods to Gitea and Woodpecker clients - Remove hardcoded fallback projects (pantheon, aeries) - Add diagnostics domain types and service layer - Add comprehensive tests for diagnostics handler and service - Fix tests to use registered test project instead of hardcoded one Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
9128dd23b5
commit
210064d490
49
ai-lookup/services/external-health.md
Normal file
49
ai-lookup/services/external-health.md
Normal file
@ -0,0 +1,49 @@
|
||||
# External Health Checker
|
||||
|
||||
**Last Updated:** 2026-02-03
|
||||
**Confidence:** High
|
||||
|
||||
## Summary
|
||||
|
||||
Background worker that continuously monitors external systems (registry, CI, git) and surfaces issues proactively via metrics, logs, and the `/ready` endpoint. Runs every 30s, caches results for instant lookups, and logs state transitions.
|
||||
|
||||
**Key Facts:**
|
||||
- Monitors: `registry` (zot), `ci` (woodpecker), `git` (gitea)
|
||||
- Check interval: 30 seconds (configurable)
|
||||
- Caches results for `/ready` endpoint (no blocking network calls)
|
||||
- Logs only on state changes (healthy→unhealthy, unhealthy→healthy)
|
||||
- Preserves `LastHealthy` timestamp through unhealthy periods
|
||||
|
||||
**File Pointers:**
|
||||
- Domain types: `internal/domain/external_health.go`
|
||||
- Worker implementation: `internal/worker/external_health.go`
|
||||
- Port interface: `internal/port/health.go:ExternalHealthChecker`
|
||||
- Handler integration: `internal/handlers/health.go:WithExternalHealthChecker`
|
||||
- Wiring: `cmd/rdev-api/main.go:433-455`
|
||||
|
||||
## How It Works
|
||||
|
||||
1. Background goroutine polls all configured external systems every 30s
|
||||
2. Checks run in parallel with 10s timeout per system
|
||||
3. Results cached in thread-safe map
|
||||
4. `/ready` reads cached statuses (no network calls)
|
||||
5. Prometheus metrics updated on each check cycle
|
||||
|
||||
**Adapter implementations:**
|
||||
- Registry: `internal/adapter/zot/client.go:Check()` - calls `/v2/` endpoint
|
||||
- CI: `internal/adapter/woodpecker/client.go:Check()` - calls `Self()` API
|
||||
- Git: `internal/adapter/gitea/client.go:Check()` - calls `ListMyOrgs()`
|
||||
|
||||
## Prometheus Metrics
|
||||
|
||||
| Metric | Type | Labels | Description |
|
||||
|--------|------|--------|-------------|
|
||||
| `rdev_external_system_healthy` | Gauge | `system` | 1=healthy, 0=unhealthy |
|
||||
| `rdev_external_system_latency_seconds` | Gauge | `system` | Check latency |
|
||||
| `rdev_external_system_last_check_timestamp` | Gauge | `system` | Unix timestamp of last check |
|
||||
|
||||
## Related Topics
|
||||
|
||||
- [Work Queue](./work-queue.md) - Uses similar background worker pattern
|
||||
- [CI Provider](./ci-provider.md) - Woodpecker adapter details
|
||||
- [Worker Pool](./worker-pool.md) - Another background worker example
|
||||
@ -418,12 +418,43 @@ func main() {
|
||||
logger.Info("registry health checker initialized", "url", registryURL)
|
||||
}
|
||||
|
||||
// Initialize diagnostics service (aggregates health data for debugging)
|
||||
diagnosticsService := service.NewDiagnosticsService(
|
||||
operationRepo,
|
||||
registryChecker,
|
||||
woodpeckerClient,
|
||||
service.DiagnosticsServiceConfig{
|
||||
DefaultGitOwner: infraCfg.GiteaDefaultOrg,
|
||||
Logger: logger,
|
||||
},
|
||||
)
|
||||
diagnosticsHandler := handlers.NewDiagnosticsHandler(diagnosticsService, projectRepo, logger)
|
||||
|
||||
// Initialize external health checker (background monitoring of registry, CI, git)
|
||||
var externalHealthChecker *worker.ExternalHealthChecker
|
||||
if registryChecker != nil || woodpeckerClient != nil || giteaClient != nil {
|
||||
externalHealthChecker = worker.NewExternalHealthChecker(
|
||||
registryChecker,
|
||||
woodpeckerClient,
|
||||
giteaClient,
|
||||
worker.ExternalHealthConfig{
|
||||
CheckInterval: 30 * time.Second,
|
||||
Logger: logger,
|
||||
},
|
||||
)
|
||||
externalHealthChecker.Start()
|
||||
logger.Info("external health checker started")
|
||||
}
|
||||
|
||||
// Override default health/ready endpoints with full dependency checks
|
||||
healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil).
|
||||
WithAgentRegistry(agentRegistry)
|
||||
if registryChecker != nil {
|
||||
healthHandler = healthHandler.WithRegistryChecker(registryChecker)
|
||||
}
|
||||
if externalHealthChecker != nil {
|
||||
healthHandler = healthHandler.WithExternalHealthChecker(externalHealthChecker)
|
||||
}
|
||||
|
||||
app.Router().Get("/health", healthHandler.Health)
|
||||
app.Router().Get("/ready", healthHandler.Ready)
|
||||
@ -448,6 +479,7 @@ func main() {
|
||||
buildsHandler.Mount(app.Router())
|
||||
createAndBuildHandler.Mount(app.Router())
|
||||
operationsHandler.Mount(app.Router())
|
||||
diagnosticsHandler.Mount(app.Router())
|
||||
sdlcHandler.Mount(app.Router())
|
||||
sdlcOrchestratorHandler.Mount(app.Router())
|
||||
|
||||
@ -514,6 +546,9 @@ func main() {
|
||||
app.EnableDocs(buildOpenAPISpec())
|
||||
|
||||
app.OnShutdown(func(ctx context.Context) error {
|
||||
if externalHealthChecker != nil {
|
||||
externalHealthChecker.Stop()
|
||||
}
|
||||
workExecutor.Stop()
|
||||
queueMaintenance.Stop()
|
||||
operationCleanup.Stop()
|
||||
|
||||
@ -11,18 +11,21 @@ package gitea
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"code.gitea.io/sdk/gitea"
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
"github.com/orchard9/rdev/internal/port"
|
||||
)
|
||||
|
||||
// Ensure Client implements GitRepository.
|
||||
// Ensure Client implements GitRepository and ExternalHealthChecker.
|
||||
var _ port.GitRepository = (*Client)(nil)
|
||||
var _ port.ExternalHealthChecker = (*Client)(nil)
|
||||
|
||||
// Client is a Gitea API client adapter.
|
||||
type Client struct {
|
||||
client *gitea.Client
|
||||
url string // Gitea server URL for health checks
|
||||
defaultOwner string // default organization/user for new repos
|
||||
}
|
||||
|
||||
@ -43,6 +46,7 @@ func NewClient(url, token, defaultOwner string) (*Client, error) {
|
||||
}
|
||||
return &Client{
|
||||
client: client,
|
||||
url: url,
|
||||
defaultOwner: defaultOwner,
|
||||
}, nil
|
||||
}
|
||||
@ -208,6 +212,45 @@ func (c *Client) DeleteWebhook(ctx context.Context, owner, repo string, webhookI
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check returns the health status of the Gitea server.
|
||||
// Implements port.ExternalHealthChecker.
|
||||
func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus {
|
||||
start := time.Now()
|
||||
status := domain.ExternalSystemStatus{
|
||||
System: domain.ExternalSystemGit,
|
||||
URL: c.url,
|
||||
}
|
||||
|
||||
// Gitea SDK doesn't support context propagation for HTTP requests,
|
||||
// but check for cancellation before making the call.
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
status.Latency = time.Since(start)
|
||||
status.LastChecked = time.Now().UTC()
|
||||
status.Healthy = false
|
||||
status.Error = ctx.Err().Error()
|
||||
return status
|
||||
default:
|
||||
}
|
||||
|
||||
// Call ListMyOrgs (lightweight, tests auth)
|
||||
_, _, err := c.client.ListMyOrgs(gitea.ListOrgsOptions{
|
||||
ListOptions: gitea.ListOptions{PageSize: 1},
|
||||
})
|
||||
status.Latency = time.Since(start)
|
||||
status.LastChecked = time.Now().UTC()
|
||||
|
||||
if err != nil {
|
||||
status.Healthy = false
|
||||
status.Error = err.Error()
|
||||
} else {
|
||||
status.Healthy = true
|
||||
status.LastHealthy = status.LastChecked
|
||||
}
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
// repoFromGitea converts a gitea.Repository to domain.Repo.
|
||||
func repoFromGitea(r *gitea.Repository) *domain.Repo {
|
||||
return &domain.Repo{
|
||||
|
||||
@ -61,23 +61,9 @@ func NewProjectRepositoryWithClient(namespace string, client *kubernetes.Clients
|
||||
}
|
||||
|
||||
// initFallbackProjects adds hardcoded projects for when K8s client is unavailable.
|
||||
// Currently empty - projects are discovered dynamically from K8s or stored in the database.
|
||||
func (r *ProjectRepository) initFallbackProjects() {
|
||||
r.projects["pantheon"] = &domain.Project{
|
||||
ID: "pantheon",
|
||||
Name: "Pantheon",
|
||||
Description: "Go API backend",
|
||||
PodName: "claudebox-pantheon-0",
|
||||
Status: domain.ProjectStatusUnknown,
|
||||
Workspace: "/workspace",
|
||||
}
|
||||
r.projects["aeries"] = &domain.Project{
|
||||
ID: "aeries",
|
||||
Name: "Aeries",
|
||||
Description: "Note community platform",
|
||||
PodName: "claudebox-aeries-0",
|
||||
Status: domain.ProjectStatusUnknown,
|
||||
Workspace: "/workspace",
|
||||
}
|
||||
// No hardcoded fallback projects
|
||||
}
|
||||
|
||||
// Ensure ProjectRepository implements port.ProjectRepository at compile time.
|
||||
|
||||
@ -28,8 +28,9 @@ import (
|
||||
"github.com/orchard9/rdev/internal/port"
|
||||
)
|
||||
|
||||
// Ensure Client implements CIProvider.
|
||||
// Ensure Client implements CIProvider and ExternalHealthChecker.
|
||||
var _ port.CIProvider = (*Client)(nil)
|
||||
var _ port.ExternalHealthChecker = (*Client)(nil)
|
||||
|
||||
// tokenTransport is an http.RoundTripper that adds bearer token auth.
|
||||
type tokenTransport struct {
|
||||
@ -331,6 +332,42 @@ func (c *Client) DeleteSecret(ctx context.Context, owner, repo, secretName strin
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check returns the health status of the Woodpecker CI system.
|
||||
// Implements port.ExternalHealthChecker.
|
||||
func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus {
|
||||
start := time.Now()
|
||||
status := domain.ExternalSystemStatus{
|
||||
System: domain.ExternalSystemCI,
|
||||
URL: c.url,
|
||||
}
|
||||
|
||||
// Check context cancellation
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
status.Latency = time.Since(start)
|
||||
status.LastChecked = time.Now().UTC()
|
||||
status.Healthy = false
|
||||
status.Error = ctx.Err().Error()
|
||||
return status
|
||||
default:
|
||||
}
|
||||
|
||||
// Call Self() to get current user info (lightweight, tests auth)
|
||||
_, err := c.client.Self()
|
||||
status.Latency = time.Since(start)
|
||||
status.LastChecked = time.Now().UTC()
|
||||
|
||||
if err != nil {
|
||||
status.Healthy = false
|
||||
status.Error = err.Error()
|
||||
} else {
|
||||
status.Healthy = true
|
||||
status.LastHealthy = status.LastChecked
|
||||
}
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
// repoFromWoodpecker converts a woodpecker.Repo to domain.CIRepo.
|
||||
func repoFromWoodpecker(r *woodpecker.Repo) *domain.CIRepo {
|
||||
// Parse forge remote ID (string in SDK, int64 in our domain)
|
||||
|
||||
110
internal/domain/diagnostics.go
Normal file
110
internal/domain/diagnostics.go
Normal file
@ -0,0 +1,110 @@
|
||||
package domain
|
||||
|
||||
import "time"
|
||||
|
||||
// ProjectDiagnostics provides a unified view of project health for debugging.
|
||||
// It aggregates data from operations, CI pipelines, and registry health.
|
||||
type ProjectDiagnostics struct {
|
||||
// ProjectID is the project being diagnosed.
|
||||
ProjectID string `json:"project_id"`
|
||||
|
||||
// GeneratedAt is when this diagnostic was generated.
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
|
||||
// Summary is a one-line status: "healthy", "degraded", or "unhealthy".
|
||||
Summary string `json:"summary"`
|
||||
|
||||
// Issues is a list of detected problems (empty if healthy).
|
||||
Issues []DiagnosticIssue `json:"issues,omitempty"`
|
||||
|
||||
// RecentOperations are the last N operations for this project.
|
||||
RecentOperations []OperationSummary `json:"recent_operations,omitempty"`
|
||||
|
||||
// CI contains CI/CD pipeline status.
|
||||
CI *CIDiagnostics `json:"ci,omitempty"`
|
||||
|
||||
// Registry contains container registry health.
|
||||
Registry *RegistryStatus `json:"registry,omitempty"`
|
||||
}
|
||||
|
||||
// DiagnosticIssue represents a detected problem.
|
||||
type DiagnosticIssue struct {
|
||||
// Severity: "error", "warning", "info"
|
||||
Severity string `json:"severity"`
|
||||
|
||||
// Source: "operation", "ci", "registry"
|
||||
Source string `json:"source"`
|
||||
|
||||
// Message is a human-readable description.
|
||||
Message string `json:"message"`
|
||||
|
||||
// Details provides additional context (e.g., error output).
|
||||
Details string `json:"details,omitempty"`
|
||||
|
||||
// Timestamp is when the issue occurred.
|
||||
Timestamp time.Time `json:"timestamp,omitempty"`
|
||||
}
|
||||
|
||||
// OperationSummary is a condensed view of an operation for diagnostics.
|
||||
type OperationSummary struct {
|
||||
ID string `json:"id"`
|
||||
Type OperationType `json:"type"`
|
||||
Status OperationStatus `json:"status"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
DurationMs int64 `json:"duration_ms,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
ExternalRef string `json:"external_ref,omitempty"`
|
||||
}
|
||||
|
||||
// CIDiagnostics contains CI pipeline health information.
|
||||
type CIDiagnostics struct {
|
||||
// Available indicates if CI is configured for this project.
|
||||
Available bool `json:"available"`
|
||||
|
||||
// RecentPipelines are the last N pipeline executions.
|
||||
RecentPipelines []CIPipelineSummary `json:"recent_pipelines,omitempty"`
|
||||
|
||||
// LastFailure contains details of the most recent failed pipeline.
|
||||
LastFailure *CIPipelineFailure `json:"last_failure,omitempty"`
|
||||
}
|
||||
|
||||
// CIPipelineSummary is a condensed view of a pipeline for diagnostics.
|
||||
type CIPipelineSummary struct {
|
||||
Number int64 `json:"number"`
|
||||
Status string `json:"status"`
|
||||
Branch string `json:"branch"`
|
||||
Commit string `json:"commit"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
Duration string `json:"duration,omitempty"`
|
||||
}
|
||||
|
||||
// CIPipelineFailure contains details about a failed pipeline.
|
||||
type CIPipelineFailure struct {
|
||||
Number int64 `json:"number"`
|
||||
FailedStep string `json:"failed_step"`
|
||||
Error string `json:"error,omitempty"`
|
||||
LogTail string `json:"log_tail,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// DiagnosticsSummary constants.
|
||||
const (
|
||||
DiagnosticsSummaryHealthy = "healthy"
|
||||
DiagnosticsSummaryDegraded = "degraded"
|
||||
DiagnosticsSummaryUnhealthy = "unhealthy"
|
||||
)
|
||||
|
||||
// DiagnosticSeverity constants.
|
||||
const (
|
||||
DiagnosticSeverityError = "error"
|
||||
DiagnosticSeverityWarning = "warning"
|
||||
DiagnosticSeverityInfo = "info"
|
||||
)
|
||||
|
||||
// DiagnosticSource constants.
|
||||
const (
|
||||
DiagnosticSourceOperation = "operation"
|
||||
DiagnosticSourceCI = "ci"
|
||||
DiagnosticSourceRegistry = "registry"
|
||||
)
|
||||
23
internal/domain/external_health.go
Normal file
23
internal/domain/external_health.go
Normal file
@ -0,0 +1,23 @@
|
||||
package domain
|
||||
|
||||
import "time"
|
||||
|
||||
// ExternalSystem identifies an external dependency.
|
||||
type ExternalSystem string
|
||||
|
||||
const (
|
||||
ExternalSystemRegistry ExternalSystem = "registry"
|
||||
ExternalSystemCI ExternalSystem = "ci"
|
||||
ExternalSystemGit ExternalSystem = "git"
|
||||
)
|
||||
|
||||
// ExternalSystemStatus represents the health of an external system.
|
||||
type ExternalSystemStatus struct {
|
||||
System ExternalSystem `json:"system"`
|
||||
Healthy bool `json:"healthy"`
|
||||
URL string `json:"url"`
|
||||
Latency time.Duration `json:"latency"`
|
||||
Error string `json:"error,omitempty"`
|
||||
LastChecked time.Time `json:"last_checked"`
|
||||
LastHealthy time.Time `json:"last_healthy,omitempty"`
|
||||
}
|
||||
87
internal/handlers/diagnostics.go
Normal file
87
internal/handlers/diagnostics.go
Normal file
@ -0,0 +1,87 @@
|
||||
// Package handlers provides HTTP handlers for the rdev API.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/orchard9/rdev/internal/auth"
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
"github.com/orchard9/rdev/internal/port"
|
||||
"github.com/orchard9/rdev/pkg/api"
|
||||
)
|
||||
|
||||
// DiagnosticsGetter retrieves project diagnostics.
|
||||
type DiagnosticsGetter interface {
|
||||
GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error)
|
||||
}
|
||||
|
||||
// DiagnosticsHandler handles project diagnostics requests.
|
||||
type DiagnosticsHandler struct {
|
||||
diagnostics DiagnosticsGetter
|
||||
projects port.ProjectRepository
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewDiagnosticsHandler creates a new diagnostics handler.
|
||||
func NewDiagnosticsHandler(
|
||||
diagnostics DiagnosticsGetter,
|
||||
projects port.ProjectRepository,
|
||||
logger *slog.Logger,
|
||||
) *DiagnosticsHandler {
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &DiagnosticsHandler{
|
||||
diagnostics: diagnostics,
|
||||
projects: projects,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
// Mount registers the diagnostics routes.
|
||||
func (h *DiagnosticsHandler) Mount(r api.Router) {
|
||||
r.Route("/projects/{projectId}/diagnostics", func(r chi.Router) {
|
||||
r.With(auth.RequireScope(auth.ScopeProjectsRead, auth.ScopeAdmin)).Get("/", h.GetDiagnostics)
|
||||
})
|
||||
}
|
||||
|
||||
// GetDiagnostics returns comprehensive health information for a project.
|
||||
// GET /projects/{projectId}/diagnostics
|
||||
func (h *DiagnosticsHandler) GetDiagnostics(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), TimeoutStandard)
|
||||
defer cancel()
|
||||
|
||||
projectID := chi.URLParam(r, "projectId")
|
||||
if projectID == "" {
|
||||
api.WriteBadRequest(w, r, "project ID is required")
|
||||
return
|
||||
}
|
||||
|
||||
// Verify project exists (optional - diagnostics can still be useful for non-k8s projects)
|
||||
if h.projects != nil {
|
||||
if _, err := h.projects.Get(ctx, domain.ProjectID(projectID)); err != nil {
|
||||
if err == domain.ErrProjectNotFound {
|
||||
// Log but continue - the project might exist in git/CI but not as a k8s pod
|
||||
h.logger.Debug("project not found in k8s, continuing with diagnostics",
|
||||
"project_id", projectID,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
diag, err := h.diagnostics.GetDiagnostics(ctx, projectID)
|
||||
if err != nil {
|
||||
h.logger.Error("failed to get diagnostics",
|
||||
"error", err,
|
||||
"project_id", projectID,
|
||||
)
|
||||
api.WriteInternalError(w, r, "failed to retrieve diagnostics")
|
||||
return
|
||||
}
|
||||
|
||||
api.WriteSuccess(w, r, diag)
|
||||
}
|
||||
148
internal/handlers/diagnostics_test.go
Normal file
148
internal/handlers/diagnostics_test.go
Normal file
@ -0,0 +1,148 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
)
|
||||
|
||||
// mockDiagnosticsGetter implements DiagnosticsGetter for testing.
|
||||
type mockDiagnosticsGetter struct {
|
||||
diagnostics *domain.ProjectDiagnostics
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockDiagnosticsGetter) GetDiagnostics(_ context.Context, projectID string) (*domain.ProjectDiagnostics, error) {
|
||||
if m.err != nil {
|
||||
return nil, m.err
|
||||
}
|
||||
if m.diagnostics != nil {
|
||||
return m.diagnostics, nil
|
||||
}
|
||||
// Return default healthy diagnostics
|
||||
return &domain.ProjectDiagnostics{
|
||||
ProjectID: projectID,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Summary: domain.DiagnosticsSummaryHealthy,
|
||||
Issues: []domain.DiagnosticIssue{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func TestDiagnosticsHandler_GetDiagnostics_Success(t *testing.T) {
|
||||
getter := &mockDiagnosticsGetter{}
|
||||
projects := newMockProjectRepo()
|
||||
h := NewDiagnosticsHandler(getter, projects, nil)
|
||||
|
||||
// Create router with chi to handle URL params
|
||||
r := chi.NewRouter()
|
||||
r.Use(testAdminAuth) // Add auth context for tests
|
||||
h.Mount(r)
|
||||
|
||||
req := httptest.NewRequest("GET", "/projects/test-project/diagnostics/", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Errorf("GetDiagnostics() status = %d, want %d; body = %s", rec.Code, http.StatusOK, rec.Body.String())
|
||||
}
|
||||
|
||||
var resp map[string]any
|
||||
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
data, ok := resp["data"].(map[string]any)
|
||||
if !ok {
|
||||
t.Fatalf("response missing data field")
|
||||
}
|
||||
|
||||
if data["project_id"] != "test-project" {
|
||||
t.Errorf("project_id = %q, want %q", data["project_id"], "test-project")
|
||||
}
|
||||
if data["summary"] != domain.DiagnosticsSummaryHealthy {
|
||||
t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryHealthy)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiagnosticsHandler_GetDiagnostics_WithIssues(t *testing.T) {
|
||||
getter := &mockDiagnosticsGetter{
|
||||
diagnostics: &domain.ProjectDiagnostics{
|
||||
ProjectID: "unhealthy-project",
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Summary: domain.DiagnosticsSummaryUnhealthy,
|
||||
Issues: []domain.DiagnosticIssue{
|
||||
{
|
||||
Severity: domain.DiagnosticSeverityError,
|
||||
Source: domain.DiagnosticSourceCI,
|
||||
Message: "CI build #42 failed",
|
||||
},
|
||||
{
|
||||
Severity: domain.DiagnosticSeverityWarning,
|
||||
Source: domain.DiagnosticSourceRegistry,
|
||||
Message: "Container registry slow",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
projects := newMockProjectRepo()
|
||||
h := NewDiagnosticsHandler(getter, projects, nil)
|
||||
|
||||
r := chi.NewRouter()
|
||||
r.Use(testAdminAuth) // Add auth context for tests
|
||||
h.Mount(r)
|
||||
|
||||
req := httptest.NewRequest("GET", "/projects/unhealthy-project/diagnostics/", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
r.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusOK)
|
||||
}
|
||||
|
||||
var resp map[string]any
|
||||
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
data, ok := resp["data"].(map[string]any)
|
||||
if !ok {
|
||||
t.Fatalf("response missing data field")
|
||||
}
|
||||
|
||||
if data["summary"] != domain.DiagnosticsSummaryUnhealthy {
|
||||
t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryUnhealthy)
|
||||
}
|
||||
|
||||
issues, ok := data["issues"].([]any)
|
||||
if !ok {
|
||||
t.Fatalf("response missing issues field")
|
||||
}
|
||||
if len(issues) != 2 {
|
||||
t.Errorf("issues count = %d, want %d", len(issues), 2)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiagnosticsHandler_GetDiagnostics_MissingProjectID(t *testing.T) {
|
||||
getter := &mockDiagnosticsGetter{}
|
||||
projects := newMockProjectRepo()
|
||||
h := NewDiagnosticsHandler(getter, projects, nil)
|
||||
|
||||
// Direct call without chi router to test missing projectId
|
||||
req := httptest.NewRequest("GET", "/projects//diagnostics/", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.GetDiagnostics(rec, req)
|
||||
|
||||
if rec.Code != http.StatusBadRequest {
|
||||
t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusBadRequest)
|
||||
}
|
||||
}
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
"github.com/orchard9/rdev/internal/metrics"
|
||||
"github.com/orchard9/rdev/internal/port"
|
||||
"github.com/orchard9/rdev/pkg/api"
|
||||
@ -19,6 +20,11 @@ type ExecutorHealthChecker interface {
|
||||
WorkerID() string
|
||||
}
|
||||
|
||||
// ExternalHealthStatusProvider provides cached external system health statuses.
|
||||
type ExternalHealthStatusProvider interface {
|
||||
GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus
|
||||
}
|
||||
|
||||
// HealthHandler handles health and readiness checks.
|
||||
type HealthHandler struct {
|
||||
serviceName string
|
||||
@ -27,6 +33,7 @@ type HealthHandler struct {
|
||||
agentRegistry port.CodeAgentRegistry
|
||||
workExecutor ExecutorHealthChecker
|
||||
registryChecker port.RegistryChecker
|
||||
externalChecker ExternalHealthStatusProvider
|
||||
}
|
||||
|
||||
// NewHealthHandler creates a new health handler with dependencies.
|
||||
@ -56,6 +63,12 @@ func (h *HealthHandler) WithRegistryChecker(checker port.RegistryChecker) *Healt
|
||||
return h
|
||||
}
|
||||
|
||||
// WithExternalHealthChecker adds a cached external health checker for monitoring.
|
||||
func (h *HealthHandler) WithExternalHealthChecker(checker ExternalHealthStatusProvider) *HealthHandler {
|
||||
h.externalChecker = checker
|
||||
return h
|
||||
}
|
||||
|
||||
// Health returns a simple liveness check.
|
||||
// This should be lightweight and only fail if the process is unhealthy.
|
||||
// GET /health
|
||||
@ -113,6 +126,26 @@ func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) {
|
||||
checks["registry"] = h.checkRegistry(ctx)
|
||||
}
|
||||
|
||||
// External system checks (cached, from background worker)
|
||||
if h.externalChecker != nil {
|
||||
for system, status := range h.externalChecker.GetAllStatuses() {
|
||||
checks["external:"+string(system)] = CheckResult{
|
||||
Healthy: status.Healthy,
|
||||
Message: status.Error,
|
||||
Latency: status.Latency.String(),
|
||||
LastCheck: status.LastChecked,
|
||||
}
|
||||
if status.Healthy {
|
||||
checks["external:"+string(system)] = CheckResult{
|
||||
Healthy: true,
|
||||
Message: "connected",
|
||||
Latency: status.Latency.String(),
|
||||
LastCheck: status.LastChecked,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response := ReadinessResponse{
|
||||
Status: "ready",
|
||||
Service: h.serviceName,
|
||||
|
||||
@ -2,6 +2,7 @@ package handlers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
@ -10,11 +11,22 @@ import (
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/orchard9/rdev/internal/adapter/kubernetes"
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
)
|
||||
|
||||
// newTestProjectsHandler creates a ProjectsHandler for testing.
|
||||
// It registers a test project "test-project" for use in tests.
|
||||
func newTestProjectsHandler() *ProjectsHandler {
|
||||
repo := kubernetes.NewProjectRepository("test-namespace")
|
||||
// Register a test project for tests to use
|
||||
_ = repo.Register(context.Background(), &domain.Project{
|
||||
ID: "test-project",
|
||||
Name: "Test Project",
|
||||
Description: "Test project for unit tests",
|
||||
PodName: "test-project-pod-0",
|
||||
Status: domain.ProjectStatusRunning,
|
||||
Workspace: "/workspace",
|
||||
})
|
||||
exec := kubernetes.NewExecutor("test-namespace")
|
||||
return NewProjectsHandler(repo, exec)
|
||||
}
|
||||
@ -61,7 +73,7 @@ func TestProjectsHandler_Get(t *testing.T) {
|
||||
projectID string
|
||||
wantStatus int
|
||||
}{
|
||||
{"existing project", "pantheon", http.StatusOK},
|
||||
{"existing project", "test-project", http.StatusOK},
|
||||
{"non-existent project", "nonexistent", http.StatusNotFound},
|
||||
}
|
||||
|
||||
@ -95,7 +107,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
name: "valid request",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ClaudeRequest{
|
||||
Prompt: "Hello, world!",
|
||||
},
|
||||
@ -103,7 +115,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "missing prompt",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ClaudeRequest{
|
||||
Prompt: "",
|
||||
},
|
||||
@ -118,7 +130,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "null byte in prompt",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ClaudeRequest{
|
||||
Prompt: "Hello\x00World",
|
||||
},
|
||||
@ -127,7 +139,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "invalid stream ID",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ClaudeRequest{
|
||||
Prompt: "Hello",
|
||||
StreamID: "invalid stream id with spaces",
|
||||
@ -175,7 +187,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
name: "valid command",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "ls -la",
|
||||
},
|
||||
@ -183,7 +195,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "missing command",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "",
|
||||
},
|
||||
@ -192,7 +204,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "dangerous command with semicolon",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "ls; rm -rf /",
|
||||
},
|
||||
@ -201,7 +213,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "dangerous command with pipe",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "cat /etc/passwd | grep root",
|
||||
},
|
||||
@ -210,7 +222,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "command substitution",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "echo $(whoami)",
|
||||
},
|
||||
@ -219,7 +231,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "redirect",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "ls > /tmp/out.txt",
|
||||
},
|
||||
@ -228,7 +240,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "rm rf root",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: ShellRequest{
|
||||
Command: "rm -rf /",
|
||||
},
|
||||
@ -281,7 +293,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
name: "valid git status",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: GitRequest{
|
||||
Args: []string{"status"},
|
||||
},
|
||||
@ -289,7 +301,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "valid git log",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: GitRequest{
|
||||
Args: []string{"log", "--oneline", "-10"},
|
||||
},
|
||||
@ -297,7 +309,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "missing args",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: GitRequest{
|
||||
Args: []string{},
|
||||
},
|
||||
@ -306,7 +318,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "git config blocked",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: GitRequest{
|
||||
Args: []string{"config", "--global", "user.name", "attacker"},
|
||||
},
|
||||
@ -315,7 +327,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "git remote blocked",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: GitRequest{
|
||||
Args: []string{"remote", "add", "evil", "https://evil.com/repo"},
|
||||
},
|
||||
@ -324,7 +336,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "force push blocked",
|
||||
projectID: "pantheon",
|
||||
projectID: "test-project",
|
||||
body: GitRequest{
|
||||
Args: []string{"push", "-f", "origin", "main"},
|
||||
},
|
||||
@ -394,9 +406,9 @@ func TestProjectsHandler_InvalidJSON(t *testing.T) {
|
||||
method string
|
||||
path string
|
||||
}{
|
||||
{"POST", "/projects/pantheon/claude"},
|
||||
{"POST", "/projects/pantheon/shell"},
|
||||
{"POST", "/projects/pantheon/git"},
|
||||
{"POST", "/projects/test-project/claude"},
|
||||
{"POST", "/projects/test-project/shell"},
|
||||
{"POST", "/projects/test-project/git"},
|
||||
}
|
||||
|
||||
for _, ep := range endpoints {
|
||||
@ -429,12 +441,12 @@ func TestCommandIDGeneration(t *testing.T) {
|
||||
body := ClaudeRequest{Prompt: "test"}
|
||||
bodyBytes, _ := json.Marshal(body)
|
||||
|
||||
req1 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes))
|
||||
req1 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
|
||||
req1.Header.Set("Content-Type", "application/json")
|
||||
rec1 := httptest.NewRecorder()
|
||||
router.ServeHTTP(rec1, req1)
|
||||
|
||||
req2 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes))
|
||||
req2 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
|
||||
req2.Header.Set("Content-Type", "application/json")
|
||||
rec2 := httptest.NewRecorder()
|
||||
router.ServeHTTP(rec2, req2)
|
||||
@ -465,7 +477,7 @@ func TestCustomStreamID(t *testing.T) {
|
||||
}
|
||||
bodyBytes, _ := json.Marshal(body)
|
||||
|
||||
req := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes))
|
||||
req := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
rec := httptest.NewRecorder()
|
||||
router.ServeHTTP(rec, req)
|
||||
|
||||
@ -142,6 +142,22 @@ var (
|
||||
Name: "rdev_ci_push_failures_total",
|
||||
Help: "Total number of CI image push failures by project",
|
||||
}, []string{"project"})
|
||||
|
||||
// External system health
|
||||
externalSystemHealthy = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "rdev_external_system_healthy",
|
||||
Help: "Whether external system is healthy (1) or not (0)",
|
||||
}, []string{"system"})
|
||||
|
||||
externalSystemLatency = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "rdev_external_system_latency_seconds",
|
||||
Help: "Latency of external system health check in seconds",
|
||||
}, []string{"system"})
|
||||
|
||||
externalSystemLastCheck = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "rdev_external_system_last_check_timestamp",
|
||||
Help: "Unix timestamp of last health check",
|
||||
}, []string{"system"})
|
||||
)
|
||||
|
||||
// RecordCommand records a command execution.
|
||||
@ -248,6 +264,17 @@ func RecordCIPushFailure(project string) {
|
||||
ciPushFailures.WithLabelValues(project).Inc()
|
||||
}
|
||||
|
||||
// SetExternalSystemHealth updates the health metrics for an external system.
|
||||
func SetExternalSystemHealth(system string, healthy bool, latencySeconds float64) {
|
||||
val := 0.0
|
||||
if healthy {
|
||||
val = 1.0
|
||||
}
|
||||
externalSystemHealthy.WithLabelValues(system).Set(val)
|
||||
externalSystemLatency.WithLabelValues(system).Set(latencySeconds)
|
||||
externalSystemLastCheck.WithLabelValues(system).Set(float64(time.Now().Unix()))
|
||||
}
|
||||
|
||||
// Handler returns the Prometheus HTTP handler.
|
||||
func Handler() http.Handler {
|
||||
return promhttp.Handler()
|
||||
|
||||
@ -23,3 +23,9 @@ type RegistryChecker interface {
|
||||
// Check returns the health status of the registry.
|
||||
Check(ctx context.Context) domain.RegistryStatus
|
||||
}
|
||||
|
||||
// ExternalHealthChecker checks an external system's health.
|
||||
type ExternalHealthChecker interface {
|
||||
// Check returns the health status of the external system.
|
||||
Check(ctx context.Context) domain.ExternalSystemStatus
|
||||
}
|
||||
|
||||
295
internal/service/diagnostics_service.go
Normal file
295
internal/service/diagnostics_service.go
Normal file
@ -0,0 +1,295 @@
|
||||
// Package service provides business logic services.
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
"github.com/orchard9/rdev/internal/port"
|
||||
)
|
||||
|
||||
// DiagnosticsServiceConfig configures the diagnostics service.
|
||||
type DiagnosticsServiceConfig struct {
|
||||
// DefaultGitOwner is the git organization for CI lookups.
|
||||
DefaultGitOwner string
|
||||
|
||||
// MaxRecentOperations is how many operations to include.
|
||||
MaxRecentOperations int
|
||||
|
||||
// MaxRecentPipelines is how many pipelines to include.
|
||||
MaxRecentPipelines int
|
||||
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// DiagnosticsService aggregates project health information from multiple sources.
|
||||
type DiagnosticsService struct {
|
||||
operationRepo port.OperationRepository
|
||||
registryChecker port.RegistryChecker
|
||||
ciProvider port.CIProvider
|
||||
|
||||
defaultGitOwner string
|
||||
maxRecentOperations int
|
||||
maxRecentPipelines int
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewDiagnosticsService creates a new diagnostics service.
|
||||
func NewDiagnosticsService(
|
||||
operationRepo port.OperationRepository,
|
||||
registryChecker port.RegistryChecker,
|
||||
ciProvider port.CIProvider,
|
||||
cfg DiagnosticsServiceConfig,
|
||||
) *DiagnosticsService {
|
||||
logger := cfg.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
maxOps := cfg.MaxRecentOperations
|
||||
if maxOps <= 0 {
|
||||
maxOps = 10
|
||||
}
|
||||
|
||||
maxPipelines := cfg.MaxRecentPipelines
|
||||
if maxPipelines <= 0 {
|
||||
maxPipelines = 5
|
||||
}
|
||||
|
||||
return &DiagnosticsService{
|
||||
operationRepo: operationRepo,
|
||||
registryChecker: registryChecker,
|
||||
ciProvider: ciProvider,
|
||||
defaultGitOwner: cfg.DefaultGitOwner,
|
||||
maxRecentOperations: maxOps,
|
||||
maxRecentPipelines: maxPipelines,
|
||||
logger: logger.With("service", "diagnostics"),
|
||||
}
|
||||
}
|
||||
|
||||
// GetDiagnostics returns comprehensive health information for a project.
|
||||
func (s *DiagnosticsService) GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error) {
|
||||
diag := &domain.ProjectDiagnostics{
|
||||
ProjectID: projectID,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Summary: domain.DiagnosticsSummaryHealthy,
|
||||
Issues: []domain.DiagnosticIssue{},
|
||||
}
|
||||
|
||||
// Collect data from each source (don't fail if one source fails)
|
||||
s.collectOperations(ctx, projectID, diag)
|
||||
s.collectRegistryHealth(ctx, diag)
|
||||
s.collectCIStatus(ctx, projectID, diag)
|
||||
|
||||
// Determine overall summary
|
||||
s.calculateSummary(diag)
|
||||
|
||||
return diag, nil
|
||||
}
|
||||
|
||||
// collectOperations fetches recent operations and extracts issues.
|
||||
func (s *DiagnosticsService) collectOperations(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) {
|
||||
filter := domain.OperationFilters{
|
||||
ProjectID: projectID,
|
||||
Limit: s.maxRecentOperations,
|
||||
}
|
||||
|
||||
ops, err := s.operationRepo.List(ctx, filter)
|
||||
if err != nil {
|
||||
s.logger.Warn("failed to fetch operations for diagnostics",
|
||||
"error", err,
|
||||
"project_id", projectID,
|
||||
)
|
||||
diag.Issues = append(diag.Issues, domain.DiagnosticIssue{
|
||||
Severity: domain.DiagnosticSeverityWarning,
|
||||
Source: domain.DiagnosticSourceOperation,
|
||||
Message: "Unable to fetch operation history",
|
||||
Details: err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Convert to summaries
|
||||
for _, op := range ops {
|
||||
summary := domain.OperationSummary{
|
||||
ID: op.ID,
|
||||
Type: op.Type,
|
||||
Status: op.Status,
|
||||
StartedAt: op.StartedAt,
|
||||
DurationMs: op.DurationMs,
|
||||
Error: op.Error,
|
||||
ExternalRef: op.ExternalRef,
|
||||
}
|
||||
diag.RecentOperations = append(diag.RecentOperations, summary)
|
||||
|
||||
// Extract issues from failed operations
|
||||
if op.Status == domain.OperationStatusFailed {
|
||||
issue := domain.DiagnosticIssue{
|
||||
Severity: domain.DiagnosticSeverityError,
|
||||
Source: domain.DiagnosticSourceOperation,
|
||||
Message: fmt.Sprintf("%s operation failed", op.Type),
|
||||
Timestamp: op.StartedAt,
|
||||
}
|
||||
if op.Error != "" {
|
||||
issue.Details = op.Error
|
||||
}
|
||||
if op.ExternalRef != "" {
|
||||
issue.Message += fmt.Sprintf(" (%s)", op.ExternalRef)
|
||||
}
|
||||
diag.Issues = append(diag.Issues, issue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// collectRegistryHealth checks registry status.
|
||||
func (s *DiagnosticsService) collectRegistryHealth(ctx context.Context, diag *domain.ProjectDiagnostics) {
|
||||
if s.registryChecker == nil {
|
||||
return
|
||||
}
|
||||
|
||||
status := s.registryChecker.Check(ctx)
|
||||
diag.Registry = &status
|
||||
|
||||
if !status.Healthy {
|
||||
diag.Issues = append(diag.Issues, domain.DiagnosticIssue{
|
||||
Severity: domain.DiagnosticSeverityError,
|
||||
Source: domain.DiagnosticSourceRegistry,
|
||||
Message: "Container registry unhealthy",
|
||||
Details: status.Error,
|
||||
Timestamp: status.LastChecked,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// collectCIStatus fetches CI pipeline information.
|
||||
func (s *DiagnosticsService) collectCIStatus(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) {
|
||||
if s.ciProvider == nil {
|
||||
diag.CI = &domain.CIDiagnostics{Available: false}
|
||||
return
|
||||
}
|
||||
|
||||
owner := s.defaultGitOwner
|
||||
if owner == "" {
|
||||
owner = "jordan" // fallback
|
||||
}
|
||||
|
||||
ciDiag := &domain.CIDiagnostics{Available: true}
|
||||
|
||||
pipelines, err := s.ciProvider.ListPipelines(ctx, owner, projectID)
|
||||
if err != nil {
|
||||
s.logger.Warn("failed to fetch pipelines for diagnostics",
|
||||
"error", err,
|
||||
"project_id", projectID,
|
||||
)
|
||||
ciDiag.Available = false
|
||||
diag.CI = ciDiag
|
||||
return
|
||||
}
|
||||
|
||||
// Convert to summaries and find failures
|
||||
var lastFailure *domain.CIPipeline
|
||||
for i, p := range pipelines {
|
||||
if i >= s.maxRecentPipelines {
|
||||
break
|
||||
}
|
||||
|
||||
summary := domain.CIPipelineSummary{
|
||||
Number: p.Number,
|
||||
Status: p.Status,
|
||||
Branch: p.Branch,
|
||||
Commit: p.Commit,
|
||||
StartedAt: p.Started,
|
||||
}
|
||||
if p.Finished.After(p.Started) {
|
||||
summary.Duration = p.Finished.Sub(p.Started).Round(time.Second).String()
|
||||
}
|
||||
ciDiag.RecentPipelines = append(ciDiag.RecentPipelines, summary)
|
||||
|
||||
// Track the most recent failure
|
||||
if p.Status == "failure" && lastFailure == nil {
|
||||
lastFailure = p
|
||||
}
|
||||
}
|
||||
|
||||
// Get details on the last failure
|
||||
if lastFailure != nil {
|
||||
failure := s.getFailureDetails(ctx, owner, projectID, lastFailure)
|
||||
ciDiag.LastFailure = failure
|
||||
|
||||
// Add as issue
|
||||
issue := domain.DiagnosticIssue{
|
||||
Severity: domain.DiagnosticSeverityError,
|
||||
Source: domain.DiagnosticSourceCI,
|
||||
Message: fmt.Sprintf("CI build #%d failed", lastFailure.Number),
|
||||
Timestamp: lastFailure.Finished,
|
||||
}
|
||||
if failure != nil && failure.FailedStep != "" {
|
||||
issue.Message += fmt.Sprintf(" at step '%s'", failure.FailedStep)
|
||||
if failure.Error != "" {
|
||||
issue.Details = failure.Error
|
||||
}
|
||||
}
|
||||
diag.Issues = append(diag.Issues, issue)
|
||||
}
|
||||
|
||||
diag.CI = ciDiag
|
||||
}
|
||||
|
||||
// getFailureDetails fetches step-level details for a failed pipeline.
|
||||
func (s *DiagnosticsService) getFailureDetails(ctx context.Context, owner, repo string, pipeline *domain.CIPipeline) *domain.CIPipelineFailure {
|
||||
failure := &domain.CIPipelineFailure{
|
||||
Number: pipeline.Number,
|
||||
Timestamp: pipeline.Finished,
|
||||
}
|
||||
|
||||
steps, err := s.ciProvider.GetPipelineSteps(ctx, owner, repo, pipeline.Number)
|
||||
if err != nil {
|
||||
s.logger.Warn("failed to fetch pipeline steps",
|
||||
"error", err,
|
||||
"pipeline", pipeline.Number,
|
||||
)
|
||||
return failure
|
||||
}
|
||||
|
||||
failure.URL = steps.URL
|
||||
|
||||
// Find the failed step
|
||||
for _, step := range steps.Steps {
|
||||
if step.Status == "failure" || step.Status == "error" {
|
||||
failure.FailedStep = step.Name
|
||||
failure.Error = step.Error
|
||||
if step.Log != "" {
|
||||
failure.LogTail = step.Log
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return failure
|
||||
}
|
||||
|
||||
// calculateSummary determines the overall health status.
|
||||
func (s *DiagnosticsService) calculateSummary(diag *domain.ProjectDiagnostics) {
|
||||
errorCount := 0
|
||||
warningCount := 0
|
||||
|
||||
for _, issue := range diag.Issues {
|
||||
switch issue.Severity {
|
||||
case domain.DiagnosticSeverityError:
|
||||
errorCount++
|
||||
case domain.DiagnosticSeverityWarning:
|
||||
warningCount++
|
||||
}
|
||||
}
|
||||
|
||||
if errorCount > 0 {
|
||||
diag.Summary = domain.DiagnosticsSummaryUnhealthy
|
||||
} else if warningCount > 0 {
|
||||
diag.Summary = domain.DiagnosticsSummaryDegraded
|
||||
} else {
|
||||
diag.Summary = domain.DiagnosticsSummaryHealthy
|
||||
}
|
||||
}
|
||||
302
internal/service/diagnostics_service_test.go
Normal file
302
internal/service/diagnostics_service_test.go
Normal file
@ -0,0 +1,302 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
)
|
||||
|
||||
// mockOperationRepo implements port.OperationRepository for testing.
|
||||
type mockOperationRepo struct {
|
||||
operations []*domain.Operation
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) Create(_ context.Context, _ *domain.Operation) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) Update(_ context.Context, _ *domain.Operation) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) Get(_ context.Context, _ string) (*domain.Operation, error) {
|
||||
return nil, domain.ErrOperationNotFound
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) GetByCommitSHA(_ context.Context, _, _ string) (*domain.Operation, error) {
|
||||
return nil, domain.ErrOperationNotFound
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) List(_ context.Context, filter domain.OperationFilters) ([]*domain.Operation, error) {
|
||||
if m.err != nil {
|
||||
return nil, m.err
|
||||
}
|
||||
var result []*domain.Operation
|
||||
for _, op := range m.operations {
|
||||
if filter.ProjectID != "" && op.ProjectID != filter.ProjectID {
|
||||
continue
|
||||
}
|
||||
result = append(result, op)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) AddStep(_ context.Context, _ string, _ domain.OperationStep) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) UpdateStep(_ context.Context, _ string, _ domain.OperationStep) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) Complete(_ context.Context, _ string, _ domain.OperationStatus, _ map[string]any, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) SetCommitSHA(_ context.Context, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) SetTriggeredBy(_ context.Context, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockOperationRepo) DeleteOlderThan(_ context.Context, _ time.Time) (int64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// mockRegistryChecker implements port.RegistryChecker for testing.
|
||||
type mockRegistryChecker struct {
|
||||
status domain.RegistryStatus
|
||||
}
|
||||
|
||||
func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus {
|
||||
return m.status
|
||||
}
|
||||
|
||||
// mockCIProvider implements port.CIProvider for testing.
|
||||
type mockCIProvider struct {
|
||||
pipelines []*domain.CIPipeline
|
||||
steps *domain.CIPipelineSteps
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) ActivateRepo(_ context.Context, _, _, _ string) (*domain.CIRepo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) DeactivateRepo(_ context.Context, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) GetRepo(_ context.Context, _, _ string) (*domain.CIRepo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) ListRepos(_ context.Context) ([]*domain.CIRepo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) AddSecret(_ context.Context, _, _ string, _ domain.CISecret) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) DeleteSecret(_ context.Context, _, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) ListPipelines(_ context.Context, _, _ string) ([]*domain.CIPipeline, error) {
|
||||
if m.err != nil {
|
||||
return nil, m.err
|
||||
}
|
||||
return m.pipelines, nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) GetPipeline(_ context.Context, _, _ string, _ int64) (*domain.CIPipeline, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) GetPipelineSteps(_ context.Context, _, _ string, _ int64) (*domain.CIPipelineSteps, error) {
|
||||
if m.steps != nil {
|
||||
return m.steps, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockCIProvider) TriggerBuild(_ context.Context, _, _, _ string) (int64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func TestDiagnosticsService_GetDiagnostics_Healthy(t *testing.T) {
|
||||
opRepo := &mockOperationRepo{
|
||||
operations: []*domain.Operation{
|
||||
{
|
||||
ID: "op-1",
|
||||
ProjectID: "test-project",
|
||||
Type: domain.OperationTypeBuild,
|
||||
Status: domain.OperationStatusCompleted,
|
||||
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||
},
|
||||
},
|
||||
}
|
||||
registry := &mockRegistryChecker{
|
||||
status: domain.RegistryStatus{
|
||||
Healthy: true,
|
||||
URL: "https://registry.example.com",
|
||||
Latency: "10ms",
|
||||
LastChecked: time.Now(),
|
||||
},
|
||||
}
|
||||
ci := &mockCIProvider{
|
||||
pipelines: []*domain.CIPipeline{
|
||||
{
|
||||
Number: 42,
|
||||
Status: "success",
|
||||
Branch: "main",
|
||||
Commit: "abc123",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{
|
||||
DefaultGitOwner: "test-org",
|
||||
})
|
||||
|
||||
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
|
||||
if err != nil {
|
||||
t.Fatalf("GetDiagnostics() error = %v", err)
|
||||
}
|
||||
|
||||
if diag.ProjectID != "test-project" {
|
||||
t.Errorf("ProjectID = %q, want %q", diag.ProjectID, "test-project")
|
||||
}
|
||||
if diag.Summary != domain.DiagnosticsSummaryHealthy {
|
||||
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy)
|
||||
}
|
||||
if len(diag.Issues) != 0 {
|
||||
t.Errorf("Issues count = %d, want 0", len(diag.Issues))
|
||||
}
|
||||
if len(diag.RecentOperations) != 1 {
|
||||
t.Errorf("RecentOperations count = %d, want 1", len(diag.RecentOperations))
|
||||
}
|
||||
if diag.Registry == nil {
|
||||
t.Error("Registry is nil, want non-nil")
|
||||
}
|
||||
if diag.CI == nil || !diag.CI.Available {
|
||||
t.Error("CI not available")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiagnosticsService_GetDiagnostics_Unhealthy(t *testing.T) {
|
||||
opRepo := &mockOperationRepo{
|
||||
operations: []*domain.Operation{
|
||||
{
|
||||
ID: "op-1",
|
||||
ProjectID: "test-project",
|
||||
Type: domain.OperationTypeBuild,
|
||||
Status: domain.OperationStatusFailed,
|
||||
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||
Error: "deployment failed",
|
||||
},
|
||||
},
|
||||
}
|
||||
registry := &mockRegistryChecker{
|
||||
status: domain.RegistryStatus{
|
||||
Healthy: false,
|
||||
URL: "https://registry.example.com",
|
||||
Error: "connection refused",
|
||||
LastChecked: time.Now(),
|
||||
},
|
||||
}
|
||||
ci := &mockCIProvider{
|
||||
pipelines: []*domain.CIPipeline{
|
||||
{
|
||||
Number: 43,
|
||||
Status: "failure",
|
||||
Branch: "main",
|
||||
Commit: "def456",
|
||||
Finished: time.Now().Add(-30 * time.Minute),
|
||||
},
|
||||
},
|
||||
steps: &domain.CIPipelineSteps{
|
||||
PipelineNumber: 43,
|
||||
URL: "https://ci.example.com/build/43",
|
||||
Steps: []domain.CIPipelineStep{
|
||||
{
|
||||
Name: "test",
|
||||
Status: "failure",
|
||||
Error: "tests failed",
|
||||
Log: "FAIL: TestSomething",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{
|
||||
DefaultGitOwner: "test-org",
|
||||
})
|
||||
|
||||
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
|
||||
if err != nil {
|
||||
t.Fatalf("GetDiagnostics() error = %v", err)
|
||||
}
|
||||
|
||||
if diag.Summary != domain.DiagnosticsSummaryUnhealthy {
|
||||
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryUnhealthy)
|
||||
}
|
||||
|
||||
// Should have 3 issues: failed operation, unhealthy registry, failed CI
|
||||
if len(diag.Issues) < 3 {
|
||||
t.Errorf("Issues count = %d, want at least 3", len(diag.Issues))
|
||||
}
|
||||
|
||||
// Check CI failure details
|
||||
if diag.CI == nil || diag.CI.LastFailure == nil {
|
||||
t.Fatal("CI.LastFailure is nil")
|
||||
}
|
||||
if diag.CI.LastFailure.FailedStep != "test" {
|
||||
t.Errorf("FailedStep = %q, want %q", diag.CI.LastFailure.FailedStep, "test")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiagnosticsService_GetDiagnostics_Degraded(t *testing.T) {
|
||||
opRepo := &mockOperationRepo{
|
||||
operations: []*domain.Operation{
|
||||
{
|
||||
ID: "op-1",
|
||||
ProjectID: "test-project",
|
||||
Type: domain.OperationTypeBuild,
|
||||
Status: domain.OperationStatusCompleted,
|
||||
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||
},
|
||||
},
|
||||
}
|
||||
registry := &mockRegistryChecker{
|
||||
status: domain.RegistryStatus{
|
||||
Healthy: true,
|
||||
URL: "https://registry.example.com",
|
||||
LastChecked: time.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
// No CI provider - should produce a warning but not error
|
||||
svc := NewDiagnosticsService(opRepo, registry, nil, DiagnosticsServiceConfig{
|
||||
DefaultGitOwner: "test-org",
|
||||
})
|
||||
|
||||
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
|
||||
if err != nil {
|
||||
t.Fatalf("GetDiagnostics() error = %v", err)
|
||||
}
|
||||
|
||||
// Without CI, status should still be healthy (CI unavailable is not an issue)
|
||||
if diag.Summary != domain.DiagnosticsSummaryHealthy {
|
||||
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy)
|
||||
}
|
||||
if diag.CI == nil || diag.CI.Available {
|
||||
t.Error("CI should be not available when no provider")
|
||||
}
|
||||
}
|
||||
248
internal/worker/external_health.go
Normal file
248
internal/worker/external_health.go
Normal file
@ -0,0 +1,248 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
"github.com/orchard9/rdev/internal/metrics"
|
||||
"github.com/orchard9/rdev/internal/port"
|
||||
)
|
||||
|
||||
// ExternalHealthChecker runs periodic health checks on external systems
|
||||
// (registry, CI, git) and caches the results for the /ready endpoint.
|
||||
type ExternalHealthChecker struct {
|
||||
registry port.RegistryChecker // zot
|
||||
ci port.ExternalHealthChecker // woodpecker
|
||||
git port.ExternalHealthChecker // gitea
|
||||
|
||||
interval time.Duration
|
||||
logger *slog.Logger
|
||||
|
||||
// Internal state (thread-safe)
|
||||
mu sync.RWMutex
|
||||
statuses map[domain.ExternalSystem]domain.ExternalSystemStatus
|
||||
|
||||
// Lifecycle
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// ExternalHealthConfig configures the health checker.
|
||||
type ExternalHealthConfig struct {
|
||||
// CheckInterval is how often to check external systems. Default: 30s.
|
||||
CheckInterval time.Duration
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// DefaultExternalHealthConfig returns sensible defaults.
|
||||
func DefaultExternalHealthConfig() ExternalHealthConfig {
|
||||
return ExternalHealthConfig{
|
||||
CheckInterval: 30 * time.Second,
|
||||
Logger: slog.Default(),
|
||||
}
|
||||
}
|
||||
|
||||
// NewExternalHealthChecker creates a new external health checker.
|
||||
// All checker parameters are optional (nil means skip that system).
|
||||
func NewExternalHealthChecker(
|
||||
registry port.RegistryChecker,
|
||||
ci port.ExternalHealthChecker,
|
||||
git port.ExternalHealthChecker,
|
||||
cfg ExternalHealthConfig,
|
||||
) *ExternalHealthChecker {
|
||||
if cfg.CheckInterval == 0 {
|
||||
cfg.CheckInterval = 30 * time.Second
|
||||
}
|
||||
if cfg.Logger == nil {
|
||||
cfg.Logger = slog.Default()
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
return &ExternalHealthChecker{
|
||||
registry: registry,
|
||||
ci: ci,
|
||||
git: git,
|
||||
interval: cfg.CheckInterval,
|
||||
logger: cfg.Logger.With("component", "external-health"),
|
||||
statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the background check loop.
|
||||
func (c *ExternalHealthChecker) Start() {
|
||||
c.logger.Info("external health checker started", "interval", c.interval)
|
||||
|
||||
c.wg.Add(1)
|
||||
go c.checkLoop()
|
||||
}
|
||||
|
||||
// Stop gracefully shuts down the checker.
|
||||
func (c *ExternalHealthChecker) Stop() {
|
||||
c.logger.Info("external health checker stopping")
|
||||
c.cancel()
|
||||
c.wg.Wait()
|
||||
c.logger.Info("external health checker stopped")
|
||||
}
|
||||
|
||||
// GetStatus returns the cached status for a specific system.
|
||||
func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
status, ok := c.statuses[system]
|
||||
return status, ok
|
||||
}
|
||||
|
||||
// GetAllStatuses returns a copy of all cached statuses.
|
||||
func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses))
|
||||
for k, v := range c.statuses {
|
||||
result[k] = v
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// checkLoop runs periodic health checks.
|
||||
func (c *ExternalHealthChecker) checkLoop() {
|
||||
defer c.wg.Done()
|
||||
|
||||
// Run immediately on start
|
||||
c.runChecks()
|
||||
|
||||
ticker := time.NewTicker(c.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-c.ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.runChecks()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runChecks performs health checks on all configured systems in parallel.
|
||||
func (c *ExternalHealthChecker) runChecks() {
|
||||
ctx, cancel := context.WithTimeout(c.ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
results := make(chan domain.ExternalSystemStatus, 3)
|
||||
|
||||
// Check registry (zot)
|
||||
if c.registry != nil {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
regStatus := c.registry.Check(ctx)
|
||||
// Convert domain.RegistryStatus to domain.ExternalSystemStatus
|
||||
status := domain.ExternalSystemStatus{
|
||||
System: domain.ExternalSystemRegistry,
|
||||
Healthy: regStatus.Healthy,
|
||||
URL: regStatus.URL,
|
||||
Error: regStatus.Error,
|
||||
LastChecked: regStatus.LastChecked,
|
||||
}
|
||||
// Parse latency string (e.g., "45ms") to duration
|
||||
if regStatus.Latency != "" {
|
||||
if d, err := time.ParseDuration(regStatus.Latency); err == nil {
|
||||
status.Latency = d
|
||||
}
|
||||
}
|
||||
if status.Healthy {
|
||||
status.LastHealthy = status.LastChecked
|
||||
}
|
||||
results <- status
|
||||
}()
|
||||
}
|
||||
|
||||
// Check CI (woodpecker)
|
||||
if c.ci != nil {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- c.ci.Check(ctx)
|
||||
}()
|
||||
}
|
||||
|
||||
// Check git (gitea)
|
||||
if c.git != nil {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
results <- c.git.Check(ctx)
|
||||
}()
|
||||
}
|
||||
|
||||
// Wait for all checks to complete, then close results channel
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(results)
|
||||
}()
|
||||
|
||||
// Collect results and update state
|
||||
for status := range results {
|
||||
c.updateStatus(status)
|
||||
}
|
||||
}
|
||||
|
||||
// updateStatus updates cached status and logs/metrics on state changes.
|
||||
func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
prev, existed := c.statuses[status.System]
|
||||
|
||||
// Preserve LastHealthy from previous status if current is unhealthy
|
||||
if !status.Healthy && existed && !prev.LastHealthy.IsZero() {
|
||||
status.LastHealthy = prev.LastHealthy
|
||||
}
|
||||
|
||||
c.statuses[status.System] = status
|
||||
|
||||
// Log state transitions
|
||||
if !existed {
|
||||
// First check
|
||||
if status.Healthy {
|
||||
c.logger.Info("external system healthy",
|
||||
"system", status.System,
|
||||
"url", status.URL,
|
||||
"latency", status.Latency,
|
||||
)
|
||||
} else {
|
||||
c.logger.Warn("external system unhealthy",
|
||||
"system", status.System,
|
||||
"url", status.URL,
|
||||
"error", status.Error,
|
||||
)
|
||||
}
|
||||
} else if prev.Healthy != status.Healthy {
|
||||
// State changed
|
||||
if status.Healthy {
|
||||
c.logger.Info("external system recovered",
|
||||
"system", status.System,
|
||||
"url", status.URL,
|
||||
"latency", status.Latency,
|
||||
)
|
||||
} else {
|
||||
c.logger.Warn("external system became unhealthy",
|
||||
"system", status.System,
|
||||
"url", status.URL,
|
||||
"error", status.Error,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Update Prometheus metrics
|
||||
metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds())
|
||||
}
|
||||
241
internal/worker/external_health_test.go
Normal file
241
internal/worker/external_health_test.go
Normal file
@ -0,0 +1,241 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/orchard9/rdev/internal/domain"
|
||||
)
|
||||
|
||||
// mockRegistryChecker is a mock implementation of port.RegistryChecker.
|
||||
type mockRegistryChecker struct {
|
||||
healthy bool
|
||||
err string
|
||||
latency time.Duration
|
||||
}
|
||||
|
||||
func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus {
|
||||
status := domain.RegistryStatus{
|
||||
Healthy: m.healthy,
|
||||
URL: "https://registry.test",
|
||||
Latency: m.latency.String(),
|
||||
LastChecked: time.Now().UTC(),
|
||||
}
|
||||
if !m.healthy {
|
||||
status.Error = m.err
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
// mockExternalHealthChecker is a mock implementation of port.ExternalHealthChecker.
|
||||
type mockExternalHealthChecker struct {
|
||||
system domain.ExternalSystem
|
||||
healthy bool
|
||||
err string
|
||||
latency time.Duration
|
||||
}
|
||||
|
||||
func (m *mockExternalHealthChecker) Check(_ context.Context) domain.ExternalSystemStatus {
|
||||
status := domain.ExternalSystemStatus{
|
||||
System: m.system,
|
||||
Healthy: m.healthy,
|
||||
URL: "https://test.system",
|
||||
Latency: m.latency,
|
||||
LastChecked: time.Now().UTC(),
|
||||
}
|
||||
if m.healthy {
|
||||
status.LastHealthy = status.LastChecked
|
||||
} else {
|
||||
status.Error = m.err
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func TestExternalHealthChecker_GetStatus(t *testing.T) {
|
||||
registry := &mockRegistryChecker{healthy: true, latency: 50 * time.Millisecond}
|
||||
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: true, latency: 100 * time.Millisecond}
|
||||
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true, latency: 75 * time.Millisecond}
|
||||
|
||||
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
|
||||
CheckInterval: 100 * time.Millisecond,
|
||||
})
|
||||
|
||||
// Run checks synchronously
|
||||
checker.runChecks()
|
||||
|
||||
// Verify registry status
|
||||
regStatus, ok := checker.GetStatus(domain.ExternalSystemRegistry)
|
||||
if !ok {
|
||||
t.Fatal("expected registry status to exist")
|
||||
}
|
||||
if !regStatus.Healthy {
|
||||
t.Error("expected registry to be healthy")
|
||||
}
|
||||
|
||||
// Verify CI status
|
||||
ciStatus, ok := checker.GetStatus(domain.ExternalSystemCI)
|
||||
if !ok {
|
||||
t.Fatal("expected CI status to exist")
|
||||
}
|
||||
if !ciStatus.Healthy {
|
||||
t.Error("expected CI to be healthy")
|
||||
}
|
||||
|
||||
// Verify Git status
|
||||
gitStatus, ok := checker.GetStatus(domain.ExternalSystemGit)
|
||||
if !ok {
|
||||
t.Fatal("expected Git status to exist")
|
||||
}
|
||||
if !gitStatus.Healthy {
|
||||
t.Error("expected Git to be healthy")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExternalHealthChecker_GetAllStatuses(t *testing.T) {
|
||||
registry := &mockRegistryChecker{healthy: true}
|
||||
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "connection refused"}
|
||||
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true}
|
||||
|
||||
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
|
||||
CheckInterval: 100 * time.Millisecond,
|
||||
})
|
||||
|
||||
checker.runChecks()
|
||||
|
||||
statuses := checker.GetAllStatuses()
|
||||
|
||||
if len(statuses) != 3 {
|
||||
t.Fatalf("expected 3 statuses, got %d", len(statuses))
|
||||
}
|
||||
|
||||
if statuses[domain.ExternalSystemRegistry].Healthy != true {
|
||||
t.Error("expected registry to be healthy")
|
||||
}
|
||||
if statuses[domain.ExternalSystemCI].Healthy != false {
|
||||
t.Error("expected CI to be unhealthy")
|
||||
}
|
||||
if statuses[domain.ExternalSystemCI].Error != "connection refused" {
|
||||
t.Errorf("expected CI error 'connection refused', got %q", statuses[domain.ExternalSystemCI].Error)
|
||||
}
|
||||
if statuses[domain.ExternalSystemGit].Healthy != true {
|
||||
t.Error("expected Git to be healthy")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExternalHealthChecker_NilCheckers(t *testing.T) {
|
||||
// All nil checkers should result in empty statuses
|
||||
checker := NewExternalHealthChecker(nil, nil, nil, ExternalHealthConfig{
|
||||
CheckInterval: 100 * time.Millisecond,
|
||||
})
|
||||
|
||||
checker.runChecks()
|
||||
|
||||
statuses := checker.GetAllStatuses()
|
||||
if len(statuses) != 0 {
|
||||
t.Fatalf("expected 0 statuses with nil checkers, got %d", len(statuses))
|
||||
}
|
||||
}
|
||||
|
||||
func TestExternalHealthChecker_StartStop(t *testing.T) {
|
||||
registry := &mockRegistryChecker{healthy: true}
|
||||
|
||||
checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{
|
||||
CheckInterval: 50 * time.Millisecond,
|
||||
})
|
||||
|
||||
checker.Start()
|
||||
|
||||
// Wait for a couple of check cycles
|
||||
time.Sleep(120 * time.Millisecond)
|
||||
|
||||
// Verify status was populated
|
||||
status, ok := checker.GetStatus(domain.ExternalSystemRegistry)
|
||||
if !ok {
|
||||
t.Fatal("expected registry status after start")
|
||||
}
|
||||
if !status.Healthy {
|
||||
t.Error("expected registry to be healthy")
|
||||
}
|
||||
|
||||
checker.Stop()
|
||||
|
||||
// After stop, statuses should still be available (cached)
|
||||
status, ok = checker.GetStatus(domain.ExternalSystemRegistry)
|
||||
if !ok {
|
||||
t.Fatal("expected registry status after stop")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExternalHealthChecker_StateTransition(t *testing.T) {
|
||||
registry := &mockRegistryChecker{healthy: true}
|
||||
|
||||
checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{
|
||||
CheckInterval: 100 * time.Millisecond,
|
||||
})
|
||||
|
||||
// Initial check - healthy
|
||||
checker.runChecks()
|
||||
status, _ := checker.GetStatus(domain.ExternalSystemRegistry)
|
||||
if !status.Healthy {
|
||||
t.Error("expected initial status to be healthy")
|
||||
}
|
||||
firstHealthy := status.LastHealthy
|
||||
|
||||
// Change to unhealthy
|
||||
registry.healthy = false
|
||||
registry.err = "connection refused"
|
||||
checker.runChecks()
|
||||
|
||||
status, _ = checker.GetStatus(domain.ExternalSystemRegistry)
|
||||
if status.Healthy {
|
||||
t.Error("expected status to be unhealthy after state change")
|
||||
}
|
||||
// LastHealthy should be preserved from when it was healthy
|
||||
if status.LastHealthy.IsZero() {
|
||||
t.Error("expected LastHealthy to be preserved")
|
||||
}
|
||||
if !status.LastHealthy.Equal(firstHealthy) {
|
||||
t.Error("expected LastHealthy to remain from healthy period")
|
||||
}
|
||||
|
||||
// Recover to healthy
|
||||
registry.healthy = true
|
||||
registry.err = ""
|
||||
checker.runChecks()
|
||||
|
||||
status, _ = checker.GetStatus(domain.ExternalSystemRegistry)
|
||||
if !status.Healthy {
|
||||
t.Error("expected status to be healthy after recovery")
|
||||
}
|
||||
// LastHealthy should be updated
|
||||
if status.LastHealthy.Before(firstHealthy) {
|
||||
t.Error("expected LastHealthy to be updated on recovery")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExternalHealthChecker_PartialFailure(t *testing.T) {
|
||||
// Registry healthy, CI unhealthy, Git healthy
|
||||
registry := &mockRegistryChecker{healthy: true}
|
||||
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "timeout"}
|
||||
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true}
|
||||
|
||||
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
|
||||
CheckInterval: 100 * time.Millisecond,
|
||||
})
|
||||
|
||||
checker.runChecks()
|
||||
|
||||
statuses := checker.GetAllStatuses()
|
||||
|
||||
// Partial failure should not affect other systems
|
||||
if !statuses[domain.ExternalSystemRegistry].Healthy {
|
||||
t.Error("registry should be healthy despite CI failure")
|
||||
}
|
||||
if statuses[domain.ExternalSystemCI].Healthy {
|
||||
t.Error("CI should be unhealthy")
|
||||
}
|
||||
if !statuses[domain.ExternalSystemGit].Healthy {
|
||||
t.Error("git should be healthy despite CI failure")
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user