feat: add diagnostics endpoint and external health monitoring
- Add /diagnostics endpoint for system health overview - Add external health worker for monitoring Gitea, Woodpecker, Registry - Add health check methods to Gitea and Woodpecker clients - Remove hardcoded fallback projects (pantheon, aeries) - Add diagnostics domain types and service layer - Add comprehensive tests for diagnostics handler and service - Fix tests to use registered test project instead of hardcoded one Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
9128dd23b5
commit
210064d490
49
ai-lookup/services/external-health.md
Normal file
49
ai-lookup/services/external-health.md
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# External Health Checker
|
||||||
|
|
||||||
|
**Last Updated:** 2026-02-03
|
||||||
|
**Confidence:** High
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Background worker that continuously monitors external systems (registry, CI, git) and surfaces issues proactively via metrics, logs, and the `/ready` endpoint. Runs every 30s, caches results for instant lookups, and logs state transitions.
|
||||||
|
|
||||||
|
**Key Facts:**
|
||||||
|
- Monitors: `registry` (zot), `ci` (woodpecker), `git` (gitea)
|
||||||
|
- Check interval: 30 seconds (configurable)
|
||||||
|
- Caches results for `/ready` endpoint (no blocking network calls)
|
||||||
|
- Logs only on state changes (healthy→unhealthy, unhealthy→healthy)
|
||||||
|
- Preserves `LastHealthy` timestamp through unhealthy periods
|
||||||
|
|
||||||
|
**File Pointers:**
|
||||||
|
- Domain types: `internal/domain/external_health.go`
|
||||||
|
- Worker implementation: `internal/worker/external_health.go`
|
||||||
|
- Port interface: `internal/port/health.go:ExternalHealthChecker`
|
||||||
|
- Handler integration: `internal/handlers/health.go:WithExternalHealthChecker`
|
||||||
|
- Wiring: `cmd/rdev-api/main.go:433-455`
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
1. Background goroutine polls all configured external systems every 30s
|
||||||
|
2. Checks run in parallel with 10s timeout per system
|
||||||
|
3. Results cached in thread-safe map
|
||||||
|
4. `/ready` reads cached statuses (no network calls)
|
||||||
|
5. Prometheus metrics updated on each check cycle
|
||||||
|
|
||||||
|
**Adapter implementations:**
|
||||||
|
- Registry: `internal/adapter/zot/client.go:Check()` - calls `/v2/` endpoint
|
||||||
|
- CI: `internal/adapter/woodpecker/client.go:Check()` - calls `Self()` API
|
||||||
|
- Git: `internal/adapter/gitea/client.go:Check()` - calls `ListMyOrgs()`
|
||||||
|
|
||||||
|
## Prometheus Metrics
|
||||||
|
|
||||||
|
| Metric | Type | Labels | Description |
|
||||||
|
|--------|------|--------|-------------|
|
||||||
|
| `rdev_external_system_healthy` | Gauge | `system` | 1=healthy, 0=unhealthy |
|
||||||
|
| `rdev_external_system_latency_seconds` | Gauge | `system` | Check latency |
|
||||||
|
| `rdev_external_system_last_check_timestamp` | Gauge | `system` | Unix timestamp of last check |
|
||||||
|
|
||||||
|
## Related Topics
|
||||||
|
|
||||||
|
- [Work Queue](./work-queue.md) - Uses similar background worker pattern
|
||||||
|
- [CI Provider](./ci-provider.md) - Woodpecker adapter details
|
||||||
|
- [Worker Pool](./worker-pool.md) - Another background worker example
|
||||||
@ -418,12 +418,43 @@ func main() {
|
|||||||
logger.Info("registry health checker initialized", "url", registryURL)
|
logger.Info("registry health checker initialized", "url", registryURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize diagnostics service (aggregates health data for debugging)
|
||||||
|
diagnosticsService := service.NewDiagnosticsService(
|
||||||
|
operationRepo,
|
||||||
|
registryChecker,
|
||||||
|
woodpeckerClient,
|
||||||
|
service.DiagnosticsServiceConfig{
|
||||||
|
DefaultGitOwner: infraCfg.GiteaDefaultOrg,
|
||||||
|
Logger: logger,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
diagnosticsHandler := handlers.NewDiagnosticsHandler(diagnosticsService, projectRepo, logger)
|
||||||
|
|
||||||
|
// Initialize external health checker (background monitoring of registry, CI, git)
|
||||||
|
var externalHealthChecker *worker.ExternalHealthChecker
|
||||||
|
if registryChecker != nil || woodpeckerClient != nil || giteaClient != nil {
|
||||||
|
externalHealthChecker = worker.NewExternalHealthChecker(
|
||||||
|
registryChecker,
|
||||||
|
woodpeckerClient,
|
||||||
|
giteaClient,
|
||||||
|
worker.ExternalHealthConfig{
|
||||||
|
CheckInterval: 30 * time.Second,
|
||||||
|
Logger: logger,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
externalHealthChecker.Start()
|
||||||
|
logger.Info("external health checker started")
|
||||||
|
}
|
||||||
|
|
||||||
// Override default health/ready endpoints with full dependency checks
|
// Override default health/ready endpoints with full dependency checks
|
||||||
healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil).
|
healthHandler := handlers.NewHealthHandler("rdev-api", database.DB, nil).
|
||||||
WithAgentRegistry(agentRegistry)
|
WithAgentRegistry(agentRegistry)
|
||||||
if registryChecker != nil {
|
if registryChecker != nil {
|
||||||
healthHandler = healthHandler.WithRegistryChecker(registryChecker)
|
healthHandler = healthHandler.WithRegistryChecker(registryChecker)
|
||||||
}
|
}
|
||||||
|
if externalHealthChecker != nil {
|
||||||
|
healthHandler = healthHandler.WithExternalHealthChecker(externalHealthChecker)
|
||||||
|
}
|
||||||
|
|
||||||
app.Router().Get("/health", healthHandler.Health)
|
app.Router().Get("/health", healthHandler.Health)
|
||||||
app.Router().Get("/ready", healthHandler.Ready)
|
app.Router().Get("/ready", healthHandler.Ready)
|
||||||
@ -448,6 +479,7 @@ func main() {
|
|||||||
buildsHandler.Mount(app.Router())
|
buildsHandler.Mount(app.Router())
|
||||||
createAndBuildHandler.Mount(app.Router())
|
createAndBuildHandler.Mount(app.Router())
|
||||||
operationsHandler.Mount(app.Router())
|
operationsHandler.Mount(app.Router())
|
||||||
|
diagnosticsHandler.Mount(app.Router())
|
||||||
sdlcHandler.Mount(app.Router())
|
sdlcHandler.Mount(app.Router())
|
||||||
sdlcOrchestratorHandler.Mount(app.Router())
|
sdlcOrchestratorHandler.Mount(app.Router())
|
||||||
|
|
||||||
@ -514,6 +546,9 @@ func main() {
|
|||||||
app.EnableDocs(buildOpenAPISpec())
|
app.EnableDocs(buildOpenAPISpec())
|
||||||
|
|
||||||
app.OnShutdown(func(ctx context.Context) error {
|
app.OnShutdown(func(ctx context.Context) error {
|
||||||
|
if externalHealthChecker != nil {
|
||||||
|
externalHealthChecker.Stop()
|
||||||
|
}
|
||||||
workExecutor.Stop()
|
workExecutor.Stop()
|
||||||
queueMaintenance.Stop()
|
queueMaintenance.Stop()
|
||||||
operationCleanup.Stop()
|
operationCleanup.Stop()
|
||||||
|
|||||||
@ -11,18 +11,21 @@ package gitea
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
"code.gitea.io/sdk/gitea"
|
"code.gitea.io/sdk/gitea"
|
||||||
"github.com/orchard9/rdev/internal/domain"
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
"github.com/orchard9/rdev/internal/port"
|
"github.com/orchard9/rdev/internal/port"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Ensure Client implements GitRepository.
|
// Ensure Client implements GitRepository and ExternalHealthChecker.
|
||||||
var _ port.GitRepository = (*Client)(nil)
|
var _ port.GitRepository = (*Client)(nil)
|
||||||
|
var _ port.ExternalHealthChecker = (*Client)(nil)
|
||||||
|
|
||||||
// Client is a Gitea API client adapter.
|
// Client is a Gitea API client adapter.
|
||||||
type Client struct {
|
type Client struct {
|
||||||
client *gitea.Client
|
client *gitea.Client
|
||||||
|
url string // Gitea server URL for health checks
|
||||||
defaultOwner string // default organization/user for new repos
|
defaultOwner string // default organization/user for new repos
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,6 +46,7 @@ func NewClient(url, token, defaultOwner string) (*Client, error) {
|
|||||||
}
|
}
|
||||||
return &Client{
|
return &Client{
|
||||||
client: client,
|
client: client,
|
||||||
|
url: url,
|
||||||
defaultOwner: defaultOwner,
|
defaultOwner: defaultOwner,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
@ -208,6 +212,45 @@ func (c *Client) DeleteWebhook(ctx context.Context, owner, repo string, webhookI
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check returns the health status of the Gitea server.
|
||||||
|
// Implements port.ExternalHealthChecker.
|
||||||
|
func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus {
|
||||||
|
start := time.Now()
|
||||||
|
status := domain.ExternalSystemStatus{
|
||||||
|
System: domain.ExternalSystemGit,
|
||||||
|
URL: c.url,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gitea SDK doesn't support context propagation for HTTP requests,
|
||||||
|
// but check for cancellation before making the call.
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
status.Latency = time.Since(start)
|
||||||
|
status.LastChecked = time.Now().UTC()
|
||||||
|
status.Healthy = false
|
||||||
|
status.Error = ctx.Err().Error()
|
||||||
|
return status
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call ListMyOrgs (lightweight, tests auth)
|
||||||
|
_, _, err := c.client.ListMyOrgs(gitea.ListOrgsOptions{
|
||||||
|
ListOptions: gitea.ListOptions{PageSize: 1},
|
||||||
|
})
|
||||||
|
status.Latency = time.Since(start)
|
||||||
|
status.LastChecked = time.Now().UTC()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
status.Healthy = false
|
||||||
|
status.Error = err.Error()
|
||||||
|
} else {
|
||||||
|
status.Healthy = true
|
||||||
|
status.LastHealthy = status.LastChecked
|
||||||
|
}
|
||||||
|
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
|
||||||
// repoFromGitea converts a gitea.Repository to domain.Repo.
|
// repoFromGitea converts a gitea.Repository to domain.Repo.
|
||||||
func repoFromGitea(r *gitea.Repository) *domain.Repo {
|
func repoFromGitea(r *gitea.Repository) *domain.Repo {
|
||||||
return &domain.Repo{
|
return &domain.Repo{
|
||||||
|
|||||||
@ -61,23 +61,9 @@ func NewProjectRepositoryWithClient(namespace string, client *kubernetes.Clients
|
|||||||
}
|
}
|
||||||
|
|
||||||
// initFallbackProjects adds hardcoded projects for when K8s client is unavailable.
|
// initFallbackProjects adds hardcoded projects for when K8s client is unavailable.
|
||||||
|
// Currently empty - projects are discovered dynamically from K8s or stored in the database.
|
||||||
func (r *ProjectRepository) initFallbackProjects() {
|
func (r *ProjectRepository) initFallbackProjects() {
|
||||||
r.projects["pantheon"] = &domain.Project{
|
// No hardcoded fallback projects
|
||||||
ID: "pantheon",
|
|
||||||
Name: "Pantheon",
|
|
||||||
Description: "Go API backend",
|
|
||||||
PodName: "claudebox-pantheon-0",
|
|
||||||
Status: domain.ProjectStatusUnknown,
|
|
||||||
Workspace: "/workspace",
|
|
||||||
}
|
|
||||||
r.projects["aeries"] = &domain.Project{
|
|
||||||
ID: "aeries",
|
|
||||||
Name: "Aeries",
|
|
||||||
Description: "Note community platform",
|
|
||||||
PodName: "claudebox-aeries-0",
|
|
||||||
Status: domain.ProjectStatusUnknown,
|
|
||||||
Workspace: "/workspace",
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure ProjectRepository implements port.ProjectRepository at compile time.
|
// Ensure ProjectRepository implements port.ProjectRepository at compile time.
|
||||||
|
|||||||
@ -28,8 +28,9 @@ import (
|
|||||||
"github.com/orchard9/rdev/internal/port"
|
"github.com/orchard9/rdev/internal/port"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Ensure Client implements CIProvider.
|
// Ensure Client implements CIProvider and ExternalHealthChecker.
|
||||||
var _ port.CIProvider = (*Client)(nil)
|
var _ port.CIProvider = (*Client)(nil)
|
||||||
|
var _ port.ExternalHealthChecker = (*Client)(nil)
|
||||||
|
|
||||||
// tokenTransport is an http.RoundTripper that adds bearer token auth.
|
// tokenTransport is an http.RoundTripper that adds bearer token auth.
|
||||||
type tokenTransport struct {
|
type tokenTransport struct {
|
||||||
@ -331,6 +332,42 @@ func (c *Client) DeleteSecret(ctx context.Context, owner, repo, secretName strin
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check returns the health status of the Woodpecker CI system.
|
||||||
|
// Implements port.ExternalHealthChecker.
|
||||||
|
func (c *Client) Check(ctx context.Context) domain.ExternalSystemStatus {
|
||||||
|
start := time.Now()
|
||||||
|
status := domain.ExternalSystemStatus{
|
||||||
|
System: domain.ExternalSystemCI,
|
||||||
|
URL: c.url,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check context cancellation
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
status.Latency = time.Since(start)
|
||||||
|
status.LastChecked = time.Now().UTC()
|
||||||
|
status.Healthy = false
|
||||||
|
status.Error = ctx.Err().Error()
|
||||||
|
return status
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call Self() to get current user info (lightweight, tests auth)
|
||||||
|
_, err := c.client.Self()
|
||||||
|
status.Latency = time.Since(start)
|
||||||
|
status.LastChecked = time.Now().UTC()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
status.Healthy = false
|
||||||
|
status.Error = err.Error()
|
||||||
|
} else {
|
||||||
|
status.Healthy = true
|
||||||
|
status.LastHealthy = status.LastChecked
|
||||||
|
}
|
||||||
|
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
|
||||||
// repoFromWoodpecker converts a woodpecker.Repo to domain.CIRepo.
|
// repoFromWoodpecker converts a woodpecker.Repo to domain.CIRepo.
|
||||||
func repoFromWoodpecker(r *woodpecker.Repo) *domain.CIRepo {
|
func repoFromWoodpecker(r *woodpecker.Repo) *domain.CIRepo {
|
||||||
// Parse forge remote ID (string in SDK, int64 in our domain)
|
// Parse forge remote ID (string in SDK, int64 in our domain)
|
||||||
|
|||||||
110
internal/domain/diagnostics.go
Normal file
110
internal/domain/diagnostics.go
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
package domain
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// ProjectDiagnostics provides a unified view of project health for debugging.
|
||||||
|
// It aggregates data from operations, CI pipelines, and registry health.
|
||||||
|
type ProjectDiagnostics struct {
|
||||||
|
// ProjectID is the project being diagnosed.
|
||||||
|
ProjectID string `json:"project_id"`
|
||||||
|
|
||||||
|
// GeneratedAt is when this diagnostic was generated.
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
|
||||||
|
// Summary is a one-line status: "healthy", "degraded", or "unhealthy".
|
||||||
|
Summary string `json:"summary"`
|
||||||
|
|
||||||
|
// Issues is a list of detected problems (empty if healthy).
|
||||||
|
Issues []DiagnosticIssue `json:"issues,omitempty"`
|
||||||
|
|
||||||
|
// RecentOperations are the last N operations for this project.
|
||||||
|
RecentOperations []OperationSummary `json:"recent_operations,omitempty"`
|
||||||
|
|
||||||
|
// CI contains CI/CD pipeline status.
|
||||||
|
CI *CIDiagnostics `json:"ci,omitempty"`
|
||||||
|
|
||||||
|
// Registry contains container registry health.
|
||||||
|
Registry *RegistryStatus `json:"registry,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiagnosticIssue represents a detected problem.
|
||||||
|
type DiagnosticIssue struct {
|
||||||
|
// Severity: "error", "warning", "info"
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
|
||||||
|
// Source: "operation", "ci", "registry"
|
||||||
|
Source string `json:"source"`
|
||||||
|
|
||||||
|
// Message is a human-readable description.
|
||||||
|
Message string `json:"message"`
|
||||||
|
|
||||||
|
// Details provides additional context (e.g., error output).
|
||||||
|
Details string `json:"details,omitempty"`
|
||||||
|
|
||||||
|
// Timestamp is when the issue occurred.
|
||||||
|
Timestamp time.Time `json:"timestamp,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OperationSummary is a condensed view of an operation for diagnostics.
|
||||||
|
type OperationSummary struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Type OperationType `json:"type"`
|
||||||
|
Status OperationStatus `json:"status"`
|
||||||
|
StartedAt time.Time `json:"started_at"`
|
||||||
|
DurationMs int64 `json:"duration_ms,omitempty"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
ExternalRef string `json:"external_ref,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// CIDiagnostics contains CI pipeline health information.
|
||||||
|
type CIDiagnostics struct {
|
||||||
|
// Available indicates if CI is configured for this project.
|
||||||
|
Available bool `json:"available"`
|
||||||
|
|
||||||
|
// RecentPipelines are the last N pipeline executions.
|
||||||
|
RecentPipelines []CIPipelineSummary `json:"recent_pipelines,omitempty"`
|
||||||
|
|
||||||
|
// LastFailure contains details of the most recent failed pipeline.
|
||||||
|
LastFailure *CIPipelineFailure `json:"last_failure,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// CIPipelineSummary is a condensed view of a pipeline for diagnostics.
|
||||||
|
type CIPipelineSummary struct {
|
||||||
|
Number int64 `json:"number"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Branch string `json:"branch"`
|
||||||
|
Commit string `json:"commit"`
|
||||||
|
StartedAt time.Time `json:"started_at"`
|
||||||
|
Duration string `json:"duration,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// CIPipelineFailure contains details about a failed pipeline.
|
||||||
|
type CIPipelineFailure struct {
|
||||||
|
Number int64 `json:"number"`
|
||||||
|
FailedStep string `json:"failed_step"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
LogTail string `json:"log_tail,omitempty"`
|
||||||
|
URL string `json:"url,omitempty"`
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiagnosticsSummary constants.
|
||||||
|
const (
|
||||||
|
DiagnosticsSummaryHealthy = "healthy"
|
||||||
|
DiagnosticsSummaryDegraded = "degraded"
|
||||||
|
DiagnosticsSummaryUnhealthy = "unhealthy"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DiagnosticSeverity constants.
|
||||||
|
const (
|
||||||
|
DiagnosticSeverityError = "error"
|
||||||
|
DiagnosticSeverityWarning = "warning"
|
||||||
|
DiagnosticSeverityInfo = "info"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DiagnosticSource constants.
|
||||||
|
const (
|
||||||
|
DiagnosticSourceOperation = "operation"
|
||||||
|
DiagnosticSourceCI = "ci"
|
||||||
|
DiagnosticSourceRegistry = "registry"
|
||||||
|
)
|
||||||
23
internal/domain/external_health.go
Normal file
23
internal/domain/external_health.go
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
package domain
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// ExternalSystem identifies an external dependency.
|
||||||
|
type ExternalSystem string
|
||||||
|
|
||||||
|
const (
|
||||||
|
ExternalSystemRegistry ExternalSystem = "registry"
|
||||||
|
ExternalSystemCI ExternalSystem = "ci"
|
||||||
|
ExternalSystemGit ExternalSystem = "git"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ExternalSystemStatus represents the health of an external system.
|
||||||
|
type ExternalSystemStatus struct {
|
||||||
|
System ExternalSystem `json:"system"`
|
||||||
|
Healthy bool `json:"healthy"`
|
||||||
|
URL string `json:"url"`
|
||||||
|
Latency time.Duration `json:"latency"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
LastChecked time.Time `json:"last_checked"`
|
||||||
|
LastHealthy time.Time `json:"last_healthy,omitempty"`
|
||||||
|
}
|
||||||
87
internal/handlers/diagnostics.go
Normal file
87
internal/handlers/diagnostics.go
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
// Package handlers provides HTTP handlers for the rdev API.
|
||||||
|
package handlers
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"log/slog"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/auth"
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
|
"github.com/orchard9/rdev/internal/port"
|
||||||
|
"github.com/orchard9/rdev/pkg/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DiagnosticsGetter retrieves project diagnostics.
|
||||||
|
type DiagnosticsGetter interface {
|
||||||
|
GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiagnosticsHandler handles project diagnostics requests.
|
||||||
|
type DiagnosticsHandler struct {
|
||||||
|
diagnostics DiagnosticsGetter
|
||||||
|
projects port.ProjectRepository
|
||||||
|
logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDiagnosticsHandler creates a new diagnostics handler.
|
||||||
|
func NewDiagnosticsHandler(
|
||||||
|
diagnostics DiagnosticsGetter,
|
||||||
|
projects port.ProjectRepository,
|
||||||
|
logger *slog.Logger,
|
||||||
|
) *DiagnosticsHandler {
|
||||||
|
if logger == nil {
|
||||||
|
logger = slog.Default()
|
||||||
|
}
|
||||||
|
return &DiagnosticsHandler{
|
||||||
|
diagnostics: diagnostics,
|
||||||
|
projects: projects,
|
||||||
|
logger: logger,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mount registers the diagnostics routes.
|
||||||
|
func (h *DiagnosticsHandler) Mount(r api.Router) {
|
||||||
|
r.Route("/projects/{projectId}/diagnostics", func(r chi.Router) {
|
||||||
|
r.With(auth.RequireScope(auth.ScopeProjectsRead, auth.ScopeAdmin)).Get("/", h.GetDiagnostics)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDiagnostics returns comprehensive health information for a project.
|
||||||
|
// GET /projects/{projectId}/diagnostics
|
||||||
|
func (h *DiagnosticsHandler) GetDiagnostics(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ctx, cancel := context.WithTimeout(r.Context(), TimeoutStandard)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
projectID := chi.URLParam(r, "projectId")
|
||||||
|
if projectID == "" {
|
||||||
|
api.WriteBadRequest(w, r, "project ID is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify project exists (optional - diagnostics can still be useful for non-k8s projects)
|
||||||
|
if h.projects != nil {
|
||||||
|
if _, err := h.projects.Get(ctx, domain.ProjectID(projectID)); err != nil {
|
||||||
|
if err == domain.ErrProjectNotFound {
|
||||||
|
// Log but continue - the project might exist in git/CI but not as a k8s pod
|
||||||
|
h.logger.Debug("project not found in k8s, continuing with diagnostics",
|
||||||
|
"project_id", projectID,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
diag, err := h.diagnostics.GetDiagnostics(ctx, projectID)
|
||||||
|
if err != nil {
|
||||||
|
h.logger.Error("failed to get diagnostics",
|
||||||
|
"error", err,
|
||||||
|
"project_id", projectID,
|
||||||
|
)
|
||||||
|
api.WriteInternalError(w, r, "failed to retrieve diagnostics")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
api.WriteSuccess(w, r, diag)
|
||||||
|
}
|
||||||
148
internal/handlers/diagnostics_test.go
Normal file
148
internal/handlers/diagnostics_test.go
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
package handlers
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// mockDiagnosticsGetter implements DiagnosticsGetter for testing.
|
||||||
|
type mockDiagnosticsGetter struct {
|
||||||
|
diagnostics *domain.ProjectDiagnostics
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockDiagnosticsGetter) GetDiagnostics(_ context.Context, projectID string) (*domain.ProjectDiagnostics, error) {
|
||||||
|
if m.err != nil {
|
||||||
|
return nil, m.err
|
||||||
|
}
|
||||||
|
if m.diagnostics != nil {
|
||||||
|
return m.diagnostics, nil
|
||||||
|
}
|
||||||
|
// Return default healthy diagnostics
|
||||||
|
return &domain.ProjectDiagnostics{
|
||||||
|
ProjectID: projectID,
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Summary: domain.DiagnosticsSummaryHealthy,
|
||||||
|
Issues: []domain.DiagnosticIssue{},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiagnosticsHandler_GetDiagnostics_Success(t *testing.T) {
|
||||||
|
getter := &mockDiagnosticsGetter{}
|
||||||
|
projects := newMockProjectRepo()
|
||||||
|
h := NewDiagnosticsHandler(getter, projects, nil)
|
||||||
|
|
||||||
|
// Create router with chi to handle URL params
|
||||||
|
r := chi.NewRouter()
|
||||||
|
r.Use(testAdminAuth) // Add auth context for tests
|
||||||
|
h.Mount(r)
|
||||||
|
|
||||||
|
req := httptest.NewRequest("GET", "/projects/test-project/diagnostics/", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
r.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Errorf("GetDiagnostics() status = %d, want %d; body = %s", rec.Code, http.StatusOK, rec.Body.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
var resp map[string]any
|
||||||
|
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
|
||||||
|
t.Fatalf("failed to decode response: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, ok := resp["data"].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("response missing data field")
|
||||||
|
}
|
||||||
|
|
||||||
|
if data["project_id"] != "test-project" {
|
||||||
|
t.Errorf("project_id = %q, want %q", data["project_id"], "test-project")
|
||||||
|
}
|
||||||
|
if data["summary"] != domain.DiagnosticsSummaryHealthy {
|
||||||
|
t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryHealthy)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiagnosticsHandler_GetDiagnostics_WithIssues(t *testing.T) {
|
||||||
|
getter := &mockDiagnosticsGetter{
|
||||||
|
diagnostics: &domain.ProjectDiagnostics{
|
||||||
|
ProjectID: "unhealthy-project",
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Summary: domain.DiagnosticsSummaryUnhealthy,
|
||||||
|
Issues: []domain.DiagnosticIssue{
|
||||||
|
{
|
||||||
|
Severity: domain.DiagnosticSeverityError,
|
||||||
|
Source: domain.DiagnosticSourceCI,
|
||||||
|
Message: "CI build #42 failed",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Severity: domain.DiagnosticSeverityWarning,
|
||||||
|
Source: domain.DiagnosticSourceRegistry,
|
||||||
|
Message: "Container registry slow",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
projects := newMockProjectRepo()
|
||||||
|
h := NewDiagnosticsHandler(getter, projects, nil)
|
||||||
|
|
||||||
|
r := chi.NewRouter()
|
||||||
|
r.Use(testAdminAuth) // Add auth context for tests
|
||||||
|
h.Mount(r)
|
||||||
|
|
||||||
|
req := httptest.NewRequest("GET", "/projects/unhealthy-project/diagnostics/", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
r.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusOK)
|
||||||
|
}
|
||||||
|
|
||||||
|
var resp map[string]any
|
||||||
|
if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil {
|
||||||
|
t.Fatalf("failed to decode response: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, ok := resp["data"].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("response missing data field")
|
||||||
|
}
|
||||||
|
|
||||||
|
if data["summary"] != domain.DiagnosticsSummaryUnhealthy {
|
||||||
|
t.Errorf("summary = %q, want %q", data["summary"], domain.DiagnosticsSummaryUnhealthy)
|
||||||
|
}
|
||||||
|
|
||||||
|
issues, ok := data["issues"].([]any)
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("response missing issues field")
|
||||||
|
}
|
||||||
|
if len(issues) != 2 {
|
||||||
|
t.Errorf("issues count = %d, want %d", len(issues), 2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiagnosticsHandler_GetDiagnostics_MissingProjectID(t *testing.T) {
|
||||||
|
getter := &mockDiagnosticsGetter{}
|
||||||
|
projects := newMockProjectRepo()
|
||||||
|
h := NewDiagnosticsHandler(getter, projects, nil)
|
||||||
|
|
||||||
|
// Direct call without chi router to test missing projectId
|
||||||
|
req := httptest.NewRequest("GET", "/projects//diagnostics/", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.GetDiagnostics(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusBadRequest {
|
||||||
|
t.Errorf("GetDiagnostics() status = %d, want %d", rec.Code, http.StatusBadRequest)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -8,6 +8,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
"github.com/orchard9/rdev/internal/metrics"
|
"github.com/orchard9/rdev/internal/metrics"
|
||||||
"github.com/orchard9/rdev/internal/port"
|
"github.com/orchard9/rdev/internal/port"
|
||||||
"github.com/orchard9/rdev/pkg/api"
|
"github.com/orchard9/rdev/pkg/api"
|
||||||
@ -19,6 +20,11 @@ type ExecutorHealthChecker interface {
|
|||||||
WorkerID() string
|
WorkerID() string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExternalHealthStatusProvider provides cached external system health statuses.
|
||||||
|
type ExternalHealthStatusProvider interface {
|
||||||
|
GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus
|
||||||
|
}
|
||||||
|
|
||||||
// HealthHandler handles health and readiness checks.
|
// HealthHandler handles health and readiness checks.
|
||||||
type HealthHandler struct {
|
type HealthHandler struct {
|
||||||
serviceName string
|
serviceName string
|
||||||
@ -27,6 +33,7 @@ type HealthHandler struct {
|
|||||||
agentRegistry port.CodeAgentRegistry
|
agentRegistry port.CodeAgentRegistry
|
||||||
workExecutor ExecutorHealthChecker
|
workExecutor ExecutorHealthChecker
|
||||||
registryChecker port.RegistryChecker
|
registryChecker port.RegistryChecker
|
||||||
|
externalChecker ExternalHealthStatusProvider
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewHealthHandler creates a new health handler with dependencies.
|
// NewHealthHandler creates a new health handler with dependencies.
|
||||||
@ -56,6 +63,12 @@ func (h *HealthHandler) WithRegistryChecker(checker port.RegistryChecker) *Healt
|
|||||||
return h
|
return h
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithExternalHealthChecker adds a cached external health checker for monitoring.
|
||||||
|
func (h *HealthHandler) WithExternalHealthChecker(checker ExternalHealthStatusProvider) *HealthHandler {
|
||||||
|
h.externalChecker = checker
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
// Health returns a simple liveness check.
|
// Health returns a simple liveness check.
|
||||||
// This should be lightweight and only fail if the process is unhealthy.
|
// This should be lightweight and only fail if the process is unhealthy.
|
||||||
// GET /health
|
// GET /health
|
||||||
@ -113,6 +126,26 @@ func (h *HealthHandler) Ready(w http.ResponseWriter, r *http.Request) {
|
|||||||
checks["registry"] = h.checkRegistry(ctx)
|
checks["registry"] = h.checkRegistry(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// External system checks (cached, from background worker)
|
||||||
|
if h.externalChecker != nil {
|
||||||
|
for system, status := range h.externalChecker.GetAllStatuses() {
|
||||||
|
checks["external:"+string(system)] = CheckResult{
|
||||||
|
Healthy: status.Healthy,
|
||||||
|
Message: status.Error,
|
||||||
|
Latency: status.Latency.String(),
|
||||||
|
LastCheck: status.LastChecked,
|
||||||
|
}
|
||||||
|
if status.Healthy {
|
||||||
|
checks["external:"+string(system)] = CheckResult{
|
||||||
|
Healthy: true,
|
||||||
|
Message: "connected",
|
||||||
|
Latency: status.Latency.String(),
|
||||||
|
LastCheck: status.LastChecked,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
response := ReadinessResponse{
|
response := ReadinessResponse{
|
||||||
Status: "ready",
|
Status: "ready",
|
||||||
Service: h.serviceName,
|
Service: h.serviceName,
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package handlers
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
@ -10,11 +11,22 @@ import (
|
|||||||
|
|
||||||
"github.com/go-chi/chi/v5"
|
"github.com/go-chi/chi/v5"
|
||||||
"github.com/orchard9/rdev/internal/adapter/kubernetes"
|
"github.com/orchard9/rdev/internal/adapter/kubernetes"
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
)
|
)
|
||||||
|
|
||||||
// newTestProjectsHandler creates a ProjectsHandler for testing.
|
// newTestProjectsHandler creates a ProjectsHandler for testing.
|
||||||
|
// It registers a test project "test-project" for use in tests.
|
||||||
func newTestProjectsHandler() *ProjectsHandler {
|
func newTestProjectsHandler() *ProjectsHandler {
|
||||||
repo := kubernetes.NewProjectRepository("test-namespace")
|
repo := kubernetes.NewProjectRepository("test-namespace")
|
||||||
|
// Register a test project for tests to use
|
||||||
|
_ = repo.Register(context.Background(), &domain.Project{
|
||||||
|
ID: "test-project",
|
||||||
|
Name: "Test Project",
|
||||||
|
Description: "Test project for unit tests",
|
||||||
|
PodName: "test-project-pod-0",
|
||||||
|
Status: domain.ProjectStatusRunning,
|
||||||
|
Workspace: "/workspace",
|
||||||
|
})
|
||||||
exec := kubernetes.NewExecutor("test-namespace")
|
exec := kubernetes.NewExecutor("test-namespace")
|
||||||
return NewProjectsHandler(repo, exec)
|
return NewProjectsHandler(repo, exec)
|
||||||
}
|
}
|
||||||
@ -61,7 +73,7 @@ func TestProjectsHandler_Get(t *testing.T) {
|
|||||||
projectID string
|
projectID string
|
||||||
wantStatus int
|
wantStatus int
|
||||||
}{
|
}{
|
||||||
{"existing project", "pantheon", http.StatusOK},
|
{"existing project", "test-project", http.StatusOK},
|
||||||
{"non-existent project", "nonexistent", http.StatusNotFound},
|
{"non-existent project", "nonexistent", http.StatusNotFound},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,7 +107,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "valid request",
|
name: "valid request",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ClaudeRequest{
|
body: ClaudeRequest{
|
||||||
Prompt: "Hello, world!",
|
Prompt: "Hello, world!",
|
||||||
},
|
},
|
||||||
@ -103,7 +115,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "missing prompt",
|
name: "missing prompt",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ClaudeRequest{
|
body: ClaudeRequest{
|
||||||
Prompt: "",
|
Prompt: "",
|
||||||
},
|
},
|
||||||
@ -118,7 +130,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "null byte in prompt",
|
name: "null byte in prompt",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ClaudeRequest{
|
body: ClaudeRequest{
|
||||||
Prompt: "Hello\x00World",
|
Prompt: "Hello\x00World",
|
||||||
},
|
},
|
||||||
@ -127,7 +139,7 @@ func TestProjectsHandler_RunClaude(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "invalid stream ID",
|
name: "invalid stream ID",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ClaudeRequest{
|
body: ClaudeRequest{
|
||||||
Prompt: "Hello",
|
Prompt: "Hello",
|
||||||
StreamID: "invalid stream id with spaces",
|
StreamID: "invalid stream id with spaces",
|
||||||
@ -175,7 +187,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "valid command",
|
name: "valid command",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "ls -la",
|
Command: "ls -la",
|
||||||
},
|
},
|
||||||
@ -183,7 +195,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "missing command",
|
name: "missing command",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "",
|
Command: "",
|
||||||
},
|
},
|
||||||
@ -192,7 +204,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "dangerous command with semicolon",
|
name: "dangerous command with semicolon",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "ls; rm -rf /",
|
Command: "ls; rm -rf /",
|
||||||
},
|
},
|
||||||
@ -201,7 +213,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "dangerous command with pipe",
|
name: "dangerous command with pipe",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "cat /etc/passwd | grep root",
|
Command: "cat /etc/passwd | grep root",
|
||||||
},
|
},
|
||||||
@ -210,7 +222,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "command substitution",
|
name: "command substitution",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "echo $(whoami)",
|
Command: "echo $(whoami)",
|
||||||
},
|
},
|
||||||
@ -219,7 +231,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "redirect",
|
name: "redirect",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "ls > /tmp/out.txt",
|
Command: "ls > /tmp/out.txt",
|
||||||
},
|
},
|
||||||
@ -228,7 +240,7 @@ func TestProjectsHandler_RunShell(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "rm rf root",
|
name: "rm rf root",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: ShellRequest{
|
body: ShellRequest{
|
||||||
Command: "rm -rf /",
|
Command: "rm -rf /",
|
||||||
},
|
},
|
||||||
@ -281,7 +293,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "valid git status",
|
name: "valid git status",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: GitRequest{
|
body: GitRequest{
|
||||||
Args: []string{"status"},
|
Args: []string{"status"},
|
||||||
},
|
},
|
||||||
@ -289,7 +301,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "valid git log",
|
name: "valid git log",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: GitRequest{
|
body: GitRequest{
|
||||||
Args: []string{"log", "--oneline", "-10"},
|
Args: []string{"log", "--oneline", "-10"},
|
||||||
},
|
},
|
||||||
@ -297,7 +309,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "missing args",
|
name: "missing args",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: GitRequest{
|
body: GitRequest{
|
||||||
Args: []string{},
|
Args: []string{},
|
||||||
},
|
},
|
||||||
@ -306,7 +318,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "git config blocked",
|
name: "git config blocked",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: GitRequest{
|
body: GitRequest{
|
||||||
Args: []string{"config", "--global", "user.name", "attacker"},
|
Args: []string{"config", "--global", "user.name", "attacker"},
|
||||||
},
|
},
|
||||||
@ -315,7 +327,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "git remote blocked",
|
name: "git remote blocked",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: GitRequest{
|
body: GitRequest{
|
||||||
Args: []string{"remote", "add", "evil", "https://evil.com/repo"},
|
Args: []string{"remote", "add", "evil", "https://evil.com/repo"},
|
||||||
},
|
},
|
||||||
@ -324,7 +336,7 @@ func TestProjectsHandler_RunGit(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "force push blocked",
|
name: "force push blocked",
|
||||||
projectID: "pantheon",
|
projectID: "test-project",
|
||||||
body: GitRequest{
|
body: GitRequest{
|
||||||
Args: []string{"push", "-f", "origin", "main"},
|
Args: []string{"push", "-f", "origin", "main"},
|
||||||
},
|
},
|
||||||
@ -394,9 +406,9 @@ func TestProjectsHandler_InvalidJSON(t *testing.T) {
|
|||||||
method string
|
method string
|
||||||
path string
|
path string
|
||||||
}{
|
}{
|
||||||
{"POST", "/projects/pantheon/claude"},
|
{"POST", "/projects/test-project/claude"},
|
||||||
{"POST", "/projects/pantheon/shell"},
|
{"POST", "/projects/test-project/shell"},
|
||||||
{"POST", "/projects/pantheon/git"},
|
{"POST", "/projects/test-project/git"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, ep := range endpoints {
|
for _, ep := range endpoints {
|
||||||
@ -429,12 +441,12 @@ func TestCommandIDGeneration(t *testing.T) {
|
|||||||
body := ClaudeRequest{Prompt: "test"}
|
body := ClaudeRequest{Prompt: "test"}
|
||||||
bodyBytes, _ := json.Marshal(body)
|
bodyBytes, _ := json.Marshal(body)
|
||||||
|
|
||||||
req1 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes))
|
req1 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
|
||||||
req1.Header.Set("Content-Type", "application/json")
|
req1.Header.Set("Content-Type", "application/json")
|
||||||
rec1 := httptest.NewRecorder()
|
rec1 := httptest.NewRecorder()
|
||||||
router.ServeHTTP(rec1, req1)
|
router.ServeHTTP(rec1, req1)
|
||||||
|
|
||||||
req2 := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes))
|
req2 := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
|
||||||
req2.Header.Set("Content-Type", "application/json")
|
req2.Header.Set("Content-Type", "application/json")
|
||||||
rec2 := httptest.NewRecorder()
|
rec2 := httptest.NewRecorder()
|
||||||
router.ServeHTTP(rec2, req2)
|
router.ServeHTTP(rec2, req2)
|
||||||
@ -465,7 +477,7 @@ func TestCustomStreamID(t *testing.T) {
|
|||||||
}
|
}
|
||||||
bodyBytes, _ := json.Marshal(body)
|
bodyBytes, _ := json.Marshal(body)
|
||||||
|
|
||||||
req := httptest.NewRequest("POST", "/projects/pantheon/claude", bytes.NewReader(bodyBytes))
|
req := httptest.NewRequest("POST", "/projects/test-project/claude", bytes.NewReader(bodyBytes))
|
||||||
req.Header.Set("Content-Type", "application/json")
|
req.Header.Set("Content-Type", "application/json")
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
router.ServeHTTP(rec, req)
|
router.ServeHTTP(rec, req)
|
||||||
|
|||||||
@ -142,6 +142,22 @@ var (
|
|||||||
Name: "rdev_ci_push_failures_total",
|
Name: "rdev_ci_push_failures_total",
|
||||||
Help: "Total number of CI image push failures by project",
|
Help: "Total number of CI image push failures by project",
|
||||||
}, []string{"project"})
|
}, []string{"project"})
|
||||||
|
|
||||||
|
// External system health
|
||||||
|
externalSystemHealthy = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Name: "rdev_external_system_healthy",
|
||||||
|
Help: "Whether external system is healthy (1) or not (0)",
|
||||||
|
}, []string{"system"})
|
||||||
|
|
||||||
|
externalSystemLatency = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Name: "rdev_external_system_latency_seconds",
|
||||||
|
Help: "Latency of external system health check in seconds",
|
||||||
|
}, []string{"system"})
|
||||||
|
|
||||||
|
externalSystemLastCheck = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Name: "rdev_external_system_last_check_timestamp",
|
||||||
|
Help: "Unix timestamp of last health check",
|
||||||
|
}, []string{"system"})
|
||||||
)
|
)
|
||||||
|
|
||||||
// RecordCommand records a command execution.
|
// RecordCommand records a command execution.
|
||||||
@ -248,6 +264,17 @@ func RecordCIPushFailure(project string) {
|
|||||||
ciPushFailures.WithLabelValues(project).Inc()
|
ciPushFailures.WithLabelValues(project).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetExternalSystemHealth updates the health metrics for an external system.
|
||||||
|
func SetExternalSystemHealth(system string, healthy bool, latencySeconds float64) {
|
||||||
|
val := 0.0
|
||||||
|
if healthy {
|
||||||
|
val = 1.0
|
||||||
|
}
|
||||||
|
externalSystemHealthy.WithLabelValues(system).Set(val)
|
||||||
|
externalSystemLatency.WithLabelValues(system).Set(latencySeconds)
|
||||||
|
externalSystemLastCheck.WithLabelValues(system).Set(float64(time.Now().Unix()))
|
||||||
|
}
|
||||||
|
|
||||||
// Handler returns the Prometheus HTTP handler.
|
// Handler returns the Prometheus HTTP handler.
|
||||||
func Handler() http.Handler {
|
func Handler() http.Handler {
|
||||||
return promhttp.Handler()
|
return promhttp.Handler()
|
||||||
|
|||||||
@ -23,3 +23,9 @@ type RegistryChecker interface {
|
|||||||
// Check returns the health status of the registry.
|
// Check returns the health status of the registry.
|
||||||
Check(ctx context.Context) domain.RegistryStatus
|
Check(ctx context.Context) domain.RegistryStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExternalHealthChecker checks an external system's health.
|
||||||
|
type ExternalHealthChecker interface {
|
||||||
|
// Check returns the health status of the external system.
|
||||||
|
Check(ctx context.Context) domain.ExternalSystemStatus
|
||||||
|
}
|
||||||
|
|||||||
295
internal/service/diagnostics_service.go
Normal file
295
internal/service/diagnostics_service.go
Normal file
@ -0,0 +1,295 @@
|
|||||||
|
// Package service provides business logic services.
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
|
"github.com/orchard9/rdev/internal/port"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DiagnosticsServiceConfig configures the diagnostics service.
|
||||||
|
type DiagnosticsServiceConfig struct {
|
||||||
|
// DefaultGitOwner is the git organization for CI lookups.
|
||||||
|
DefaultGitOwner string
|
||||||
|
|
||||||
|
// MaxRecentOperations is how many operations to include.
|
||||||
|
MaxRecentOperations int
|
||||||
|
|
||||||
|
// MaxRecentPipelines is how many pipelines to include.
|
||||||
|
MaxRecentPipelines int
|
||||||
|
|
||||||
|
Logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiagnosticsService aggregates project health information from multiple sources.
|
||||||
|
type DiagnosticsService struct {
|
||||||
|
operationRepo port.OperationRepository
|
||||||
|
registryChecker port.RegistryChecker
|
||||||
|
ciProvider port.CIProvider
|
||||||
|
|
||||||
|
defaultGitOwner string
|
||||||
|
maxRecentOperations int
|
||||||
|
maxRecentPipelines int
|
||||||
|
logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDiagnosticsService creates a new diagnostics service.
|
||||||
|
func NewDiagnosticsService(
|
||||||
|
operationRepo port.OperationRepository,
|
||||||
|
registryChecker port.RegistryChecker,
|
||||||
|
ciProvider port.CIProvider,
|
||||||
|
cfg DiagnosticsServiceConfig,
|
||||||
|
) *DiagnosticsService {
|
||||||
|
logger := cfg.Logger
|
||||||
|
if logger == nil {
|
||||||
|
logger = slog.Default()
|
||||||
|
}
|
||||||
|
|
||||||
|
maxOps := cfg.MaxRecentOperations
|
||||||
|
if maxOps <= 0 {
|
||||||
|
maxOps = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
maxPipelines := cfg.MaxRecentPipelines
|
||||||
|
if maxPipelines <= 0 {
|
||||||
|
maxPipelines = 5
|
||||||
|
}
|
||||||
|
|
||||||
|
return &DiagnosticsService{
|
||||||
|
operationRepo: operationRepo,
|
||||||
|
registryChecker: registryChecker,
|
||||||
|
ciProvider: ciProvider,
|
||||||
|
defaultGitOwner: cfg.DefaultGitOwner,
|
||||||
|
maxRecentOperations: maxOps,
|
||||||
|
maxRecentPipelines: maxPipelines,
|
||||||
|
logger: logger.With("service", "diagnostics"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDiagnostics returns comprehensive health information for a project.
|
||||||
|
func (s *DiagnosticsService) GetDiagnostics(ctx context.Context, projectID string) (*domain.ProjectDiagnostics, error) {
|
||||||
|
diag := &domain.ProjectDiagnostics{
|
||||||
|
ProjectID: projectID,
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Summary: domain.DiagnosticsSummaryHealthy,
|
||||||
|
Issues: []domain.DiagnosticIssue{},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect data from each source (don't fail if one source fails)
|
||||||
|
s.collectOperations(ctx, projectID, diag)
|
||||||
|
s.collectRegistryHealth(ctx, diag)
|
||||||
|
s.collectCIStatus(ctx, projectID, diag)
|
||||||
|
|
||||||
|
// Determine overall summary
|
||||||
|
s.calculateSummary(diag)
|
||||||
|
|
||||||
|
return diag, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectOperations fetches recent operations and extracts issues.
|
||||||
|
func (s *DiagnosticsService) collectOperations(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) {
|
||||||
|
filter := domain.OperationFilters{
|
||||||
|
ProjectID: projectID,
|
||||||
|
Limit: s.maxRecentOperations,
|
||||||
|
}
|
||||||
|
|
||||||
|
ops, err := s.operationRepo.List(ctx, filter)
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Warn("failed to fetch operations for diagnostics",
|
||||||
|
"error", err,
|
||||||
|
"project_id", projectID,
|
||||||
|
)
|
||||||
|
diag.Issues = append(diag.Issues, domain.DiagnosticIssue{
|
||||||
|
Severity: domain.DiagnosticSeverityWarning,
|
||||||
|
Source: domain.DiagnosticSourceOperation,
|
||||||
|
Message: "Unable to fetch operation history",
|
||||||
|
Details: err.Error(),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to summaries
|
||||||
|
for _, op := range ops {
|
||||||
|
summary := domain.OperationSummary{
|
||||||
|
ID: op.ID,
|
||||||
|
Type: op.Type,
|
||||||
|
Status: op.Status,
|
||||||
|
StartedAt: op.StartedAt,
|
||||||
|
DurationMs: op.DurationMs,
|
||||||
|
Error: op.Error,
|
||||||
|
ExternalRef: op.ExternalRef,
|
||||||
|
}
|
||||||
|
diag.RecentOperations = append(diag.RecentOperations, summary)
|
||||||
|
|
||||||
|
// Extract issues from failed operations
|
||||||
|
if op.Status == domain.OperationStatusFailed {
|
||||||
|
issue := domain.DiagnosticIssue{
|
||||||
|
Severity: domain.DiagnosticSeverityError,
|
||||||
|
Source: domain.DiagnosticSourceOperation,
|
||||||
|
Message: fmt.Sprintf("%s operation failed", op.Type),
|
||||||
|
Timestamp: op.StartedAt,
|
||||||
|
}
|
||||||
|
if op.Error != "" {
|
||||||
|
issue.Details = op.Error
|
||||||
|
}
|
||||||
|
if op.ExternalRef != "" {
|
||||||
|
issue.Message += fmt.Sprintf(" (%s)", op.ExternalRef)
|
||||||
|
}
|
||||||
|
diag.Issues = append(diag.Issues, issue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectRegistryHealth checks registry status.
|
||||||
|
func (s *DiagnosticsService) collectRegistryHealth(ctx context.Context, diag *domain.ProjectDiagnostics) {
|
||||||
|
if s.registryChecker == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
status := s.registryChecker.Check(ctx)
|
||||||
|
diag.Registry = &status
|
||||||
|
|
||||||
|
if !status.Healthy {
|
||||||
|
diag.Issues = append(diag.Issues, domain.DiagnosticIssue{
|
||||||
|
Severity: domain.DiagnosticSeverityError,
|
||||||
|
Source: domain.DiagnosticSourceRegistry,
|
||||||
|
Message: "Container registry unhealthy",
|
||||||
|
Details: status.Error,
|
||||||
|
Timestamp: status.LastChecked,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectCIStatus fetches CI pipeline information.
|
||||||
|
func (s *DiagnosticsService) collectCIStatus(ctx context.Context, projectID string, diag *domain.ProjectDiagnostics) {
|
||||||
|
if s.ciProvider == nil {
|
||||||
|
diag.CI = &domain.CIDiagnostics{Available: false}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
owner := s.defaultGitOwner
|
||||||
|
if owner == "" {
|
||||||
|
owner = "jordan" // fallback
|
||||||
|
}
|
||||||
|
|
||||||
|
ciDiag := &domain.CIDiagnostics{Available: true}
|
||||||
|
|
||||||
|
pipelines, err := s.ciProvider.ListPipelines(ctx, owner, projectID)
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Warn("failed to fetch pipelines for diagnostics",
|
||||||
|
"error", err,
|
||||||
|
"project_id", projectID,
|
||||||
|
)
|
||||||
|
ciDiag.Available = false
|
||||||
|
diag.CI = ciDiag
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to summaries and find failures
|
||||||
|
var lastFailure *domain.CIPipeline
|
||||||
|
for i, p := range pipelines {
|
||||||
|
if i >= s.maxRecentPipelines {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
summary := domain.CIPipelineSummary{
|
||||||
|
Number: p.Number,
|
||||||
|
Status: p.Status,
|
||||||
|
Branch: p.Branch,
|
||||||
|
Commit: p.Commit,
|
||||||
|
StartedAt: p.Started,
|
||||||
|
}
|
||||||
|
if p.Finished.After(p.Started) {
|
||||||
|
summary.Duration = p.Finished.Sub(p.Started).Round(time.Second).String()
|
||||||
|
}
|
||||||
|
ciDiag.RecentPipelines = append(ciDiag.RecentPipelines, summary)
|
||||||
|
|
||||||
|
// Track the most recent failure
|
||||||
|
if p.Status == "failure" && lastFailure == nil {
|
||||||
|
lastFailure = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get details on the last failure
|
||||||
|
if lastFailure != nil {
|
||||||
|
failure := s.getFailureDetails(ctx, owner, projectID, lastFailure)
|
||||||
|
ciDiag.LastFailure = failure
|
||||||
|
|
||||||
|
// Add as issue
|
||||||
|
issue := domain.DiagnosticIssue{
|
||||||
|
Severity: domain.DiagnosticSeverityError,
|
||||||
|
Source: domain.DiagnosticSourceCI,
|
||||||
|
Message: fmt.Sprintf("CI build #%d failed", lastFailure.Number),
|
||||||
|
Timestamp: lastFailure.Finished,
|
||||||
|
}
|
||||||
|
if failure != nil && failure.FailedStep != "" {
|
||||||
|
issue.Message += fmt.Sprintf(" at step '%s'", failure.FailedStep)
|
||||||
|
if failure.Error != "" {
|
||||||
|
issue.Details = failure.Error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
diag.Issues = append(diag.Issues, issue)
|
||||||
|
}
|
||||||
|
|
||||||
|
diag.CI = ciDiag
|
||||||
|
}
|
||||||
|
|
||||||
|
// getFailureDetails fetches step-level details for a failed pipeline.
|
||||||
|
func (s *DiagnosticsService) getFailureDetails(ctx context.Context, owner, repo string, pipeline *domain.CIPipeline) *domain.CIPipelineFailure {
|
||||||
|
failure := &domain.CIPipelineFailure{
|
||||||
|
Number: pipeline.Number,
|
||||||
|
Timestamp: pipeline.Finished,
|
||||||
|
}
|
||||||
|
|
||||||
|
steps, err := s.ciProvider.GetPipelineSteps(ctx, owner, repo, pipeline.Number)
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Warn("failed to fetch pipeline steps",
|
||||||
|
"error", err,
|
||||||
|
"pipeline", pipeline.Number,
|
||||||
|
)
|
||||||
|
return failure
|
||||||
|
}
|
||||||
|
|
||||||
|
failure.URL = steps.URL
|
||||||
|
|
||||||
|
// Find the failed step
|
||||||
|
for _, step := range steps.Steps {
|
||||||
|
if step.Status == "failure" || step.Status == "error" {
|
||||||
|
failure.FailedStep = step.Name
|
||||||
|
failure.Error = step.Error
|
||||||
|
if step.Log != "" {
|
||||||
|
failure.LogTail = step.Log
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return failure
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculateSummary determines the overall health status.
|
||||||
|
func (s *DiagnosticsService) calculateSummary(diag *domain.ProjectDiagnostics) {
|
||||||
|
errorCount := 0
|
||||||
|
warningCount := 0
|
||||||
|
|
||||||
|
for _, issue := range diag.Issues {
|
||||||
|
switch issue.Severity {
|
||||||
|
case domain.DiagnosticSeverityError:
|
||||||
|
errorCount++
|
||||||
|
case domain.DiagnosticSeverityWarning:
|
||||||
|
warningCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if errorCount > 0 {
|
||||||
|
diag.Summary = domain.DiagnosticsSummaryUnhealthy
|
||||||
|
} else if warningCount > 0 {
|
||||||
|
diag.Summary = domain.DiagnosticsSummaryDegraded
|
||||||
|
} else {
|
||||||
|
diag.Summary = domain.DiagnosticsSummaryHealthy
|
||||||
|
}
|
||||||
|
}
|
||||||
302
internal/service/diagnostics_service_test.go
Normal file
302
internal/service/diagnostics_service_test.go
Normal file
@ -0,0 +1,302 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// mockOperationRepo implements port.OperationRepository for testing.
|
||||||
|
type mockOperationRepo struct {
|
||||||
|
operations []*domain.Operation
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) Create(_ context.Context, _ *domain.Operation) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) Update(_ context.Context, _ *domain.Operation) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) Get(_ context.Context, _ string) (*domain.Operation, error) {
|
||||||
|
return nil, domain.ErrOperationNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) GetByCommitSHA(_ context.Context, _, _ string) (*domain.Operation, error) {
|
||||||
|
return nil, domain.ErrOperationNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) List(_ context.Context, filter domain.OperationFilters) ([]*domain.Operation, error) {
|
||||||
|
if m.err != nil {
|
||||||
|
return nil, m.err
|
||||||
|
}
|
||||||
|
var result []*domain.Operation
|
||||||
|
for _, op := range m.operations {
|
||||||
|
if filter.ProjectID != "" && op.ProjectID != filter.ProjectID {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result = append(result, op)
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) AddStep(_ context.Context, _ string, _ domain.OperationStep) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) UpdateStep(_ context.Context, _ string, _ domain.OperationStep) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) Complete(_ context.Context, _ string, _ domain.OperationStatus, _ map[string]any, _, _ string) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) SetCommitSHA(_ context.Context, _, _ string) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) SetTriggeredBy(_ context.Context, _, _ string) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockOperationRepo) DeleteOlderThan(_ context.Context, _ time.Time) (int64, error) {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// mockRegistryChecker implements port.RegistryChecker for testing.
|
||||||
|
type mockRegistryChecker struct {
|
||||||
|
status domain.RegistryStatus
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus {
|
||||||
|
return m.status
|
||||||
|
}
|
||||||
|
|
||||||
|
// mockCIProvider implements port.CIProvider for testing.
|
||||||
|
type mockCIProvider struct {
|
||||||
|
pipelines []*domain.CIPipeline
|
||||||
|
steps *domain.CIPipelineSteps
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) ActivateRepo(_ context.Context, _, _, _ string) (*domain.CIRepo, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) DeactivateRepo(_ context.Context, _, _ string) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) GetRepo(_ context.Context, _, _ string) (*domain.CIRepo, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) ListRepos(_ context.Context) ([]*domain.CIRepo, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) AddSecret(_ context.Context, _, _ string, _ domain.CISecret) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) DeleteSecret(_ context.Context, _, _, _ string) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) ListPipelines(_ context.Context, _, _ string) ([]*domain.CIPipeline, error) {
|
||||||
|
if m.err != nil {
|
||||||
|
return nil, m.err
|
||||||
|
}
|
||||||
|
return m.pipelines, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) GetPipeline(_ context.Context, _, _ string, _ int64) (*domain.CIPipeline, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) GetPipelineSteps(_ context.Context, _, _ string, _ int64) (*domain.CIPipelineSteps, error) {
|
||||||
|
if m.steps != nil {
|
||||||
|
return m.steps, nil
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockCIProvider) TriggerBuild(_ context.Context, _, _, _ string) (int64, error) {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiagnosticsService_GetDiagnostics_Healthy(t *testing.T) {
|
||||||
|
opRepo := &mockOperationRepo{
|
||||||
|
operations: []*domain.Operation{
|
||||||
|
{
|
||||||
|
ID: "op-1",
|
||||||
|
ProjectID: "test-project",
|
||||||
|
Type: domain.OperationTypeBuild,
|
||||||
|
Status: domain.OperationStatusCompleted,
|
||||||
|
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
registry := &mockRegistryChecker{
|
||||||
|
status: domain.RegistryStatus{
|
||||||
|
Healthy: true,
|
||||||
|
URL: "https://registry.example.com",
|
||||||
|
Latency: "10ms",
|
||||||
|
LastChecked: time.Now(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ci := &mockCIProvider{
|
||||||
|
pipelines: []*domain.CIPipeline{
|
||||||
|
{
|
||||||
|
Number: 42,
|
||||||
|
Status: "success",
|
||||||
|
Branch: "main",
|
||||||
|
Commit: "abc123",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{
|
||||||
|
DefaultGitOwner: "test-org",
|
||||||
|
})
|
||||||
|
|
||||||
|
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetDiagnostics() error = %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diag.ProjectID != "test-project" {
|
||||||
|
t.Errorf("ProjectID = %q, want %q", diag.ProjectID, "test-project")
|
||||||
|
}
|
||||||
|
if diag.Summary != domain.DiagnosticsSummaryHealthy {
|
||||||
|
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy)
|
||||||
|
}
|
||||||
|
if len(diag.Issues) != 0 {
|
||||||
|
t.Errorf("Issues count = %d, want 0", len(diag.Issues))
|
||||||
|
}
|
||||||
|
if len(diag.RecentOperations) != 1 {
|
||||||
|
t.Errorf("RecentOperations count = %d, want 1", len(diag.RecentOperations))
|
||||||
|
}
|
||||||
|
if diag.Registry == nil {
|
||||||
|
t.Error("Registry is nil, want non-nil")
|
||||||
|
}
|
||||||
|
if diag.CI == nil || !diag.CI.Available {
|
||||||
|
t.Error("CI not available")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiagnosticsService_GetDiagnostics_Unhealthy(t *testing.T) {
|
||||||
|
opRepo := &mockOperationRepo{
|
||||||
|
operations: []*domain.Operation{
|
||||||
|
{
|
||||||
|
ID: "op-1",
|
||||||
|
ProjectID: "test-project",
|
||||||
|
Type: domain.OperationTypeBuild,
|
||||||
|
Status: domain.OperationStatusFailed,
|
||||||
|
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||||
|
Error: "deployment failed",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
registry := &mockRegistryChecker{
|
||||||
|
status: domain.RegistryStatus{
|
||||||
|
Healthy: false,
|
||||||
|
URL: "https://registry.example.com",
|
||||||
|
Error: "connection refused",
|
||||||
|
LastChecked: time.Now(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ci := &mockCIProvider{
|
||||||
|
pipelines: []*domain.CIPipeline{
|
||||||
|
{
|
||||||
|
Number: 43,
|
||||||
|
Status: "failure",
|
||||||
|
Branch: "main",
|
||||||
|
Commit: "def456",
|
||||||
|
Finished: time.Now().Add(-30 * time.Minute),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
steps: &domain.CIPipelineSteps{
|
||||||
|
PipelineNumber: 43,
|
||||||
|
URL: "https://ci.example.com/build/43",
|
||||||
|
Steps: []domain.CIPipelineStep{
|
||||||
|
{
|
||||||
|
Name: "test",
|
||||||
|
Status: "failure",
|
||||||
|
Error: "tests failed",
|
||||||
|
Log: "FAIL: TestSomething",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
svc := NewDiagnosticsService(opRepo, registry, ci, DiagnosticsServiceConfig{
|
||||||
|
DefaultGitOwner: "test-org",
|
||||||
|
})
|
||||||
|
|
||||||
|
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetDiagnostics() error = %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diag.Summary != domain.DiagnosticsSummaryUnhealthy {
|
||||||
|
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryUnhealthy)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should have 3 issues: failed operation, unhealthy registry, failed CI
|
||||||
|
if len(diag.Issues) < 3 {
|
||||||
|
t.Errorf("Issues count = %d, want at least 3", len(diag.Issues))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check CI failure details
|
||||||
|
if diag.CI == nil || diag.CI.LastFailure == nil {
|
||||||
|
t.Fatal("CI.LastFailure is nil")
|
||||||
|
}
|
||||||
|
if diag.CI.LastFailure.FailedStep != "test" {
|
||||||
|
t.Errorf("FailedStep = %q, want %q", diag.CI.LastFailure.FailedStep, "test")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDiagnosticsService_GetDiagnostics_Degraded(t *testing.T) {
|
||||||
|
opRepo := &mockOperationRepo{
|
||||||
|
operations: []*domain.Operation{
|
||||||
|
{
|
||||||
|
ID: "op-1",
|
||||||
|
ProjectID: "test-project",
|
||||||
|
Type: domain.OperationTypeBuild,
|
||||||
|
Status: domain.OperationStatusCompleted,
|
||||||
|
StartedAt: time.Now().Add(-1 * time.Hour),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
registry := &mockRegistryChecker{
|
||||||
|
status: domain.RegistryStatus{
|
||||||
|
Healthy: true,
|
||||||
|
URL: "https://registry.example.com",
|
||||||
|
LastChecked: time.Now(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// No CI provider - should produce a warning but not error
|
||||||
|
svc := NewDiagnosticsService(opRepo, registry, nil, DiagnosticsServiceConfig{
|
||||||
|
DefaultGitOwner: "test-org",
|
||||||
|
})
|
||||||
|
|
||||||
|
diag, err := svc.GetDiagnostics(context.Background(), "test-project")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetDiagnostics() error = %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Without CI, status should still be healthy (CI unavailable is not an issue)
|
||||||
|
if diag.Summary != domain.DiagnosticsSummaryHealthy {
|
||||||
|
t.Errorf("Summary = %q, want %q", diag.Summary, domain.DiagnosticsSummaryHealthy)
|
||||||
|
}
|
||||||
|
if diag.CI == nil || diag.CI.Available {
|
||||||
|
t.Error("CI should be not available when no provider")
|
||||||
|
}
|
||||||
|
}
|
||||||
248
internal/worker/external_health.go
Normal file
248
internal/worker/external_health.go
Normal file
@ -0,0 +1,248 @@
|
|||||||
|
package worker
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"log/slog"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
|
"github.com/orchard9/rdev/internal/metrics"
|
||||||
|
"github.com/orchard9/rdev/internal/port"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ExternalHealthChecker runs periodic health checks on external systems
|
||||||
|
// (registry, CI, git) and caches the results for the /ready endpoint.
|
||||||
|
type ExternalHealthChecker struct {
|
||||||
|
registry port.RegistryChecker // zot
|
||||||
|
ci port.ExternalHealthChecker // woodpecker
|
||||||
|
git port.ExternalHealthChecker // gitea
|
||||||
|
|
||||||
|
interval time.Duration
|
||||||
|
logger *slog.Logger
|
||||||
|
|
||||||
|
// Internal state (thread-safe)
|
||||||
|
mu sync.RWMutex
|
||||||
|
statuses map[domain.ExternalSystem]domain.ExternalSystemStatus
|
||||||
|
|
||||||
|
// Lifecycle
|
||||||
|
ctx context.Context
|
||||||
|
cancel context.CancelFunc
|
||||||
|
wg sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExternalHealthConfig configures the health checker.
|
||||||
|
type ExternalHealthConfig struct {
|
||||||
|
// CheckInterval is how often to check external systems. Default: 30s.
|
||||||
|
CheckInterval time.Duration
|
||||||
|
Logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultExternalHealthConfig returns sensible defaults.
|
||||||
|
func DefaultExternalHealthConfig() ExternalHealthConfig {
|
||||||
|
return ExternalHealthConfig{
|
||||||
|
CheckInterval: 30 * time.Second,
|
||||||
|
Logger: slog.Default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewExternalHealthChecker creates a new external health checker.
|
||||||
|
// All checker parameters are optional (nil means skip that system).
|
||||||
|
func NewExternalHealthChecker(
|
||||||
|
registry port.RegistryChecker,
|
||||||
|
ci port.ExternalHealthChecker,
|
||||||
|
git port.ExternalHealthChecker,
|
||||||
|
cfg ExternalHealthConfig,
|
||||||
|
) *ExternalHealthChecker {
|
||||||
|
if cfg.CheckInterval == 0 {
|
||||||
|
cfg.CheckInterval = 30 * time.Second
|
||||||
|
}
|
||||||
|
if cfg.Logger == nil {
|
||||||
|
cfg.Logger = slog.Default()
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
|
||||||
|
return &ExternalHealthChecker{
|
||||||
|
registry: registry,
|
||||||
|
ci: ci,
|
||||||
|
git: git,
|
||||||
|
interval: cfg.CheckInterval,
|
||||||
|
logger: cfg.Logger.With("component", "external-health"),
|
||||||
|
statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus),
|
||||||
|
ctx: ctx,
|
||||||
|
cancel: cancel,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start begins the background check loop.
|
||||||
|
func (c *ExternalHealthChecker) Start() {
|
||||||
|
c.logger.Info("external health checker started", "interval", c.interval)
|
||||||
|
|
||||||
|
c.wg.Add(1)
|
||||||
|
go c.checkLoop()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop gracefully shuts down the checker.
|
||||||
|
func (c *ExternalHealthChecker) Stop() {
|
||||||
|
c.logger.Info("external health checker stopping")
|
||||||
|
c.cancel()
|
||||||
|
c.wg.Wait()
|
||||||
|
c.logger.Info("external health checker stopped")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetStatus returns the cached status for a specific system.
|
||||||
|
func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) {
|
||||||
|
c.mu.RLock()
|
||||||
|
defer c.mu.RUnlock()
|
||||||
|
status, ok := c.statuses[system]
|
||||||
|
return status, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAllStatuses returns a copy of all cached statuses.
|
||||||
|
func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus {
|
||||||
|
c.mu.RLock()
|
||||||
|
defer c.mu.RUnlock()
|
||||||
|
|
||||||
|
result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses))
|
||||||
|
for k, v := range c.statuses {
|
||||||
|
result[k] = v
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkLoop runs periodic health checks.
|
||||||
|
func (c *ExternalHealthChecker) checkLoop() {
|
||||||
|
defer c.wg.Done()
|
||||||
|
|
||||||
|
// Run immediately on start
|
||||||
|
c.runChecks()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(c.interval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-c.ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
c.runChecks()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runChecks performs health checks on all configured systems in parallel.
|
||||||
|
func (c *ExternalHealthChecker) runChecks() {
|
||||||
|
ctx, cancel := context.WithTimeout(c.ctx, 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
results := make(chan domain.ExternalSystemStatus, 3)
|
||||||
|
|
||||||
|
// Check registry (zot)
|
||||||
|
if c.registry != nil {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
regStatus := c.registry.Check(ctx)
|
||||||
|
// Convert domain.RegistryStatus to domain.ExternalSystemStatus
|
||||||
|
status := domain.ExternalSystemStatus{
|
||||||
|
System: domain.ExternalSystemRegistry,
|
||||||
|
Healthy: regStatus.Healthy,
|
||||||
|
URL: regStatus.URL,
|
||||||
|
Error: regStatus.Error,
|
||||||
|
LastChecked: regStatus.LastChecked,
|
||||||
|
}
|
||||||
|
// Parse latency string (e.g., "45ms") to duration
|
||||||
|
if regStatus.Latency != "" {
|
||||||
|
if d, err := time.ParseDuration(regStatus.Latency); err == nil {
|
||||||
|
status.Latency = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if status.Healthy {
|
||||||
|
status.LastHealthy = status.LastChecked
|
||||||
|
}
|
||||||
|
results <- status
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check CI (woodpecker)
|
||||||
|
if c.ci != nil {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
results <- c.ci.Check(ctx)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check git (gitea)
|
||||||
|
if c.git != nil {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
results <- c.git.Check(ctx)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all checks to complete, then close results channel
|
||||||
|
go func() {
|
||||||
|
wg.Wait()
|
||||||
|
close(results)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Collect results and update state
|
||||||
|
for status := range results {
|
||||||
|
c.updateStatus(status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateStatus updates cached status and logs/metrics on state changes.
|
||||||
|
func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
|
prev, existed := c.statuses[status.System]
|
||||||
|
|
||||||
|
// Preserve LastHealthy from previous status if current is unhealthy
|
||||||
|
if !status.Healthy && existed && !prev.LastHealthy.IsZero() {
|
||||||
|
status.LastHealthy = prev.LastHealthy
|
||||||
|
}
|
||||||
|
|
||||||
|
c.statuses[status.System] = status
|
||||||
|
|
||||||
|
// Log state transitions
|
||||||
|
if !existed {
|
||||||
|
// First check
|
||||||
|
if status.Healthy {
|
||||||
|
c.logger.Info("external system healthy",
|
||||||
|
"system", status.System,
|
||||||
|
"url", status.URL,
|
||||||
|
"latency", status.Latency,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
c.logger.Warn("external system unhealthy",
|
||||||
|
"system", status.System,
|
||||||
|
"url", status.URL,
|
||||||
|
"error", status.Error,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else if prev.Healthy != status.Healthy {
|
||||||
|
// State changed
|
||||||
|
if status.Healthy {
|
||||||
|
c.logger.Info("external system recovered",
|
||||||
|
"system", status.System,
|
||||||
|
"url", status.URL,
|
||||||
|
"latency", status.Latency,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
c.logger.Warn("external system became unhealthy",
|
||||||
|
"system", status.System,
|
||||||
|
"url", status.URL,
|
||||||
|
"error", status.Error,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update Prometheus metrics
|
||||||
|
metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds())
|
||||||
|
}
|
||||||
241
internal/worker/external_health_test.go
Normal file
241
internal/worker/external_health_test.go
Normal file
@ -0,0 +1,241 @@
|
|||||||
|
package worker
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/orchard9/rdev/internal/domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// mockRegistryChecker is a mock implementation of port.RegistryChecker.
|
||||||
|
type mockRegistryChecker struct {
|
||||||
|
healthy bool
|
||||||
|
err string
|
||||||
|
latency time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockRegistryChecker) Check(_ context.Context) domain.RegistryStatus {
|
||||||
|
status := domain.RegistryStatus{
|
||||||
|
Healthy: m.healthy,
|
||||||
|
URL: "https://registry.test",
|
||||||
|
Latency: m.latency.String(),
|
||||||
|
LastChecked: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
if !m.healthy {
|
||||||
|
status.Error = m.err
|
||||||
|
}
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
|
||||||
|
// mockExternalHealthChecker is a mock implementation of port.ExternalHealthChecker.
|
||||||
|
type mockExternalHealthChecker struct {
|
||||||
|
system domain.ExternalSystem
|
||||||
|
healthy bool
|
||||||
|
err string
|
||||||
|
latency time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockExternalHealthChecker) Check(_ context.Context) domain.ExternalSystemStatus {
|
||||||
|
status := domain.ExternalSystemStatus{
|
||||||
|
System: m.system,
|
||||||
|
Healthy: m.healthy,
|
||||||
|
URL: "https://test.system",
|
||||||
|
Latency: m.latency,
|
||||||
|
LastChecked: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
if m.healthy {
|
||||||
|
status.LastHealthy = status.LastChecked
|
||||||
|
} else {
|
||||||
|
status.Error = m.err
|
||||||
|
}
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExternalHealthChecker_GetStatus(t *testing.T) {
|
||||||
|
registry := &mockRegistryChecker{healthy: true, latency: 50 * time.Millisecond}
|
||||||
|
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: true, latency: 100 * time.Millisecond}
|
||||||
|
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true, latency: 75 * time.Millisecond}
|
||||||
|
|
||||||
|
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
|
||||||
|
CheckInterval: 100 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Run checks synchronously
|
||||||
|
checker.runChecks()
|
||||||
|
|
||||||
|
// Verify registry status
|
||||||
|
regStatus, ok := checker.GetStatus(domain.ExternalSystemRegistry)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected registry status to exist")
|
||||||
|
}
|
||||||
|
if !regStatus.Healthy {
|
||||||
|
t.Error("expected registry to be healthy")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify CI status
|
||||||
|
ciStatus, ok := checker.GetStatus(domain.ExternalSystemCI)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected CI status to exist")
|
||||||
|
}
|
||||||
|
if !ciStatus.Healthy {
|
||||||
|
t.Error("expected CI to be healthy")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify Git status
|
||||||
|
gitStatus, ok := checker.GetStatus(domain.ExternalSystemGit)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected Git status to exist")
|
||||||
|
}
|
||||||
|
if !gitStatus.Healthy {
|
||||||
|
t.Error("expected Git to be healthy")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExternalHealthChecker_GetAllStatuses(t *testing.T) {
|
||||||
|
registry := &mockRegistryChecker{healthy: true}
|
||||||
|
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "connection refused"}
|
||||||
|
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true}
|
||||||
|
|
||||||
|
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
|
||||||
|
CheckInterval: 100 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
checker.runChecks()
|
||||||
|
|
||||||
|
statuses := checker.GetAllStatuses()
|
||||||
|
|
||||||
|
if len(statuses) != 3 {
|
||||||
|
t.Fatalf("expected 3 statuses, got %d", len(statuses))
|
||||||
|
}
|
||||||
|
|
||||||
|
if statuses[domain.ExternalSystemRegistry].Healthy != true {
|
||||||
|
t.Error("expected registry to be healthy")
|
||||||
|
}
|
||||||
|
if statuses[domain.ExternalSystemCI].Healthy != false {
|
||||||
|
t.Error("expected CI to be unhealthy")
|
||||||
|
}
|
||||||
|
if statuses[domain.ExternalSystemCI].Error != "connection refused" {
|
||||||
|
t.Errorf("expected CI error 'connection refused', got %q", statuses[domain.ExternalSystemCI].Error)
|
||||||
|
}
|
||||||
|
if statuses[domain.ExternalSystemGit].Healthy != true {
|
||||||
|
t.Error("expected Git to be healthy")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExternalHealthChecker_NilCheckers(t *testing.T) {
|
||||||
|
// All nil checkers should result in empty statuses
|
||||||
|
checker := NewExternalHealthChecker(nil, nil, nil, ExternalHealthConfig{
|
||||||
|
CheckInterval: 100 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
checker.runChecks()
|
||||||
|
|
||||||
|
statuses := checker.GetAllStatuses()
|
||||||
|
if len(statuses) != 0 {
|
||||||
|
t.Fatalf("expected 0 statuses with nil checkers, got %d", len(statuses))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExternalHealthChecker_StartStop(t *testing.T) {
|
||||||
|
registry := &mockRegistryChecker{healthy: true}
|
||||||
|
|
||||||
|
checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{
|
||||||
|
CheckInterval: 50 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
checker.Start()
|
||||||
|
|
||||||
|
// Wait for a couple of check cycles
|
||||||
|
time.Sleep(120 * time.Millisecond)
|
||||||
|
|
||||||
|
// Verify status was populated
|
||||||
|
status, ok := checker.GetStatus(domain.ExternalSystemRegistry)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected registry status after start")
|
||||||
|
}
|
||||||
|
if !status.Healthy {
|
||||||
|
t.Error("expected registry to be healthy")
|
||||||
|
}
|
||||||
|
|
||||||
|
checker.Stop()
|
||||||
|
|
||||||
|
// After stop, statuses should still be available (cached)
|
||||||
|
status, ok = checker.GetStatus(domain.ExternalSystemRegistry)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected registry status after stop")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExternalHealthChecker_StateTransition(t *testing.T) {
|
||||||
|
registry := &mockRegistryChecker{healthy: true}
|
||||||
|
|
||||||
|
checker := NewExternalHealthChecker(registry, nil, nil, ExternalHealthConfig{
|
||||||
|
CheckInterval: 100 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Initial check - healthy
|
||||||
|
checker.runChecks()
|
||||||
|
status, _ := checker.GetStatus(domain.ExternalSystemRegistry)
|
||||||
|
if !status.Healthy {
|
||||||
|
t.Error("expected initial status to be healthy")
|
||||||
|
}
|
||||||
|
firstHealthy := status.LastHealthy
|
||||||
|
|
||||||
|
// Change to unhealthy
|
||||||
|
registry.healthy = false
|
||||||
|
registry.err = "connection refused"
|
||||||
|
checker.runChecks()
|
||||||
|
|
||||||
|
status, _ = checker.GetStatus(domain.ExternalSystemRegistry)
|
||||||
|
if status.Healthy {
|
||||||
|
t.Error("expected status to be unhealthy after state change")
|
||||||
|
}
|
||||||
|
// LastHealthy should be preserved from when it was healthy
|
||||||
|
if status.LastHealthy.IsZero() {
|
||||||
|
t.Error("expected LastHealthy to be preserved")
|
||||||
|
}
|
||||||
|
if !status.LastHealthy.Equal(firstHealthy) {
|
||||||
|
t.Error("expected LastHealthy to remain from healthy period")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recover to healthy
|
||||||
|
registry.healthy = true
|
||||||
|
registry.err = ""
|
||||||
|
checker.runChecks()
|
||||||
|
|
||||||
|
status, _ = checker.GetStatus(domain.ExternalSystemRegistry)
|
||||||
|
if !status.Healthy {
|
||||||
|
t.Error("expected status to be healthy after recovery")
|
||||||
|
}
|
||||||
|
// LastHealthy should be updated
|
||||||
|
if status.LastHealthy.Before(firstHealthy) {
|
||||||
|
t.Error("expected LastHealthy to be updated on recovery")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExternalHealthChecker_PartialFailure(t *testing.T) {
|
||||||
|
// Registry healthy, CI unhealthy, Git healthy
|
||||||
|
registry := &mockRegistryChecker{healthy: true}
|
||||||
|
ci := &mockExternalHealthChecker{system: domain.ExternalSystemCI, healthy: false, err: "timeout"}
|
||||||
|
git := &mockExternalHealthChecker{system: domain.ExternalSystemGit, healthy: true}
|
||||||
|
|
||||||
|
checker := NewExternalHealthChecker(registry, ci, git, ExternalHealthConfig{
|
||||||
|
CheckInterval: 100 * time.Millisecond,
|
||||||
|
})
|
||||||
|
|
||||||
|
checker.runChecks()
|
||||||
|
|
||||||
|
statuses := checker.GetAllStatuses()
|
||||||
|
|
||||||
|
// Partial failure should not affect other systems
|
||||||
|
if !statuses[domain.ExternalSystemRegistry].Healthy {
|
||||||
|
t.Error("registry should be healthy despite CI failure")
|
||||||
|
}
|
||||||
|
if statuses[domain.ExternalSystemCI].Healthy {
|
||||||
|
t.Error("CI should be unhealthy")
|
||||||
|
}
|
||||||
|
if !statuses[domain.ExternalSystemGit].Healthy {
|
||||||
|
t.Error("git should be healthy despite CI failure")
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user