rdev/internal/worker/external_health.go
jordan 210064d490 feat: add diagnostics endpoint and external health monitoring
- Add /diagnostics endpoint for system health overview
- Add external health worker for monitoring Gitea, Woodpecker, Registry
- Add health check methods to Gitea and Woodpecker clients
- Remove hardcoded fallback projects (pantheon, aeries)
- Add diagnostics domain types and service layer
- Add comprehensive tests for diagnostics handler and service
- Fix tests to use registered test project instead of hardcoded one

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 19:10:56 -07:00

249 lines
6.0 KiB
Go

package worker
import (
"context"
"log/slog"
"sync"
"time"
"github.com/orchard9/rdev/internal/domain"
"github.com/orchard9/rdev/internal/metrics"
"github.com/orchard9/rdev/internal/port"
)
// ExternalHealthChecker runs periodic health checks on external systems
// (registry, CI, git) and caches the results for the /ready endpoint.
type ExternalHealthChecker struct {
registry port.RegistryChecker // zot
ci port.ExternalHealthChecker // woodpecker
git port.ExternalHealthChecker // gitea
interval time.Duration
logger *slog.Logger
// Internal state (thread-safe)
mu sync.RWMutex
statuses map[domain.ExternalSystem]domain.ExternalSystemStatus
// Lifecycle
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
}
// ExternalHealthConfig configures the health checker.
type ExternalHealthConfig struct {
// CheckInterval is how often to check external systems. Default: 30s.
CheckInterval time.Duration
Logger *slog.Logger
}
// DefaultExternalHealthConfig returns sensible defaults.
func DefaultExternalHealthConfig() ExternalHealthConfig {
return ExternalHealthConfig{
CheckInterval: 30 * time.Second,
Logger: slog.Default(),
}
}
// NewExternalHealthChecker creates a new external health checker.
// All checker parameters are optional (nil means skip that system).
func NewExternalHealthChecker(
registry port.RegistryChecker,
ci port.ExternalHealthChecker,
git port.ExternalHealthChecker,
cfg ExternalHealthConfig,
) *ExternalHealthChecker {
if cfg.CheckInterval == 0 {
cfg.CheckInterval = 30 * time.Second
}
if cfg.Logger == nil {
cfg.Logger = slog.Default()
}
ctx, cancel := context.WithCancel(context.Background())
return &ExternalHealthChecker{
registry: registry,
ci: ci,
git: git,
interval: cfg.CheckInterval,
logger: cfg.Logger.With("component", "external-health"),
statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus),
ctx: ctx,
cancel: cancel,
}
}
// Start begins the background check loop.
func (c *ExternalHealthChecker) Start() {
c.logger.Info("external health checker started", "interval", c.interval)
c.wg.Add(1)
go c.checkLoop()
}
// Stop gracefully shuts down the checker.
func (c *ExternalHealthChecker) Stop() {
c.logger.Info("external health checker stopping")
c.cancel()
c.wg.Wait()
c.logger.Info("external health checker stopped")
}
// GetStatus returns the cached status for a specific system.
func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
status, ok := c.statuses[system]
return status, ok
}
// GetAllStatuses returns a copy of all cached statuses.
func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus {
c.mu.RLock()
defer c.mu.RUnlock()
result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses))
for k, v := range c.statuses {
result[k] = v
}
return result
}
// checkLoop runs periodic health checks.
func (c *ExternalHealthChecker) checkLoop() {
defer c.wg.Done()
// Run immediately on start
c.runChecks()
ticker := time.NewTicker(c.interval)
defer ticker.Stop()
for {
select {
case <-c.ctx.Done():
return
case <-ticker.C:
c.runChecks()
}
}
}
// runChecks performs health checks on all configured systems in parallel.
func (c *ExternalHealthChecker) runChecks() {
ctx, cancel := context.WithTimeout(c.ctx, 10*time.Second)
defer cancel()
var wg sync.WaitGroup
results := make(chan domain.ExternalSystemStatus, 3)
// Check registry (zot)
if c.registry != nil {
wg.Add(1)
go func() {
defer wg.Done()
regStatus := c.registry.Check(ctx)
// Convert domain.RegistryStatus to domain.ExternalSystemStatus
status := domain.ExternalSystemStatus{
System: domain.ExternalSystemRegistry,
Healthy: regStatus.Healthy,
URL: regStatus.URL,
Error: regStatus.Error,
LastChecked: regStatus.LastChecked,
}
// Parse latency string (e.g., "45ms") to duration
if regStatus.Latency != "" {
if d, err := time.ParseDuration(regStatus.Latency); err == nil {
status.Latency = d
}
}
if status.Healthy {
status.LastHealthy = status.LastChecked
}
results <- status
}()
}
// Check CI (woodpecker)
if c.ci != nil {
wg.Add(1)
go func() {
defer wg.Done()
results <- c.ci.Check(ctx)
}()
}
// Check git (gitea)
if c.git != nil {
wg.Add(1)
go func() {
defer wg.Done()
results <- c.git.Check(ctx)
}()
}
// Wait for all checks to complete, then close results channel
go func() {
wg.Wait()
close(results)
}()
// Collect results and update state
for status := range results {
c.updateStatus(status)
}
}
// updateStatus updates cached status and logs/metrics on state changes.
func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) {
c.mu.Lock()
defer c.mu.Unlock()
prev, existed := c.statuses[status.System]
// Preserve LastHealthy from previous status if current is unhealthy
if !status.Healthy && existed && !prev.LastHealthy.IsZero() {
status.LastHealthy = prev.LastHealthy
}
c.statuses[status.System] = status
// Log state transitions
if !existed {
// First check
if status.Healthy {
c.logger.Info("external system healthy",
"system", status.System,
"url", status.URL,
"latency", status.Latency,
)
} else {
c.logger.Warn("external system unhealthy",
"system", status.System,
"url", status.URL,
"error", status.Error,
)
}
} else if prev.Healthy != status.Healthy {
// State changed
if status.Healthy {
c.logger.Info("external system recovered",
"system", status.System,
"url", status.URL,
"latency", status.Latency,
)
} else {
c.logger.Warn("external system became unhealthy",
"system", status.System,
"url", status.URL,
"error", status.Error,
)
}
}
// Update Prometheus metrics
metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds())
}