- Add /diagnostics endpoint for system health overview - Add external health worker for monitoring Gitea, Woodpecker, Registry - Add health check methods to Gitea and Woodpecker clients - Remove hardcoded fallback projects (pantheon, aeries) - Add diagnostics domain types and service layer - Add comprehensive tests for diagnostics handler and service - Fix tests to use registered test project instead of hardcoded one Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
249 lines
6.0 KiB
Go
249 lines
6.0 KiB
Go
package worker
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/orchard9/rdev/internal/domain"
|
|
"github.com/orchard9/rdev/internal/metrics"
|
|
"github.com/orchard9/rdev/internal/port"
|
|
)
|
|
|
|
// ExternalHealthChecker runs periodic health checks on external systems
|
|
// (registry, CI, git) and caches the results for the /ready endpoint.
|
|
type ExternalHealthChecker struct {
|
|
registry port.RegistryChecker // zot
|
|
ci port.ExternalHealthChecker // woodpecker
|
|
git port.ExternalHealthChecker // gitea
|
|
|
|
interval time.Duration
|
|
logger *slog.Logger
|
|
|
|
// Internal state (thread-safe)
|
|
mu sync.RWMutex
|
|
statuses map[domain.ExternalSystem]domain.ExternalSystemStatus
|
|
|
|
// Lifecycle
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
// ExternalHealthConfig configures the health checker.
|
|
type ExternalHealthConfig struct {
|
|
// CheckInterval is how often to check external systems. Default: 30s.
|
|
CheckInterval time.Duration
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// DefaultExternalHealthConfig returns sensible defaults.
|
|
func DefaultExternalHealthConfig() ExternalHealthConfig {
|
|
return ExternalHealthConfig{
|
|
CheckInterval: 30 * time.Second,
|
|
Logger: slog.Default(),
|
|
}
|
|
}
|
|
|
|
// NewExternalHealthChecker creates a new external health checker.
|
|
// All checker parameters are optional (nil means skip that system).
|
|
func NewExternalHealthChecker(
|
|
registry port.RegistryChecker,
|
|
ci port.ExternalHealthChecker,
|
|
git port.ExternalHealthChecker,
|
|
cfg ExternalHealthConfig,
|
|
) *ExternalHealthChecker {
|
|
if cfg.CheckInterval == 0 {
|
|
cfg.CheckInterval = 30 * time.Second
|
|
}
|
|
if cfg.Logger == nil {
|
|
cfg.Logger = slog.Default()
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
return &ExternalHealthChecker{
|
|
registry: registry,
|
|
ci: ci,
|
|
git: git,
|
|
interval: cfg.CheckInterval,
|
|
logger: cfg.Logger.With("component", "external-health"),
|
|
statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus),
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
}
|
|
}
|
|
|
|
// Start begins the background check loop.
|
|
func (c *ExternalHealthChecker) Start() {
|
|
c.logger.Info("external health checker started", "interval", c.interval)
|
|
|
|
c.wg.Add(1)
|
|
go c.checkLoop()
|
|
}
|
|
|
|
// Stop gracefully shuts down the checker.
|
|
func (c *ExternalHealthChecker) Stop() {
|
|
c.logger.Info("external health checker stopping")
|
|
c.cancel()
|
|
c.wg.Wait()
|
|
c.logger.Info("external health checker stopped")
|
|
}
|
|
|
|
// GetStatus returns the cached status for a specific system.
|
|
func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
status, ok := c.statuses[system]
|
|
return status, ok
|
|
}
|
|
|
|
// GetAllStatuses returns a copy of all cached statuses.
|
|
func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
|
|
result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses))
|
|
for k, v := range c.statuses {
|
|
result[k] = v
|
|
}
|
|
return result
|
|
}
|
|
|
|
// checkLoop runs periodic health checks.
|
|
func (c *ExternalHealthChecker) checkLoop() {
|
|
defer c.wg.Done()
|
|
|
|
// Run immediately on start
|
|
c.runChecks()
|
|
|
|
ticker := time.NewTicker(c.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-c.ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
c.runChecks()
|
|
}
|
|
}
|
|
}
|
|
|
|
// runChecks performs health checks on all configured systems in parallel.
|
|
func (c *ExternalHealthChecker) runChecks() {
|
|
ctx, cancel := context.WithTimeout(c.ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
var wg sync.WaitGroup
|
|
results := make(chan domain.ExternalSystemStatus, 3)
|
|
|
|
// Check registry (zot)
|
|
if c.registry != nil {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
regStatus := c.registry.Check(ctx)
|
|
// Convert domain.RegistryStatus to domain.ExternalSystemStatus
|
|
status := domain.ExternalSystemStatus{
|
|
System: domain.ExternalSystemRegistry,
|
|
Healthy: regStatus.Healthy,
|
|
URL: regStatus.URL,
|
|
Error: regStatus.Error,
|
|
LastChecked: regStatus.LastChecked,
|
|
}
|
|
// Parse latency string (e.g., "45ms") to duration
|
|
if regStatus.Latency != "" {
|
|
if d, err := time.ParseDuration(regStatus.Latency); err == nil {
|
|
status.Latency = d
|
|
}
|
|
}
|
|
if status.Healthy {
|
|
status.LastHealthy = status.LastChecked
|
|
}
|
|
results <- status
|
|
}()
|
|
}
|
|
|
|
// Check CI (woodpecker)
|
|
if c.ci != nil {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
results <- c.ci.Check(ctx)
|
|
}()
|
|
}
|
|
|
|
// Check git (gitea)
|
|
if c.git != nil {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
results <- c.git.Check(ctx)
|
|
}()
|
|
}
|
|
|
|
// Wait for all checks to complete, then close results channel
|
|
go func() {
|
|
wg.Wait()
|
|
close(results)
|
|
}()
|
|
|
|
// Collect results and update state
|
|
for status := range results {
|
|
c.updateStatus(status)
|
|
}
|
|
}
|
|
|
|
// updateStatus updates cached status and logs/metrics on state changes.
|
|
func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
prev, existed := c.statuses[status.System]
|
|
|
|
// Preserve LastHealthy from previous status if current is unhealthy
|
|
if !status.Healthy && existed && !prev.LastHealthy.IsZero() {
|
|
status.LastHealthy = prev.LastHealthy
|
|
}
|
|
|
|
c.statuses[status.System] = status
|
|
|
|
// Log state transitions
|
|
if !existed {
|
|
// First check
|
|
if status.Healthy {
|
|
c.logger.Info("external system healthy",
|
|
"system", status.System,
|
|
"url", status.URL,
|
|
"latency", status.Latency,
|
|
)
|
|
} else {
|
|
c.logger.Warn("external system unhealthy",
|
|
"system", status.System,
|
|
"url", status.URL,
|
|
"error", status.Error,
|
|
)
|
|
}
|
|
} else if prev.Healthy != status.Healthy {
|
|
// State changed
|
|
if status.Healthy {
|
|
c.logger.Info("external system recovered",
|
|
"system", status.System,
|
|
"url", status.URL,
|
|
"latency", status.Latency,
|
|
)
|
|
} else {
|
|
c.logger.Warn("external system became unhealthy",
|
|
"system", status.System,
|
|
"url", status.URL,
|
|
"error", status.Error,
|
|
)
|
|
}
|
|
}
|
|
|
|
// Update Prometheus metrics
|
|
metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds())
|
|
}
|