package worker import ( "context" "sync" "time" "github.com/orchard9/rdev/internal/domain" "github.com/orchard9/rdev/internal/logging" "github.com/orchard9/rdev/internal/metrics" "github.com/orchard9/rdev/internal/port" ) // ExternalHealthChecker runs periodic health checks on external systems // (registry, CI, git) and caches the results for the /ready endpoint. type ExternalHealthChecker struct { registry port.RegistryChecker // zot ci port.ExternalHealthChecker // woodpecker git port.ExternalHealthChecker // gitea interval time.Duration // Internal state (thread-safe) mu sync.RWMutex statuses map[domain.ExternalSystem]domain.ExternalSystemStatus // Lifecycle ctx context.Context cancel context.CancelFunc wg sync.WaitGroup } // ExternalHealthConfig configures the health checker. type ExternalHealthConfig struct { // CheckInterval is how often to check external systems. Default: 30s. CheckInterval time.Duration } // DefaultExternalHealthConfig returns sensible defaults. func DefaultExternalHealthConfig() ExternalHealthConfig { return ExternalHealthConfig{ CheckInterval: 30 * time.Second, } } // NewExternalHealthChecker creates a new external health checker. // All checker parameters are optional (nil means skip that system). func NewExternalHealthChecker( registry port.RegistryChecker, ci port.ExternalHealthChecker, git port.ExternalHealthChecker, cfg ExternalHealthConfig, ) *ExternalHealthChecker { if cfg.CheckInterval == 0 { cfg.CheckInterval = 30 * time.Second } ctx, cancel := context.WithCancel(context.Background()) return &ExternalHealthChecker{ registry: registry, ci: ci, git: git, interval: cfg.CheckInterval, statuses: make(map[domain.ExternalSystem]domain.ExternalSystemStatus), ctx: ctx, cancel: cancel, } } // Start begins the background check loop. func (c *ExternalHealthChecker) Start() { log := logging.FromContext(c.ctx).WithWorker("external-health") log.Info("external health checker started", "interval", c.interval) c.wg.Add(1) go c.checkLoop() } // Stop gracefully shuts down the checker. func (c *ExternalHealthChecker) Stop() { log := logging.FromContext(c.ctx).WithWorker("external-health") log.Info("external health checker stopping") c.cancel() c.wg.Wait() log.Info("external health checker stopped") } // GetStatus returns the cached status for a specific system. func (c *ExternalHealthChecker) GetStatus(system domain.ExternalSystem) (domain.ExternalSystemStatus, bool) { c.mu.RLock() defer c.mu.RUnlock() status, ok := c.statuses[system] return status, ok } // GetAllStatuses returns a copy of all cached statuses. func (c *ExternalHealthChecker) GetAllStatuses() map[domain.ExternalSystem]domain.ExternalSystemStatus { c.mu.RLock() defer c.mu.RUnlock() result := make(map[domain.ExternalSystem]domain.ExternalSystemStatus, len(c.statuses)) for k, v := range c.statuses { result[k] = v } return result } // checkLoop runs periodic health checks. func (c *ExternalHealthChecker) checkLoop() { defer c.wg.Done() // Run immediately on start c.runChecks() ticker := time.NewTicker(c.interval) defer ticker.Stop() for { select { case <-c.ctx.Done(): return case <-ticker.C: c.runChecks() } } } // runChecks performs health checks on all configured systems in parallel. func (c *ExternalHealthChecker) runChecks() { ctx, cancel := context.WithTimeout(c.ctx, TimeoutHealthCheck) defer cancel() var wg sync.WaitGroup results := make(chan domain.ExternalSystemStatus, 3) // Check registry (zot) if c.registry != nil { wg.Add(1) go func() { defer wg.Done() regStatus := c.registry.Check(ctx) // Convert domain.RegistryStatus to domain.ExternalSystemStatus status := domain.ExternalSystemStatus{ System: domain.ExternalSystemRegistry, Healthy: regStatus.Healthy, URL: regStatus.URL, Error: regStatus.Error, LastChecked: regStatus.LastChecked, } // Parse latency string (e.g., "45ms") to duration if regStatus.Latency != "" { if d, err := time.ParseDuration(regStatus.Latency); err == nil { status.Latency = d } } if status.Healthy { status.LastHealthy = status.LastChecked } results <- status }() } // Check CI (woodpecker) if c.ci != nil { wg.Add(1) go func() { defer wg.Done() results <- c.ci.Check(ctx) }() } // Check git (gitea) if c.git != nil { wg.Add(1) go func() { defer wg.Done() results <- c.git.Check(ctx) }() } // Wait for all checks to complete, then close results channel go func() { wg.Wait() close(results) }() // Collect results and update state for status := range results { c.updateStatus(status) } } // updateStatus updates cached status and logs/metrics on state changes. func (c *ExternalHealthChecker) updateStatus(status domain.ExternalSystemStatus) { log := logging.FromContext(c.ctx).WithWorker("external-health") c.mu.Lock() defer c.mu.Unlock() prev, existed := c.statuses[status.System] // Preserve LastHealthy from previous status if current is unhealthy if !status.Healthy && existed && !prev.LastHealthy.IsZero() { status.LastHealthy = prev.LastHealthy } c.statuses[status.System] = status // Log state transitions if !existed { // First check if status.Healthy { log.Info("external system healthy", "system", status.System, "url", status.URL, "latency", status.Latency, ) } else { log.Warn("external system unhealthy", "system", status.System, "url", status.URL, logging.FieldError, status.Error, ) } } else if prev.Healthy != status.Healthy { // State changed if status.Healthy { log.Info("external system recovered", "system", status.System, "url", status.URL, "latency", status.Latency, ) } else { log.Warn("external system became unhealthy", "system", status.System, "url", status.URL, logging.FieldError, status.Error, ) } } // Update Prometheus metrics metrics.SetExternalSystemHealth(string(status.System), status.Healthy, status.Latency.Seconds()) }