rdev/internal/circuitbreaker/registry.go
jordan f20fc6c51c
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
feat(saga): implement enterprise-grade resilience architecture
Fixes issues from code review of resilience implementation:

- Wire saga system in main.go (SagaRepository, SagaExecutor, SagaHandler)
- Fix CompletedSteps() to include skipped steps for dependency resolution
- Fix reverse loop bug in saga compensation (use standard swap pattern)
- Add circuit breaker state change callbacks for Prometheus metrics

Phase 1 (Build Resilience):
- Add failure:retry to all component Kaniko build steps
- Add preflight registry health check before builds
- Add services-deployed sync point to decouple docs from critical path

Phase 2 (API Resilience):
- Add pipeline retry endpoint (POST /projects/{id}/pipelines/{number}/retry)
- Wire circuit breakers with metrics callbacks
- Add /health/circuits endpoint for circuit breaker status

Phase 3 (Saga Engine):
- Full domain model (Saga, SagaStep, RetryPolicy, BackoffType)
- PostgreSQL saga repository with CRUD and step management
- Saga executor with retry, compensation, skip step support
- Saga API handlers with CRUD and control operations

Phase 4 (Observability):
- Add saga metrics (total, step_duration, retry, circuit_breaker_state)
- Add logging fields (saga_id, saga_name, step_name)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-08 01:58:02 -07:00

161 lines
3.7 KiB
Go

// Package circuitbreaker provides protection against cascading failures.
package circuitbreaker
import (
"sync"
"time"
"github.com/orchard9/rdev/internal/metrics"
)
// Registry manages named circuit breakers.
type Registry struct {
mu sync.RWMutex
breakers map[string]*CircuitBreaker
configs map[string]Config
}
// NewRegistry creates a new circuit breaker registry.
func NewRegistry() *Registry {
return &Registry{
breakers: make(map[string]*CircuitBreaker),
configs: make(map[string]Config),
}
}
// Get returns the circuit breaker for the given name, creating one if needed.
func (r *Registry) Get(name string) *CircuitBreaker {
r.mu.RLock()
cb, ok := r.breakers[name]
r.mu.RUnlock()
if ok {
return cb
}
r.mu.Lock()
defer r.mu.Unlock()
// Double-check after acquiring write lock
if cb, ok = r.breakers[name]; ok {
return cb
}
// Use custom config if set, otherwise use defaults
cfg, ok := r.configs[name]
if !ok {
cfg = DefaultConfig()
}
// Ensure name is set for callbacks
cfg.Name = name
// Set up metrics callback if not already configured
if cfg.OnStateChange == nil {
cfg.OnStateChange = defaultStateChangeCallback
}
cb = New(cfg)
r.breakers[name] = cb
return cb
}
// Configure sets the configuration for a named circuit breaker.
// Must be called before Get() for the configuration to take effect.
func (r *Registry) Configure(name string, cfg Config) {
r.mu.Lock()
defer r.mu.Unlock()
r.configs[name] = cfg
}
// StatusEntry contains the status of a single circuit breaker.
type StatusEntry struct {
Name string `json:"name"`
State string `json:"state"`
Failures int `json:"failures"`
LastFailure *time.Time `json:"last_failure,omitempty"`
}
// AllStatus returns the status of all registered circuit breakers.
func (r *Registry) AllStatus() []StatusEntry {
r.mu.RLock()
defer r.mu.RUnlock()
entries := make([]StatusEntry, 0, len(r.breakers))
for name, cb := range r.breakers {
stats := cb.Stats()
entry := StatusEntry{
Name: name,
State: stats.State.String(),
Failures: stats.Failures,
}
if !stats.LastFailure.IsZero() {
t := stats.LastFailure.UTC()
entry.LastFailure = &t
}
entries = append(entries, entry)
}
return entries
}
// Reset resets a specific circuit breaker by name.
func (r *Registry) Reset(name string) bool {
r.mu.RLock()
cb, ok := r.breakers[name]
r.mu.RUnlock()
if !ok {
return false
}
cb.Reset()
return true
}
// ResetAll resets all circuit breakers.
func (r *Registry) ResetAll() {
r.mu.RLock()
defer r.mu.RUnlock()
for _, cb := range r.breakers {
cb.Reset()
}
}
// GlobalRegistry is the default global circuit breaker registry.
var GlobalRegistry = NewRegistry()
// defaultStateChangeCallback updates Prometheus metrics when circuit breaker state changes.
func defaultStateChangeCallback(name string, from, to State) {
// State values: 0=closed, 1=half-open, 2=open
metrics.SetCircuitBreakerState(name, int(to))
}
// Known circuit breaker names for external systems.
const (
NameWoodpecker = "woodpecker"
NameGitea = "gitea"
NameRegistry = "registry"
NameCloudflare = "cloudflare"
)
func init() {
// Configure default circuit breakers with appropriate settings
GlobalRegistry.Configure(NameWoodpecker, Config{
FailureThreshold: 3,
ResetTimeout: 30 * time.Second,
HalfOpenRequests: 1,
})
GlobalRegistry.Configure(NameGitea, Config{
FailureThreshold: 3,
ResetTimeout: 30 * time.Second,
HalfOpenRequests: 1,
})
GlobalRegistry.Configure(NameRegistry, Config{
FailureThreshold: 5,
ResetTimeout: 60 * time.Second,
HalfOpenRequests: 1,
})
GlobalRegistry.Configure(NameCloudflare, Config{
FailureThreshold: 5,
ResetTimeout: 60 * time.Second,
HalfOpenRequests: 1,
})
}