Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
Fixes issues from code review of resilience implementation:
- Wire saga system in main.go (SagaRepository, SagaExecutor, SagaHandler)
- Fix CompletedSteps() to include skipped steps for dependency resolution
- Fix reverse loop bug in saga compensation (use standard swap pattern)
- Add circuit breaker state change callbacks for Prometheus metrics
Phase 1 (Build Resilience):
- Add failure:retry to all component Kaniko build steps
- Add preflight registry health check before builds
- Add services-deployed sync point to decouple docs from critical path
Phase 2 (API Resilience):
- Add pipeline retry endpoint (POST /projects/{id}/pipelines/{number}/retry)
- Wire circuit breakers with metrics callbacks
- Add /health/circuits endpoint for circuit breaker status
Phase 3 (Saga Engine):
- Full domain model (Saga, SagaStep, RetryPolicy, BackoffType)
- PostgreSQL saga repository with CRUD and step management
- Saga executor with retry, compensation, skip step support
- Saga API handlers with CRUD and control operations
Phase 4 (Observability):
- Add saga metrics (total, step_duration, retry, circuit_breaker_state)
- Add logging fields (saga_id, saga_name, step_name)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
161 lines
3.7 KiB
Go
161 lines
3.7 KiB
Go
// Package circuitbreaker provides protection against cascading failures.
|
|
package circuitbreaker
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/orchard9/rdev/internal/metrics"
|
|
)
|
|
|
|
// Registry manages named circuit breakers.
|
|
type Registry struct {
|
|
mu sync.RWMutex
|
|
breakers map[string]*CircuitBreaker
|
|
configs map[string]Config
|
|
}
|
|
|
|
// NewRegistry creates a new circuit breaker registry.
|
|
func NewRegistry() *Registry {
|
|
return &Registry{
|
|
breakers: make(map[string]*CircuitBreaker),
|
|
configs: make(map[string]Config),
|
|
}
|
|
}
|
|
|
|
// Get returns the circuit breaker for the given name, creating one if needed.
|
|
func (r *Registry) Get(name string) *CircuitBreaker {
|
|
r.mu.RLock()
|
|
cb, ok := r.breakers[name]
|
|
r.mu.RUnlock()
|
|
if ok {
|
|
return cb
|
|
}
|
|
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
// Double-check after acquiring write lock
|
|
if cb, ok = r.breakers[name]; ok {
|
|
return cb
|
|
}
|
|
|
|
// Use custom config if set, otherwise use defaults
|
|
cfg, ok := r.configs[name]
|
|
if !ok {
|
|
cfg = DefaultConfig()
|
|
}
|
|
|
|
// Ensure name is set for callbacks
|
|
cfg.Name = name
|
|
|
|
// Set up metrics callback if not already configured
|
|
if cfg.OnStateChange == nil {
|
|
cfg.OnStateChange = defaultStateChangeCallback
|
|
}
|
|
|
|
cb = New(cfg)
|
|
r.breakers[name] = cb
|
|
return cb
|
|
}
|
|
|
|
// Configure sets the configuration for a named circuit breaker.
|
|
// Must be called before Get() for the configuration to take effect.
|
|
func (r *Registry) Configure(name string, cfg Config) {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
r.configs[name] = cfg
|
|
}
|
|
|
|
// StatusEntry contains the status of a single circuit breaker.
|
|
type StatusEntry struct {
|
|
Name string `json:"name"`
|
|
State string `json:"state"`
|
|
Failures int `json:"failures"`
|
|
LastFailure *time.Time `json:"last_failure,omitempty"`
|
|
}
|
|
|
|
// AllStatus returns the status of all registered circuit breakers.
|
|
func (r *Registry) AllStatus() []StatusEntry {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
|
|
entries := make([]StatusEntry, 0, len(r.breakers))
|
|
for name, cb := range r.breakers {
|
|
stats := cb.Stats()
|
|
entry := StatusEntry{
|
|
Name: name,
|
|
State: stats.State.String(),
|
|
Failures: stats.Failures,
|
|
}
|
|
if !stats.LastFailure.IsZero() {
|
|
t := stats.LastFailure.UTC()
|
|
entry.LastFailure = &t
|
|
}
|
|
entries = append(entries, entry)
|
|
}
|
|
return entries
|
|
}
|
|
|
|
// Reset resets a specific circuit breaker by name.
|
|
func (r *Registry) Reset(name string) bool {
|
|
r.mu.RLock()
|
|
cb, ok := r.breakers[name]
|
|
r.mu.RUnlock()
|
|
if !ok {
|
|
return false
|
|
}
|
|
cb.Reset()
|
|
return true
|
|
}
|
|
|
|
// ResetAll resets all circuit breakers.
|
|
func (r *Registry) ResetAll() {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
for _, cb := range r.breakers {
|
|
cb.Reset()
|
|
}
|
|
}
|
|
|
|
// GlobalRegistry is the default global circuit breaker registry.
|
|
var GlobalRegistry = NewRegistry()
|
|
|
|
// defaultStateChangeCallback updates Prometheus metrics when circuit breaker state changes.
|
|
func defaultStateChangeCallback(name string, from, to State) {
|
|
// State values: 0=closed, 1=half-open, 2=open
|
|
metrics.SetCircuitBreakerState(name, int(to))
|
|
}
|
|
|
|
// Known circuit breaker names for external systems.
|
|
const (
|
|
NameWoodpecker = "woodpecker"
|
|
NameGitea = "gitea"
|
|
NameRegistry = "registry"
|
|
NameCloudflare = "cloudflare"
|
|
)
|
|
|
|
func init() {
|
|
// Configure default circuit breakers with appropriate settings
|
|
GlobalRegistry.Configure(NameWoodpecker, Config{
|
|
FailureThreshold: 3,
|
|
ResetTimeout: 30 * time.Second,
|
|
HalfOpenRequests: 1,
|
|
})
|
|
GlobalRegistry.Configure(NameGitea, Config{
|
|
FailureThreshold: 3,
|
|
ResetTimeout: 30 * time.Second,
|
|
HalfOpenRequests: 1,
|
|
})
|
|
GlobalRegistry.Configure(NameRegistry, Config{
|
|
FailureThreshold: 5,
|
|
ResetTimeout: 60 * time.Second,
|
|
HalfOpenRequests: 1,
|
|
})
|
|
GlobalRegistry.Configure(NameCloudflare, Config{
|
|
FailureThreshold: 5,
|
|
ResetTimeout: 60 * time.Second,
|
|
HalfOpenRequests: 1,
|
|
})
|
|
}
|