rdev/internal/db/migrations/018_sagas.sql
jordan f20fc6c51c
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
feat(saga): implement enterprise-grade resilience architecture
Fixes issues from code review of resilience implementation:

- Wire saga system in main.go (SagaRepository, SagaExecutor, SagaHandler)
- Fix CompletedSteps() to include skipped steps for dependency resolution
- Fix reverse loop bug in saga compensation (use standard swap pattern)
- Add circuit breaker state change callbacks for Prometheus metrics

Phase 1 (Build Resilience):
- Add failure:retry to all component Kaniko build steps
- Add preflight registry health check before builds
- Add services-deployed sync point to decouple docs from critical path

Phase 2 (API Resilience):
- Add pipeline retry endpoint (POST /projects/{id}/pipelines/{number}/retry)
- Wire circuit breakers with metrics callbacks
- Add /health/circuits endpoint for circuit breaker status

Phase 3 (Saga Engine):
- Full domain model (Saga, SagaStep, RetryPolicy, BackoffType)
- PostgreSQL saga repository with CRUD and step management
- Saga executor with retry, compensation, skip step support
- Saga API handlers with CRUD and control operations

Phase 4 (Observability):
- Add saga metrics (total, step_duration, retry, circuit_breaker_state)
- Add logging fields (saga_id, saga_name, step_name)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-08 01:58:02 -07:00

79 lines
2.6 KiB
PL/PgSQL

-- Saga pattern for resilient multi-step workflows
-- Sagas track multi-step operations with retry and compensation support
-- +goose Up
-- Main sagas table
CREATE TABLE sagas (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending',
definition TEXT,
vars JSONB NOT NULL DEFAULT '{}',
outputs JSONB NOT NULL DEFAULT '{}',
current_step TEXT,
retry_count INT NOT NULL DEFAULT 0,
max_retries INT NOT NULL DEFAULT 3,
error TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
-- Constraints
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'compensating', 'compensated'))
);
-- Saga steps table
CREATE TABLE saga_steps (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
saga_id UUID NOT NULL REFERENCES sagas(id) ON DELETE CASCADE,
name TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'pending',
action TEXT NOT NULL,
depends_on TEXT[] NOT NULL DEFAULT '{}',
retry_policy JSONB NOT NULL DEFAULT '{"max_attempts": 3, "backoff_type": "exponential", "initial_delay": "5s", "max_delay": "60s"}',
compensate TEXT,
config JSONB NOT NULL DEFAULT '{}',
output JSONB,
error TEXT,
retry_count INT NOT NULL DEFAULT 0,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
-- Constraints
CONSTRAINT unique_step_per_saga UNIQUE (saga_id, name),
CONSTRAINT valid_step_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'skipped'))
);
-- Indexes for common queries
CREATE INDEX idx_sagas_status ON sagas(status) WHERE status IN ('pending', 'running', 'compensating');
CREATE INDEX idx_sagas_name ON sagas(name);
CREATE INDEX idx_sagas_created_at ON sagas(created_at DESC);
CREATE INDEX idx_saga_steps_saga_id ON saga_steps(saga_id);
CREATE INDEX idx_saga_steps_status ON saga_steps(saga_id, status);
-- Update timestamp trigger
CREATE OR REPLACE FUNCTION update_saga_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trigger_saga_updated_at
BEFORE UPDATE ON sagas
FOR EACH ROW
EXECUTE FUNCTION update_saga_updated_at();
-- +goose Down
DROP TRIGGER IF EXISTS trigger_saga_updated_at ON sagas;
DROP FUNCTION IF EXISTS update_saga_updated_at();
DROP INDEX IF EXISTS idx_saga_steps_status;
DROP INDEX IF EXISTS idx_saga_steps_saga_id;
DROP INDEX IF EXISTS idx_sagas_created_at;
DROP INDEX IF EXISTS idx_sagas_name;
DROP INDEX IF EXISTS idx_sagas_status;
DROP TABLE IF EXISTS saga_steps;
DROP TABLE IF EXISTS sagas;