rdev/internal/metrics/metrics.go
jordan 9a1309a0c5 feat: fix composable monorepo CI builds + health endpoint improvements
Composable monorepo CI fixes:
- Add empty go.sum.tmpl files for pkg, service, worker, and cli components
- Fix Dockerfile.tmpl glob patterns (COPY go.work.sum* is invalid in Kaniko)
- Add deps step to CI that runs go work sync and go mod tidy before builds
- Fix scalar-go dependency version (v0.1.2 doesn't exist, use v0.13.0)

Health endpoint improvements:
- Add registry health check (zot OCI /v2/ endpoint)
- Add health metrics for CI, registry, and Git
- Add /health/ci endpoint for Woodpecker health

Visual verification scaffolding:
- Add Playwright pod and scripts ConfigMap
- Add vision.md and implementation breakdown plan

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 18:46:51 -07:00

314 lines
10 KiB
Go

// Package metrics provides Prometheus metrics for the rdev API.
package metrics
import (
"net/http"
"regexp"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// Commands
commandsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_commands_total",
Help: "Total number of commands executed",
}, []string{"project", "type", "status"})
commandDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "rdev_command_duration_seconds",
Help: "Duration of command execution in seconds",
Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min
}, []string{"project", "type"})
// Code Agents
agentRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_agent_requests_total",
Help: "Total number of code agent requests",
}, []string{"provider", "status"})
agentRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "rdev_agent_request_duration_seconds",
Help: "Duration of code agent requests in seconds",
Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min
}, []string{"provider"})
agentToolUse = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_agent_tool_use_total",
Help: "Total number of tool invocations by code agents",
}, []string{"provider", "tool"})
agentAvailability = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_agent_available",
Help: "Whether the code agent is available (1) or not (0)",
}, []string{"provider"})
// Worker Pool
workersTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_workers_total",
Help: "Number of registered workers by status",
}, []string{"status"})
workerHeartbeatAge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_worker_heartbeat_age_seconds",
Help: "Age of the most recent worker heartbeat in seconds",
}, []string{"worker_id"})
// Builds
buildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_builds_total",
Help: "Total number of build tasks by status",
}, []string{"project", "status"})
buildDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "rdev_build_duration_seconds",
Help: "Duration of build executions in seconds",
Buckets: prometheus.ExponentialBuckets(1, 2, 12), // 1s to ~34min
}, []string{"project"})
// Work Queue
workQueueDepth = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_work_queue_depth",
Help: "Number of tasks in the work queue by status",
}, []string{"status"})
// Streams
activeStreams = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_active_streams",
Help: "Number of active SSE streams",
}, []string{"project"})
streamReconnects = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_stream_reconnects_total",
Help: "Total number of SSE stream reconnections",
}, []string{"project"})
// Build Events (SSE streaming)
buildEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_build_events_total",
Help: "Total number of build events published",
}, []string{"type"})
buildEventSubscribers = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "rdev_build_event_subscribers",
Help: "Number of active build event subscribers",
}, []string{"task_id"})
buildEventBufferSize = promauto.NewGauge(prometheus.GaugeOpts{
Name: "rdev_build_event_buffer_size",
Help: "Total number of events in replay buffers",
})
// Authentication
authFailures = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_auth_failures_total",
Help: "Total number of authentication failures",
}, []string{"reason"})
// API Requests
requestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "rdev_api_request_duration_seconds",
Help: "Duration of API requests in seconds",
Buckets: prometheus.DefBuckets,
}, []string{"method", "path", "status"})
requestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_api_requests_total",
Help: "Total number of API requests",
}, []string{"method", "path", "status"})
// Registry health
registryHealthy = promauto.NewGauge(prometheus.GaugeOpts{
Name: "rdev_registry_healthy",
Help: "Whether the container registry is healthy (1) or not (0)",
})
registryLatency = promauto.NewGauge(prometheus.GaugeOpts{
Name: "rdev_registry_latency_seconds",
Help: "Latency of registry health check in seconds",
})
// CI builds
ciBuildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_ci_builds_total",
Help: "Total number of CI builds by project and status",
}, []string{"project", "status"})
ciPushFailures = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "rdev_ci_push_failures_total",
Help: "Total number of CI image push failures by project",
}, []string{"project"})
)
// RecordCommand records a command execution.
func RecordCommand(project, cmdType, status string, durationMs int64) {
commandsTotal.WithLabelValues(project, cmdType, status).Inc()
commandDuration.WithLabelValues(project, cmdType).Observe(float64(durationMs) / 1000.0)
}
// IncActiveStreams increments the active stream count for a project.
func IncActiveStreams(project string) {
activeStreams.WithLabelValues(project).Inc()
}
// DecActiveStreams decrements the active stream count for a project.
func DecActiveStreams(project string) {
activeStreams.WithLabelValues(project).Dec()
}
// RecordStreamReconnect records a stream reconnection.
func RecordStreamReconnect(project string) {
streamReconnects.WithLabelValues(project).Inc()
}
// RecordBuildEvent records a build event publication.
func RecordBuildEvent(eventType string) {
buildEventsTotal.WithLabelValues(eventType).Inc()
}
// SetBuildEventSubscribers sets the number of subscribers for a build stream.
func SetBuildEventSubscribers(taskID string, count int) {
buildEventSubscribers.WithLabelValues(taskID).Set(float64(count))
}
// SetBuildEventBufferSize sets the total buffer size for event replay.
func SetBuildEventBufferSize(size int64) {
buildEventBufferSize.Set(float64(size))
}
// RecordAuthFailure records an authentication failure.
func RecordAuthFailure(reason string) {
authFailures.WithLabelValues(reason).Inc()
}
// RecordAgentRequest records a code agent request execution.
func RecordAgentRequest(provider, status string, durationMs int64) {
agentRequestsTotal.WithLabelValues(provider, status).Inc()
agentRequestDuration.WithLabelValues(provider).Observe(float64(durationMs) / 1000.0)
}
// RecordAgentToolUse records a tool invocation by a code agent.
func RecordAgentToolUse(provider, tool string) {
agentToolUse.WithLabelValues(provider, tool).Inc()
}
// SetAgentAvailability sets the availability status of a code agent.
func SetAgentAvailability(provider string, available bool) {
val := 0.0
if available {
val = 1.0
}
agentAvailability.WithLabelValues(provider).Set(val)
}
// SetWorkerCount sets the number of workers for a given status.
func SetWorkerCount(status string, count int) {
workersTotal.WithLabelValues(status).Set(float64(count))
}
// RecordWorkerHeartbeat sets the age of a worker's most recent heartbeat.
func RecordWorkerHeartbeat(workerID string, ageSeconds float64) {
workerHeartbeatAge.WithLabelValues(workerID).Set(ageSeconds)
}
// RecordBuild records a build task completion.
func RecordBuild(project, status string, durationMs int64) {
buildsTotal.WithLabelValues(project, status).Inc()
if durationMs > 0 {
buildDuration.WithLabelValues(project).Observe(float64(durationMs) / 1000.0)
}
}
// SetWorkQueueDepth sets the current depth of the work queue for a status.
func SetWorkQueueDepth(status string, count int64) {
workQueueDepth.WithLabelValues(status).Set(float64(count))
}
// SetRegistryHealth sets the registry health status.
func SetRegistryHealth(healthy bool, latencySeconds float64) {
val := 0.0
if healthy {
val = 1.0
}
registryHealthy.Set(val)
registryLatency.Set(latencySeconds)
}
// RecordCIBuild records a CI build event.
func RecordCIBuild(project, status string) {
ciBuildsTotal.WithLabelValues(project, status).Inc()
}
// RecordCIPushFailure records a CI image push failure.
func RecordCIPushFailure(project string) {
ciPushFailures.WithLabelValues(project).Inc()
}
// Handler returns the Prometheus HTTP handler.
func Handler() http.Handler {
return promhttp.Handler()
}
// Middleware returns an HTTP middleware that records request metrics.
func Middleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// Wrap the response writer to capture status code
rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
next.ServeHTTP(rw, r)
duration := time.Since(start).Seconds()
status := strconv.Itoa(rw.statusCode)
path := normalizePath(r.URL.Path)
requestDuration.WithLabelValues(r.Method, path, status).Observe(duration)
requestsTotal.WithLabelValues(r.Method, path, status).Inc()
})
}
// responseWriter wraps http.ResponseWriter to capture status code.
type responseWriter struct {
http.ResponseWriter
statusCode int
}
func (rw *responseWriter) WriteHeader(code int) {
rw.statusCode = code
rw.ResponseWriter.WriteHeader(code)
}
// pathNormalizers contains patterns to normalize variable path segments.
// Order matters - more specific patterns first.
var pathNormalizers = []struct {
pattern *regexp.Regexp
replace string
}{
// /keys/uuid -> /keys/{id}
{regexp.MustCompile(`^/keys/[^/]+$`), "/keys/{id}"},
// /workers/{id}/... -> /workers/{id}/...
{regexp.MustCompile(`^/workers/[^/]+(/.*)?$`), "/workers/{id}$1"},
// /builds/{id} -> /builds/{id}
{regexp.MustCompile(`^/builds/[^/]+$`), "/builds/{id}"},
// /projects/{id}/claude-config/{type}/{name} -> /projects/{id}/claude-config/{type}/{name}
{regexp.MustCompile(`^/projects/[^/]+/claude-config/(commands|skills|agents)/[^/]+$`), "/projects/{id}/claude-config/$1/{name}"},
// /projects/{id}/... (any sub-path) - must be last as it's most general
{regexp.MustCompile(`^/projects/[^/]+(/.*)?$`), "/projects/{id}$1"},
}
// normalizePath normalizes the URL path for consistent metric labels.
// Replaces variable path segments with placeholders to prevent cardinality explosion.
func normalizePath(path string) string {
for _, n := range pathNormalizers {
if n.pattern.MatchString(path) {
return n.pattern.ReplaceAllString(path, n.replace)
}
}
return path
}