// Package metrics provides Prometheus metrics for the rdev API. package metrics import ( "net/http" "regexp" "strconv" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promhttp" ) var ( // Commands commandsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_commands_total", Help: "Total number of commands executed", }, []string{"project", "type", "status"}) commandDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "rdev_command_duration_seconds", Help: "Duration of command execution in seconds", Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min }, []string{"project", "type"}) // Code Agents agentRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_agent_requests_total", Help: "Total number of code agent requests", }, []string{"provider", "status"}) agentRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "rdev_agent_request_duration_seconds", Help: "Duration of code agent requests in seconds", Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min }, []string{"provider"}) agentToolUse = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_agent_tool_use_total", Help: "Total number of tool invocations by code agents", }, []string{"provider", "tool"}) agentAvailability = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_agent_available", Help: "Whether the code agent is available (1) or not (0)", }, []string{"provider"}) // Worker Pool workersTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_workers_total", Help: "Number of registered workers by status", }, []string{"status"}) workerHeartbeatAge = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_worker_heartbeat_age_seconds", Help: "Age of the most recent worker heartbeat in seconds", }, []string{"worker_id"}) // Builds buildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_builds_total", Help: "Total number of build tasks by status", }, []string{"project", "status"}) buildDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "rdev_build_duration_seconds", Help: "Duration of build executions in seconds", Buckets: prometheus.ExponentialBuckets(1, 2, 12), // 1s to ~34min }, []string{"project"}) // Work Queue workQueueDepth = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_work_queue_depth", Help: "Number of tasks in the work queue by status", }, []string{"status"}) // Streams activeStreams = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_active_streams", Help: "Number of active SSE streams", }, []string{"project"}) streamReconnects = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_stream_reconnects_total", Help: "Total number of SSE stream reconnections", }, []string{"project"}) // Build Events (SSE streaming) buildEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_build_events_total", Help: "Total number of build events published", }, []string{"type"}) buildEventSubscribers = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_build_event_subscribers", Help: "Number of active build event subscribers", }, []string{"task_id"}) buildEventBufferSize = promauto.NewGauge(prometheus.GaugeOpts{ Name: "rdev_build_event_buffer_size", Help: "Total number of events in replay buffers", }) // Authentication authFailures = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_auth_failures_total", Help: "Total number of authentication failures", }, []string{"reason"}) // API Requests requestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "rdev_api_request_duration_seconds", Help: "Duration of API requests in seconds", Buckets: prometheus.DefBuckets, }, []string{"method", "path", "status"}) requestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_api_requests_total", Help: "Total number of API requests", }, []string{"method", "path", "status"}) // Registry health registryHealthy = promauto.NewGauge(prometheus.GaugeOpts{ Name: "rdev_registry_healthy", Help: "Whether the container registry is healthy (1) or not (0)", }) registryLatency = promauto.NewGauge(prometheus.GaugeOpts{ Name: "rdev_registry_latency_seconds", Help: "Latency of registry health check in seconds", }) // CI builds ciBuildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_ci_builds_total", Help: "Total number of CI builds by project and status", }, []string{"project", "status"}) ciPushFailures = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_ci_push_failures_total", Help: "Total number of CI image push failures by project", }, []string{"project"}) // External system health externalSystemHealthy = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_external_system_healthy", Help: "Whether external system is healthy (1) or not (0)", }, []string{"system"}) externalSystemLatency = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_external_system_latency_seconds", Help: "Latency of external system health check in seconds", }, []string{"system"}) externalSystemLastCheck = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_external_system_last_check_timestamp", Help: "Unix timestamp of last health check", }, []string{"system"}) // Saga metrics sagaTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_saga_total", Help: "Total number of sagas by name and final status", }, []string{"name", "status"}) sagaStepDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "rdev_saga_step_duration_seconds", Help: "Duration of saga step execution in seconds", Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min }, []string{"saga", "step", "action"}) sagaRetryTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "rdev_saga_retry_total", Help: "Total number of saga step retries", }, []string{"saga", "step"}) circuitBreakerState = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "rdev_circuit_breaker_state", Help: "Circuit breaker state: 0=closed, 1=half-open, 2=open", }, []string{"name"}) ) // RecordCommand records a command execution. func RecordCommand(project, cmdType, status string, durationMs int64) { commandsTotal.WithLabelValues(project, cmdType, status).Inc() commandDuration.WithLabelValues(project, cmdType).Observe(float64(durationMs) / 1000.0) } // IncActiveStreams increments the active stream count for a project. func IncActiveStreams(project string) { activeStreams.WithLabelValues(project).Inc() } // DecActiveStreams decrements the active stream count for a project. func DecActiveStreams(project string) { activeStreams.WithLabelValues(project).Dec() } // RecordStreamReconnect records a stream reconnection. func RecordStreamReconnect(project string) { streamReconnects.WithLabelValues(project).Inc() } // RecordBuildEvent records a build event publication. func RecordBuildEvent(eventType string) { buildEventsTotal.WithLabelValues(eventType).Inc() } // SetBuildEventSubscribers sets the number of subscribers for a build stream. func SetBuildEventSubscribers(taskID string, count int) { buildEventSubscribers.WithLabelValues(taskID).Set(float64(count)) } // SetBuildEventBufferSize sets the total buffer size for event replay. func SetBuildEventBufferSize(size int64) { buildEventBufferSize.Set(float64(size)) } // RecordAuthFailure records an authentication failure. func RecordAuthFailure(reason string) { authFailures.WithLabelValues(reason).Inc() } // RecordAgentRequest records a code agent request execution. func RecordAgentRequest(provider, status string, durationMs int64) { agentRequestsTotal.WithLabelValues(provider, status).Inc() agentRequestDuration.WithLabelValues(provider).Observe(float64(durationMs) / 1000.0) } // RecordAgentToolUse records a tool invocation by a code agent. func RecordAgentToolUse(provider, tool string) { agentToolUse.WithLabelValues(provider, tool).Inc() } // SetAgentAvailability sets the availability status of a code agent. func SetAgentAvailability(provider string, available bool) { val := 0.0 if available { val = 1.0 } agentAvailability.WithLabelValues(provider).Set(val) } // SetWorkerCount sets the number of workers for a given status. func SetWorkerCount(status string, count int) { workersTotal.WithLabelValues(status).Set(float64(count)) } // RecordWorkerHeartbeat sets the age of a worker's most recent heartbeat. func RecordWorkerHeartbeat(workerID string, ageSeconds float64) { workerHeartbeatAge.WithLabelValues(workerID).Set(ageSeconds) } // RecordBuild records a build task completion. func RecordBuild(project, status string, durationMs int64) { buildsTotal.WithLabelValues(project, status).Inc() if durationMs > 0 { buildDuration.WithLabelValues(project).Observe(float64(durationMs) / 1000.0) } } // SetWorkQueueDepth sets the current depth of the work queue for a status. func SetWorkQueueDepth(status string, count int64) { workQueueDepth.WithLabelValues(status).Set(float64(count)) } // SetRegistryHealth sets the registry health status. func SetRegistryHealth(healthy bool, latencySeconds float64) { val := 0.0 if healthy { val = 1.0 } registryHealthy.Set(val) registryLatency.Set(latencySeconds) } // RecordCIBuild records a CI build event. func RecordCIBuild(project, status string) { ciBuildsTotal.WithLabelValues(project, status).Inc() } // RecordCIPushFailure records a CI image push failure. func RecordCIPushFailure(project string) { ciPushFailures.WithLabelValues(project).Inc() } // SetExternalSystemHealth updates the health metrics for an external system. func SetExternalSystemHealth(system string, healthy bool, latencySeconds float64) { val := 0.0 if healthy { val = 1.0 } externalSystemHealthy.WithLabelValues(system).Set(val) externalSystemLatency.WithLabelValues(system).Set(latencySeconds) externalSystemLastCheck.WithLabelValues(system).Set(float64(time.Now().Unix())) } // RecordSaga records a saga completion. func RecordSaga(name, status string) { sagaTotal.WithLabelValues(name, status).Inc() } // RecordSagaStepDuration records the duration of a saga step. func RecordSagaStepDuration(saga, step, action string, durationMs int64) { sagaStepDuration.WithLabelValues(saga, step, action).Observe(float64(durationMs) / 1000.0) } // RecordSagaRetry records a saga step retry. func RecordSagaRetry(saga, step string) { sagaRetryTotal.WithLabelValues(saga, step).Inc() } // SetCircuitBreakerState sets the circuit breaker state metric. // state: 0=closed, 1=half-open, 2=open func SetCircuitBreakerState(name string, state int) { circuitBreakerState.WithLabelValues(name).Set(float64(state)) } // Handler returns the Prometheus HTTP handler. func Handler() http.Handler { return promhttp.Handler() } // Middleware returns an HTTP middleware that records request metrics. func Middleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() // Wrap the response writer to capture status code rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK} next.ServeHTTP(rw, r) duration := time.Since(start).Seconds() status := strconv.Itoa(rw.statusCode) path := normalizePath(r.URL.Path) requestDuration.WithLabelValues(r.Method, path, status).Observe(duration) requestsTotal.WithLabelValues(r.Method, path, status).Inc() }) } // responseWriter wraps http.ResponseWriter to capture status code. type responseWriter struct { http.ResponseWriter statusCode int } func (rw *responseWriter) WriteHeader(code int) { rw.statusCode = code rw.ResponseWriter.WriteHeader(code) } // pathNormalizers contains patterns to normalize variable path segments. // Order matters - more specific patterns first. var pathNormalizers = []struct { pattern *regexp.Regexp replace string }{ // /keys/uuid -> /keys/{id} {regexp.MustCompile(`^/keys/[^/]+$`), "/keys/{id}"}, // /workers/{id}/... -> /workers/{id}/... {regexp.MustCompile(`^/workers/[^/]+(/.*)?$`), "/workers/{id}$1"}, // /builds/{id} -> /builds/{id} {regexp.MustCompile(`^/builds/[^/]+$`), "/builds/{id}"}, // /projects/{id}/claude-config/{type}/{name} -> /projects/{id}/claude-config/{type}/{name} {regexp.MustCompile(`^/projects/[^/]+/claude-config/(commands|skills|agents)/[^/]+$`), "/projects/{id}/claude-config/$1/{name}"}, // /projects/{id}/... (any sub-path) - must be last as it's most general {regexp.MustCompile(`^/projects/[^/]+(/.*)?$`), "/projects/{id}$1"}, } // normalizePath normalizes the URL path for consistent metric labels. // Replaces variable path segments with placeholders to prevent cardinality explosion. func normalizePath(path string) string { for _, n := range pathNormalizers { if n.pattern.MatchString(path) { return n.pattern.ReplaceAllString(path, n.replace) } } return path }