- Add /diagnostics endpoint for system health overview - Add external health worker for monitoring Gitea, Woodpecker, Registry - Add health check methods to Gitea and Woodpecker clients - Remove hardcoded fallback projects (pantheon, aeries) - Add diagnostics domain types and service layer - Add comprehensive tests for diagnostics handler and service - Fix tests to use registered test project instead of hardcoded one Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
341 lines
11 KiB
Go
341 lines
11 KiB
Go
// Package metrics provides Prometheus metrics for the rdev API.
|
|
package metrics
|
|
|
|
import (
|
|
"net/http"
|
|
"regexp"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
)
|
|
|
|
var (
|
|
// Commands
|
|
commandsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_commands_total",
|
|
Help: "Total number of commands executed",
|
|
}, []string{"project", "type", "status"})
|
|
|
|
commandDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
|
Name: "rdev_command_duration_seconds",
|
|
Help: "Duration of command execution in seconds",
|
|
Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min
|
|
}, []string{"project", "type"})
|
|
|
|
// Code Agents
|
|
agentRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_agent_requests_total",
|
|
Help: "Total number of code agent requests",
|
|
}, []string{"provider", "status"})
|
|
|
|
agentRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
|
Name: "rdev_agent_request_duration_seconds",
|
|
Help: "Duration of code agent requests in seconds",
|
|
Buckets: prometheus.ExponentialBuckets(0.1, 2, 15), // 0.1s to ~27min
|
|
}, []string{"provider"})
|
|
|
|
agentToolUse = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_agent_tool_use_total",
|
|
Help: "Total number of tool invocations by code agents",
|
|
}, []string{"provider", "tool"})
|
|
|
|
agentAvailability = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_agent_available",
|
|
Help: "Whether the code agent is available (1) or not (0)",
|
|
}, []string{"provider"})
|
|
|
|
// Worker Pool
|
|
workersTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_workers_total",
|
|
Help: "Number of registered workers by status",
|
|
}, []string{"status"})
|
|
|
|
workerHeartbeatAge = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_worker_heartbeat_age_seconds",
|
|
Help: "Age of the most recent worker heartbeat in seconds",
|
|
}, []string{"worker_id"})
|
|
|
|
// Builds
|
|
buildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_builds_total",
|
|
Help: "Total number of build tasks by status",
|
|
}, []string{"project", "status"})
|
|
|
|
buildDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
|
Name: "rdev_build_duration_seconds",
|
|
Help: "Duration of build executions in seconds",
|
|
Buckets: prometheus.ExponentialBuckets(1, 2, 12), // 1s to ~34min
|
|
}, []string{"project"})
|
|
|
|
// Work Queue
|
|
workQueueDepth = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_work_queue_depth",
|
|
Help: "Number of tasks in the work queue by status",
|
|
}, []string{"status"})
|
|
|
|
// Streams
|
|
activeStreams = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_active_streams",
|
|
Help: "Number of active SSE streams",
|
|
}, []string{"project"})
|
|
|
|
streamReconnects = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_stream_reconnects_total",
|
|
Help: "Total number of SSE stream reconnections",
|
|
}, []string{"project"})
|
|
|
|
// Build Events (SSE streaming)
|
|
buildEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_build_events_total",
|
|
Help: "Total number of build events published",
|
|
}, []string{"type"})
|
|
|
|
buildEventSubscribers = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_build_event_subscribers",
|
|
Help: "Number of active build event subscribers",
|
|
}, []string{"task_id"})
|
|
|
|
buildEventBufferSize = promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "rdev_build_event_buffer_size",
|
|
Help: "Total number of events in replay buffers",
|
|
})
|
|
|
|
// Authentication
|
|
authFailures = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_auth_failures_total",
|
|
Help: "Total number of authentication failures",
|
|
}, []string{"reason"})
|
|
|
|
// API Requests
|
|
requestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
|
Name: "rdev_api_request_duration_seconds",
|
|
Help: "Duration of API requests in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
}, []string{"method", "path", "status"})
|
|
|
|
requestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_api_requests_total",
|
|
Help: "Total number of API requests",
|
|
}, []string{"method", "path", "status"})
|
|
|
|
// Registry health
|
|
registryHealthy = promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "rdev_registry_healthy",
|
|
Help: "Whether the container registry is healthy (1) or not (0)",
|
|
})
|
|
|
|
registryLatency = promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "rdev_registry_latency_seconds",
|
|
Help: "Latency of registry health check in seconds",
|
|
})
|
|
|
|
// CI builds
|
|
ciBuildsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_ci_builds_total",
|
|
Help: "Total number of CI builds by project and status",
|
|
}, []string{"project", "status"})
|
|
|
|
ciPushFailures = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "rdev_ci_push_failures_total",
|
|
Help: "Total number of CI image push failures by project",
|
|
}, []string{"project"})
|
|
|
|
// External system health
|
|
externalSystemHealthy = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_external_system_healthy",
|
|
Help: "Whether external system is healthy (1) or not (0)",
|
|
}, []string{"system"})
|
|
|
|
externalSystemLatency = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_external_system_latency_seconds",
|
|
Help: "Latency of external system health check in seconds",
|
|
}, []string{"system"})
|
|
|
|
externalSystemLastCheck = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "rdev_external_system_last_check_timestamp",
|
|
Help: "Unix timestamp of last health check",
|
|
}, []string{"system"})
|
|
)
|
|
|
|
// RecordCommand records a command execution.
|
|
func RecordCommand(project, cmdType, status string, durationMs int64) {
|
|
commandsTotal.WithLabelValues(project, cmdType, status).Inc()
|
|
commandDuration.WithLabelValues(project, cmdType).Observe(float64(durationMs) / 1000.0)
|
|
}
|
|
|
|
// IncActiveStreams increments the active stream count for a project.
|
|
func IncActiveStreams(project string) {
|
|
activeStreams.WithLabelValues(project).Inc()
|
|
}
|
|
|
|
// DecActiveStreams decrements the active stream count for a project.
|
|
func DecActiveStreams(project string) {
|
|
activeStreams.WithLabelValues(project).Dec()
|
|
}
|
|
|
|
// RecordStreamReconnect records a stream reconnection.
|
|
func RecordStreamReconnect(project string) {
|
|
streamReconnects.WithLabelValues(project).Inc()
|
|
}
|
|
|
|
// RecordBuildEvent records a build event publication.
|
|
func RecordBuildEvent(eventType string) {
|
|
buildEventsTotal.WithLabelValues(eventType).Inc()
|
|
}
|
|
|
|
// SetBuildEventSubscribers sets the number of subscribers for a build stream.
|
|
func SetBuildEventSubscribers(taskID string, count int) {
|
|
buildEventSubscribers.WithLabelValues(taskID).Set(float64(count))
|
|
}
|
|
|
|
// SetBuildEventBufferSize sets the total buffer size for event replay.
|
|
func SetBuildEventBufferSize(size int64) {
|
|
buildEventBufferSize.Set(float64(size))
|
|
}
|
|
|
|
// RecordAuthFailure records an authentication failure.
|
|
func RecordAuthFailure(reason string) {
|
|
authFailures.WithLabelValues(reason).Inc()
|
|
}
|
|
|
|
// RecordAgentRequest records a code agent request execution.
|
|
func RecordAgentRequest(provider, status string, durationMs int64) {
|
|
agentRequestsTotal.WithLabelValues(provider, status).Inc()
|
|
agentRequestDuration.WithLabelValues(provider).Observe(float64(durationMs) / 1000.0)
|
|
}
|
|
|
|
// RecordAgentToolUse records a tool invocation by a code agent.
|
|
func RecordAgentToolUse(provider, tool string) {
|
|
agentToolUse.WithLabelValues(provider, tool).Inc()
|
|
}
|
|
|
|
// SetAgentAvailability sets the availability status of a code agent.
|
|
func SetAgentAvailability(provider string, available bool) {
|
|
val := 0.0
|
|
if available {
|
|
val = 1.0
|
|
}
|
|
agentAvailability.WithLabelValues(provider).Set(val)
|
|
}
|
|
|
|
// SetWorkerCount sets the number of workers for a given status.
|
|
func SetWorkerCount(status string, count int) {
|
|
workersTotal.WithLabelValues(status).Set(float64(count))
|
|
}
|
|
|
|
// RecordWorkerHeartbeat sets the age of a worker's most recent heartbeat.
|
|
func RecordWorkerHeartbeat(workerID string, ageSeconds float64) {
|
|
workerHeartbeatAge.WithLabelValues(workerID).Set(ageSeconds)
|
|
}
|
|
|
|
// RecordBuild records a build task completion.
|
|
func RecordBuild(project, status string, durationMs int64) {
|
|
buildsTotal.WithLabelValues(project, status).Inc()
|
|
if durationMs > 0 {
|
|
buildDuration.WithLabelValues(project).Observe(float64(durationMs) / 1000.0)
|
|
}
|
|
}
|
|
|
|
// SetWorkQueueDepth sets the current depth of the work queue for a status.
|
|
func SetWorkQueueDepth(status string, count int64) {
|
|
workQueueDepth.WithLabelValues(status).Set(float64(count))
|
|
}
|
|
|
|
// SetRegistryHealth sets the registry health status.
|
|
func SetRegistryHealth(healthy bool, latencySeconds float64) {
|
|
val := 0.0
|
|
if healthy {
|
|
val = 1.0
|
|
}
|
|
registryHealthy.Set(val)
|
|
registryLatency.Set(latencySeconds)
|
|
}
|
|
|
|
// RecordCIBuild records a CI build event.
|
|
func RecordCIBuild(project, status string) {
|
|
ciBuildsTotal.WithLabelValues(project, status).Inc()
|
|
}
|
|
|
|
// RecordCIPushFailure records a CI image push failure.
|
|
func RecordCIPushFailure(project string) {
|
|
ciPushFailures.WithLabelValues(project).Inc()
|
|
}
|
|
|
|
// SetExternalSystemHealth updates the health metrics for an external system.
|
|
func SetExternalSystemHealth(system string, healthy bool, latencySeconds float64) {
|
|
val := 0.0
|
|
if healthy {
|
|
val = 1.0
|
|
}
|
|
externalSystemHealthy.WithLabelValues(system).Set(val)
|
|
externalSystemLatency.WithLabelValues(system).Set(latencySeconds)
|
|
externalSystemLastCheck.WithLabelValues(system).Set(float64(time.Now().Unix()))
|
|
}
|
|
|
|
// Handler returns the Prometheus HTTP handler.
|
|
func Handler() http.Handler {
|
|
return promhttp.Handler()
|
|
}
|
|
|
|
// Middleware returns an HTTP middleware that records request metrics.
|
|
func Middleware(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
start := time.Now()
|
|
|
|
// Wrap the response writer to capture status code
|
|
rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
|
|
|
|
next.ServeHTTP(rw, r)
|
|
|
|
duration := time.Since(start).Seconds()
|
|
status := strconv.Itoa(rw.statusCode)
|
|
path := normalizePath(r.URL.Path)
|
|
|
|
requestDuration.WithLabelValues(r.Method, path, status).Observe(duration)
|
|
requestsTotal.WithLabelValues(r.Method, path, status).Inc()
|
|
})
|
|
}
|
|
|
|
// responseWriter wraps http.ResponseWriter to capture status code.
|
|
type responseWriter struct {
|
|
http.ResponseWriter
|
|
statusCode int
|
|
}
|
|
|
|
func (rw *responseWriter) WriteHeader(code int) {
|
|
rw.statusCode = code
|
|
rw.ResponseWriter.WriteHeader(code)
|
|
}
|
|
|
|
// pathNormalizers contains patterns to normalize variable path segments.
|
|
// Order matters - more specific patterns first.
|
|
var pathNormalizers = []struct {
|
|
pattern *regexp.Regexp
|
|
replace string
|
|
}{
|
|
// /keys/uuid -> /keys/{id}
|
|
{regexp.MustCompile(`^/keys/[^/]+$`), "/keys/{id}"},
|
|
// /workers/{id}/... -> /workers/{id}/...
|
|
{regexp.MustCompile(`^/workers/[^/]+(/.*)?$`), "/workers/{id}$1"},
|
|
// /builds/{id} -> /builds/{id}
|
|
{regexp.MustCompile(`^/builds/[^/]+$`), "/builds/{id}"},
|
|
// /projects/{id}/claude-config/{type}/{name} -> /projects/{id}/claude-config/{type}/{name}
|
|
{regexp.MustCompile(`^/projects/[^/]+/claude-config/(commands|skills|agents)/[^/]+$`), "/projects/{id}/claude-config/$1/{name}"},
|
|
// /projects/{id}/... (any sub-path) - must be last as it's most general
|
|
{regexp.MustCompile(`^/projects/[^/]+(/.*)?$`), "/projects/{id}$1"},
|
|
}
|
|
|
|
// normalizePath normalizes the URL path for consistent metric labels.
|
|
// Replaces variable path segments with placeholders to prevent cardinality explosion.
|
|
func normalizePath(path string) string {
|
|
for _, n := range pathNormalizers {
|
|
if n.pattern.MatchString(path) {
|
|
return n.pattern.ReplaceAllString(path, n.replace)
|
|
}
|
|
}
|
|
return path
|
|
}
|