rdev/internal/telemetry/telemetry.go
jordan bc47e426b0 feat: Add CI pipeline proxy, DNS alias management, and worker executor system
- Add ListPipelines/GetPipeline to CIProvider port with Woodpecker adapter
- Add DNS alias endpoints: GET/POST/DELETE /projects/{id}/domains
- Implement worker executor daemon, build executor, and git operations
- Add build service, worker service, and build audit tracking
- Add worker registry with PostgreSQL adapter and migration
- Add multi-provider code agent interface (Claude Code + OpenCode)
- Add create-and-build combo endpoint
- Update landing-page cookbook to reflect all gaps closed
- Fix tech debt: unified validation, auth scopes, error wrapping, slog patterns

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 21:05:28 -07:00

230 lines
6.6 KiB
Go

// Package telemetry provides OpenTelemetry integration for the rdev API.
//
// It initializes a tracer provider with OTLP exporter for distributed tracing.
// Traces are exported to an OpenTelemetry collector (e.g., otel-collector in k8s).
//
// Configuration via environment variables:
// - OTEL_EXPORTER_OTLP_ENDPOINT: Collector endpoint (default: otel-collector.observability.svc:4317)
// - OTEL_SERVICE_NAME: Service name for traces (default: rdev-api)
// - OTEL_SERVICE_VERSION: Service version (default: unknown)
// - OTEL_SERVICE_NAMESPACE: Namespace (default: rdev)
// - OTEL_ENABLED: Enable/disable telemetry (default: true)
package telemetry
import (
"context"
"fmt"
"log/slog"
"os"
"strings"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
"go.opentelemetry.io/otel/trace"
"go.opentelemetry.io/otel/trace/noop"
)
// Config holds telemetry configuration.
type Config struct {
// Endpoint is the OTLP collector endpoint (gRPC).
// Default: otel-collector.observability.svc:4317
Endpoint string
// ServiceName identifies this service in traces.
// Default: rdev-api
ServiceName string
// ServiceVersion is the version of this service.
// Default: unknown
ServiceVersion string
// ServiceNamespace groups related services.
// Default: rdev
ServiceNamespace string
// Enabled controls whether telemetry is active.
// Default: true
Enabled bool
// Insecure disables TLS for the gRPC connection.
// Default: true (for internal k8s communication)
Insecure bool
// Logger for telemetry initialization messages.
Logger *slog.Logger
}
// DefaultConfig returns configuration with defaults applied.
func DefaultConfig() Config {
return Config{
Endpoint: getEnv("OTEL_EXPORTER_OTLP_ENDPOINT", "otel-collector.observability.svc:4317"),
ServiceName: getEnv("OTEL_SERVICE_NAME", "rdev-api"),
ServiceVersion: getEnv("OTEL_SERVICE_VERSION", "unknown"),
ServiceNamespace: getEnv("OTEL_SERVICE_NAMESPACE", "rdev"),
Enabled: getEnvBool("OTEL_ENABLED", true),
Insecure: true,
}
}
// Telemetry manages OpenTelemetry resources.
type Telemetry struct {
config Config
tracerProvider *sdktrace.TracerProvider
tracer trace.Tracer
logger *slog.Logger
}
// New creates and initializes a new Telemetry instance.
// Call Shutdown() when done to flush pending traces.
func New(ctx context.Context, cfg Config) (*Telemetry, error) {
logger := cfg.Logger
if logger == nil {
logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
}
t := &Telemetry{
config: cfg,
logger: logger,
}
if !cfg.Enabled {
logger.Info("telemetry disabled, using noop tracer")
t.tracer = noop.NewTracerProvider().Tracer(cfg.ServiceName)
return t, nil
}
// Create OTLP exporter
opts := []otlptracegrpc.Option{
otlptracegrpc.WithEndpoint(cfg.Endpoint),
}
if cfg.Insecure {
opts = append(opts, otlptracegrpc.WithInsecure())
}
exporter, err := otlptracegrpc.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("failed to create OTLP exporter: %w", err)
}
// Create resource with service information
// Note: We create a new resource instead of merging with Default() to avoid
// schema URL conflicts between different semconv versions
res := resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceName(cfg.ServiceName),
semconv.ServiceVersion(cfg.ServiceVersion),
semconv.ServiceNamespace(cfg.ServiceNamespace),
attribute.String("deployment.environment", getEnv("ENVIRONMENT", "production")),
)
// Create tracer provider with batch span processor
tp := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter,
sdktrace.WithBatchTimeout(5*time.Second),
sdktrace.WithMaxExportBatchSize(512),
),
sdktrace.WithResource(res),
sdktrace.WithSampler(sdktrace.AlwaysSample()),
)
// Set as global tracer provider
otel.SetTracerProvider(tp)
// Set up propagation (W3C Trace Context + Baggage)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
t.tracerProvider = tp
t.tracer = tp.Tracer(cfg.ServiceName)
logger.Info("telemetry initialized",
"endpoint", cfg.Endpoint,
"service", cfg.ServiceName,
"version", cfg.ServiceVersion,
"namespace", cfg.ServiceNamespace,
)
return t, nil
}
// Tracer returns the tracer for creating spans.
func (t *Telemetry) Tracer() trace.Tracer {
return t.tracer
}
// Shutdown gracefully shuts down the telemetry, flushing any pending traces.
// Should be called during application shutdown.
func (t *Telemetry) Shutdown(ctx context.Context) error {
if t.tracerProvider == nil {
return nil
}
t.logger.Info("shutting down telemetry")
// Create a timeout context if none provided
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, 10*time.Second)
defer cancel()
}
if err := t.tracerProvider.Shutdown(ctx); err != nil {
return fmt.Errorf("telemetry shutdown failed: %w", err)
}
t.logger.Info("telemetry shutdown complete")
return nil
}
// StartSpan starts a new span with the given name.
// Returns the span and a new context containing the span.
func (t *Telemetry) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
return t.tracer.Start(ctx, name, opts...)
}
// AddSpanEvent adds an event to the current span in the context.
func AddSpanEvent(ctx context.Context, name string, attrs ...attribute.KeyValue) {
span := trace.SpanFromContext(ctx)
span.AddEvent(name, trace.WithAttributes(attrs...))
}
// SetSpanError records an error on the current span.
func SetSpanError(ctx context.Context, err error) {
span := trace.SpanFromContext(ctx)
span.RecordError(err)
}
// SetSpanAttributes sets attributes on the current span.
func SetSpanAttributes(ctx context.Context, attrs ...attribute.KeyValue) {
span := trace.SpanFromContext(ctx)
span.SetAttributes(attrs...)
}
// getEnv returns the environment variable value or the default.
func getEnv(key, defaultVal string) string {
if v := os.Getenv(key); v != "" {
return v
}
return defaultVal
}
// getEnvBool returns the environment variable as bool or the default.
func getEnvBool(key string, defaultVal bool) bool {
v := os.Getenv(key)
if v == "" {
return defaultVal
}
v = strings.ToLower(v)
return v == "true" || v == "1" || v == "yes"
}