Major refactoring to hexagonal (ports & adapters) architecture: - Add service layer (apikey_service, project_service) for business logic - Add webhook system with dispatcher and delivery tracking - Add command queue with priority-based processing - Add rate limiting with sliding window algorithm - Add audit logging for command execution - Add OpenTelemetry integration (traces, metrics, spans) - Add circuit breaker for fault tolerance - Add cached repository wrapper for performance - Add comprehensive validation package - Add Kubernetes client integration for pod management - Add database migrations (allowed_ips, audit_log, rate_limiting, queue, webhooks) - Add network policy and PodDisruptionBudget for k8s - Remove legacy executor and projects/registry packages - Untrack secrets.yaml (now managed via envault) - Add coverage.out to .gitignore - Add e2e test infrastructure with docker-compose - Add comprehensive documentation (API, architecture, operations, plans) - Add golangci-lint config and pre-commit hook Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
230 lines
6.6 KiB
Go
230 lines
6.6 KiB
Go
// Package telemetry provides OpenTelemetry integration for the rdev API.
|
|
//
|
|
// It initializes a tracer provider with OTLP exporter for distributed tracing.
|
|
// Traces are exported to an OpenTelemetry collector (e.g., otel-collector in k8s).
|
|
//
|
|
// Configuration via environment variables:
|
|
// - OTEL_EXPORTER_OTLP_ENDPOINT: Collector endpoint (default: otel-collector.observability.svc:4317)
|
|
// - OTEL_SERVICE_NAME: Service name for traces (default: rdev-api)
|
|
// - OTEL_SERVICE_VERSION: Service version (default: unknown)
|
|
// - OTEL_SERVICE_NAMESPACE: Namespace (default: rdev)
|
|
// - OTEL_ENABLED: Enable/disable telemetry (default: true)
|
|
package telemetry
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"log/slog"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"go.opentelemetry.io/otel"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
|
"go.opentelemetry.io/otel/propagation"
|
|
"go.opentelemetry.io/otel/sdk/resource"
|
|
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
|
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
|
|
"go.opentelemetry.io/otel/trace"
|
|
"go.opentelemetry.io/otel/trace/noop"
|
|
)
|
|
|
|
// Config holds telemetry configuration.
|
|
type Config struct {
|
|
// Endpoint is the OTLP collector endpoint (gRPC).
|
|
// Default: otel-collector.observability.svc:4317
|
|
Endpoint string
|
|
|
|
// ServiceName identifies this service in traces.
|
|
// Default: rdev-api
|
|
ServiceName string
|
|
|
|
// ServiceVersion is the version of this service.
|
|
// Default: unknown
|
|
ServiceVersion string
|
|
|
|
// ServiceNamespace groups related services.
|
|
// Default: rdev
|
|
ServiceNamespace string
|
|
|
|
// Enabled controls whether telemetry is active.
|
|
// Default: true
|
|
Enabled bool
|
|
|
|
// Insecure disables TLS for the gRPC connection.
|
|
// Default: true (for internal k8s communication)
|
|
Insecure bool
|
|
|
|
// Logger for telemetry initialization messages.
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// DefaultConfig returns configuration with defaults applied.
|
|
func DefaultConfig() Config {
|
|
return Config{
|
|
Endpoint: getEnv("OTEL_EXPORTER_OTLP_ENDPOINT", "otel-collector.observability.svc:4317"),
|
|
ServiceName: getEnv("OTEL_SERVICE_NAME", "rdev-api"),
|
|
ServiceVersion: getEnv("OTEL_SERVICE_VERSION", "unknown"),
|
|
ServiceNamespace: getEnv("OTEL_SERVICE_NAMESPACE", "rdev"),
|
|
Enabled: getEnvBool("OTEL_ENABLED", true),
|
|
Insecure: true,
|
|
}
|
|
}
|
|
|
|
// Telemetry manages OpenTelemetry resources.
|
|
type Telemetry struct {
|
|
config Config
|
|
tracerProvider *sdktrace.TracerProvider
|
|
tracer trace.Tracer
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// New creates and initializes a new Telemetry instance.
|
|
// Call Shutdown() when done to flush pending traces.
|
|
func New(ctx context.Context, cfg Config) (*Telemetry, error) {
|
|
logger := cfg.Logger
|
|
if logger == nil {
|
|
logger = slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
|
|
Level: slog.LevelInfo,
|
|
}))
|
|
}
|
|
|
|
t := &Telemetry{
|
|
config: cfg,
|
|
logger: logger,
|
|
}
|
|
|
|
if !cfg.Enabled {
|
|
logger.Info("telemetry disabled, using noop tracer")
|
|
t.tracer = noop.NewTracerProvider().Tracer(cfg.ServiceName)
|
|
return t, nil
|
|
}
|
|
|
|
// Create OTLP exporter
|
|
opts := []otlptracegrpc.Option{
|
|
otlptracegrpc.WithEndpoint(cfg.Endpoint),
|
|
}
|
|
if cfg.Insecure {
|
|
opts = append(opts, otlptracegrpc.WithInsecure())
|
|
}
|
|
|
|
exporter, err := otlptracegrpc.New(ctx, opts...)
|
|
if err != nil {
|
|
return nil, errors.New("failed to create OTLP exporter: " + err.Error())
|
|
}
|
|
|
|
// Create resource with service information
|
|
// Note: We create a new resource instead of merging with Default() to avoid
|
|
// schema URL conflicts between different semconv versions
|
|
res := resource.NewWithAttributes(
|
|
semconv.SchemaURL,
|
|
semconv.ServiceName(cfg.ServiceName),
|
|
semconv.ServiceVersion(cfg.ServiceVersion),
|
|
semconv.ServiceNamespace(cfg.ServiceNamespace),
|
|
attribute.String("deployment.environment", getEnv("ENVIRONMENT", "production")),
|
|
)
|
|
|
|
// Create tracer provider with batch span processor
|
|
tp := sdktrace.NewTracerProvider(
|
|
sdktrace.WithBatcher(exporter,
|
|
sdktrace.WithBatchTimeout(5*time.Second),
|
|
sdktrace.WithMaxExportBatchSize(512),
|
|
),
|
|
sdktrace.WithResource(res),
|
|
sdktrace.WithSampler(sdktrace.AlwaysSample()),
|
|
)
|
|
|
|
// Set as global tracer provider
|
|
otel.SetTracerProvider(tp)
|
|
|
|
// Set up propagation (W3C Trace Context + Baggage)
|
|
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
|
propagation.TraceContext{},
|
|
propagation.Baggage{},
|
|
))
|
|
|
|
t.tracerProvider = tp
|
|
t.tracer = tp.Tracer(cfg.ServiceName)
|
|
|
|
logger.Info("telemetry initialized",
|
|
"endpoint", cfg.Endpoint,
|
|
"service", cfg.ServiceName,
|
|
"version", cfg.ServiceVersion,
|
|
"namespace", cfg.ServiceNamespace,
|
|
)
|
|
|
|
return t, nil
|
|
}
|
|
|
|
// Tracer returns the tracer for creating spans.
|
|
func (t *Telemetry) Tracer() trace.Tracer {
|
|
return t.tracer
|
|
}
|
|
|
|
// Shutdown gracefully shuts down the telemetry, flushing any pending traces.
|
|
// Should be called during application shutdown.
|
|
func (t *Telemetry) Shutdown(ctx context.Context) error {
|
|
if t.tracerProvider == nil {
|
|
return nil
|
|
}
|
|
|
|
t.logger.Info("shutting down telemetry")
|
|
|
|
// Create a timeout context if none provided
|
|
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
|
|
var cancel context.CancelFunc
|
|
ctx, cancel = context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
}
|
|
|
|
if err := t.tracerProvider.Shutdown(ctx); err != nil {
|
|
return errors.New("telemetry shutdown failed: " + err.Error())
|
|
}
|
|
|
|
t.logger.Info("telemetry shutdown complete")
|
|
return nil
|
|
}
|
|
|
|
// StartSpan starts a new span with the given name.
|
|
// Returns the span and a new context containing the span.
|
|
func (t *Telemetry) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
|
|
return t.tracer.Start(ctx, name, opts...)
|
|
}
|
|
|
|
// AddSpanEvent adds an event to the current span in the context.
|
|
func AddSpanEvent(ctx context.Context, name string, attrs ...attribute.KeyValue) {
|
|
span := trace.SpanFromContext(ctx)
|
|
span.AddEvent(name, trace.WithAttributes(attrs...))
|
|
}
|
|
|
|
// SetSpanError records an error on the current span.
|
|
func SetSpanError(ctx context.Context, err error) {
|
|
span := trace.SpanFromContext(ctx)
|
|
span.RecordError(err)
|
|
}
|
|
|
|
// SetSpanAttributes sets attributes on the current span.
|
|
func SetSpanAttributes(ctx context.Context, attrs ...attribute.KeyValue) {
|
|
span := trace.SpanFromContext(ctx)
|
|
span.SetAttributes(attrs...)
|
|
}
|
|
|
|
// getEnv returns the environment variable value or the default.
|
|
func getEnv(key, defaultVal string) string {
|
|
if v := os.Getenv(key); v != "" {
|
|
return v
|
|
}
|
|
return defaultVal
|
|
}
|
|
|
|
// getEnvBool returns the environment variable as bool or the default.
|
|
func getEnvBool(key string, defaultVal bool) bool {
|
|
v := os.Getenv(key)
|
|
if v == "" {
|
|
return defaultVal
|
|
}
|
|
v = strings.ToLower(v)
|
|
return v == "true" || v == "1" || v == "yes"
|
|
}
|