diff --git a/cmd/buildkitd/main.go b/cmd/buildkitd/main.go index a73c743cc..4c5980ec2 100644 --- a/cmd/buildkitd/main.go +++ b/cmd/buildkitd/main.go @@ -70,7 +70,9 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" sdktrace "go.opentelemetry.io/otel/sdk/trace" @@ -400,7 +402,7 @@ func main() { defer db.Close() } - controller, err := newController(ctx, c, &cfg) + controller, err := newController(ctx, c, &cfg, mp) if err != nil { return err } @@ -827,7 +829,7 @@ func serverCredentials(cfg config.TLSConfig) (*tls.Config, error) { return tlsConf, nil } -func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*control.Controller, error) { +func newController(ctx context.Context, c *cli.Context, cfg *config.Config, mp metric.MeterProvider) (*control.Controller, error) { sessionManager, err := session.NewManager() if err != nil { return nil, err @@ -930,6 +932,7 @@ func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*co CacheManager: solver.NewCacheManager(context.TODO(), "local", cacheStorage, worker.NewCacheResultStorage(wc)), Entitlements: cfg.Entitlements, TraceCollector: tc, + MeterProvider: mp, HistoryDB: historyDB, CacheStore: cacheStorage, LeaseManager: w.LeaseManager(), @@ -1106,6 +1109,7 @@ func newTracerProvider(ctx context.Context) (*sdktrace.TracerProvider, error) { func newMeterProvider(ctx context.Context) (*sdkmetric.MeterProvider, error) { opts := []sdkmetric.Option{ sdkmetric.WithResource(detect.Resource()), + sdkmetric.WithView(buildDurationView()), } if r, err := prometheus.New(); err != nil { @@ -1126,6 +1130,32 @@ func newMeterProvider(ctx context.Context) (*sdkmetric.MeterProvider, error) { return sdkmetric.NewMeterProvider(opts...), nil } +// buildDurationView routes the buildkit.build.duration histogram to a +// Base2 exponential aggregation so that the OTEL Prometheus exporter +// renders it as a Prometheus native histogram. Native histograms avoid +// the "tens of millions of series" cardinality blow-up reported in +// moby/buildkit#5777 by storing observations in dynamically-sized +// exponential buckets rather than a fixed bucket schedule. +// +// The AttributeFilter is defense in depth: only the status attribute +// is recorded on this histogram, regardless of what the call site +// supplies. +func buildDurationView() sdkmetric.View { + return sdkmetric.NewView( + sdkmetric.Instrument{ + Name: "buildkit.build.duration", + Kind: sdkmetric.InstrumentKindHistogram, + }, + sdkmetric.Stream{ + Aggregation: sdkmetric.AggregationBase2ExponentialHistogram{ + MaxSize: 160, + MaxScale: 20, + }, + AttributeFilter: attribute.NewAllowKeysFilter("status"), + }, + ) +} + func getCDIManager(cfg config.CDIConfig) (*cdidevices.Manager, error) { if cfg.Disabled != nil && *cfg.Disabled { return nil, nil diff --git a/control/control.go b/control/control.go index e4704fb39..5afb62a49 100644 --- a/control/control.go +++ b/control/control.go @@ -51,6 +51,7 @@ import ( "github.com/moby/buildkit/worker" digest "github.com/opencontainers/go-digest" "github.com/pkg/errors" + "go.opentelemetry.io/otel/metric" sdktrace "go.opentelemetry.io/otel/sdk/trace" tracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1" "golang.org/x/sync/errgroup" @@ -70,6 +71,7 @@ type Opt struct { ResolveCacheImporterFuncs map[string]remotecache.ResolveCacheImporterFunc Entitlements []string TraceCollector sdktrace.SpanExporter + MeterProvider metric.MeterProvider HistoryDB db.DB CacheStore *bboltcachestorage.Store LeaseManager *leaseutil.Manager @@ -118,6 +120,7 @@ func NewController(opt Opt) (*Controller, error) { Entitlements: opt.Entitlements, HistoryQueue: hq, ProvenanceEnv: opt.ProvenanceEnv, + MeterProvider: opt.MeterProvider, }) if err != nil { return nil, errors.Wrap(err, "failed to create solver") diff --git a/go.mod b/go.mod index e6c2524a6..ff6b1b402 100644 --- a/go.mod +++ b/go.mod @@ -99,6 +99,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 go.opentelemetry.io/otel/exporters/prometheus v0.60.0 + go.opentelemetry.io/otel/metric v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 go.opentelemetry.io/otel/trace v1.43.0 @@ -221,7 +222,6 @@ require ( github.com/vishvananda/netns v0.0.5 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/term v0.42.0 // indirect diff --git a/solver/llbsolver/history.go b/solver/llbsolver/history.go index 4acfccf9c..330eb6e24 100644 --- a/solver/llbsolver/history.go +++ b/solver/llbsolver/history.go @@ -262,6 +262,12 @@ func (s *Solver) recordBuildHistory(ctx context.Context, id string, req frontend ready, done := s.history.AcquireFinalizer(rec.Ref) + // Emit build observability metrics before the history record is + // finalized. rec is fully populated at this point — Frontend, + // Error, NumCompletedSteps, CreatedAt, and CompletedAt are set + // regardless of whether the build succeeded or failed. + s.metrics.recordBuildCompletion(ctx, rec) + if err1 := s.history.Update(ctx, &controlapi.BuildHistoryEvent{ Type: controlapi.BuildHistoryEventType_COMPLETE, Record: rec, diff --git a/solver/llbsolver/metrics.go b/solver/llbsolver/metrics.go new file mode 100644 index 000000000..e4bddb421 --- /dev/null +++ b/solver/llbsolver/metrics.go @@ -0,0 +1,132 @@ +package llbsolver + +import ( + "context" + + controlapi "github.com/moby/buildkit/api/services/control" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" + "google.golang.org/grpc/codes" +) + +// instrumentationName is the OTEL instrumentation scope used for every +// build observability instrument exported by the solver. +const instrumentationName = "github.com/moby/buildkit/solver/llbsolver" + +// Attribute keys for build observability metrics. +const ( + statusKey = attribute.Key("status") + errorCodeKey = attribute.Key("error_code") + kindKey = attribute.Key("kind") +) + +// Attribute values for the keys above. +const ( + statusSuccess = "success" + statusFailure = "failure" + + stepKindCompleted = "completed" + stepKindCached = "cached" + stepKindTotal = "total" + stepKindWarnings = "warnings" +) + +// buildMetrics holds the OTEL instruments the solver writes to once per +// solve completion. The struct is constructed once in New and shared by +// every Solve via the *Solver receiver — instruments are concurrency-safe +// per the OTEL metric API contract. +type buildMetrics struct { + builds metric.Int64Counter + steps metric.Int64Counter + duration metric.Float64Histogram +} + +// newBuildMetrics registers the build-completion instruments against mp. +// A nil mp is treated as a request to disable metrics: a no-op provider +// is used so the rest of the solver does not need to special-case the +// "metrics disabled" path. Returning an error here causes daemon startup +// to fail fast, matching the buildkit convention for required wiring. +func newBuildMetrics(mp metric.MeterProvider) (*buildMetrics, error) { + if mp == nil { + mp = noop.NewMeterProvider() + } + meter := mp.Meter(instrumentationName) + + builds, err := meter.Int64Counter( + "buildkit.builds", + metric.WithDescription("Number of builds completed, labeled by frontend, status, and (on failure) gRPC error code."), + ) + if err != nil { + return nil, err + } + + steps, err := meter.Int64Counter( + "buildkit.builds.steps", + metric.WithDescription("Cumulative count of build steps observed, partitioned by kind (completed, cached, total, warnings)."), + ) + if err != nil { + return nil, err + } + + duration, err := meter.Float64Histogram( + "buildkit.build.duration", + metric.WithDescription("Wall-clock duration of build solves from CreatedAt to CompletedAt."), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + + return &buildMetrics{ + builds: builds, + steps: steps, + duration: duration, + }, nil +} + +// recordBuildCompletion observes one finished build onto every relevant +// instrument. It is invoked from the recordBuildHistory finalizer, after +// rec is fully populated and immediately before the COMPLETE history +// event is sent. +// +// Labels are deliberately bounded to keep cardinality finite: +// - status: "success" or "failure". +// - error_code: the gRPC codes.Code string of rec.Error, attached only +// on failure. rec.Error.Message is intentionally not used as a label +// to avoid the cardinality blow-up reported in moby/buildkit#5777. +// +// rec.Frontend is intentionally NOT used as a label: the modern +// gateway-client path used by buildctl and buildx clears Frontend on +// the wire (see client/build.go), so the field is empty for nearly all +// real-world callers and the label would have cardinality 1. A +// follow-up can re-introduce a frontend signal if and when buildkit +// itself starts populating rec.Frontend on the gateway path. +// +// A nil receiver or nil rec is a no-op so callers do not have to guard +// the call site. +func (m *buildMetrics) recordBuildCompletion(ctx context.Context, rec *controlapi.BuildHistoryRecord) { + if m == nil || rec == nil { + return + } + + status := statusSuccess + attrs := []attribute.KeyValue{statusKey.String(status)} + if rec.Error != nil { + status = statusFailure + attrs[0] = statusKey.String(status) + attrs = append(attrs, errorCodeKey.String(codes.Code(rec.Error.Code).String())) + } + + m.builds.Add(ctx, 1, metric.WithAttributes(attrs...)) + + m.steps.Add(ctx, int64(rec.NumCompletedSteps), metric.WithAttributes(kindKey.String(stepKindCompleted))) + m.steps.Add(ctx, int64(rec.NumCachedSteps), metric.WithAttributes(kindKey.String(stepKindCached))) + m.steps.Add(ctx, int64(rec.NumTotalSteps), metric.WithAttributes(kindKey.String(stepKindTotal))) + m.steps.Add(ctx, int64(rec.NumWarnings), metric.WithAttributes(kindKey.String(stepKindWarnings))) + + if rec.CreatedAt != nil && rec.CompletedAt != nil { + seconds := rec.CompletedAt.AsTime().Sub(rec.CreatedAt.AsTime()).Seconds() + m.duration.Record(ctx, seconds, metric.WithAttributes(statusKey.String(status))) + } +} diff --git a/solver/llbsolver/metrics_test.go b/solver/llbsolver/metrics_test.go new file mode 100644 index 000000000..adf6f1e5c --- /dev/null +++ b/solver/llbsolver/metrics_test.go @@ -0,0 +1,189 @@ +package llbsolver + +import ( + "context" + "testing" + "time" + + controlapi "github.com/moby/buildkit/api/services/control" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + rpcstatus "google.golang.org/genproto/googleapis/rpc/status" + "google.golang.org/grpc/codes" + "google.golang.org/protobuf/types/known/timestamppb" +) + +// newTestMetrics returns a buildMetrics wired to a ManualReader so that +// tests can collect and inspect emitted observations synchronously. +func newTestMetrics(t *testing.T) (*buildMetrics, *sdkmetric.ManualReader) { + t.Helper() + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + bm, err := newBuildMetrics(mp) + require.NoError(t, err) + return bm, reader +} + +// collect collects all metrics from the reader into a flat map keyed by +// instrument name. Each value is the metricdata.Aggregation (Sum, +// Histogram, etc.) for that instrument. +func collect(t *testing.T, reader *sdkmetric.ManualReader) map[string]metricdata.Aggregation { + t.Helper() + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + out := map[string]metricdata.Aggregation{} + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + out[m.Name] = m.Data + } + } + return out +} + +// findCounterPoint returns the int64 sum data point whose attributes +// contain every (key, value) pair in want. Fails the test if no +// matching point is found. +func findCounterPoint(t *testing.T, agg metricdata.Aggregation, want map[string]string) metricdata.DataPoint[int64] { + t.Helper() + sum, ok := agg.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64], got %T", agg) + for _, dp := range sum.DataPoints { + if attrsContain(dp.Attributes, want) { + return dp + } + } + t.Fatalf("no data point with attrs %v found among %d points", want, len(sum.DataPoints)) + return metricdata.DataPoint[int64]{} +} + +// attrsContain reports whether every (key, value) pair in want is +// present in set. +func attrsContain(set attribute.Set, want map[string]string) bool { + for k, v := range want { + got, ok := set.Value(attribute.Key(k)) + if !ok || got.AsString() != v { + return false + } + } + return true +} + +func TestRecordBuildCompletion_Success(t *testing.T) { + bm, reader := newTestMetrics(t) + + start := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC) + rec := &controlapi.BuildHistoryRecord{ + CreatedAt: timestamppb.New(start), + CompletedAt: timestamppb.New(start.Add(7 * time.Second)), + NumCompletedSteps: 5, + NumCachedSteps: 2, + NumTotalSteps: 5, + NumWarnings: 1, + } + + bm.recordBuildCompletion(context.Background(), rec) + got := collect(t, reader) + + builds := findCounterPoint(t, got["buildkit.builds"], map[string]string{ + "status": "success", + }) + require.Equal(t, int64(1), builds.Value) + // On success, error_code must not be present at all. + if _, ok := builds.Attributes.Value("error_code"); ok { + t.Fatal("error_code attribute should be absent on success") + } + + hist, ok := got["buildkit.build.duration"].(metricdata.Histogram[float64]) + require.True(t, ok, "expected Histogram[float64] for build.duration, got %T", got["buildkit.build.duration"]) + require.Len(t, hist.DataPoints, 1) + require.Equal(t, uint64(1), hist.DataPoints[0].Count) + require.InDelta(t, 7.0, hist.DataPoints[0].Sum, 0.001) + require.True(t, attrsContain(hist.DataPoints[0].Attributes, map[string]string{ + "status": "success", + })) +} + +func TestRecordBuildCompletion_FailureGRPCCodes(t *testing.T) { + cases := []struct { + name string + code codes.Code + }{ + {"canceled", codes.Canceled}, + {"resource_exhausted", codes.ResourceExhausted}, + {"unknown", codes.Unknown}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + bm, reader := newTestMetrics(t) + start := time.Now() + rec := &controlapi.BuildHistoryRecord{ + CreatedAt: timestamppb.New(start), + CompletedAt: timestamppb.New(start.Add(time.Second)), + Error: &rpcstatus.Status{Code: int32(tc.code), Message: "ignored: must not appear as a label"}, + } + bm.recordBuildCompletion(context.Background(), rec) + + builds := findCounterPoint(t, collect(t, reader)["buildkit.builds"], map[string]string{ + "status": "failure", + "error_code": tc.code.String(), + }) + require.Equal(t, int64(1), builds.Value) + }) + } +} + +func TestRecordBuildCompletion_StepCounters(t *testing.T) { + bm, reader := newTestMetrics(t) + start := time.Now() + rec := &controlapi.BuildHistoryRecord{ + CreatedAt: timestamppb.New(start), + CompletedAt: timestamppb.New(start.Add(time.Second)), + NumCompletedSteps: 8, + NumCachedSteps: 3, + NumTotalSteps: 10, + NumWarnings: 2, + } + bm.recordBuildCompletion(context.Background(), rec) + + steps := collect(t, reader)["buildkit.builds.steps"] + for kind, want := range map[string]int64{ + "completed": 8, + "cached": 3, + "total": 10, + "warnings": 2, + } { + dp := findCounterPoint(t, steps, map[string]string{"kind": kind}) + require.Equal(t, want, dp.Value, "kind=%s", kind) + } +} + +func TestRecordBuildCompletion_NilSafe(t *testing.T) { + // Nil receiver and nil record must both be no-ops so that callers + // at solver/llbsolver/history.go do not have to guard the call. + var bm *buildMetrics + require.NotPanics(t, func() { + bm.recordBuildCompletion(context.Background(), &controlapi.BuildHistoryRecord{}) + }) + + bm, _ = newTestMetrics(t) + require.NotPanics(t, func() { + bm.recordBuildCompletion(context.Background(), nil) + }) +} + +func TestNewBuildMetrics_NilProviderUsesNoop(t *testing.T) { + // Passing a nil MeterProvider must succeed and produce a usable + // metrics struct. This keeps tests and alternative integrations + // from having to wire OTEL just to construct the solver. + bm, err := newBuildMetrics(nil) + require.NoError(t, err) + require.NotNil(t, bm) + require.NotPanics(t, func() { + bm.recordBuildCompletion(context.Background(), &controlapi.BuildHistoryRecord{ + CreatedAt: timestamppb.Now(), + CompletedAt: timestamppb.Now(), + }) + }) +} diff --git a/solver/llbsolver/solver.go b/solver/llbsolver/solver.go index 26e1edbda..a6c794632 100644 --- a/solver/llbsolver/solver.go +++ b/solver/llbsolver/solver.go @@ -30,6 +30,7 @@ import ( "github.com/moby/buildkit/worker" digest "github.com/opencontainers/go-digest" "github.com/pkg/errors" + "go.opentelemetry.io/otel/metric" "golang.org/x/sync/errgroup" ) @@ -60,6 +61,7 @@ type Opt struct { HistoryQueue *history.Queue ResourceMonitor *resources.Monitor ProvenanceEnv map[string]any + MeterProvider metric.MeterProvider } type Solver struct { @@ -76,6 +78,7 @@ type Solver struct { sysSampler *resources.Sampler[*resourcestypes.SysSample] provenanceEnv map[string]any provenanceStore *provenanceStore + metrics *buildMetrics } // Processor defines a processing function to be applied after solving, but @@ -95,6 +98,11 @@ func New(opt Opt) (*Solver, error) { } } + bm, err := newBuildMetrics(opt.MeterProvider) + if err != nil { + return nil, errors.Wrap(err, "failed to register build metrics") + } + s := &Solver{ workerController: opt.WorkerController, resolveWorker: defaultResolver(opt.WorkerController), @@ -107,6 +115,7 @@ func New(opt Opt) (*Solver, error) { history: opt.HistoryQueue, provenanceEnv: opt.ProvenanceEnv, provenanceStore: newProvenanceStore(), + metrics: bm, } sampler, err := resources.NewSysSampler()