Merge pull request #6736 from asbarron/asbarron/buildkit-build-metrics

solver/llbsolver: emit native build-completion metrics
This commit is contained in:
Tõnis Tiigi
2026-06-02 22:30:14 -07:00
committed by GitHub
7 changed files with 372 additions and 3 deletions

View File

@@ -70,7 +70,9 @@ import (
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/propagation"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
@@ -400,7 +402,7 @@ func main() {
defer db.Close()
}
controller, err := newController(ctx, c, &cfg)
controller, err := newController(ctx, c, &cfg, mp)
if err != nil {
return err
}
@@ -827,7 +829,7 @@ func serverCredentials(cfg config.TLSConfig) (*tls.Config, error) {
return tlsConf, nil
}
func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*control.Controller, error) {
func newController(ctx context.Context, c *cli.Context, cfg *config.Config, mp metric.MeterProvider) (*control.Controller, error) {
sessionManager, err := session.NewManager()
if err != nil {
return nil, err
@@ -930,6 +932,7 @@ func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*co
CacheManager: solver.NewCacheManager(context.TODO(), "local", cacheStorage, worker.NewCacheResultStorage(wc)),
Entitlements: cfg.Entitlements,
TraceCollector: tc,
MeterProvider: mp,
HistoryDB: historyDB,
CacheStore: cacheStorage,
LeaseManager: w.LeaseManager(),
@@ -1106,6 +1109,7 @@ func newTracerProvider(ctx context.Context) (*sdktrace.TracerProvider, error) {
func newMeterProvider(ctx context.Context) (*sdkmetric.MeterProvider, error) {
opts := []sdkmetric.Option{
sdkmetric.WithResource(detect.Resource()),
sdkmetric.WithView(buildDurationView()),
}
if r, err := prometheus.New(); err != nil {
@@ -1126,6 +1130,32 @@ func newMeterProvider(ctx context.Context) (*sdkmetric.MeterProvider, error) {
return sdkmetric.NewMeterProvider(opts...), nil
}
// buildDurationView routes the buildkit.build.duration histogram to a
// Base2 exponential aggregation so that the OTEL Prometheus exporter
// renders it as a Prometheus native histogram. Native histograms avoid
// the "tens of millions of series" cardinality blow-up reported in
// moby/buildkit#5777 by storing observations in dynamically-sized
// exponential buckets rather than a fixed bucket schedule.
//
// The AttributeFilter is defense in depth: only the status attribute
// is recorded on this histogram, regardless of what the call site
// supplies.
func buildDurationView() sdkmetric.View {
return sdkmetric.NewView(
sdkmetric.Instrument{
Name: "buildkit.build.duration",
Kind: sdkmetric.InstrumentKindHistogram,
},
sdkmetric.Stream{
Aggregation: sdkmetric.AggregationBase2ExponentialHistogram{
MaxSize: 160,
MaxScale: 20,
},
AttributeFilter: attribute.NewAllowKeysFilter("status"),
},
)
}
func getCDIManager(cfg config.CDIConfig) (*cdidevices.Manager, error) {
if cfg.Disabled != nil && *cfg.Disabled {
return nil, nil

View File

@@ -51,6 +51,7 @@ import (
"github.com/moby/buildkit/worker"
digest "github.com/opencontainers/go-digest"
"github.com/pkg/errors"
"go.opentelemetry.io/otel/metric"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
tracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1"
"golang.org/x/sync/errgroup"
@@ -70,6 +71,7 @@ type Opt struct {
ResolveCacheImporterFuncs map[string]remotecache.ResolveCacheImporterFunc
Entitlements []string
TraceCollector sdktrace.SpanExporter
MeterProvider metric.MeterProvider
HistoryDB db.DB
CacheStore *bboltcachestorage.Store
LeaseManager *leaseutil.Manager
@@ -118,6 +120,7 @@ func NewController(opt Opt) (*Controller, error) {
Entitlements: opt.Entitlements,
HistoryQueue: hq,
ProvenanceEnv: opt.ProvenanceEnv,
MeterProvider: opt.MeterProvider,
})
if err != nil {
return nil, errors.Wrap(err, "failed to create solver")

2
go.mod
View File

@@ -99,6 +99,7 @@ require (
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
go.opentelemetry.io/otel/exporters/prometheus v0.60.0
go.opentelemetry.io/otel/metric v1.43.0
go.opentelemetry.io/otel/sdk v1.43.0
go.opentelemetry.io/otel/sdk/metric v1.43.0
go.opentelemetry.io/otel/trace v1.43.0
@@ -221,7 +222,6 @@ require (
github.com/vishvananda/netns v0.0.5 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/otel/metric v1.43.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/term v0.42.0 // indirect

View File

@@ -262,6 +262,12 @@ func (s *Solver) recordBuildHistory(ctx context.Context, id string, req frontend
ready, done := s.history.AcquireFinalizer(rec.Ref)
// Emit build observability metrics before the history record is
// finalized. rec is fully populated at this point — Frontend,
// Error, NumCompletedSteps, CreatedAt, and CompletedAt are set
// regardless of whether the build succeeded or failed.
s.metrics.recordBuildCompletion(ctx, rec)
if err1 := s.history.Update(ctx, &controlapi.BuildHistoryEvent{
Type: controlapi.BuildHistoryEventType_COMPLETE,
Record: rec,

132
solver/llbsolver/metrics.go Normal file
View File

@@ -0,0 +1,132 @@
package llbsolver
import (
"context"
controlapi "github.com/moby/buildkit/api/services/control"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/metric/noop"
"google.golang.org/grpc/codes"
)
// instrumentationName is the OTEL instrumentation scope used for every
// build observability instrument exported by the solver.
const instrumentationName = "github.com/moby/buildkit/solver/llbsolver"
// Attribute keys for build observability metrics.
const (
statusKey = attribute.Key("status")
errorCodeKey = attribute.Key("error_code")
kindKey = attribute.Key("kind")
)
// Attribute values for the keys above.
const (
statusSuccess = "success"
statusFailure = "failure"
stepKindCompleted = "completed"
stepKindCached = "cached"
stepKindTotal = "total"
stepKindWarnings = "warnings"
)
// buildMetrics holds the OTEL instruments the solver writes to once per
// solve completion. The struct is constructed once in New and shared by
// every Solve via the *Solver receiver — instruments are concurrency-safe
// per the OTEL metric API contract.
type buildMetrics struct {
builds metric.Int64Counter
steps metric.Int64Counter
duration metric.Float64Histogram
}
// newBuildMetrics registers the build-completion instruments against mp.
// A nil mp is treated as a request to disable metrics: a no-op provider
// is used so the rest of the solver does not need to special-case the
// "metrics disabled" path. Returning an error here causes daemon startup
// to fail fast, matching the buildkit convention for required wiring.
func newBuildMetrics(mp metric.MeterProvider) (*buildMetrics, error) {
if mp == nil {
mp = noop.NewMeterProvider()
}
meter := mp.Meter(instrumentationName)
builds, err := meter.Int64Counter(
"buildkit.builds",
metric.WithDescription("Number of builds completed, labeled by frontend, status, and (on failure) gRPC error code."),
)
if err != nil {
return nil, err
}
steps, err := meter.Int64Counter(
"buildkit.builds.steps",
metric.WithDescription("Cumulative count of build steps observed, partitioned by kind (completed, cached, total, warnings)."),
)
if err != nil {
return nil, err
}
duration, err := meter.Float64Histogram(
"buildkit.build.duration",
metric.WithDescription("Wall-clock duration of build solves from CreatedAt to CompletedAt."),
metric.WithUnit("s"),
)
if err != nil {
return nil, err
}
return &buildMetrics{
builds: builds,
steps: steps,
duration: duration,
}, nil
}
// recordBuildCompletion observes one finished build onto every relevant
// instrument. It is invoked from the recordBuildHistory finalizer, after
// rec is fully populated and immediately before the COMPLETE history
// event is sent.
//
// Labels are deliberately bounded to keep cardinality finite:
// - status: "success" or "failure".
// - error_code: the gRPC codes.Code string of rec.Error, attached only
// on failure. rec.Error.Message is intentionally not used as a label
// to avoid the cardinality blow-up reported in moby/buildkit#5777.
//
// rec.Frontend is intentionally NOT used as a label: the modern
// gateway-client path used by buildctl and buildx clears Frontend on
// the wire (see client/build.go), so the field is empty for nearly all
// real-world callers and the label would have cardinality 1. A
// follow-up can re-introduce a frontend signal if and when buildkit
// itself starts populating rec.Frontend on the gateway path.
//
// A nil receiver or nil rec is a no-op so callers do not have to guard
// the call site.
func (m *buildMetrics) recordBuildCompletion(ctx context.Context, rec *controlapi.BuildHistoryRecord) {
if m == nil || rec == nil {
return
}
status := statusSuccess
attrs := []attribute.KeyValue{statusKey.String(status)}
if rec.Error != nil {
status = statusFailure
attrs[0] = statusKey.String(status)
attrs = append(attrs, errorCodeKey.String(codes.Code(rec.Error.Code).String()))
}
m.builds.Add(ctx, 1, metric.WithAttributes(attrs...))
m.steps.Add(ctx, int64(rec.NumCompletedSteps), metric.WithAttributes(kindKey.String(stepKindCompleted)))
m.steps.Add(ctx, int64(rec.NumCachedSteps), metric.WithAttributes(kindKey.String(stepKindCached)))
m.steps.Add(ctx, int64(rec.NumTotalSteps), metric.WithAttributes(kindKey.String(stepKindTotal)))
m.steps.Add(ctx, int64(rec.NumWarnings), metric.WithAttributes(kindKey.String(stepKindWarnings)))
if rec.CreatedAt != nil && rec.CompletedAt != nil {
seconds := rec.CompletedAt.AsTime().Sub(rec.CreatedAt.AsTime()).Seconds()
m.duration.Record(ctx, seconds, metric.WithAttributes(statusKey.String(status)))
}
}

View File

@@ -0,0 +1,189 @@
package llbsolver
import (
"context"
"testing"
"time"
controlapi "github.com/moby/buildkit/api/services/control"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel/attribute"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
rpcstatus "google.golang.org/genproto/googleapis/rpc/status"
"google.golang.org/grpc/codes"
"google.golang.org/protobuf/types/known/timestamppb"
)
// newTestMetrics returns a buildMetrics wired to a ManualReader so that
// tests can collect and inspect emitted observations synchronously.
func newTestMetrics(t *testing.T) (*buildMetrics, *sdkmetric.ManualReader) {
t.Helper()
reader := sdkmetric.NewManualReader()
mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader))
bm, err := newBuildMetrics(mp)
require.NoError(t, err)
return bm, reader
}
// collect collects all metrics from the reader into a flat map keyed by
// instrument name. Each value is the metricdata.Aggregation (Sum,
// Histogram, etc.) for that instrument.
func collect(t *testing.T, reader *sdkmetric.ManualReader) map[string]metricdata.Aggregation {
t.Helper()
var rm metricdata.ResourceMetrics
require.NoError(t, reader.Collect(context.Background(), &rm))
out := map[string]metricdata.Aggregation{}
for _, sm := range rm.ScopeMetrics {
for _, m := range sm.Metrics {
out[m.Name] = m.Data
}
}
return out
}
// findCounterPoint returns the int64 sum data point whose attributes
// contain every (key, value) pair in want. Fails the test if no
// matching point is found.
func findCounterPoint(t *testing.T, agg metricdata.Aggregation, want map[string]string) metricdata.DataPoint[int64] {
t.Helper()
sum, ok := agg.(metricdata.Sum[int64])
require.True(t, ok, "expected Sum[int64], got %T", agg)
for _, dp := range sum.DataPoints {
if attrsContain(dp.Attributes, want) {
return dp
}
}
t.Fatalf("no data point with attrs %v found among %d points", want, len(sum.DataPoints))
return metricdata.DataPoint[int64]{}
}
// attrsContain reports whether every (key, value) pair in want is
// present in set.
func attrsContain(set attribute.Set, want map[string]string) bool {
for k, v := range want {
got, ok := set.Value(attribute.Key(k))
if !ok || got.AsString() != v {
return false
}
}
return true
}
func TestRecordBuildCompletion_Success(t *testing.T) {
bm, reader := newTestMetrics(t)
start := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
rec := &controlapi.BuildHistoryRecord{
CreatedAt: timestamppb.New(start),
CompletedAt: timestamppb.New(start.Add(7 * time.Second)),
NumCompletedSteps: 5,
NumCachedSteps: 2,
NumTotalSteps: 5,
NumWarnings: 1,
}
bm.recordBuildCompletion(context.Background(), rec)
got := collect(t, reader)
builds := findCounterPoint(t, got["buildkit.builds"], map[string]string{
"status": "success",
})
require.Equal(t, int64(1), builds.Value)
// On success, error_code must not be present at all.
if _, ok := builds.Attributes.Value("error_code"); ok {
t.Fatal("error_code attribute should be absent on success")
}
hist, ok := got["buildkit.build.duration"].(metricdata.Histogram[float64])
require.True(t, ok, "expected Histogram[float64] for build.duration, got %T", got["buildkit.build.duration"])
require.Len(t, hist.DataPoints, 1)
require.Equal(t, uint64(1), hist.DataPoints[0].Count)
require.InDelta(t, 7.0, hist.DataPoints[0].Sum, 0.001)
require.True(t, attrsContain(hist.DataPoints[0].Attributes, map[string]string{
"status": "success",
}))
}
func TestRecordBuildCompletion_FailureGRPCCodes(t *testing.T) {
cases := []struct {
name string
code codes.Code
}{
{"canceled", codes.Canceled},
{"resource_exhausted", codes.ResourceExhausted},
{"unknown", codes.Unknown},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
bm, reader := newTestMetrics(t)
start := time.Now()
rec := &controlapi.BuildHistoryRecord{
CreatedAt: timestamppb.New(start),
CompletedAt: timestamppb.New(start.Add(time.Second)),
Error: &rpcstatus.Status{Code: int32(tc.code), Message: "ignored: must not appear as a label"},
}
bm.recordBuildCompletion(context.Background(), rec)
builds := findCounterPoint(t, collect(t, reader)["buildkit.builds"], map[string]string{
"status": "failure",
"error_code": tc.code.String(),
})
require.Equal(t, int64(1), builds.Value)
})
}
}
func TestRecordBuildCompletion_StepCounters(t *testing.T) {
bm, reader := newTestMetrics(t)
start := time.Now()
rec := &controlapi.BuildHistoryRecord{
CreatedAt: timestamppb.New(start),
CompletedAt: timestamppb.New(start.Add(time.Second)),
NumCompletedSteps: 8,
NumCachedSteps: 3,
NumTotalSteps: 10,
NumWarnings: 2,
}
bm.recordBuildCompletion(context.Background(), rec)
steps := collect(t, reader)["buildkit.builds.steps"]
for kind, want := range map[string]int64{
"completed": 8,
"cached": 3,
"total": 10,
"warnings": 2,
} {
dp := findCounterPoint(t, steps, map[string]string{"kind": kind})
require.Equal(t, want, dp.Value, "kind=%s", kind)
}
}
func TestRecordBuildCompletion_NilSafe(t *testing.T) {
// Nil receiver and nil record must both be no-ops so that callers
// at solver/llbsolver/history.go do not have to guard the call.
var bm *buildMetrics
require.NotPanics(t, func() {
bm.recordBuildCompletion(context.Background(), &controlapi.BuildHistoryRecord{})
})
bm, _ = newTestMetrics(t)
require.NotPanics(t, func() {
bm.recordBuildCompletion(context.Background(), nil)
})
}
func TestNewBuildMetrics_NilProviderUsesNoop(t *testing.T) {
// Passing a nil MeterProvider must succeed and produce a usable
// metrics struct. This keeps tests and alternative integrations
// from having to wire OTEL just to construct the solver.
bm, err := newBuildMetrics(nil)
require.NoError(t, err)
require.NotNil(t, bm)
require.NotPanics(t, func() {
bm.recordBuildCompletion(context.Background(), &controlapi.BuildHistoryRecord{
CreatedAt: timestamppb.Now(),
CompletedAt: timestamppb.Now(),
})
})
}

View File

@@ -30,6 +30,7 @@ import (
"github.com/moby/buildkit/worker"
digest "github.com/opencontainers/go-digest"
"github.com/pkg/errors"
"go.opentelemetry.io/otel/metric"
"golang.org/x/sync/errgroup"
)
@@ -60,6 +61,7 @@ type Opt struct {
HistoryQueue *history.Queue
ResourceMonitor *resources.Monitor
ProvenanceEnv map[string]any
MeterProvider metric.MeterProvider
}
type Solver struct {
@@ -76,6 +78,7 @@ type Solver struct {
sysSampler *resources.Sampler[*resourcestypes.SysSample]
provenanceEnv map[string]any
provenanceStore *provenanceStore
metrics *buildMetrics
}
// Processor defines a processing function to be applied after solving, but
@@ -95,6 +98,11 @@ func New(opt Opt) (*Solver, error) {
}
}
bm, err := newBuildMetrics(opt.MeterProvider)
if err != nil {
return nil, errors.Wrap(err, "failed to register build metrics")
}
s := &Solver{
workerController: opt.WorkerController,
resolveWorker: defaultResolver(opt.WorkerController),
@@ -107,6 +115,7 @@ func New(opt Opt) (*Solver, error) {
history: opt.HistoryQueue,
provenanceEnv: opt.ProvenanceEnv,
provenanceStore: newProvenanceStore(),
metrics: bm,
}
sampler, err := resources.NewSysSampler()