From 9a1bf2a83befc310e9b18209209e9a58b5364aa5 Mon Sep 17 00:00:00 2001 From: Dan Duvall Date: Sat, 15 Nov 2025 23:18:21 -0800 Subject: [PATCH] dockerfile: run buildkitd within a cgroup namespace for cgroup v2 Introduce a new entrypoint script for the Linux image that, if cgroup v2 is in use, creates a new cgroup and mount namespace for buildkitd within a new entrypoint using `unshare` and remounts `/sys/fs/cgroup` to restrict its view of the unified cgroup hierarchy. This will ensure its `init` cgroup and all OCI worker managed cgroups are kept beneath the root cgroup of the initial entrypoint process. When buildkitd is run in a managed environment like Kubernetes without its own cgroup namespace (the default behavior of privileged pods in Kubernetes where cgroup v2 is in use; see [cgroup v2 KEP][kep]), the OCI worker will spawn processes in cgroups that are outside of the cgroup hierarchy that was created for the buildkitd container, leading to incorrect resource accounting and enforcement which in turn can cause OOM errors and CPU contention on the node. Example behavior without this change: ```console root@k8s-node:/# cat /proc/$(pgrep -n buildkitd)/cgroup 0::/init root@k8s-node:/# cat /proc/$(pgrep -n some-build-process)/cgroup 0::/buildkit/{runc-container-id} ``` Example behavior with this change: ```console root@k8s-node:/# cat /proc/$(pgrep -n buildkitd)/cgroup 0::/kubepods/burstable/pod{pod-id}/{container-id}/init root@k8s-node:/# cat /proc/$(pgrep -n some-build-process)/cgroup 0::/kubepods/burstable/pod{pod-id}/{container-id}/buildkit/{runc-container-id} ``` Note this was developed as an alternative approach to moby/buildkit#6343 [kep]: https://github.com/kubernetes/enhancements/tree/6d3210f7dd5d547c8f7f6a33af6a09eb45193cd7/keps/sig-node/2254-cgroup-v2#cgroup-namespace Signed-off-by: Dan Duvall --- Dockerfile | 6 ++++-- hack/buildkitd-entrypoint | 22 ++++++++++++++++++++++ hack/with-cgroupfs-remount | 8 ++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100755 hack/buildkitd-entrypoint create mode 100755 hack/with-cgroupfs-remount diff --git a/Dockerfile b/Dockerfile index 9b73a5d3c..e85e5061b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -221,7 +221,7 @@ FROM scratch AS release COPY --link --from=releaser /out/ / FROM alpine:${ALPINE_VERSION} AS buildkit-export-alpine -RUN apk add --no-cache fuse3 git openssh openssl pigz xz iptables ip6tables \ +RUN apk add --no-cache fuse3 git openssh openssl pigz xz iptables ip6tables util-linux-misc \ && ln -s fusermount3 /usr/bin/fusermount COPY --link examples/buildctl-daemonless/buildctl-daemonless.sh /usr/bin/ VOLUME /var/lib/buildkit @@ -398,7 +398,9 @@ EOT FROM buildkit-export AS buildkit-linux COPY --link --from=binaries / /usr/bin/ ENV BUILDKIT_SETUP_CGROUPV2_ROOT=1 -ENTRYPOINT ["buildkitd"] +COPY --link hack/buildkitd-entrypoint /usr/bin/buildkitd-entrypoint +COPY --link hack/with-cgroupfs-remount /usr/bin/with-cgroupfs-remount +ENTRYPOINT ["/usr/bin/buildkitd-entrypoint"] FROM buildkit-linux AS buildkit-linux-debug COPY --link --from=dlv /out/dlv /usr/bin/dlv diff --git a/hack/buildkitd-entrypoint b/hack/buildkitd-entrypoint new file mode 100755 index 000000000..76d5f6d51 --- /dev/null +++ b/hack/buildkitd-entrypoint @@ -0,0 +1,22 @@ +#!/bin/sh +# +# For cgroup v2, ensure buildkitd has a namespaced view of /sys/fs/cgroup by +# running in a new cgroup and mount namespace and remounting /sys/fs/cgroup. +# Assume we are already in our own cgroup ns if the current cgroup path is +# "/". +# +# Note this is a workaround for the lack of cgroupns control in the Kubernetes +# API. If KEP-5714 is adopted, this can eventually be removed. +# +# See https://github.com/kubernetes/enhancements/issues/5714 + +set -e + +if [ -e /sys/fs/cgroup/cgroup.controllers ]; then + if [ "$(cut -d: -f3 /proc/self/cgroup)" != "/" ]; then + echo creating cgroup namespace + exec /usr/bin/unshare --cgroup --mount /usr/bin/with-cgroupfs-remount /usr/bin/buildkitd "$@" + fi +fi + +exec /usr/bin/buildkitd "$@" diff --git a/hack/with-cgroupfs-remount b/hack/with-cgroupfs-remount new file mode 100755 index 000000000..262ee40d2 --- /dev/null +++ b/hack/with-cgroupfs-remount @@ -0,0 +1,8 @@ +#!/bin/sh +set -e + +options="$(awk '$2 == "/sys/fs/cgroup" { print $4 }' /proc/self/mounts)" +umount /sys/fs/cgroup +mount -t cgroup2 -o "$options" cgroup2 /sys/fs/cgroup + +exec "$@"