libct: fix resetting CPU affinity

unix.CPUSet is limited to 1024 CPUs. Calling
unix.SchedSetaffinity(pid, cpuset) removes all CPUs starting from 1024
from allowed CPUs of pid, even if cpuset is all ones. As a
consequence, when runc tries to reset CPU affinity to "allow all" by
default, it prevents all containers from CPUs 1024 onwards.

This change uses a huge CPU mask to play safe and get all possible
CPUs enabled with a single sched_setaffinity call.

Fixes: #5023

Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
This commit is contained in:
Antti Kervinen
2025-11-18 14:13:20 +02:00
committed by Kir Kolyshkin
parent ffe8b28ff4
commit 700c944c4d
2 changed files with 41 additions and 22 deletions

View File

@@ -2,6 +2,7 @@ package linux
import (
"os"
"unsafe"
"golang.org/x/sys/unix"
)
@@ -65,6 +66,22 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from unix.Sockaddr, err error
return n, from, err
}
// SchedSetaffinity wraps sched_setaffinity syscall without unix.CPUSet size limitation.
func SchedSetaffinity(pid int, buf []byte) error {
err := retryOnEINTR(func() error {
_, _, errno := unix.Syscall(
unix.SYS_SCHED_SETAFFINITY,
uintptr(pid),
uintptr(len(buf)),
uintptr((unsafe.Pointer)(&buf[0])))
if errno != 0 {
return errno
}
return nil
})
return os.NewSyscallError("sched_setaffinity", err)
}
// Sendmsg wraps [unix.Sendmsg].
func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error {
err := retryOnEINTR(func() error {

View File

@@ -1,6 +1,7 @@
package libcontainer
import (
"bytes"
"context"
"encoding/json"
"errors"
@@ -25,6 +26,7 @@ import (
"github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/fs2"
"github.com/opencontainers/runc/internal/linux"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/internal/userns"
@@ -178,33 +180,33 @@ type setnsProcess struct {
// tryResetCPUAffinity tries to reset the CPU affinity of the process
// identified by pid to include all possible CPUs (notwithstanding cgroup
// cpuset restrictions and isolated CPUs).
// cpuset restrictions, isolated CPUs and CPU online status).
func tryResetCPUAffinity(pid int) {
// When resetting the CPU affinity, we want to match the configured cgroup
// cpuset (or the default set of all CPUs, if no cpuset is configured)
// rather than some more restrictive affinity we were spawned in (such as
// one that may have been inherited from systemd). The cpuset cgroup used
// to reconfigure the cpumask automatically for joining processes, but
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
// this behaviour in Linux 6.2.
// When resetting the CPU affinity, we want to allow all
// possible CPUs in the system, including those not in
// cpuset.cpus, online or even present (hot-plugged) at call
// time. Using a cpumask any tighter this that may disallow
// using those CPUs if they are added to cpuset.cpus later.
//
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
// things like /proc/stat would be wrong for most nested containers), but
// luckily sched_setaffinity(2) will implicitly:
// Note that sched_setaffinity(2) will implicitly:
//
// * Clamp the cpumask so that it matches the number of CPUs
// supported by the kernel.
//
// * Clamp the cpumask so that it matches the current number of CPUs on
// the system.
// * Mask out any CPUs that are not a member of the target task's
// configured cgroup cpuset.
// configured cgroup cpuset. This is for task's effective affinity,
// without forgetting masked-out CPUs should the cgroup cpuset
// change later.
//
// So we can just pass a very large array of set cpumask bits and the
// kernel will silently convert that to the correct value very cheaply.
var cpuset unix.CPUSet
cpuset.Fill() // set all bits
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
logrus.WithError(
os.NewSyscallError("sched_setaffinity", err),
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
// Therefore, preparing the cpumask, we can avoid reading
// /sys/devices/system/cpu/possible and kernel_max.
// Instead, we use a huge buffer similarly to go 1.25 runtime in
// getCPUCount().
const maxCPUs = 64 * 1024
buf := bytes.Repeat([]byte{0xff}, maxCPUs/8)
if err := linux.SchedSetaffinity(pid, buf); err != nil {
logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
return
}
}