mirror of
https://github.com/opencontainers/runc.git
synced 2026-06-24 08:48:44 +00:00
libct: fix resetting CPU affinity
unix.CPUSet is limited to 1024 CPUs. Calling unix.SchedSetaffinity(pid, cpuset) removes all CPUs starting from 1024 from allowed CPUs of pid, even if cpuset is all ones. As a consequence, when runc tries to reset CPU affinity to "allow all" by default, it prevents all containers from CPUs 1024 onwards. This change uses a huge CPU mask to play safe and get all possible CPUs enabled with a single sched_setaffinity call. Fixes: #5023 Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
This commit is contained in:
committed by
Kir Kolyshkin
parent
ffe8b28ff4
commit
700c944c4d
@@ -2,6 +2,7 @@ package linux
|
||||
|
||||
import (
|
||||
"os"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
@@ -65,6 +66,22 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from unix.Sockaddr, err error
|
||||
return n, from, err
|
||||
}
|
||||
|
||||
// SchedSetaffinity wraps sched_setaffinity syscall without unix.CPUSet size limitation.
|
||||
func SchedSetaffinity(pid int, buf []byte) error {
|
||||
err := retryOnEINTR(func() error {
|
||||
_, _, errno := unix.Syscall(
|
||||
unix.SYS_SCHED_SETAFFINITY,
|
||||
uintptr(pid),
|
||||
uintptr(len(buf)),
|
||||
uintptr((unsafe.Pointer)(&buf[0])))
|
||||
if errno != 0 {
|
||||
return errno
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return os.NewSyscallError("sched_setaffinity", err)
|
||||
}
|
||||
|
||||
// Sendmsg wraps [unix.Sendmsg].
|
||||
func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error {
|
||||
err := retryOnEINTR(func() error {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package libcontainer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
@@ -25,6 +26,7 @@ import (
|
||||
|
||||
"github.com/opencontainers/cgroups"
|
||||
"github.com/opencontainers/cgroups/fs2"
|
||||
"github.com/opencontainers/runc/internal/linux"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
"github.com/opencontainers/runc/libcontainer/internal/userns"
|
||||
@@ -178,33 +180,33 @@ type setnsProcess struct {
|
||||
|
||||
// tryResetCPUAffinity tries to reset the CPU affinity of the process
|
||||
// identified by pid to include all possible CPUs (notwithstanding cgroup
|
||||
// cpuset restrictions and isolated CPUs).
|
||||
// cpuset restrictions, isolated CPUs and CPU online status).
|
||||
func tryResetCPUAffinity(pid int) {
|
||||
// When resetting the CPU affinity, we want to match the configured cgroup
|
||||
// cpuset (or the default set of all CPUs, if no cpuset is configured)
|
||||
// rather than some more restrictive affinity we were spawned in (such as
|
||||
// one that may have been inherited from systemd). The cpuset cgroup used
|
||||
// to reconfigure the cpumask automatically for joining processes, but
|
||||
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
|
||||
// this behaviour in Linux 6.2.
|
||||
// When resetting the CPU affinity, we want to allow all
|
||||
// possible CPUs in the system, including those not in
|
||||
// cpuset.cpus, online or even present (hot-plugged) at call
|
||||
// time. Using a cpumask any tighter this that may disallow
|
||||
// using those CPUs if they are added to cpuset.cpus later.
|
||||
//
|
||||
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
|
||||
// things like /proc/stat would be wrong for most nested containers), but
|
||||
// luckily sched_setaffinity(2) will implicitly:
|
||||
// Note that sched_setaffinity(2) will implicitly:
|
||||
//
|
||||
// * Clamp the cpumask so that it matches the number of CPUs
|
||||
// supported by the kernel.
|
||||
//
|
||||
// * Clamp the cpumask so that it matches the current number of CPUs on
|
||||
// the system.
|
||||
// * Mask out any CPUs that are not a member of the target task's
|
||||
// configured cgroup cpuset.
|
||||
// configured cgroup cpuset. This is for task's effective affinity,
|
||||
// without forgetting masked-out CPUs should the cgroup cpuset
|
||||
// change later.
|
||||
//
|
||||
// So we can just pass a very large array of set cpumask bits and the
|
||||
// kernel will silently convert that to the correct value very cheaply.
|
||||
var cpuset unix.CPUSet
|
||||
cpuset.Fill() // set all bits
|
||||
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
|
||||
logrus.WithError(
|
||||
os.NewSyscallError("sched_setaffinity", err),
|
||||
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
|
||||
// Therefore, preparing the cpumask, we can avoid reading
|
||||
// /sys/devices/system/cpu/possible and kernel_max.
|
||||
// Instead, we use a huge buffer similarly to go 1.25 runtime in
|
||||
// getCPUCount().
|
||||
const maxCPUs = 64 * 1024
|
||||
buf := bytes.Repeat([]byte{0xff}, maxCPUs/8)
|
||||
if err := linux.SchedSetaffinity(pid, buf); err != nil {
|
||||
logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user