mirror of
https://github.com/systemd/systemd.git
synced 2026-06-30 19:57:29 +00:00
nspawn: join network namespace before cloning user namespace
When both --private-users and --network-namespace-path are specified, systemd-nspawn fails to start with "Operation not permitted" during the setns() call. This occurs because of the following execution sequence: 1. The outer child calls raw_clone() with CLONE_NEWUSER to create the new user namespace. 2. The inner child is spawned inside this new user namespace. 3. The inner child then attempts to call setns() to join the external network namespace. Because the inner child is already running inside the restricted user namespace, the kernel rejects the setns() call to join a network namespace owned by a different (host/more privileged) user namespace. Fix this by moving the setns() call to the outer child, executing it just before the raw_clone() call. This ensures the network namespace is joined while the process still has the necessary privileges, which also aligns with the inner child's expectation that the network namespace is already set up upon entry.
This commit is contained in:
@@ -4449,6 +4449,13 @@ static int outer_child(
|
||||
if (notify_fd < 0)
|
||||
return notify_fd;
|
||||
|
||||
/* Join the external network namespace first, while we are still in the parent's
|
||||
* user namespace and have CAP_SYS_ADMIN there. Once we clone with CLONE_NEWUSER,
|
||||
* the child will be in a new user namespace, lacking the capabilities in the
|
||||
* parent user namespace required to join its network namespace. */
|
||||
if (arg_network_namespace_path && setns(netns_fd, CLONE_NEWNET) < 0)
|
||||
return log_error_errno(errno, "Failed to join network namespace: %m");
|
||||
|
||||
pid_t pid = raw_clone(SIGCHLD|CLONE_NEWNS|
|
||||
arg_clone_ns_flags |
|
||||
(IN_SET(arg_userns_mode, USER_NAMESPACE_FIXED, USER_NAMESPACE_PICK) ? CLONE_NEWUSER : 0) |
|
||||
@@ -4464,9 +4471,6 @@ static int outer_child(
|
||||
/* The inner child has all namespaces that are requested, so that we all are owned by the
|
||||
* user if user namespaces are turned on. */
|
||||
|
||||
if (arg_network_namespace_path && setns(netns_fd, CLONE_NEWNET) < 0)
|
||||
return log_error_errno(errno, "Failed to join network namespace: %m");
|
||||
|
||||
if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
|
||||
/* In managed usernamespace operation, sysfs + procfs are special, we'll have to
|
||||
* mount them inside the inner namespaces, but before we switch root. Hence do so
|
||||
|
||||
@@ -1185,6 +1185,21 @@ matrix_run_one() {
|
||||
ip a | grep -v -E '^1: lo.*UP'
|
||||
ip netns del nspawn_test
|
||||
|
||||
# test --network-namespace-path works when combined with --private-users=pick
|
||||
ip netns add nspawn_test
|
||||
ip netns exec nspawn_test ip link add foo type dummy
|
||||
|
||||
if [[ "$IS_USERNS_SUPPORTED" == "yes" && "$api_vfs_writable" == "no" ]]; then
|
||||
SYSTEMD_NSPAWN_USE_CGNS="$use_cgns" SYSTEMD_NSPAWN_API_VFS_WRITABLE="$api_vfs_writable" \
|
||||
systemd-nspawn --register=no \
|
||||
--directory="$root" \
|
||||
--private-users=pick \
|
||||
--network-namespace-path=/run/netns/nspawn_test \
|
||||
ip link show dev foo
|
||||
fi
|
||||
|
||||
ip netns del nspawn_test
|
||||
|
||||
rm -fr "$root"
|
||||
|
||||
return 0
|
||||
|
||||
Reference in New Issue
Block a user