nspawn: join network namespace before cloning user namespace

When both --private-users and --network-namespace-path are
specified, systemd-nspawn fails to start with "Operation not
permitted" during the setns() call.

This occurs because of the following execution sequence:
1. The outer child calls raw_clone() with CLONE_NEWUSER to create
   the new user namespace.
2. The inner child is spawned inside this new user namespace.
3. The inner child then attempts to call setns() to join the
   external network namespace.

Because the inner child is already running inside
the restricted user namespace, the kernel rejects
the setns() call to join a network namespace owned by
a different (host/more privileged) user namespace.

Fix this by moving the setns() call to the outer child, executing it
just before the raw_clone() call. This ensures the network namespace
is joined while the process still has the necessary privileges, which
also aligns with the inner child's expectation that the network
namespace is already set up upon entry.
This commit is contained in:
Malformed C
2026-05-27 22:43:35 +03:00
committed by Yu Watanabe
parent 68af89e619
commit d35fa257eb
2 changed files with 22 additions and 3 deletions

View File

@@ -4449,6 +4449,13 @@ static int outer_child(
if (notify_fd < 0)
return notify_fd;
/* Join the external network namespace first, while we are still in the parent's
* user namespace and have CAP_SYS_ADMIN there. Once we clone with CLONE_NEWUSER,
* the child will be in a new user namespace, lacking the capabilities in the
* parent user namespace required to join its network namespace. */
if (arg_network_namespace_path && setns(netns_fd, CLONE_NEWNET) < 0)
return log_error_errno(errno, "Failed to join network namespace: %m");
pid_t pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
(IN_SET(arg_userns_mode, USER_NAMESPACE_FIXED, USER_NAMESPACE_PICK) ? CLONE_NEWUSER : 0) |
@@ -4464,9 +4471,6 @@ static int outer_child(
/* The inner child has all namespaces that are requested, so that we all are owned by the
* user if user namespaces are turned on. */
if (arg_network_namespace_path && setns(netns_fd, CLONE_NEWNET) < 0)
return log_error_errno(errno, "Failed to join network namespace: %m");
if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
/* In managed usernamespace operation, sysfs + procfs are special, we'll have to
* mount them inside the inner namespaces, but before we switch root. Hence do so

View File

@@ -1185,6 +1185,21 @@ matrix_run_one() {
ip a | grep -v -E '^1: lo.*UP'
ip netns del nspawn_test
# test --network-namespace-path works when combined with --private-users=pick
ip netns add nspawn_test
ip netns exec nspawn_test ip link add foo type dummy
if [[ "$IS_USERNS_SUPPORTED" == "yes" && "$api_vfs_writable" == "no" ]]; then
SYSTEMD_NSPAWN_USE_CGNS="$use_cgns" SYSTEMD_NSPAWN_API_VFS_WRITABLE="$api_vfs_writable" \
systemd-nspawn --register=no \
--directory="$root" \
--private-users=pick \
--network-namespace-path=/run/netns/nspawn_test \
ip link show dev foo
fi
ip netns del nspawn_test
rm -fr "$root"
return 0