diff --git a/man/systemd-nsresourced.service.xml b/man/systemd-nsresourced.service.xml index 787312d858f..853fe09fbc9 100644 --- a/man/systemd-nsresourced.service.xml +++ b/man/systemd-nsresourced.service.xml @@ -52,6 +52,16 @@ registered with this service. Moreover, UIDs and GIDs are always allocated together, and symmetrically. + The allocation API supports delegated ranges: additional UID/GID ranges that + are mapped 1:1 into the user namespace rather than being translated to a target UID/GID. These delegated + ranges enable nested user namespace scenarios where a container needs to create child user namespaces + with their own transient UID ranges. Normally, the kernel restricts which UIDs can be mapped into a user + namespace to those that are also mapped in the parent. Delegated ranges solve this by pre-allocating + additional ranges that are visible inside the user namespace and can be used by nested + AllocateUserRange() calls. Up to 16 delegated ranges can be requested per user + namespace, each of size 65536. The ranges are allocated from the container UID ranges as per + Users, Groups, UIDs and GIDs on systemd Systems. + The service provides API calls to allowlist mounts (referenced via their mount file descriptors as per Linux fsmount() API), to pass ownership of a cgroup subtree to the user namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination diff --git a/src/basic/uid-range.c b/src/basic/uid-range.c index 1aaf760468b..763c421e91e 100644 --- a/src/basic/uid-range.c +++ b/src/basic/uid-range.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include #include #include "alloc-util.h" @@ -115,7 +116,7 @@ int uid_range_add_internal(UIDRange **range, uid_t start, uid_t nr, bool coalesc return 0; } -int uid_range_add_str(UIDRange **range, const char *s) { +int uid_range_add_str_full(UIDRange **range, const char *s, bool coalesce) { uid_t start, end; int r; @@ -126,7 +127,7 @@ int uid_range_add_str(UIDRange **range, const char *s) { if (r < 0) return r; - return uid_range_add_internal(range, start, end - start + 1, /* coalesce= */ true); + return uid_range_add_internal(range, start, end - start + 1, coalesce); } int uid_range_next_lower(const UIDRange *range, uid_t *uid) { @@ -230,7 +231,7 @@ bool uid_range_is_empty(const UIDRange *range) { return true; } -int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret) { +int uid_range_load_userns_full(const char *path, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret) { _cleanup_(uid_range_freep) UIDRange *range = NULL; _cleanup_fclose_ FILE *f = NULL; int r; @@ -280,13 +281,14 @@ int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange ** return r; } - uid_range_coalesce(range); + if (coalesce) + uid_range_coalesce(range); *ret = TAKE_PTR(range); return 0; } -int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) { +int uid_range_load_userns_by_fd_full(int userns_fd, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret) { _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL; int r; @@ -299,7 +301,7 @@ int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange if (r < 0) return r; if (r > 0) - return uid_range_load_userns(/* path= */ NULL, mode, ret); + return uid_range_load_userns_full(/* path= */ NULL, mode, coalesce, ret); r = userns_enter_and_pin(userns_fd, &pidref); if (r < 0) @@ -309,7 +311,7 @@ int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange pidref.pid, IN_SET(mode, UID_RANGE_USERNS_INSIDE, UID_RANGE_USERNS_OUTSIDE) ? "uid_map" : "gid_map"); - return uid_range_load_userns(p, mode, ret); + return uid_range_load_userns_full(p, mode, coalesce, ret); } bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr) { @@ -332,6 +334,204 @@ bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr) { return false; } +int uid_range_clip(UIDRange *range, uid_t min, uid_t max) { + assert(range); + + if (min > max) + return -EINVAL; + + size_t t = 0; + FOREACH_ARRAY(e, range->entries, range->n_entries) { + uid_t entry_end = e->start + e->nr; /* one past the last UID in entry */ + + /* Skip entries completely outside [min, max] */ + if (entry_end <= min || e->start > max) + continue; + + /* Trim the entry to fit within [min, max] */ + uid_t new_start = MAX(e->start, min); + /* entry_end is exclusive, avoid overflow when max == UINT32_MAX */ + uid_t new_end = entry_end <= max ? entry_end : max + 1; + assert(new_end > new_start); + + range->entries[t++] = (UIDRangeEntry) { + .start = new_start, + .nr = new_end - new_start, + }; + } + + range->n_entries = t; + + return 0; +} + +int uid_range_partition(UIDRange *range, uid_t size) { + assert(range); + assert(size > 0); + + /* Partitions the UID range entries into buckets of the given size. Any entry larger than the given + * size will be partitioned into multiple entries, each of the given size. Any leftover UIDs in the + * entry are dropped. Any entries smaller than the given size are also dropped. */ + + /* Count how many entries we'll need after partitioning */ + size_t n_new_entries = 0; + FOREACH_ARRAY(e, range->entries, range->n_entries) + n_new_entries += e->nr / size; + + if (n_new_entries == 0) { + range->n_entries = 0; + return 0; + } + + if (n_new_entries > range->n_entries && !GREEDY_REALLOC(range->entries, n_new_entries)) + return -ENOMEM; + + /* Work backwards to avoid overwriting entries we still need to read */ + size_t t = n_new_entries; + for (size_t i = range->n_entries; i > 0; i--) { + UIDRangeEntry *e = range->entries + i - 1; + unsigned n_parts = e->nr / size; + + for (unsigned j = n_parts; j > 0; j--) + range->entries[--t] = (UIDRangeEntry) { + .start = e->start + (j - 1) * size, + .nr = size, + }; + } + + range->n_entries = n_new_entries; + + return 0; +} + +int uid_range_copy(const UIDRange *range, UIDRange **ret) { + assert(ret); + + if (!range) { + *ret = NULL; + return 0; + } + + _cleanup_(uid_range_freep) UIDRange *copy = new0(UIDRange, 1); + if (!copy) + return -ENOMEM; + + if (range->n_entries > 0) { + copy->entries = newdup(UIDRangeEntry, range->entries, range->n_entries); + if (!copy->entries) + return -ENOMEM; + + copy->n_entries = range->n_entries; + } + + *ret = TAKE_PTR(copy); + return 0; +} + +int uid_range_remove(UIDRange *range, uid_t start, uid_t size) { + assert(range); + + if (size == 0) + return 0; + + uid_t end = start + size; /* one past the last UID to remove */ + + for (size_t i = 0; i < range->n_entries; i++) { + UIDRangeEntry *e = range->entries + i; + uid_t entry_end = e->start + e->nr; + + /* No overlap */ + if (entry_end <= start || e->start >= end) + continue; + + /* Check if this removal splits the entry into two parts */ + if (e->start < start && entry_end > end) { + /* Need to split: grow the array first */ + if (!GREEDY_REALLOC(range->entries, range->n_entries + 1)) + return -ENOMEM; + + /* Re-fetch pointer after potential realloc */ + e = range->entries + i; + entry_end = e->start + e->nr; + + /* Shift everything after this entry to make room */ + memmove(range->entries + i + 2, range->entries + i + 1, + (range->n_entries - i - 1) * sizeof(UIDRangeEntry)); + range->n_entries++; + + /* First part: before the removed range */ + range->entries[i] = (UIDRangeEntry) { + .start = e->start, + .nr = start - e->start, + }; + + /* Second part: after the removed range */ + range->entries[i + 1] = (UIDRangeEntry) { + .start = end, + .nr = entry_end - end, + }; + + /* Skip the newly inserted entry */ + i++; + continue; + } + + /* Removal covers the entire entry */ + if (start <= e->start && end >= entry_end) { + memmove(e, e + 1, (range->n_entries - i - 1) * sizeof(UIDRangeEntry)); + range->n_entries--; + i--; + continue; + } + + /* Removal trims the start of the entry */ + if (start <= e->start && end > e->start) { + e->nr = entry_end - end; + e->start = end; + continue; + } + + /* Removal trims the end of the entry */ + if (start < entry_end && end >= entry_end) { + e->nr = start - e->start; + continue; + } + } + + return 0; +} + +int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret) { + assert(uid_range_entries(outside) == uid_range_entries(inside)); + assert(ret); + + /* Given two UID ranges that represent the outside UID range of a user namespace (the 2nd and 3rd + * columns in /proc/xxx/uid_map) and the inside UID range of a user namespace (the 1st and 3rd + * columns in /proc/xxx/uid_map), translates the given UID from the outside range to the inside + * range. For example, given the following UID range: + * + * 0 1000 1 + * + * calling uid_range_translate(outside, inside, 1000) will return 0 as the output UID. Alternatively, + * calling uid_range_translate(inside, outside, 0) will return 1000 as the output UID. + */ + + for (size_t i = 0; i < uid_range_entries(outside); i++) + assert(outside->entries[i].nr == inside->entries[i].nr); + + for (size_t i = 0; i < uid_range_entries(outside); i++) { + const UIDRangeEntry *e = outside->entries + i; + + if (uid < e->start || uid >= e->start + e->nr) + continue; + + *ret = inside->entries[i].start + uid - e->start; + return 0; + } + + return -ESRCH; +} + bool uid_range_equal(const UIDRange *a, const UIDRange *b) { if (a == b) return true; diff --git a/src/basic/uid-range.h b/src/basic/uid-range.h index c28b02fa7d1..a15a2a8e4f9 100644 --- a/src/basic/uid-range.h +++ b/src/basic/uid-range.h @@ -19,7 +19,10 @@ int uid_range_add_internal(UIDRange **range, uid_t start, uid_t nr, bool coalesc static inline int uid_range_add(UIDRange **range, uid_t start, uid_t nr) { return uid_range_add_internal(range, start, nr, true); } -int uid_range_add_str(UIDRange **range, const char *s); +int uid_range_add_str_full(UIDRange **range, const char *s, bool coalesce); +static inline int uid_range_add_str(UIDRange **range, const char *s) { + return uid_range_add_str_full(range, s, true); +} int uid_range_next_lower(const UIDRange *range, uid_t *uid); @@ -48,11 +51,23 @@ typedef enum UIDRangeUsernsMode { _UID_RANGE_USERNS_MODE_INVALID = -EINVAL, } UIDRangeUsernsMode; -int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret); -int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret); +int uid_range_load_userns_full(const char *path, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret); +static inline int uid_range_load_userns(const char *path, UIDRangeUsernsMode mode, UIDRange **ret) { + return uid_range_load_userns_full(path, mode, true, ret); +} +int uid_range_load_userns_by_fd_full(int userns_fd, UIDRangeUsernsMode mode, bool coalesce, UIDRange **ret); +static inline int uid_range_load_userns_by_fd(int userns_fd, UIDRangeUsernsMode mode, UIDRange **ret) { + return uid_range_load_userns_by_fd_full(userns_fd, mode, true, ret); +} bool uid_range_overlaps(const UIDRange *range, uid_t start, uid_t nr); +int uid_range_clip(UIDRange *range, uid_t min, uid_t max); +int uid_range_partition(UIDRange *range, uid_t size); +int uid_range_copy(const UIDRange *range, UIDRange **ret); +int uid_range_remove(UIDRange *range, uid_t start, uid_t size); +int uid_range_translate(const UIDRange *outside, const UIDRange *inside, uid_t uid, uid_t *ret); + int uid_map_search_root(pid_t pid, UIDRangeUsernsMode mode, uid_t *ret); uid_t uid_range_base(const UIDRange *range); diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c index 60d3a01ce06..abb50955081 100644 --- a/src/nsresourced/nsresourcework.c +++ b/src/nsresourced/nsresourcework.c @@ -34,6 +34,7 @@ #include "mountpoint-util.h" #include "namespace-util.h" #include "netlink-util.h" +#include "nsresource.h" #include "pidref.h" #include "process-util.h" #include "random-util.h" @@ -357,16 +358,19 @@ static int vl_method_get_memberships(sd_varlink *link, sd_json_variant *paramete return sd_varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); } -static int uid_is_available( - int registry_dir_fd, - uid_t candidate) { - +static int uid_is_available(int registry_dir_fd, uid_t candidate, int parent_userns_fd) { int r; assert(registry_dir_fd >= 0); log_debug("Checking if UID " UID_FMT " is available.", candidate); + uint64_t parent_userns_inode = 0; + struct stat parent_st; + if (fstat(parent_userns_fd, &parent_st) < 0) + return log_debug_errno(errno, "Failed to fstat parent user namespace: %m"); + parent_userns_inode = parent_st.st_ino; + r = userns_registry_uid_exists(registry_dir_fd, candidate); if (r < 0) return r; @@ -379,17 +383,65 @@ static int uid_is_available( if (r > 0) return false; - r = userdb_by_uid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL); - if (r >= 0) - return false; - if (r != -ESRCH) + /* Also check delegation files. If parent_userns_inode is set and matches the delegation's userns + * inode, the UID is available because the parent owns that delegation. */ + r = userns_registry_delegation_uid_exists(registry_dir_fd, candidate); + if (r < 0) return r; + if (r > 0) { + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo delegation = DELEGATED_USER_NAMESPACE_INFO_NULL; + r = userns_registry_load_delegation_by_uid(registry_dir_fd, candidate, &delegation); + if (r < 0) + return r; - r = groupdb_by_gid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL); - if (r >= 0) - return false; - if (r != -ESRCH) + if (delegation.userns_inode != parent_userns_inode) + return false; + + /* The parent userns owns this delegation, so the UID is available for nested allocation */ + log_debug("UID " UID_FMT " is delegated by parent userns inode %" PRIu64 ", available for nested allocation.", + candidate, parent_userns_inode); + } + + r = userns_registry_delegation_gid_exists(registry_dir_fd, (gid_t) candidate); + if (r < 0) return r; + if (r > 0) { + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo delegation = DELEGATED_USER_NAMESPACE_INFO_NULL; + r = userns_registry_load_delegation_by_gid(registry_dir_fd, candidate, &delegation); + if (r < 0) + return r; + + if (delegation.userns_inode != parent_userns_inode) + return false; + + /* The parent userns owns this delegation, so the UID is available for nested allocation */ + log_debug("UID " UID_FMT " is delegated by parent userns inode %" PRIu64 ", available for nested allocation.", + candidate, parent_userns_inode); + } + + r = is_our_namespace(parent_userns_fd, NAMESPACE_USER); + if (r < 0) + return log_debug_errno(r, "Failed to check if parent user namespace is our user namespace: %m"); + + if (r > 0) { + /* Only check userdb if we're allocating from our current user namespace. userdb won't be + * to tell us anything on whether UIDs/GIDs in another user namespace are in use or not. On + * top of that, for nspawn containers registered with machined's userdb implementation, it + * would tell us that any ranges delegated to the container are in use (which is true in the + * nsresourced user namespace, but not in the nspawn user namespace). */ + + r = userdb_by_uid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + + r = groupdb_by_gid(candidate, /* match= */ NULL, USERDB_AVOID_MULTIPLEXER, /* ret= */ NULL); + if (r >= 0) + return false; + if (r != -ESRCH) + return r; + } log_debug("UID " UID_FMT " is available.", candidate); @@ -433,19 +485,114 @@ static int name_is_available( return true; } -static int allocate_now( +static int allocate_one( int registry_dir_fd, - UserNamespaceInfo *info, - int *ret_lock_fd) { + const char *name, + uint32_t size, + int parent_userns_fd, + UIDRange *candidates, + uid_t *ret_candidate) { static const uint8_t hash_key[16] = { 0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd, 0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee, }; - - _cleanup_(uid_range_freep) UIDRange *valid_range = NULL; - uid_t candidate, uidmin, uidmax, uidmask; + _cleanup_(uid_range_freep) UIDRange *copy = NULL; + uid_t candidate, uidmin, uidmax; unsigned n_tries = 100; + size_t idx; + int r; + + assert(registry_dir_fd >= 0); + assert(candidates); + assert(ret_candidate); + + switch (size) { + + case NSRESOURCE_UIDS_64K: + uidmin = CONTAINER_UID_BASE_MIN; + uidmax = CONTAINER_UID_BASE_MAX; + break; + + case NSRESOURCE_UIDS_1: + uidmin = DYNAMIC_UID_MIN; + uidmax = DYNAMIC_UID_MAX; + break; + + default: + assert_not_reached(); + } + + /* Make a copy of candidates that we can modify for the selection algorithm */ + r = uid_range_copy(candidates, ©); + if (r < 0) + return log_debug_errno(r, "Failed to copy UID range: %m"); + + /* Clip the copy with the valid UID range for this allocation size */ + r = uid_range_clip(copy, uidmin, uidmax); + if (r < 0) + return log_debug_errno(r, "Failed to intersect UID range: %m"); + + /* Partition entries into entries of exactly the right size */ + r = uid_range_partition(copy, size); + if (r < 0) + return log_debug_errno(r, "Failed to partition UID ranges: %m"); + + if (uid_range_is_empty(copy)) + return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate."); + + log_debug("Partitioned UID range into %zu entries of size %" PRIu32, copy->n_entries, size); + + /* Start from a hash of the input name if we have one, use random values afterwards. */ + idx = name ? siphash24_string(name, hash_key) : random_u32(); + for (;; idx = random_u32()) { + if (uid_range_is_empty(copy)) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "All candidate UIDs already taken."); + + if (--n_tries <= 0) + return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available."); + + idx %= copy->n_entries; + + candidate = copy->entries[idx].start; + + /* We only check the base UID for each range. Pass the parent userns inode so that + * allocating from a delegated range owned by the parent is allowed. */ + r = uid_is_available(registry_dir_fd, candidate, parent_userns_fd); + if (r < 0) + return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate); + if (r > 0) + break; + + log_debug("UID range " UID_FMT " already taken.", candidate); + + /* Remove this unavailable range from candidates so we don't try it again */ + r = uid_range_remove(copy, candidate, size); + if (r < 0) + return log_debug_errno(r, "Failed to remove unavailable range from candidates: %m"); + } + + /* Remove the allocated range from the original candidates */ + r = uid_range_remove(candidates, candidate, size); + if (r < 0) + return log_debug_errno(r, "Failed to remove allocated range from candidates: %m"); + + *ret_candidate = candidate; + + log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + size - 1); + + return 0; +} + +static int allocate_now( + int registry_dir_fd, + int userns_fd, + int parent_userns_fd, + UserNamespaceInfo *info, + int *ret_lock_fd) { + + _cleanup_(uid_range_freep) UIDRange *candidates = NULL; + uid_t candidate; int r; /* Returns the following error codes: @@ -456,33 +603,12 @@ static int allocate_now( */ assert(registry_dir_fd >= 0); + assert(userns_fd >= 0); assert(info); - switch (info->size) { - - case 0x10000U: - uidmin = CONTAINER_UID_BASE_MIN; - uidmax = CONTAINER_UID_BASE_MAX; - uidmask = (uid_t) UINT32_C(0xFFFF0000); - break; - - case 1U: - uidmin = DYNAMIC_UID_MIN; - uidmax = DYNAMIC_UID_MAX; - uidmask = (uid_t) UINT32_C(0xFFFFFFFF); - break; - - default: - assert_not_reached(); - } - - r = uid_range_load_userns(/* path= */ NULL, UID_RANGE_USERNS_INSIDE, &valid_range); + r = uid_range_load_userns_by_fd(parent_userns_fd, UID_RANGE_USERNS_INSIDE, &candidates); if (r < 0) - return r; - - /* Check early whether we have any chance at all given our own uid range */ - if (!uid_range_overlaps(valid_range, uidmin, uidmax)) - return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate."); + return log_debug_errno(r, "Failed to read userns UID range: %m"); _cleanup_close_ int lock_fd = -EBADF; lock_fd = userns_registry_lock(registry_dir_fd); @@ -508,45 +634,74 @@ static int allocate_now( if (r == 0) return -EEXIST; - for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */ - candidate = random_u32()) { /* Use random values afterwards */ + r = allocate_one( + registry_dir_fd, + info->name, info->size, + parent_userns_fd, + candidates, + &candidate); + if (r < 0) + return r; - if (--n_tries <= 0) - return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available."); + info->start_uid = candidate; + info->start_gid = (gid_t) candidate; - candidate = (candidate % (uidmax - uidmin)) + uidmin; - candidate &= uidmask; + /* Now allocate delegated ranges if requested */ + if (info->n_delegates > 0) { + assert(info->delegates); - if (!uid_range_covers(valid_range, candidate, info->size)) - continue; + FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) { + r = allocate_one( + registry_dir_fd, + /* name= */ NULL, + delegate->size, + parent_userns_fd, + candidates, + &candidate); + if (r < 0) + return r; - /* We only check the base UID for each range (!) */ - r = uid_is_available(registry_dir_fd, candidate); - if (r < 0) - return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate); - if (r > 0) { - info->start_uid = candidate; - info->start_gid = (gid_t) candidate; - - log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1); - - if (ret_lock_fd) - *ret_lock_fd = TAKE_FD(lock_fd); - - return 0; + delegate->userns_inode = info->userns_inode; + delegate->start_uid = candidate; + delegate->start_gid = (gid_t) candidate; } - - log_debug("UID range " UID_FMT " already taken.", candidate); } + + if (ret_lock_fd) + *ret_lock_fd = TAKE_FD(lock_fd); + + return 0; } -static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) { +static int write_userns_mappings(PidRef *pidref, const char *uidmap, const char *gidmap) { + const char *pmap; + int r; + + assert(pidref); + assert(uidmap); + assert(gidmap); + + pmap = procfs_file_alloca(pidref->pid, "uid_map"); + r = write_string_file(pmap, uidmap, /* flags= */ 0); + if (r < 0) + return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m"); + + pmap = procfs_file_alloca(pidref->pid, "gid_map"); + r = write_string_file(pmap, gidmap, /* flags= */ 0); + if (r < 0) + return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m"); + + return 0; +} + +static int write_userns(int userns_fd, int parent_userns_fd, const UserNamespaceInfo *userns_info) { _cleanup_(pidref_done_sigkill_wait) PidRef pidref = PIDREF_NULL; _cleanup_close_ int efd = -EBADF; uint64_t u; int r; - assert(usernsfd >= 0); + assert(userns_fd >= 0); + assert(parent_userns_fd >= 0); assert(userns_info); assert(uid_is_valid(userns_info->target_uid)); assert(uid_is_valid(userns_info->start_uid)); @@ -566,7 +721,7 @@ static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) { if (r == 0) { /* child */ - if (setns(usernsfd, CLONE_NEWUSER) < 0) { + if (setns(userns_fd, CLONE_NEWUSER) < 0) { log_error_errno(errno, "Failed to join user namespace: %m"); goto child_fail; } @@ -588,22 +743,135 @@ static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) { /* Now write mapping */ - _cleanup_free_ char *pmap = NULL; + _cleanup_(uid_range_freep) UIDRange *outside_range = NULL; + r = uid_range_load_userns_by_fd_full(parent_userns_fd, UID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &outside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns UID range: %m"); - if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pidref.pid) < 0) + _cleanup_(uid_range_freep) UIDRange *inside_range = NULL; + r = uid_range_load_userns_by_fd_full(parent_userns_fd, UID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &inside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns UID range: %m"); + + uid_t start_uid; + r = uid_range_translate(outside_range, inside_range, userns_info->start_uid, &start_uid); + if (r < 0) + return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", userns_info->start_uid); + + /* Let's enforce that the transient UID/GID ranges are mapped 1:1 in the parent user namespace, to + * avoid any weird mapping shenanigans that might happen otherwise. */ + + if (start_uid != userns_info->start_uid) + return log_debug_errno( + SYNTHETIC_ERRNO(ERANGE), + "Transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")", + userns_info->start_uid, start_uid); + + /* Build uid_map content: primary mapping + delegated mappings (1:1) */ + _cleanup_free_ char *uidmap = NULL; + if (asprintf(&uidmap, UID_FMT " " UID_FMT " %" PRIu32 "\n", + userns_info->target_uid, start_uid, userns_info->size) < 0) return log_oom(); - r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " %" PRIu32 "\n", userns_info->target_uid, userns_info->start_uid, userns_info->size); - if (r < 0) - return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m"); + log_debug("UID mapping: " UID_FMT " " UID_FMT " %" PRIu32, + userns_info->target_uid, userns_info->start_uid, userns_info->size); - pmap = mfree(pmap); - if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pidref.pid) < 0) + FOREACH_ARRAY(delegate, userns_info->delegates, userns_info->n_delegates) { + r = uid_range_translate(outside_range, inside_range, delegate->start_uid, &start_uid); + if (r < 0) + return log_debug_errno(r, "Failed to translate UID "UID_FMT" to parent userns: %m", userns_info->start_uid); + + if (start_uid != delegate->start_uid) + return log_debug_errno( + SYNTHETIC_ERRNO(ERANGE), + "Delegated transient UID range not mapped 1:1 in parent userns ("UID_FMT" -> "UID_FMT")", + delegate->start_uid, start_uid); + + if (strextendf(&uidmap, + UID_FMT " " UID_FMT " %" PRIu32 "\n", + delegate->start_uid, + start_uid, + delegate->size) < 0) + return log_oom(); + + log_debug("UID mapping: " UID_FMT " " UID_FMT " %" PRIu32, + delegate->start_uid, start_uid, delegate->size); + } + + outside_range = uid_range_free(outside_range); + inside_range = uid_range_free(inside_range); + + r = uid_range_load_userns_by_fd_full(parent_userns_fd, GID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &outside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns GID range: %m"); + + r = uid_range_load_userns_by_fd_full(parent_userns_fd, GID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &inside_range); + if (r < 0) + return log_debug_errno(r, "Failed to read userns GID range: %m"); + + gid_t start_gid; + r = uid_range_translate(outside_range, inside_range, userns_info->start_gid, &start_gid); + if (r < 0) + return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid); + + if (start_gid != userns_info->start_gid) + return log_debug_errno( + SYNTHETIC_ERRNO(ERANGE), + "Transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")", + userns_info->start_gid, start_gid); + + _cleanup_free_ char *gidmap = NULL; + if (asprintf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n", + userns_info->target_gid, start_gid, userns_info->size) < 0) return log_oom(); - r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " %" PRIu32 "\n", userns_info->target_gid, userns_info->start_gid, userns_info->size); + log_debug("GID mapping: " GID_FMT " " GID_FMT " %" PRIu32, + userns_info->target_gid, userns_info->start_gid, userns_info->size); + + FOREACH_ARRAY(delegate, userns_info->delegates, userns_info->n_delegates) { + r = uid_range_translate(outside_range, inside_range, delegate->start_gid, &start_gid); + if (r < 0) + return log_debug_errno(r, "Failed to translate GID "GID_FMT" to parent userns: %m", userns_info->start_gid); + + if (start_gid != delegate->start_gid) + return log_debug_errno( + SYNTHETIC_ERRNO(ERANGE), + "Delegated transient GID range not mapped 1:1 in parent userns ("GID_FMT" -> "GID_FMT")", + delegate->start_gid, start_gid); + + /* Delegated ranges are mapped 1:1 (inside GID == outside GID) */ + if (strextendf(&gidmap, GID_FMT " " GID_FMT " %" PRIu32 "\n", + delegate->start_gid, + start_gid, + delegate->size) < 0) + return log_oom(); + + log_debug("GID mapping: " GID_FMT " " GID_FMT " %" PRIu32, + delegate->start_gid, start_gid, delegate->size); + } + + r = is_our_namespace(parent_userns_fd, NAMESPACE_USER); if (r < 0) - return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m"); + return log_debug_errno(r, "Failed to check if parent user namespace refers to our own user namespace: %m"); + if (r > 0) + return write_userns_mappings(&pidref, uidmap, gidmap); + + /* The kernel is paranoid that the uid_map and gid_map files are written either from the user + * namespace itself or its parent user namespace, so we have to join the parent user namespace to + * write the files. */ + + r = pidref_safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT, /* ret= */ NULL); + if (r < 0) + return r; + if (r == 0) { + if (setns(parent_userns_fd, CLONE_NEWUSER) < 0) { + log_error_errno(errno, "Failed to join parent user namespace: %m"); + _exit(EXIT_FAILURE); + } + + r = write_userns_mappings(&pidref, uidmap, gidmap); + _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } /* We are done! */ @@ -840,16 +1108,18 @@ typedef struct AllocateParameters { uid_t target; unsigned userns_fd_idx; bool mangle_name; + uint32_t delegate_container_ranges; } AllocateParameters; static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *parameters, sd_varlink_method_flags_t flags, void *userdata) { static const sd_json_dispatch_field dispatch_table[] = { - { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, offsetof(AllocateParameters, name), SD_JSON_MANDATORY }, - { "size", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, size), SD_JSON_MANDATORY }, - { "target", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid, offsetof(AllocateParameters, target), 0 }, - { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY }, - { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 }, + { "name", SD_JSON_VARIANT_STRING, sd_json_dispatch_const_string, offsetof(AllocateParameters, name), SD_JSON_MANDATORY }, + { "size", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, size), SD_JSON_MANDATORY }, + { "target", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uid_gid, offsetof(AllocateParameters, target), 0 }, + { "userNamespaceFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint, offsetof(AllocateParameters, userns_fd_idx), SD_JSON_MANDATORY }, + { "mangleName", SD_JSON_VARIANT_BOOLEAN, sd_json_dispatch_stdbool, offsetof(AllocateParameters, mangle_name), 0 }, + { "delegateContainerRanges", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(AllocateParameters, delegate_container_ranges), 0 }, {} }; @@ -883,6 +1153,9 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para if (r != 0) return r; + if (p.delegate_container_ranges > USER_NAMESPACE_DELEGATIONS_MAX) + return sd_varlink_error(link, "io.systemd.NamespaceResource.TooManyDelegations", NULL); + userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx); if (userns_fd < 0) return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m"); @@ -898,6 +1171,10 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para if (fstat(userns_fd, &userns_st) < 0) return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m"); + _cleanup_close_ int parent_userns_fd = ioctl(userns_fd, NS_GET_PARENT); + if (parent_userns_fd < 0) + return log_debug_errno(errno, "Failed to get parent user namespace: %m"); + r = sd_varlink_get_peer_uid(link, &peer_uid); if (r < 0) return r; @@ -942,7 +1219,21 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para userns_info->target_uid = p.target; userns_info->target_gid = (gid_t) p.target; - r = allocate_now(registry_dir_fd, userns_info, &lock_fd); + /* Set up delegation arrays if requested */ + if (p.delegate_container_ranges > 0) { + userns_info->delegates = new0(DelegatedUserNamespaceInfo, p.delegate_container_ranges); + if (!userns_info->delegates) + return -ENOMEM; + + FOREACH_ARRAY(delegate, userns_info->delegates, p.delegate_container_ranges) { + *delegate = DELEGATED_USER_NAMESPACE_INFO_NULL; + delegate->size = NSRESOURCE_UIDS_64K; + } + + userns_info->n_delegates = p.delegate_container_ranges; + } + + r = allocate_now(registry_dir_fd, userns_fd, parent_userns_fd, userns_info, &lock_fd); if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */ return sd_varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL); if (r == -EBUSY) /* All used up */ @@ -968,7 +1259,7 @@ static int vl_method_allocate_user_range(sd_varlink *link, sd_json_variant *para if (r < 0) goto fail; - r = write_userns(userns_fd, userns_info); + r = write_userns(userns_fd, parent_userns_fd, userns_info); if (r < 0) goto fail; diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c index 97a222cfc40..a728c7fde9f 100644 --- a/src/nsresourced/userns-registry.c +++ b/src/nsresourced/userns-registry.c @@ -56,6 +56,23 @@ int userns_registry_lock(int dir_fd) { return TAKE_FD(lock_fd); } +void delegated_userns_info_done(DelegatedUserNamespaceInfo *info) { + if (!info) + return; + + info->ancestor_userns = mfree(info->ancestor_userns); + info->n_ancestor_userns = 0; +} + +void delegated_userns_info_done_many(DelegatedUserNamespaceInfo infos[], size_t n) { + assert(infos || n == 0); + + FOREACH_ARRAY(info, infos, n) + delegated_userns_info_done(info); + + free(infos); +} + UserNamespaceInfo* userns_info_new(void) { UserNamespaceInfo *info = new(UserNamespaceInfo, 1); if (!info) @@ -79,6 +96,8 @@ UserNamespaceInfo *userns_info_free(UserNamespaceInfo *userns) { free(userns->cgroups); free(userns->name); + delegated_userns_info_done_many(userns->delegates, userns->n_delegates); + strv_free(userns->netifs); return mfree(userns); @@ -128,6 +147,100 @@ static int dispatch_cgroups_array(const char *name, sd_json_variant *variant, sd return 0; } +static int dispatch_delegates_array(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { + UserNamespaceInfo *info = ASSERT_PTR(userdata); + DelegatedUserNamespaceInfo *delegates = NULL; + size_t n = 0; + int r; + + CLEANUP_ARRAY(delegates, n, delegated_userns_info_done_many); + + if (sd_json_variant_is_null(variant)) { + delegated_userns_info_done_many(info->delegates, info->n_delegates); + info->delegates = NULL; + info->n_delegates = 0; + return 0; + } + + if (!sd_json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + size_t elements = sd_json_variant_elements(variant); + if (elements > USER_NAMESPACE_DELEGATIONS_MAX) + return json_log(variant, flags, SYNTHETIC_ERRNO(E2BIG), "Too many delegations."); + + delegates = new(DelegatedUserNamespaceInfo, elements); + if (!delegates) + return json_log_oom(variant, flags); + + sd_json_variant *e; + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + static const sd_json_dispatch_field delegate_dispatch_table[] = { + { "userns", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(DelegatedUserNamespaceInfo, userns_inode), 0 }, + { "start", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_uid), SD_JSON_MANDATORY }, + { "startGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_gid), SD_JSON_MANDATORY }, + { "size", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32, offsetof(DelegatedUserNamespaceInfo, size), SD_JSON_MANDATORY }, + {} + }; + + delegates[n] = DELEGATED_USER_NAMESPACE_INFO_NULL; + + r = sd_json_dispatch(e, delegate_dispatch_table, flags, &delegates[n]); + if (r < 0) + return r; + + if (!uid_is_valid(delegates[n].start_uid) || !gid_is_valid(delegates[n].start_gid)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid delegate UID/GID."); + + if (delegates[n].size == 0) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid delegate size."); + + n++; + } + + delegated_userns_info_done_many(info->delegates, info->n_delegates); + info->delegates = TAKE_PTR(delegates); + info->n_delegates = n; + + return 0; +} + +static int dispatch_ancestor_userns_array(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) { + DelegatedUserNamespaceInfo *info = ASSERT_PTR(userdata); + _cleanup_free_ uint64_t *ancestor_userns = NULL; + size_t n = 0; + + if (sd_json_variant_is_null(variant)) { + info->ancestor_userns = mfree(info->ancestor_userns); + info->n_ancestor_userns = 0; + return 0; + } + + if (!sd_json_variant_is_array(variant)) + return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name)); + + ancestor_userns = new(uint64_t, sd_json_variant_elements(variant)); + if (!ancestor_userns) + return json_log_oom(variant, flags); + + sd_json_variant *e; + JSON_VARIANT_ARRAY_FOREACH(e, variant) { + if (!sd_json_variant_is_unsigned(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an unsigned integer."); + + uint64_t v = sd_json_variant_unsigned(e); + if (v == 0) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid ancestor userns inode 0."); + + ancestor_userns[n++] = v; + } + + free_and_replace(info->ancestor_userns, ancestor_userns); + info->n_ancestor_userns = n; + + return 0; +} + static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **ret) { static const sd_json_dispatch_field dispatch_table[] = { @@ -141,6 +254,7 @@ static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo ** { "targetGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(UserNamespaceInfo, target_gid), 0 }, { "cgroups", SD_JSON_VARIANT_ARRAY, dispatch_cgroups_array, 0, 0 }, { "netifs", SD_JSON_VARIANT_ARRAY, sd_json_dispatch_strv, offsetof(UserNamespaceInfo, netifs), 0 }, + { "delegates", SD_JSON_VARIANT_ARRAY, dispatch_delegates_array, 0, 0 }, {} }; @@ -443,6 +557,18 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) { return r; } + _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegates_array = NULL; + FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) { + r = sd_json_variant_append_arraybo( + &delegates_array, + SD_JSON_BUILD_PAIR_UNSIGNED("userns", delegate->userns_inode), + SD_JSON_BUILD_PAIR_UNSIGNED("start", delegate->start_uid), + SD_JSON_BUILD_PAIR_UNSIGNED("startGid", delegate->start_gid), + SD_JSON_BUILD_PAIR_UNSIGNED("size", delegate->size)); + if (r < 0) + return r; + } + _cleanup_(sd_json_variant_unrefp) sd_json_variant *def = NULL; r = sd_json_buildo( &def, @@ -455,7 +581,8 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) { SD_JSON_BUILD_PAIR_CONDITION(gid_is_valid(info->start_gid), "startGid", SD_JSON_BUILD_UNSIGNED(info->start_gid)), SD_JSON_BUILD_PAIR_CONDITION(gid_is_valid(info->target_gid), "targetGid", SD_JSON_BUILD_UNSIGNED(info->target_gid)), SD_JSON_BUILD_PAIR_CONDITION(!!cgroup_array, "cgroups", SD_JSON_BUILD_VARIANT(cgroup_array)), - JSON_BUILD_PAIR_STRV_NON_EMPTY("netifs", info->netifs)); + JSON_BUILD_PAIR_STRV_NON_EMPTY("netifs", info->netifs), + SD_JSON_BUILD_PAIR_CONDITION(!!delegates_array, "delegates", SD_JSON_BUILD_VARIANT(delegates_array))); if (r < 0) return r; @@ -531,6 +658,82 @@ int userns_registry_store(int dir_fd, UserNamespaceInfo *info) { goto fail; } + /* Store delegation files */ + FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegate_def = NULL, *ancestor_array = NULL; + _cleanup_free_ char *delegate_buf = NULL, *delegate_uid_fn = NULL, *delegate_gid_fn = NULL; + + if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) < 0) { + r = log_oom_debug(); + goto fail; + } + + /* Check if this delegation already exists. If so, this is a recursive + * subdelegation: we need to preserve the chain of previous owners so that + * ownership can be restored when the current owner goes away. */ + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo existing = DELEGATED_USER_NAMESPACE_INFO_NULL; + + r = userns_registry_load_delegation_by_uid(dir_fd, delegate->start_uid, &existing); + if (r >= 0) { + /* Delegation file exists — append old owner to ancestor chain */ + FOREACH_ARRAY(ancestor_userns, existing.ancestor_userns, existing.n_ancestor_userns) { + r = sd_json_variant_append_arrayb( + &ancestor_array, + SD_JSON_BUILD_UNSIGNED(*ancestor_userns)); + if (r < 0) + goto fail; + } + + /* userns_registry_store() is also called to update existing entries in the registry + * in which case we don't need to update the ownership of the delegated UID ranges. */ + if (delegate->userns_inode != existing.userns_inode) { + r = sd_json_variant_append_arrayb( + &ancestor_array, + SD_JSON_BUILD_UNSIGNED(existing.userns_inode)); + if (r < 0) + goto fail; + } + + } else if (r != -ENOENT) { + log_debug_errno(r, "Failed to load existing delegation for UID " UID_FMT ": %m", delegate->start_uid); + goto fail; + } + + r = sd_json_buildo( + &delegate_def, + SD_JSON_BUILD_PAIR_UNSIGNED("userns", delegate->userns_inode), + SD_JSON_BUILD_PAIR_UNSIGNED("start", delegate->start_uid), + SD_JSON_BUILD_PAIR_UNSIGNED("startGid", delegate->start_gid), + SD_JSON_BUILD_PAIR_UNSIGNED("size", delegate->size), + SD_JSON_BUILD_PAIR_CONDITION(!!ancestor_array, "ancestorUserns", SD_JSON_BUILD_VARIANT(ancestor_array))); + if (r < 0) + goto fail; + + r = sd_json_variant_format(delegate_def, /* flags= */ 0, &delegate_buf); + if (r < 0) { + log_debug_errno(r, "Failed to format delegation JSON object: %m"); + goto fail; + } + + r = write_string_file_at(dir_fd, delegate_uid_fn, delegate_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC); + if (r < 0) { + log_debug_errno(r, "Failed to write delegation data to '%s' in registry: %m", delegate_uid_fn); + goto fail; + } + + /* Create GID symlink pointing to the UID file */ + if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) < 0) { + r = log_oom_debug(); + goto fail; + } + + r = linkat_replace(dir_fd, delegate_uid_fn, dir_fd, delegate_gid_fn); + if (r < 0) { + log_debug_errno(r, "Failed to link delegation data to '%s' in registry: %m", delegate_gid_fn); + goto fail; + } + } + return 0; fail: @@ -547,6 +750,17 @@ fail: if (uid_fn) (void) unlinkat(dir_fd, uid_fn, AT_REMOVEDIR); + /* Clean up any delegation files we created */ + FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) { + _cleanup_free_ char *delegate_uid_fn = NULL, *delegate_gid_fn = NULL; + + if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) >= 0) + (void) unlinkat(dir_fd, delegate_uid_fn, /* flags= */ 0); + + if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) >= 0) + (void) unlinkat(dir_fd, delegate_gid_fn, /* flags= */ 0); + } + return r; } @@ -568,14 +782,18 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) { if (asprintf(®_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0) return log_oom_debug(); - ret = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0)); + r = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", reg_fn)); _cleanup_free_ char *link1_fn = NULL; link1_fn = strjoin("n", info->name, ".userns"); if (!link1_fn) return log_oom_debug(); - RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link1_fn, 0))); + r = RET_NERRNO(unlinkat(dir_fd, link1_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link1_fn)); if (uid_is_valid(info->start_uid)) { _cleanup_free_ char *link2_fn = NULL; @@ -583,7 +801,9 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) { if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start_uid) < 0) return log_oom_debug(); - RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link2_fn, 0))); + r = RET_NERRNO(unlinkat(dir_fd, link2_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link2_fn)); } if (uid_is_valid(info->start_gid)) { @@ -592,7 +812,9 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) { if (asprintf(&link3_fn, "g" GID_FMT ".userns", info->start_gid) < 0) return log_oom_debug(); - RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link3_fn, 0))); + r = RET_NERRNO(unlinkat(dir_fd, link3_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", link3_fn)); } _cleanup_free_ char *uid_fn = NULL; @@ -603,11 +825,90 @@ int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) { if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0) return log_oom_debug(); - RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, owner_fn, 0))); + r = RET_NERRNO(unlinkat(dir_fd, owner_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", owner_fn)); r = RET_NERRNO(unlinkat(dir_fd, uid_fn, AT_REMOVEDIR)); - if (r != -ENOTEMPTY) - RET_GATHER(ret, r); + if (r < 0 && r != -ENOTEMPTY) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", uid_fn)); + + /* Remove or restore delegation files */ + FOREACH_ARRAY(delegate, info->delegates, info->n_delegates) { + /* Check if this delegation has ancestor user namespaces. If so, restore ownership to + * the last ancestor instead of removing the delegation file entirely. */ + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo existing = DELEGATED_USER_NAMESPACE_INFO_NULL; + + r = userns_registry_load_delegation_by_uid(dir_fd, delegate->start_uid, &existing); + if (r < 0) { + log_debug_errno(r, + "Failed to load delegated UID range starting at "UID_FMT":"GID_FMT" for userns %"PRIu64": %m", + delegate->start_uid, delegate->start_gid, delegate->userns_inode); + RET_GATHER(ret, r); + continue; + } + + _cleanup_free_ char *delegate_uid_fn = NULL; + if (asprintf(&delegate_uid_fn, "u" UID_FMT ".delegate", delegate->start_uid) < 0) + return log_oom_debug(); + + if (existing.n_ancestor_userns > 0) { + _cleanup_(sd_json_variant_unrefp) sd_json_variant *delegate_def = NULL, *ancestor_array = NULL; + _cleanup_free_ char *delegate_buf = NULL; + + /* Pop the last ancestor userns inode to become the new owner */ + uint64_t new_owner = existing.ancestor_userns[existing.n_ancestor_userns - 1]; + + log_debug("Moving ownership of delegated UID range from %"PRIu64" to %"PRIu64".", + delegate->userns_inode, new_owner); + + /* Rebuild ancestor array without the last entry */ + for (size_t j = 0; j + 1 < existing.n_ancestor_userns; j++) { + r = sd_json_variant_append_arrayb( + &ancestor_array, + SD_JSON_BUILD_UNSIGNED(existing.ancestor_userns[j])); + if (r < 0) + return log_debug_errno(r, "Failed to append to JSON array: %m"); + } + + r = sd_json_buildo( + &delegate_def, + SD_JSON_BUILD_PAIR_UNSIGNED("userns", new_owner), + SD_JSON_BUILD_PAIR_UNSIGNED("start", existing.start_uid), + SD_JSON_BUILD_PAIR_UNSIGNED("startGid", existing.start_gid), + SD_JSON_BUILD_PAIR_UNSIGNED("size", existing.size), + SD_JSON_BUILD_PAIR_CONDITION(!!ancestor_array, "ancestorUserns", SD_JSON_BUILD_VARIANT(ancestor_array))); + if (r < 0) + return log_debug_errno(r, "Failed to build delegate JSON object: %m"); + + r = sd_json_variant_format(delegate_def, /* flags= */ 0, &delegate_buf); + if (r < 0) + return log_debug_errno(r, "Failed to format delegation JSON object: %m"); + + r = write_string_file_at(dir_fd, delegate_uid_fn, delegate_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to write restored delegation data to '%s' in registry: %m", delegate_uid_fn)); + + /* GID link already points to the UID file, no need to update it */ + continue; + } + + log_debug("Removing delegated UID range starting at "UID_FMT":"GID_FMT" for userns %"PRIu64 ".", + delegate->start_uid, delegate->start_gid, delegate->userns_inode); + + /* No ancestor chain — just remove the delegation files */ + r = RET_NERRNO(unlinkat(dir_fd, delegate_uid_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", delegate_uid_fn)); + + _cleanup_free_ char *delegate_gid_fn = NULL; + if (asprintf(&delegate_gid_fn, "g" GID_FMT ".delegate", delegate->start_gid) < 0) + return log_oom_debug(); + + r = RET_NERRNO(unlinkat(dir_fd, delegate_gid_fn, 0)); + if (r < 0) + RET_GATHER(ret, log_debug_errno(r, "Failed to remove %s: %m", delegate_gid_fn)); + } return ret; } @@ -822,3 +1123,135 @@ int userns_registry_per_uid(int dir_fd, uid_t owner) { return n; } + +int userns_registry_delegation_uid_exists(int dir_fd, uid_t start) { + _cleanup_free_ char *fn = NULL; + + assert(dir_fd >= 0); + + if (!uid_is_valid(start)) + return -ENOENT; + + if (start == 0) + return true; + + if (asprintf(&fn, "u" UID_FMT ".delegate", start) < 0) + return -ENOMEM; + + if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + return errno == ENOENT ? false : -errno; + + return true; +} + +int userns_registry_delegation_gid_exists(int dir_fd, gid_t start) { + _cleanup_free_ char *fn = NULL; + + assert(dir_fd >= 0); + + if (!gid_is_valid(start)) + return -ENOENT; + + if (start == 0) + return true; + + if (asprintf(&fn, "g" GID_FMT ".delegate", start) < 0) + return -ENOMEM; + + if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0) + return errno == ENOENT ? false : -errno; + + return true; +} + +static int userns_registry_load_delegation(int dir_fd, const char *filename, DelegatedUserNamespaceInfo *ret) { + + static const sd_json_dispatch_field dispatch_table[] = { + { "userns", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint64, offsetof(DelegatedUserNamespaceInfo, userns_inode), SD_JSON_MANDATORY }, + { "start", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_uid), SD_JSON_MANDATORY }, + { "startGid", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uid_gid, offsetof(DelegatedUserNamespaceInfo, start_gid), 0 }, + { "size", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint32, offsetof(DelegatedUserNamespaceInfo, size), SD_JSON_MANDATORY }, + { "ancestorUserns", SD_JSON_VARIANT_ARRAY, dispatch_ancestor_userns_array, 0, 0 }, + {} + }; + + _cleanup_(sd_json_variant_unrefp) sd_json_variant *v = NULL; + _cleanup_close_ int registry_fd = -EBADF; + int r; + + if (dir_fd < 0) { + registry_fd = userns_registry_open_fd(); + if (registry_fd < 0) + return registry_fd; + + dir_fd = registry_fd; + } + + r = sd_json_parse_file_at(/* f= */ NULL, dir_fd, filename, /* flags= */ 0, &v, /* reterr_line= */ NULL, /* reterr_column= */ NULL); + if (r < 0) + return r; + + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL; + + r = sd_json_dispatch(v, dispatch_table, /* flags= */ 0, &data); + if (r < 0) + return r; + + if (data.userns_inode == 0) + return -EBADMSG; + if (data.size == 0) + return -EBADMSG; + + if (ret) + *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL); + + return 0; +} + +int userns_registry_load_delegation_by_uid(int dir_fd, uid_t start, DelegatedUserNamespaceInfo *ret) { + _cleanup_free_ char *fn = NULL; + int r; + + if (!uid_is_valid(start)) + return -ENOENT; + + if (asprintf(&fn, "u" UID_FMT ".delegate", start) < 0) + return -ENOMEM; + + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL; + r = userns_registry_load_delegation(dir_fd, fn, &data); + if (r < 0) + return r; + + if (data.start_uid != start) + return -EBADMSG; + + if (ret) + *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL); + + return 0; +} + +int userns_registry_load_delegation_by_gid(int dir_fd, gid_t start, DelegatedUserNamespaceInfo *ret) { + _cleanup_free_ char *fn = NULL; + int r; + + if (!gid_is_valid(start)) + return -ENOENT; + + if (asprintf(&fn, "g" UID_FMT ".delegate", start) < 0) + return -ENOMEM; + + _cleanup_(delegated_userns_info_done) DelegatedUserNamespaceInfo data = DELEGATED_USER_NAMESPACE_INFO_NULL; + r = userns_registry_load_delegation(dir_fd, fn, &data); + if (r < 0) + return r; + + if (data.start_gid != start) + return -EBADMSG; + + if (ret) + *ret = TAKE_GENERIC(data, DelegatedUserNamespaceInfo, DELEGATED_USER_NAMESPACE_INFO_NULL); + + return 0; +} diff --git a/src/nsresourced/userns-registry.h b/src/nsresourced/userns-registry.h index fee2623a3b5..f08b238861a 100644 --- a/src/nsresourced/userns-registry.h +++ b/src/nsresourced/userns-registry.h @@ -5,6 +5,26 @@ #define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16U #define USER_NAMESPACE_NETIFS_DELEGATE_MAX 16U +#define USER_NAMESPACE_DELEGATIONS_MAX 16U + +typedef struct DelegatedUserNamespaceInfo { + uint64_t userns_inode; + uid_t start_uid; + gid_t start_gid; + uint32_t size; + /* We track all the previous owners of the delegation so we can restore the previous owner of each + * delegated range when a user namespace with delegated ranges is freed. */ + uint64_t *ancestor_userns; + size_t n_ancestor_userns; +} DelegatedUserNamespaceInfo; + +#define DELEGATED_USER_NAMESPACE_INFO_NULL (DelegatedUserNamespaceInfo) { \ + .start_uid = UID_INVALID, \ + .start_gid = GID_INVALID, \ +} + +void delegated_userns_info_done(DelegatedUserNamespaceInfo *info); +void delegated_userns_info_done_many(DelegatedUserNamespaceInfo infos[], size_t n); typedef struct UserNamespaceInfo { uid_t owner; @@ -18,6 +38,8 @@ typedef struct UserNamespaceInfo { uint64_t *cgroups; size_t n_cgroups; char **netifs; + DelegatedUserNamespaceInfo *delegates; + size_t n_delegates; } UserNamespaceInfo; UserNamespaceInfo* userns_info_new(void); @@ -51,3 +73,8 @@ int userns_registry_uid_exists(int dir_fd, uid_t start); int userns_registry_gid_exists(int dir_fd, gid_t start); int userns_registry_per_uid(int dir_fd, uid_t owner); + +int userns_registry_delegation_uid_exists(int dir_fd, uid_t start); +int userns_registry_delegation_gid_exists(int dir_fd, gid_t start); +int userns_registry_load_delegation_by_uid(int dir_fd, uid_t start, DelegatedUserNamespaceInfo *ret); +int userns_registry_load_delegation_by_gid(int dir_fd, gid_t start, DelegatedUserNamespaceInfo *ret); diff --git a/src/shared/varlink-io.systemd.NamespaceResource.c b/src/shared/varlink-io.systemd.NamespaceResource.c index 03bfc411347..7d5f5093224 100644 --- a/src/shared/varlink-io.systemd.NamespaceResource.c +++ b/src/shared/varlink-io.systemd.NamespaceResource.c @@ -14,6 +14,8 @@ static SD_VARLINK_DEFINE_METHOD( SD_VARLINK_DEFINE_INPUT(target, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("A file descriptor to an allocated userns with no current UID range assignments"), SD_VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, SD_VARLINK_INT, 0), + SD_VARLINK_FIELD_COMMENT("Number of transient 64K container UID/GID ranges to delegate. These are mapped 1:1 into the user namespace and can be used by nested user namespaces for container workloads. Must be between 0 and 16. Defaults to 0."), + SD_VARLINK_DEFINE_INPUT(delegateContainerRanges, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("The name assigned to the user namespace. (This is particularly interesting in case mangleName was enabled)."), SD_VARLINK_DEFINE_OUTPUT(name, SD_VARLINK_STRING, SD_VARLINK_NULLABLE)); @@ -69,6 +71,7 @@ static SD_VARLINK_DEFINE_ERROR(UserNamespaceWithoutUserRange); static SD_VARLINK_DEFINE_ERROR(TooManyControlGroups); static SD_VARLINK_DEFINE_ERROR(ControlGroupAlreadyAdded); static SD_VARLINK_DEFINE_ERROR(TooManyNetworkInterfaces); +static SD_VARLINK_DEFINE_ERROR(TooManyDelegations); SD_VARLINK_DEFINE_INTERFACE( io_systemd_NamespaceResource, @@ -103,4 +106,6 @@ SD_VARLINK_DEFINE_INTERFACE( SD_VARLINK_SYMBOL_COMMENT("The specified cgroup has already been added to the user namespace."), &vl_error_ControlGroupAlreadyAdded, SD_VARLINK_SYMBOL_COMMENT("The per-user namespace limit of network interfaces has been reached."), - &vl_error_TooManyNetworkInterfaces); + &vl_error_TooManyNetworkInterfaces, + SD_VARLINK_SYMBOL_COMMENT("The specified number of delegations exceeds the maximum allowed."), + &vl_error_TooManyDelegations); diff --git a/src/test/test-uid-range.c b/src/test/test-uid-range.c index 69c39b05750..6eef98153ef 100644 --- a/src/test/test-uid-range.c +++ b/src/test/test-uid-range.c @@ -195,4 +195,306 @@ TEST(uid_range_coalesce) { ASSERT_EQ(p->entries[0].nr, 115U); } +TEST(uid_range_clip) { + _cleanup_(uid_range_freep) UIDRange *p = NULL; + + /* Build a range: 100-199, 300-399, 500-599 */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_add_str(&p, "300-399")); + ASSERT_OK(uid_range_add_str(&p, "500-599")); + ASSERT_EQ(uid_range_entries(p), 3U); + + /* Intersect with range that covers all entries */ + ASSERT_OK(uid_range_clip(p, 0, 1000)); + ASSERT_EQ(uid_range_entries(p), 3U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 100U); + ASSERT_EQ(p->entries[1].start, 300U); + ASSERT_EQ(p->entries[1].nr, 100U); + ASSERT_EQ(p->entries[2].start, 500U); + ASSERT_EQ(p->entries[2].nr, 100U); + + /* Intersect with range that excludes first and last entries */ + ASSERT_OK(uid_range_clip(p, 200, 499)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 300U); + ASSERT_EQ(p->entries[0].nr, 100U); + + p = uid_range_free(p); + + /* Test partial overlap - trimming from both sides */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_clip(p, 150, 180)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 150U); + ASSERT_EQ(p->entries[0].nr, 31U); + + p = uid_range_free(p); + + /* Test intersection that removes all entries */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_clip(p, 500, 600)); + ASSERT_TRUE(uid_range_is_empty(p)); + + p = uid_range_free(p); + + /* Test invalid min > max */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_ERROR(uid_range_clip(p, 200, 100), EINVAL); + + p = uid_range_free(p); + + /* Test with max == UINT32_MAX (should not overflow) */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_clip(p, 0, UINT32_MAX)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 100U); + + p = uid_range_free(p); + + /* Test with both min and max at extremes */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_add_str(&p, "500-599")); + ASSERT_OK(uid_range_clip(p, 150, UINT32_MAX)); + ASSERT_EQ(uid_range_entries(p), 2U); + ASSERT_EQ(p->entries[0].start, 150U); + ASSERT_EQ(p->entries[0].nr, 50U); + ASSERT_EQ(p->entries[1].start, 500U); + ASSERT_EQ(p->entries[1].nr, 100U); +} + +TEST(uid_range_partition) { + _cleanup_(uid_range_freep) UIDRange *p = NULL; + + /* Single entry that divides evenly */ + ASSERT_OK(uid_range_add_str(&p, "0-299")); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_OK(uid_range_partition(p, 100)); + ASSERT_EQ(uid_range_entries(p), 3U); + ASSERT_EQ(p->entries[0].start, 0U); + ASSERT_EQ(p->entries[0].nr, 100U); + ASSERT_EQ(p->entries[1].start, 100U); + ASSERT_EQ(p->entries[1].nr, 100U); + ASSERT_EQ(p->entries[2].start, 200U); + ASSERT_EQ(p->entries[2].nr, 100U); + + p = uid_range_free(p); + + /* Entry with remainder (gets truncated) */ + ASSERT_OK(uid_range_add_str(&p, "0-249")); + ASSERT_OK(uid_range_partition(p, 100)); + ASSERT_EQ(uid_range_entries(p), 2U); + ASSERT_EQ(p->entries[0].start, 0U); + ASSERT_EQ(p->entries[0].nr, 100U); + ASSERT_EQ(p->entries[1].start, 100U); + ASSERT_EQ(p->entries[1].nr, 100U); + + p = uid_range_free(p); + + /* Entry smaller than partition size - gets dropped */ + ASSERT_OK(uid_range_add_str(&p, "0-49")); + ASSERT_OK(uid_range_partition(p, 100)); + ASSERT_TRUE(uid_range_is_empty(p)); + + p = uid_range_free(p); + + /* Multiple entries */ + ASSERT_OK(uid_range_add_str(&p, "0-199")); + ASSERT_OK(uid_range_add_str(&p, "1000-1299")); + ASSERT_EQ(uid_range_entries(p), 2U); + ASSERT_OK(uid_range_partition(p, 100)); + ASSERT_EQ(uid_range_entries(p), 5U); + ASSERT_EQ(p->entries[0].start, 0U); + ASSERT_EQ(p->entries[0].nr, 100U); + ASSERT_EQ(p->entries[1].start, 100U); + ASSERT_EQ(p->entries[1].nr, 100U); + ASSERT_EQ(p->entries[2].start, 1000U); + ASSERT_EQ(p->entries[2].nr, 100U); + ASSERT_EQ(p->entries[3].start, 1100U); + ASSERT_EQ(p->entries[3].nr, 100U); + ASSERT_EQ(p->entries[4].start, 1200U); + ASSERT_EQ(p->entries[4].nr, 100U); + + p = uid_range_free(p); + + /* Partition size of 1 */ + ASSERT_OK(uid_range_add_str(&p, "100-102")); + ASSERT_OK(uid_range_partition(p, 1)); + ASSERT_EQ(uid_range_entries(p), 3U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 1U); + ASSERT_EQ(p->entries[1].start, 101U); + ASSERT_EQ(p->entries[1].nr, 1U); + ASSERT_EQ(p->entries[2].start, 102U); + ASSERT_EQ(p->entries[2].nr, 1U); +} + +TEST(uid_range_copy) { + _cleanup_(uid_range_freep) UIDRange *p = NULL, *copy = NULL; + + /* Copy NULL range */ + ASSERT_OK(uid_range_copy(NULL, ©)); + ASSERT_TRUE(uid_range_is_empty(copy)); + + copy = uid_range_free(copy); + + /* Copy empty range */ + p = new0(UIDRange, 1); + ASSERT_NOT_NULL(p); + ASSERT_OK(uid_range_copy(p, ©)); + ASSERT_NOT_NULL(copy); + ASSERT_TRUE(uid_range_is_empty(copy)); + + p = uid_range_free(p); + copy = uid_range_free(copy); + + /* Copy range with entries */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_add_str(&p, "300-399")); + ASSERT_OK(uid_range_copy(p, ©)); + ASSERT_TRUE(uid_range_equal(p, copy)); + + /* Verify it's a deep copy - modifying original doesn't affect copy */ + ASSERT_OK(uid_range_add_str(&p, "500-599")); + ASSERT_FALSE(uid_range_equal(p, copy)); + ASSERT_EQ(uid_range_entries(copy), 2U); +} + +TEST(uid_range_remove) { + _cleanup_(uid_range_freep) UIDRange *p = NULL; + + /* Build a range: 100-199 */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + + /* Remove with size 0 - no-op */ + ASSERT_OK(uid_range_remove(p, 150, 0)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 100U); + + /* Remove range that doesn't overlap - no change */ + ASSERT_OK(uid_range_remove(p, 0, 50)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 100U); + + ASSERT_OK(uid_range_remove(p, 300, 50)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 100U); + + /* Remove from the start of the entry */ + ASSERT_OK(uid_range_remove(p, 100, 10)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 110U); + ASSERT_EQ(p->entries[0].nr, 90U); + + /* Remove from the end of the entry */ + ASSERT_OK(uid_range_remove(p, 190, 10)); + ASSERT_EQ(uid_range_entries(p), 1U); + ASSERT_EQ(p->entries[0].start, 110U); + ASSERT_EQ(p->entries[0].nr, 80U); + + /* Remove from the middle - splits the entry */ + ASSERT_OK(uid_range_remove(p, 140, 20)); + ASSERT_EQ(uid_range_entries(p), 2U); + ASSERT_EQ(p->entries[0].start, 110U); + ASSERT_EQ(p->entries[0].nr, 30U); + ASSERT_EQ(p->entries[1].start, 160U); + ASSERT_EQ(p->entries[1].nr, 30U); + + p = uid_range_free(p); + + /* Remove entire entry */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_remove(p, 100, 100)); + ASSERT_TRUE(uid_range_is_empty(p)); + + p = uid_range_free(p); + + /* Remove range larger than entry */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_remove(p, 50, 200)); + ASSERT_TRUE(uid_range_is_empty(p)); + + p = uid_range_free(p); + + /* Remove affecting multiple entries */ + ASSERT_OK(uid_range_add_str(&p, "100-199")); + ASSERT_OK(uid_range_add_str(&p, "300-399")); + ASSERT_OK(uid_range_add_str(&p, "500-599")); + ASSERT_EQ(uid_range_entries(p), 3U); + + /* Remove range spanning the middle entry completely and trimming others */ + ASSERT_OK(uid_range_remove(p, 150, 400)); + ASSERT_EQ(uid_range_entries(p), 2U); + ASSERT_EQ(p->entries[0].start, 100U); + ASSERT_EQ(p->entries[0].nr, 50U); + ASSERT_EQ(p->entries[1].start, 550U); + ASSERT_EQ(p->entries[1].nr, 50U); +} + +TEST(uid_range_translate) { + _cleanup_(uid_range_freep) UIDRange *o = NULL, *i = NULL; + uid_t uid; + + ASSERT_OK(uid_range_add_str_full(&o, "200-299", /* coalesce= */ false)); + ASSERT_OK(uid_range_add_str_full(&i, "100-199", /* coalesce= */ false)); + ASSERT_OK(uid_range_translate(o, i, 250, &uid)); + ASSERT_EQ(uid, 150U); + ASSERT_OK(uid_range_translate(i, o, 150, &uid)); + ASSERT_EQ(uid, 250U); + + ASSERT_OK(uid_range_add_str_full(&o, "300-399", /* coalesce= */ false)); + ASSERT_OK(uid_range_add_str_full(&i, "350-449", /* coalesce= */ false)); + ASSERT_OK(uid_range_translate(o, i, 350, &uid)); + ASSERT_EQ(uid, 400U); + ASSERT_OK(uid_range_translate(i, o, 400, &uid)); + ASSERT_EQ(uid, 350U); + + /* Test translating at range boundaries */ + ASSERT_OK(uid_range_translate(o, i, 200, &uid)); + ASSERT_EQ(uid, 100U); + ASSERT_OK(uid_range_translate(o, i, 299, &uid)); + ASSERT_EQ(uid, 199U); + ASSERT_OK(uid_range_translate(o, i, 300, &uid)); + ASSERT_EQ(uid, 350U); + ASSERT_OK(uid_range_translate(o, i, 399, &uid)); + ASSERT_EQ(uid, 449U); + + /* Test reverse translation at boundaries */ + ASSERT_OK(uid_range_translate(i, o, 100, &uid)); + ASSERT_EQ(uid, 200U); + ASSERT_OK(uid_range_translate(i, o, 199, &uid)); + ASSERT_EQ(uid, 299U); + ASSERT_OK(uid_range_translate(i, o, 350, &uid)); + ASSERT_EQ(uid, 300U); + ASSERT_OK(uid_range_translate(i, o, 449, &uid)); + ASSERT_EQ(uid, 399U); + + /* Test UID not in any range returns ESRCH */ + ASSERT_ERROR(uid_range_translate(o, i, 0, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(o, i, 199, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(o, i, 400, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(i, o, 0, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(i, o, 99, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(i, o, 200, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(i, o, 349, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(i, o, 450, &uid), ESRCH); + + o = uid_range_free(o); + i = uid_range_free(i); + + /* Test with single-element ranges */ + ASSERT_OK(uid_range_add_str_full(&o, "1000", /* coalesce= */ false)); + ASSERT_OK(uid_range_add_str_full(&i, "5000", /* coalesce= */ false)); + ASSERT_OK(uid_range_translate(o, i, 1000, &uid)); + ASSERT_EQ(uid, 5000U); + ASSERT_OK(uid_range_translate(i, o, 5000, &uid)); + ASSERT_EQ(uid, 1000U); + ASSERT_ERROR(uid_range_translate(o, i, 999, &uid), ESRCH); + ASSERT_ERROR(uid_range_translate(o, i, 1001, &uid), ESRCH); +} + DEFINE_TEST_MAIN(LOG_DEBUG); diff --git a/test/units/TEST-50-DISSECT.mountfsd.sh b/test/units/TEST-50-DISSECT.mountfsd.sh index 94f802e780d..e5092b56868 100755 --- a/test/units/TEST-50-DISSECT.mountfsd.sh +++ b/test/units/TEST-50-DISSECT.mountfsd.sh @@ -60,6 +60,23 @@ if (SYSTEMD_LOG_TARGET=console varlinkctl call \ exit 0 fi +# Test delegated UID ranges +# Verify that delegated ranges show up in uid_map (6 lines: 1 primary + 2 container ranges + 3 dynamic users) +test "$(run0 -u testuser --pipe unshare --user varlinkctl --exec call \ + --push-fd=/proc/self/ns/user \ + /run/systemd/userdb/io.systemd.NamespaceResource \ + io.systemd.NamespaceResource.AllocateUserRange \ + '{"name":"test-delegate","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":2}' \ + -- cat /proc/self/uid_map | wc -l)" -eq 3 + +# Test that delegateContainerRanges > 16 fails with TooManyDelegations error +(! run0 -u testuser --pipe unshare --user varlinkctl call \ + --push-fd=/proc/self/ns/user \ + /run/systemd/userdb/io.systemd.NamespaceResource \ + io.systemd.NamespaceResource.AllocateUserRange \ + '{"name":"test-fail","size":65536,"userNamespaceFileDescriptor":0,"delegateContainerRanges":17}') |& + grep "io.systemd.NamespaceResource.TooManyDelegations" >/dev/null + # This should work without the key systemd-dissect --image-policy='root=verity:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null systemd-dissect --image-policy='root=verity+signed:=absent+unused' --mtree /var/tmp/unpriv.raw >/dev/null