From 1ada9064e65410fabadd57b281dbc8a42e0e0a6f Mon Sep 17 00:00:00 2001
From: Michael Vogt <michael@amutable.com>
Date: Wed, 17 Jun 2026 09:51:53 +0200
Subject: [PATCH] hostname: improve the algorithm in hostname_pick_word()

Lennart suggested to use a more uniform algorithm for
the picking of the hostname words that is not biased
for long words by just (predictably) randomly going over
the offsets until we land on a word boundary. This is a
very nice suggestion so this commit implements it with
a fallback to the "old" behavior if we do not find a
word boundary within a reasonable amount of attempts.

A small python script shows that 64 iterations plus
fallback is a good number:
```
$ python3 simulate-hostname-pick.py 64
hostname-wordlist/adverbs
  words=261  p_accept=0.1119  avg_bytes/word=1/p=8.94
  max_iterations=64, n_trials=1000000
    fallback rate       :   0.051000%  (510/1_000_000)
    mean seeks per word :        8.93

hostname-wordlist/adjectives
  words=449  p_accept=0.1380  avg_bytes/word=1/p=7.24
  max_iterations=64, n_trials=1000000
    fallback rate       :   0.007500%  (75/1_000_000)
    mean seeks per word :        7.25

hostname-wordlist/nouns
  words=449  p_accept=0.1472  avg_bytes/word=1/p=6.79
  max_iterations=64, n_trials=1000000
    fallback rate       :   0.002700%  (27/1_000_000)
    mean seeks per word :        6.79
```
Combined with the fallback to the previous method if
we can't find anything within the 64 attemps this seems
to be the best tradeoff and give us very good uniformity.
---
 hostname-wordlist/README    |  25 +++----
 src/shared/hostname-setup.c | 129 ++++++++++++++++++++++++------------
 2 files changed, 95 insertions(+), 59 deletions(-)

diff --git a/hostname-wordlist/README b/hostname-wordlist/README
index 1cda8ab7cc2..0ef3a3fc5fd 100644
--- a/hostname-wordlist/README
+++ b/hostname-wordlist/README
@@ -24,13 +24,11 @@ the symlinks) while the actual word lists keep meaningful names.
 Files
 -----
 
-Each file is a plain list of words, one per line, with no comment or blank
-lines: a word is picked by hashing the machine ID to a byte offset into the
-file, so comment/blank lines (although skipped) would bias the selection and
-should be avoided. Each word must be a valid single hostname label (lowercase
-letters, digits, hyphens); invalid entries are skipped. The file is used as-is
-from the highest-priority directory that provides it (/etc wins over /run wins
-over /usr/lib); files are not merged across directories.
+Each file is a plain list of words, one per line. Blank lines and lines starting
+with "#" are treated as comments and skipped. Each word must be a valid single
+hostname label (lowercase letters, digits, hyphens); invalid entries are skipped.
+The file is used as-is from the highest-priority directory that provides it (/etc
+-> /run -> /usr/local/lib -> /usr/lib); files are not merged across directories.
 
 Search path (highest priority first):
 
@@ -49,15 +47,10 @@ list. Changing a word list may change the name a machine gets. If a referenced
 list is missing the name is treated as invalid and the built-in fallback
 hostname is used.
 
-Because a word is chosen by byte offset into the file (rather than loading and
-indexing the whole list), the words are not all equally likely: a word's chance
-tracks the length of the word that precedes it in the list (not its own length),
-so a word listed right after a long word is slightly more likely to be picked.
-The effect is small: about a 12% non-uniformity, i.e. the effective name space
-is ~88% of the nominal product for $-$-$. This is an accepted trade for not
-reading the whole list into memory. If exact uniformity is ever needed, pad
-every word to a fixed width (e.g. with trailing '#') and have the loader strip
-the padding.
+Words are picked uniformly without reading the whole list into memory: an offset
+is chosen by hashing and accepted only when it lands on the start of a line
+(otherwise another offset is tried), so a word's chance does not depend on its
+own length or that of its neighbours.
 
 Origin
 ------
diff --git a/src/shared/hostname-setup.c b/src/shared/hostname-setup.c
index aa04a1ceab3..22225e20309 100644
--- a/src/shared/hostname-setup.c
+++ b/src/shared/hostname-setup.c
@@ -321,12 +321,58 @@ static int hostname_open_wordlist(const char *file, FILE **ret) {
         return 0;
 }
 
+static bool normalize_and_validate_word(char *word) {
+        assert(word);
+
+        if (IN_SET(word[0], '\0', '#')) /* empty line or comment */
+                return false;
+
+        ascii_strlower(word);
+        return hostname_is_valid(word, /* flags= */ 0);
+}
+
+static int pick_word_linear_scan(FILE *f, off_t offset, char **ret) {
+        int r;
+
+        assert(f);
+        assert(ret);
+
+        if (fseeko(f, offset, SEEK_SET) < 0)
+                return -errno;
+
+        bool wrapped = false;
+        r = read_line(f, LONG_LINE_MAX, NULL); /* discard the partial line we landed in */
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                wrapped = true;
+                rewind(f);
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return r;
+                if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
+                        if (wrapped) /* already wrapped once, the file contains no usable word at all */
+                                return -ENOENT;
+                        wrapped = true;
+                        rewind(f);
+                        continue;
+                }
+                if (normalize_and_validate_word(line)) {
+                        *ret = TAKE_PTR(line);
+                        return 0;
+                }
+        }
+}
+
 static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
         static const sd_id128_t word_key = SD_ID128_MAKE(2d,9f,1c,7a,4b,8e,43,11,9a,6d,5f,02,c8,77,e3,14);
         _cleanup_fclose_ FILE *f = NULL;
         struct stat st;
-        bool wrapped = false;
-        uint64_t h;
         int r;
 
         assert(pos >= 1);
@@ -348,55 +394,52 @@ static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
         if (st.st_size == 0)
                 return -ENOENT;
 
-        /* Pick a word without reading the whole list into memory: hash the machine ID and word position to a
-         * byte offset. This stream is independent of the '?' nibble stream, so pure-'?' templates keep
-         * producing byte-identical output. Stable as long as the wordlist is stable. */
-        struct siphash state;
-        siphash24_init(&state, word_key.bytes);
-        siphash24_compress_typesafe(mid, &state);
-        siphash24_compress_typesafe(pos, &state);
-        h = siphash24_finalize(&state);
-
-        if (fseeko(f, (off_t) (h % (uint64_t) st.st_size), SEEK_SET) < 0)
-                return -errno;
-
-        /* We mostly landed mid-line, so read/discard the current line here. If the file was shrunk by a
-         * concurrent modification we might have seeked at/past EOF, so wrap around to the beginning. */
-        r = read_line(f, LONG_LINE_MAX, NULL);
-        if (r < 0)
-                return r;
-        if (r == 0) {
-                wrapped = true;
-                rewind(f);
-        }
-
-        for (;;) {
+        /* Pick a word without reading the whole list into memory:
+         * 1. pick a random offset in the file [0 … st.st_size-1]
+         * 2. if offset is zero, read a full line from the beginning of the file, use that.
+         * 3. otherwise, seek to offset minus 1 and read one character.
+         * 4. if that character is newline, then read a full line after it, and use that as result
+         * 5. otherwise, goto 1
+         *
+         * As a safety net terminate after a fixed number iterations (for pathological wordlists)
+         * This stream is independent of the '?' nibble stream so pure-'?'  * templates keep producing
+         * byte-identical output. Stable as long as the wordlist is stable. */
+        off_t offset = 0;
+        const unsigned int MAX_ITERATIONS = 64;
+        for (unsigned i = 0; i < MAX_ITERATIONS; i++) {
                 _cleanup_free_ char *line = NULL;
 
+                struct siphash state;
+                siphash24_init(&state, word_key.bytes);
+                siphash24_compress_typesafe(mid, &state);
+                siphash24_compress_typesafe(pos, &state);
+                siphash24_compress_typesafe(i, &state); /* counter mode */
+                offset = (off_t) (siphash24_finalize(&state) % (uint64_t) st.st_size);
+
+                if (offset > 0) {
+                        if (fseeko(f, offset - 1, SEEK_SET) < 0)
+                                return -errno;
+                        if (fgetc(f) != '\n')
+                                continue; /* not a line start */
+                } else if (fseeko(f, 0, SEEK_SET) < 0) /* offset 0 always begins the first line */
+                        return -errno;
+
                 r = read_stripped_line(f, LONG_LINE_MAX, &line);
                 if (r < 0)
                         return r;
-                if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
-                        if (wrapped) /* already wrapped once, the file contains no usable word at all */
-                                return -ENOENT;
-                        wrapped = true;
-                        rewind(f);
+                if (r == 0) /* raced with truncation */
                         continue;
+                if (normalize_and_validate_word(line)) {
+                        *ret = TAKE_PTR(line);
+                        return 0;
                 }
-
-                /* Skip empty lines and comments */
-                if (IN_SET(line[0], '\0', '#'))
-                        continue;
-
-                /* Each word must be a valid single hostname label on its own; lowercase it and silently skip
-                 * bogus entries. */
-                ascii_strlower(line);
-                if (!hostname_is_valid(line, /* flags= */ 0))
-                        continue;
-
-                *ret = TAKE_PTR(line);
-                return 0;
+                /* Comment/empty/invalid line: resample rather than advancing, to keep the pick uniform. */
         }
+
+        /* We exhausted the uniform attempts, this should never happen but if it does fallback to picking the
+        * next word after our last attempt. */
+        log_warning("hostname_pick_word did not find a usable word after %u in wordlist %zu", MAX_ITERATIONS, pos);
+        return pick_word_linear_scan(f, offset, ret);
 }
 
 int hostname_substitute_wildcards(const char *name, char **ret) {