hostname: improve the algorithm in hostname_pick_word()

Lennart suggested to use a more uniform algorithm for the picking of the hostname words that is not biased for long words by just (predictably) randomly going over the offsets until we land on a word boundary. This is a very nice suggestion so this commit implements it with a fallback to the "old" behavior if we do not find a word boundary within a reasonable amount of attempts. A small python script shows that 64 iterations plus fallback is a good number: ``` $ python3 simulate-hostname-pick.py 64 hostname-wordlist/adverbs words=261 p_accept=0.1119 avg_bytes/word=1/p=8.94 max_iterations=64, n_trials=1000000 fallback rate : 0.051000% (510/1_000_000) mean seeks per word : 8.93 hostname-wordlist/adjectives words=449 p_accept=0.1380 avg_bytes/word=1/p=7.24 max_iterations=64, n_trials=1000000 fallback rate : 0.007500% (75/1_000_000) mean seeks per word : 7.25 hostname-wordlist/nouns words=449 p_accept=0.1472 avg_bytes/word=1/p=6.79 max_iterations=64, n_trials=1000000 fallback rate : 0.002700% (27/1_000_000) mean seeks per word : 6.79 ``` Combined with the fallback to the previous method if we can't find anything within the 64 attemps this seems to be the best tradeoff and give us very good uniformity.
2026-06-24 08:47:49 +00:00 · 2026-06-17 09:51:53 +02:00
parent c3c735bd09
commit 1ada9064e6
2 changed files with 95 additions and 59 deletions
--- a/hostname-wordlist/README
+++ b/hostname-wordlist/README
@@ -24,13 +24,11 @@ the symlinks) while the actual word lists keep meaningful names.
 Files
 -----

-Each file is a plain list of words, one per line, with no comment or blank
-lines: a word is picked by hashing the machine ID to a byte offset into the
-file, so comment/blank lines (although skipped) would bias the selection and
-should be avoided. Each word must be a valid single hostname label (lowercase
-letters, digits, hyphens); invalid entries are skipped. The file is used as-is
-from the highest-priority directory that provides it (/etc wins over /run wins
-over /usr/lib); files are not merged across directories.
+Each file is a plain list of words, one per line. Blank lines and lines starting
+with "#" are treated as comments and skipped. Each word must be a valid single
+hostname label (lowercase letters, digits, hyphens); invalid entries are skipped.
+The file is used as-is from the highest-priority directory that provides it (/etc
+-> /run -> /usr/local/lib -> /usr/lib); files are not merged across directories.

 Search path (highest priority first):

@@ -49,15 +47,10 @@ list. Changing a word list may change the name a machine gets. If a referenced
 list is missing the name is treated as invalid and the built-in fallback
 hostname is used.

-Because a word is chosen by byte offset into the file (rather than loading and
-indexing the whole list), the words are not all equally likely: a word's chance
-tracks the length of the word that precedes it in the list (not its own length),
-so a word listed right after a long word is slightly more likely to be picked.
-The effect is small: about a 12% non-uniformity, i.e. the effective name space
-is ~88% of the nominal product for $-$-$. This is an accepted trade for not
-reading the whole list into memory. If exact uniformity is ever needed, pad
-every word to a fixed width (e.g. with trailing '#') and have the loader strip
-the padding.
+Words are picked uniformly without reading the whole list into memory: an offset
+is chosen by hashing and accepted only when it lands on the start of a line
+(otherwise another offset is tried), so a word's chance does not depend on its
+own length or that of its neighbours.

 Origin
 ------
--- a/src/shared/hostname-setup.c
+++ b/src/shared/hostname-setup.c
@@ -321,12 +321,58 @@ static int hostname_open_wordlist(const char *file, FILE **ret) {
        return 0;
 }

+static bool normalize_and_validate_word(char *word) {
+        assert(word);
+
+        if (IN_SET(word[0], '\0', '#')) /* empty line or comment */
+                return false;
+
+        ascii_strlower(word);
+        return hostname_is_valid(word, /* flags= */ 0);
+}
+
+static int pick_word_linear_scan(FILE *f, off_t offset, char **ret) {
+        int r;
+
+        assert(f);
+        assert(ret);
+
+        if (fseeko(f, offset, SEEK_SET) < 0)
+                return -errno;
+
+        bool wrapped = false;
+        r = read_line(f, LONG_LINE_MAX, NULL); /* discard the partial line we landed in */
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                wrapped = true;
+                rewind(f);
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return r;
+                if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
+                        if (wrapped) /* already wrapped once, the file contains no usable word at all */
+                                return -ENOENT;
+                        wrapped = true;
+                        rewind(f);
+                        continue;
+                }
+                if (normalize_and_validate_word(line)) {
+                        *ret = TAKE_PTR(line);
+                        return 0;
+                }
+        }
+}
+
 static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
        static const sd_id128_t word_key = SD_ID128_MAKE(2d,9f,1c,7a,4b,8e,43,11,9a,6d,5f,02,c8,77,e3,14);
        _cleanup_fclose_ FILE *f = NULL;
        struct stat st;
-        bool wrapped = false;
-        uint64_t h;
        int r;

        assert(pos >= 1);
@@ -348,55 +394,52 @@ static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
        if (st.st_size == 0)
                return -ENOENT;

-        /* Pick a word without reading the whole list into memory: hash the machine ID and word position to a
-         * byte offset. This stream is independent of the '?' nibble stream, so pure-'?' templates keep
-         * producing byte-identical output. Stable as long as the wordlist is stable. */
-        struct siphash state;
-        siphash24_init(&state, word_key.bytes);
-        siphash24_compress_typesafe(mid, &state);
-        siphash24_compress_typesafe(pos, &state);
-        h = siphash24_finalize(&state);
-
-        if (fseeko(f, (off_t) (h % (uint64_t) st.st_size), SEEK_SET) < 0)
-                return -errno;
-
-        /* We mostly landed mid-line, so read/discard the current line here. If the file was shrunk by a
-         * concurrent modification we might have seeked at/past EOF, so wrap around to the beginning. */
-        r = read_line(f, LONG_LINE_MAX, NULL);
-        if (r < 0)
-                return r;
-        if (r == 0) {
-                wrapped = true;
-                rewind(f);
-        }
-
-        for (;;) {
+        /* Pick a word without reading the whole list into memory:
+         * 1. pick a random offset in the file [0 … st.st_size-1]
+         * 2. if offset is zero, read a full line from the beginning of the file, use that.
+         * 3. otherwise, seek to offset minus 1 and read one character.
+         * 4. if that character is newline, then read a full line after it, and use that as result
+         * 5. otherwise, goto 1
+         *
+         * As a safety net terminate after a fixed number iterations (for pathological wordlists)
+         * This stream is independent of the '?' nibble stream so pure-'?'  * templates keep producing
+         * byte-identical output. Stable as long as the wordlist is stable. */
+        off_t offset = 0;
+        const unsigned int MAX_ITERATIONS = 64;
+        for (unsigned i = 0; i < MAX_ITERATIONS; i++) {
                _cleanup_free_ char *line = NULL;

+                struct siphash state;
+                siphash24_init(&state, word_key.bytes);
+                siphash24_compress_typesafe(mid, &state);
+                siphash24_compress_typesafe(pos, &state);
+                siphash24_compress_typesafe(i, &state); /* counter mode */
+                offset = (off_t) (siphash24_finalize(&state) % (uint64_t) st.st_size);
+
+                if (offset > 0) {
+                        if (fseeko(f, offset - 1, SEEK_SET) < 0)
+                                return -errno;
+                        if (fgetc(f) != '\n')
+                                continue; /* not a line start */
+                } else if (fseeko(f, 0, SEEK_SET) < 0) /* offset 0 always begins the first line */
+                        return -errno;
+
                r = read_stripped_line(f, LONG_LINE_MAX, &line);
                if (r < 0)
                        return r;
-                if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
-                        if (wrapped) /* already wrapped once, the file contains no usable word at all */
-                                return -ENOENT;
-                        wrapped = true;
-                        rewind(f);
+                if (r == 0) /* raced with truncation */
                        continue;
+                if (normalize_and_validate_word(line)) {
+                        *ret = TAKE_PTR(line);
+                        return 0;
                }
-
-                /* Skip empty lines and comments */
-                if (IN_SET(line[0], '\0', '#'))
-                        continue;
-
-                /* Each word must be a valid single hostname label on its own; lowercase it and silently skip
-                 * bogus entries. */
-                ascii_strlower(line);
-                if (!hostname_is_valid(line, /* flags= */ 0))
-                        continue;
-
-                *ret = TAKE_PTR(line);
-                return 0;
+                /* Comment/empty/invalid line: resample rather than advancing, to keep the pick uniform. */
        }
+
+        /* We exhausted the uniform attempts, this should never happen but if it does fallback to picking the
+        * next word after our last attempt. */
+        log_warning("hostname_pick_word did not find a usable word after %u in wordlist %zu", MAX_ITERATIONS, pos);
+        return pick_word_linear_scan(f, offset, ret);
 }

 int hostname_substitute_wildcards(const char *name, char **ret) {