From 1ada9064e65410fabadd57b281dbc8a42e0e0a6f Mon Sep 17 00:00:00 2001 From: Michael Vogt Date: Wed, 17 Jun 2026 09:51:53 +0200 Subject: [PATCH] hostname: improve the algorithm in hostname_pick_word() Lennart suggested to use a more uniform algorithm for the picking of the hostname words that is not biased for long words by just (predictably) randomly going over the offsets until we land on a word boundary. This is a very nice suggestion so this commit implements it with a fallback to the "old" behavior if we do not find a word boundary within a reasonable amount of attempts. A small python script shows that 64 iterations plus fallback is a good number: ``` $ python3 simulate-hostname-pick.py 64 hostname-wordlist/adverbs words=261 p_accept=0.1119 avg_bytes/word=1/p=8.94 max_iterations=64, n_trials=1000000 fallback rate : 0.051000% (510/1_000_000) mean seeks per word : 8.93 hostname-wordlist/adjectives words=449 p_accept=0.1380 avg_bytes/word=1/p=7.24 max_iterations=64, n_trials=1000000 fallback rate : 0.007500% (75/1_000_000) mean seeks per word : 7.25 hostname-wordlist/nouns words=449 p_accept=0.1472 avg_bytes/word=1/p=6.79 max_iterations=64, n_trials=1000000 fallback rate : 0.002700% (27/1_000_000) mean seeks per word : 6.79 ``` Combined with the fallback to the previous method if we can't find anything within the 64 attemps this seems to be the best tradeoff and give us very good uniformity. --- hostname-wordlist/README | 25 +++---- src/shared/hostname-setup.c | 129 ++++++++++++++++++++++++------------ 2 files changed, 95 insertions(+), 59 deletions(-) diff --git a/hostname-wordlist/README b/hostname-wordlist/README index 1cda8ab7cc2..0ef3a3fc5fd 100644 --- a/hostname-wordlist/README +++ b/hostname-wordlist/README @@ -24,13 +24,11 @@ the symlinks) while the actual word lists keep meaningful names. Files ----- -Each file is a plain list of words, one per line, with no comment or blank -lines: a word is picked by hashing the machine ID to a byte offset into the -file, so comment/blank lines (although skipped) would bias the selection and -should be avoided. Each word must be a valid single hostname label (lowercase -letters, digits, hyphens); invalid entries are skipped. The file is used as-is -from the highest-priority directory that provides it (/etc wins over /run wins -over /usr/lib); files are not merged across directories. +Each file is a plain list of words, one per line. Blank lines and lines starting +with "#" are treated as comments and skipped. Each word must be a valid single +hostname label (lowercase letters, digits, hyphens); invalid entries are skipped. +The file is used as-is from the highest-priority directory that provides it (/etc +-> /run -> /usr/local/lib -> /usr/lib); files are not merged across directories. Search path (highest priority first): @@ -49,15 +47,10 @@ list. Changing a word list may change the name a machine gets. If a referenced list is missing the name is treated as invalid and the built-in fallback hostname is used. -Because a word is chosen by byte offset into the file (rather than loading and -indexing the whole list), the words are not all equally likely: a word's chance -tracks the length of the word that precedes it in the list (not its own length), -so a word listed right after a long word is slightly more likely to be picked. -The effect is small: about a 12% non-uniformity, i.e. the effective name space -is ~88% of the nominal product for $-$-$. This is an accepted trade for not -reading the whole list into memory. If exact uniformity is ever needed, pad -every word to a fixed width (e.g. with trailing '#') and have the loader strip -the padding. +Words are picked uniformly without reading the whole list into memory: an offset +is chosen by hashing and accepted only when it lands on the start of a line +(otherwise another offset is tried), so a word's chance does not depend on its +own length or that of its neighbours. Origin ------ diff --git a/src/shared/hostname-setup.c b/src/shared/hostname-setup.c index aa04a1ceab3..22225e20309 100644 --- a/src/shared/hostname-setup.c +++ b/src/shared/hostname-setup.c @@ -321,12 +321,58 @@ static int hostname_open_wordlist(const char *file, FILE **ret) { return 0; } +static bool normalize_and_validate_word(char *word) { + assert(word); + + if (IN_SET(word[0], '\0', '#')) /* empty line or comment */ + return false; + + ascii_strlower(word); + return hostname_is_valid(word, /* flags= */ 0); +} + +static int pick_word_linear_scan(FILE *f, off_t offset, char **ret) { + int r; + + assert(f); + assert(ret); + + if (fseeko(f, offset, SEEK_SET) < 0) + return -errno; + + bool wrapped = false; + r = read_line(f, LONG_LINE_MAX, NULL); /* discard the partial line we landed in */ + if (r < 0) + return r; + if (r == 0) { + wrapped = true; + rewind(f); + } + + for (;;) { + _cleanup_free_ char *line = NULL; + + r = read_stripped_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */ + if (wrapped) /* already wrapped once, the file contains no usable word at all */ + return -ENOENT; + wrapped = true; + rewind(f); + continue; + } + if (normalize_and_validate_word(line)) { + *ret = TAKE_PTR(line); + return 0; + } + } +} + static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) { static const sd_id128_t word_key = SD_ID128_MAKE(2d,9f,1c,7a,4b,8e,43,11,9a,6d,5f,02,c8,77,e3,14); _cleanup_fclose_ FILE *f = NULL; struct stat st; - bool wrapped = false; - uint64_t h; int r; assert(pos >= 1); @@ -348,55 +394,52 @@ static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) { if (st.st_size == 0) return -ENOENT; - /* Pick a word without reading the whole list into memory: hash the machine ID and word position to a - * byte offset. This stream is independent of the '?' nibble stream, so pure-'?' templates keep - * producing byte-identical output. Stable as long as the wordlist is stable. */ - struct siphash state; - siphash24_init(&state, word_key.bytes); - siphash24_compress_typesafe(mid, &state); - siphash24_compress_typesafe(pos, &state); - h = siphash24_finalize(&state); - - if (fseeko(f, (off_t) (h % (uint64_t) st.st_size), SEEK_SET) < 0) - return -errno; - - /* We mostly landed mid-line, so read/discard the current line here. If the file was shrunk by a - * concurrent modification we might have seeked at/past EOF, so wrap around to the beginning. */ - r = read_line(f, LONG_LINE_MAX, NULL); - if (r < 0) - return r; - if (r == 0) { - wrapped = true; - rewind(f); - } - - for (;;) { + /* Pick a word without reading the whole list into memory: + * 1. pick a random offset in the file [0 … st.st_size-1] + * 2. if offset is zero, read a full line from the beginning of the file, use that. + * 3. otherwise, seek to offset minus 1 and read one character. + * 4. if that character is newline, then read a full line after it, and use that as result + * 5. otherwise, goto 1 + * + * As a safety net terminate after a fixed number iterations (for pathological wordlists) + * This stream is independent of the '?' nibble stream so pure-'?' * templates keep producing + * byte-identical output. Stable as long as the wordlist is stable. */ + off_t offset = 0; + const unsigned int MAX_ITERATIONS = 64; + for (unsigned i = 0; i < MAX_ITERATIONS; i++) { _cleanup_free_ char *line = NULL; + struct siphash state; + siphash24_init(&state, word_key.bytes); + siphash24_compress_typesafe(mid, &state); + siphash24_compress_typesafe(pos, &state); + siphash24_compress_typesafe(i, &state); /* counter mode */ + offset = (off_t) (siphash24_finalize(&state) % (uint64_t) st.st_size); + + if (offset > 0) { + if (fseeko(f, offset - 1, SEEK_SET) < 0) + return -errno; + if (fgetc(f) != '\n') + continue; /* not a line start */ + } else if (fseeko(f, 0, SEEK_SET) < 0) /* offset 0 always begins the first line */ + return -errno; + r = read_stripped_line(f, LONG_LINE_MAX, &line); if (r < 0) return r; - if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */ - if (wrapped) /* already wrapped once, the file contains no usable word at all */ - return -ENOENT; - wrapped = true; - rewind(f); + if (r == 0) /* raced with truncation */ continue; + if (normalize_and_validate_word(line)) { + *ret = TAKE_PTR(line); + return 0; } - - /* Skip empty lines and comments */ - if (IN_SET(line[0], '\0', '#')) - continue; - - /* Each word must be a valid single hostname label on its own; lowercase it and silently skip - * bogus entries. */ - ascii_strlower(line); - if (!hostname_is_valid(line, /* flags= */ 0)) - continue; - - *ret = TAKE_PTR(line); - return 0; + /* Comment/empty/invalid line: resample rather than advancing, to keep the pick uniform. */ } + + /* We exhausted the uniform attempts, this should never happen but if it does fallback to picking the + * next word after our last attempt. */ + log_warning("hostname_pick_word did not find a usable word after %u in wordlist %zu", MAX_ITERATIONS, pos); + return pick_word_linear_scan(f, offset, ret); } int hostname_substitute_wildcards(const char *name, char **ret) {