hostname: improve the algorithm in hostname_pick_word()

Lennart suggested to use a more uniform algorithm for
the picking of the hostname words that is not biased
for long words by just (predictably) randomly going over
the offsets until we land on a word boundary. This is a
very nice suggestion so this commit implements it with
a fallback to the "old" behavior if we do not find a
word boundary within a reasonable amount of attempts.

A small python script shows that 64 iterations plus
fallback is a good number:
```
$ python3 simulate-hostname-pick.py 64
hostname-wordlist/adverbs
  words=261  p_accept=0.1119  avg_bytes/word=1/p=8.94
  max_iterations=64, n_trials=1000000
    fallback rate       :   0.051000%  (510/1_000_000)
    mean seeks per word :        8.93

hostname-wordlist/adjectives
  words=449  p_accept=0.1380  avg_bytes/word=1/p=7.24
  max_iterations=64, n_trials=1000000
    fallback rate       :   0.007500%  (75/1_000_000)
    mean seeks per word :        7.25

hostname-wordlist/nouns
  words=449  p_accept=0.1472  avg_bytes/word=1/p=6.79
  max_iterations=64, n_trials=1000000
    fallback rate       :   0.002700%  (27/1_000_000)
    mean seeks per word :        6.79
```
Combined with the fallback to the previous method if
we can't find anything within the 64 attemps this seems
to be the best tradeoff and give us very good uniformity.
This commit is contained in:
Michael Vogt
2026-06-17 09:51:53 +02:00
parent c3c735bd09
commit 1ada9064e6
2 changed files with 95 additions and 59 deletions

View File

@@ -24,13 +24,11 @@ the symlinks) while the actual word lists keep meaningful names.
Files
-----
Each file is a plain list of words, one per line, with no comment or blank
lines: a word is picked by hashing the machine ID to a byte offset into the
file, so comment/blank lines (although skipped) would bias the selection and
should be avoided. Each word must be a valid single hostname label (lowercase
letters, digits, hyphens); invalid entries are skipped. The file is used as-is
from the highest-priority directory that provides it (/etc wins over /run wins
over /usr/lib); files are not merged across directories.
Each file is a plain list of words, one per line. Blank lines and lines starting
with "#" are treated as comments and skipped. Each word must be a valid single
hostname label (lowercase letters, digits, hyphens); invalid entries are skipped.
The file is used as-is from the highest-priority directory that provides it (/etc
-> /run -> /usr/local/lib -> /usr/lib); files are not merged across directories.
Search path (highest priority first):
@@ -49,15 +47,10 @@ list. Changing a word list may change the name a machine gets. If a referenced
list is missing the name is treated as invalid and the built-in fallback
hostname is used.
Because a word is chosen by byte offset into the file (rather than loading and
indexing the whole list), the words are not all equally likely: a word's chance
tracks the length of the word that precedes it in the list (not its own length),
so a word listed right after a long word is slightly more likely to be picked.
The effect is small: about a 12% non-uniformity, i.e. the effective name space
is ~88% of the nominal product for $-$-$. This is an accepted trade for not
reading the whole list into memory. If exact uniformity is ever needed, pad
every word to a fixed width (e.g. with trailing '#') and have the loader strip
the padding.
Words are picked uniformly without reading the whole list into memory: an offset
is chosen by hashing and accepted only when it lands on the start of a line
(otherwise another offset is tried), so a word's chance does not depend on its
own length or that of its neighbours.
Origin
------

View File

@@ -321,12 +321,58 @@ static int hostname_open_wordlist(const char *file, FILE **ret) {
return 0;
}
static bool normalize_and_validate_word(char *word) {
assert(word);
if (IN_SET(word[0], '\0', '#')) /* empty line or comment */
return false;
ascii_strlower(word);
return hostname_is_valid(word, /* flags= */ 0);
}
static int pick_word_linear_scan(FILE *f, off_t offset, char **ret) {
int r;
assert(f);
assert(ret);
if (fseeko(f, offset, SEEK_SET) < 0)
return -errno;
bool wrapped = false;
r = read_line(f, LONG_LINE_MAX, NULL); /* discard the partial line we landed in */
if (r < 0)
return r;
if (r == 0) {
wrapped = true;
rewind(f);
}
for (;;) {
_cleanup_free_ char *line = NULL;
r = read_stripped_line(f, LONG_LINE_MAX, &line);
if (r < 0)
return r;
if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
if (wrapped) /* already wrapped once, the file contains no usable word at all */
return -ENOENT;
wrapped = true;
rewind(f);
continue;
}
if (normalize_and_validate_word(line)) {
*ret = TAKE_PTR(line);
return 0;
}
}
}
static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
static const sd_id128_t word_key = SD_ID128_MAKE(2d,9f,1c,7a,4b,8e,43,11,9a,6d,5f,02,c8,77,e3,14);
_cleanup_fclose_ FILE *f = NULL;
struct stat st;
bool wrapped = false;
uint64_t h;
int r;
assert(pos >= 1);
@@ -348,55 +394,52 @@ static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
if (st.st_size == 0)
return -ENOENT;
/* Pick a word without reading the whole list into memory: hash the machine ID and word position to a
* byte offset. This stream is independent of the '?' nibble stream, so pure-'?' templates keep
* producing byte-identical output. Stable as long as the wordlist is stable. */
struct siphash state;
siphash24_init(&state, word_key.bytes);
siphash24_compress_typesafe(mid, &state);
siphash24_compress_typesafe(pos, &state);
h = siphash24_finalize(&state);
if (fseeko(f, (off_t) (h % (uint64_t) st.st_size), SEEK_SET) < 0)
return -errno;
/* We mostly landed mid-line, so read/discard the current line here. If the file was shrunk by a
* concurrent modification we might have seeked at/past EOF, so wrap around to the beginning. */
r = read_line(f, LONG_LINE_MAX, NULL);
if (r < 0)
return r;
if (r == 0) {
wrapped = true;
rewind(f);
}
for (;;) {
/* Pick a word without reading the whole list into memory:
* 1. pick a random offset in the file [0 … st.st_size-1]
* 2. if offset is zero, read a full line from the beginning of the file, use that.
* 3. otherwise, seek to offset minus 1 and read one character.
* 4. if that character is newline, then read a full line after it, and use that as result
* 5. otherwise, goto 1
*
* As a safety net terminate after a fixed number iterations (for pathological wordlists)
* This stream is independent of the '?' nibble stream so pure-'?' * templates keep producing
* byte-identical output. Stable as long as the wordlist is stable. */
off_t offset = 0;
const unsigned int MAX_ITERATIONS = 64;
for (unsigned i = 0; i < MAX_ITERATIONS; i++) {
_cleanup_free_ char *line = NULL;
struct siphash state;
siphash24_init(&state, word_key.bytes);
siphash24_compress_typesafe(mid, &state);
siphash24_compress_typesafe(pos, &state);
siphash24_compress_typesafe(i, &state); /* counter mode */
offset = (off_t) (siphash24_finalize(&state) % (uint64_t) st.st_size);
if (offset > 0) {
if (fseeko(f, offset - 1, SEEK_SET) < 0)
return -errno;
if (fgetc(f) != '\n')
continue; /* not a line start */
} else if (fseeko(f, 0, SEEK_SET) < 0) /* offset 0 always begins the first line */
return -errno;
r = read_stripped_line(f, LONG_LINE_MAX, &line);
if (r < 0)
return r;
if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
if (wrapped) /* already wrapped once, the file contains no usable word at all */
return -ENOENT;
wrapped = true;
rewind(f);
if (r == 0) /* raced with truncation */
continue;
if (normalize_and_validate_word(line)) {
*ret = TAKE_PTR(line);
return 0;
}
/* Skip empty lines and comments */
if (IN_SET(line[0], '\0', '#'))
continue;
/* Each word must be a valid single hostname label on its own; lowercase it and silently skip
* bogus entries. */
ascii_strlower(line);
if (!hostname_is_valid(line, /* flags= */ 0))
continue;
*ret = TAKE_PTR(line);
return 0;
/* Comment/empty/invalid line: resample rather than advancing, to keep the pick uniform. */
}
/* We exhausted the uniform attempts, this should never happen but if it does fallback to picking the
* next word after our last attempt. */
log_warning("hostname_pick_word did not find a usable word after %u in wordlist %zu", MAX_ITERATIONS, pos);
return pick_word_linear_scan(f, offset, ret);
}
int hostname_substitute_wildcards(const char *name, char **ret) {