From be3fcf33e81e5991ed239f9116f6482d571e50fd Mon Sep 17 00:00:00 2001 From: Esteban Ginez Date: Wed, 10 Jun 2026 14:48:48 -0700 Subject: [PATCH] fix(shim/windows): retry on winio.ErrTimeout in awaitPipeReady awaitPipeReady retried only when DialPipe returned os.IsNotExist or context.DeadlineExceeded, but winio.DialPipe converts the per-attempt deadline into winio.ErrTimeout before returning. A pipe in state 1 (ListenPipe called, Accept not yet called) causes DialPipe to block for the full per-attempt timeout and return winio.ErrTimeout, which the old check treated as a fatal error instead of retrying. Also guard windows.ERROR_PIPE_BUSY explicitly to match the error checks in containerd/nerdbox#218. Adds a regression test that forces the state-1 to state-2 transition race by delaying Accept past the 1-second per-attempt timeout. Signed-off-by: Esteban Ginez --- pkg/shim/shim_windows.go | 14 +++++-- pkg/shim/shim_windows_test.go | 71 +++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 pkg/shim/shim_windows_test.go diff --git a/pkg/shim/shim_windows.go b/pkg/shim/shim_windows.go index b0b03c5f07..c319666c1c 100644 --- a/pkg/shim/shim_windows.go +++ b/pkg/shim/shim_windows.go @@ -18,6 +18,7 @@ package shim import ( "context" + "errors" "fmt" "io" "net" @@ -28,6 +29,7 @@ import ( "github.com/containerd/errdefs" "github.com/containerd/log" "github.com/containerd/ttrpc" + "golang.org/x/sys/windows" ) func setupSignals(config Config) (chan os.Signal, error) { @@ -85,9 +87,15 @@ func awaitPipeReady(address string) error { return nil } lastErr = err - // Retry on both "pipe not found" and "pipe busy / deadline exceeded" - // — the pipe may still be starting up or temporarily at capacity. - if !os.IsNotExist(err) && err != context.DeadlineExceeded { + // Retry on pipe-not-found, i/o timeout, and pipe-busy. + // winio.DialPipe returns winio.ErrTimeout when the per-attempt timeout fires. + // ERROR_PIPE_BUSY is normally absorbed by go-winio's tryDialPipe loop + // and surfaces as winio.ErrTimeout once the deadline fires; guard it + // explicitly in case a future go-winio version surfaces it unwrapped. + retryable := os.IsNotExist(err) || + errors.Is(err, winio.ErrTimeout) || + errors.Is(err, windows.ERROR_PIPE_BUSY) + if !retryable { return err } select { diff --git a/pkg/shim/shim_windows_test.go b/pkg/shim/shim_windows_test.go new file mode 100644 index 0000000000..588dcc8ba0 --- /dev/null +++ b/pkg/shim/shim_windows_test.go @@ -0,0 +1,71 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package shim + +import ( + "strings" + "testing" + "time" + + winio "github.com/Microsoft/go-winio" +) + +// TestAwaitPipeReady_State1_RetriesOnErrTimeout proves that awaitPipeReady +// retries when winio.DialPipe returns winio.ErrTimeout (pipe in state 1: +// ListenPipe called, Accept not yet called). +// +// With the pre-fix code this test fails: awaitPipeReady returns winio.ErrTimeout +// immediately after the first dial attempt instead of retrying. +// +// After the fix (errors.Is(err, winio.ErrTimeout) treated as retryable) the +// test passes: the function retries past the timeout, Accept fires at 1.5 s, +// and the next dial attempt connects. +func TestAwaitPipeReady_State1_RetriesOnErrTimeout(t *testing.T) { + pipePath := `\\.\pipe\` + strings.ReplaceAll(t.Name(), "/", "_") + + // Create the pipe listener (state 1): ListenPipe called, Accept not yet + // called. winio.DialPipe against a state-1 pipe blocks for the per-attempt + // timeout then returns winio.ErrTimeout — not context.DeadlineExceeded. + listener, err := winio.ListenPipe(pipePath, nil) + if err != nil { + t.Fatalf("ListenPipe: %v", err) + } + defer listener.Close() + + // Transition to state 2 after 1.5 s — intentionally longer than the 1-second + // per-attempt DialPipe timeout so the first attempt always returns ErrTimeout. + // awaitPipeReady's 5-second overall timer still leaves enough room for a + // successful retry once Accept fires. + go func() { + time.Sleep(1500 * time.Millisecond) + conn, err := listener.Accept() + if err == nil { + conn.Close() + } + }() + + start := time.Now() + if err := awaitPipeReady(pipePath); err != nil { + t.Fatalf("awaitPipeReady(%q) = %v; want nil", pipePath, err) + } + // Assert at least one per-attempt timeout (1s) elapsed, proving the + // retry path was exercised and the test is not a false positive from + // Accept firing before the first dial attempt completed. + if elapsed := time.Since(start); elapsed < time.Second { + t.Errorf("awaitPipeReady returned in %v; want ≥1s to confirm retry path was taken", elapsed) + } +}