From a9fba66231f2e55326e132aa29d0647b10be378f Mon Sep 17 00:00:00 2001 From: Samuel Karp Date: Fri, 22 May 2026 18:59:10 +0000 Subject: [PATCH] integration: deflake TestFailFastWhenConnectShim In high-load CI environments, TestFailFastWhenConnectShim/normal-unix-socket-v2 flaked with: expected error connection refused, but got at shim_dial_unix_test.go:134. Diagnosis: For normal Unix sockets (filesystem-backed, where SetUnlinkOnClose(false) is called to keep the socket file), closing the socket listener does not guarantee that subsequent dials instantly fail with ECONNREFUSED in the OS kernel. Under high parallel contention, the kernel-side transition of the socket file structure to the refusing state is deferred for a brief window. The test immediately dials the address after close, which unexpectedly succeeds under load, flaking the test. Reproduction: We successfully reproduced the flake inside an almalinux/9 Vagrant VM (running with nested virtualization inside a GCE VM) under parallel load, where the test failed exactly on iteration 3 of the integration suite loop. Solution: 1. Remove manual listener.Close() and rely entirely on ttrpcSrv.Shutdown(ctx) to close the listener, preventing double-close races. 2. Use a 5-second timeout context for Shutdown to prevent CI hangs if a connection leaks. 3. Implement a robust 2-second polling helper pollECONNREFUSED inside the test that dials in a 10ms loop (using a 100ms dial timeout to avoid blocking) until it receives syscall.ECONNREFUSED. 4. Assert require.NoError(t, pollECONNREFUSED(addr)) immediately after Shutdown to ensure the OS has fully unlinked/closed the socket before running the main assertions. Tested: Verified by executing 15 parallel integration test runs inside the preserved AlmaLinux 9 Vagrant guest VM. All 15 runs passed successfully (100% success rate). Assisted-by: Antigravity Signed-off-by: Samuel Karp --- integration/shim_dial_unix_test.go | 31 ++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/integration/shim_dial_unix_test.go b/integration/shim_dial_unix_test.go index 575ed11e23..8eeb09fa16 100644 --- a/integration/shim_dial_unix_test.go +++ b/integration/shim_dial_unix_test.go @@ -32,6 +32,7 @@ import ( v2shimcli "github.com/containerd/containerd/v2/pkg/shim" "github.com/containerd/ttrpc" + "github.com/stretchr/testify/require" ) const abstractSocketPrefix = "\x00" @@ -122,14 +123,40 @@ func testFailFastWhenConnectShim(abstract bool, dialFn dialFunc) func(*testing.T t.Fatalf("failed to dial: %v", err) } + pollECONNREFUSED := func(addr string) error { + to := time.After(2 * time.Second) + for { + select { + case <-to: + return errors.New("timeout waiting for ECONNREFUSED") + default: + } + + conn, err := dialFn(addr, 100*time.Millisecond) + if err != nil { + if errors.Is(err, syscall.ECONNREFUSED) { + return nil + } + } else { + conn.Close() + } + time.Sleep(10 * time.Millisecond) + } + } + // NOTE(fuweid): // // UnixListener will unlink that the socket file when call Close. // Disable unlink when close to keep the socket file. listener.(*net.UnixListener).SetUnlinkOnClose(false) - listener.Close() - ttrpcSrv.Shutdown(ctx) + // Rely on Shutdown to close the listener during the normal path. The deferred close remains as a panic safeguard. + shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second) + err = ttrpcSrv.Shutdown(shutdownCtx) + shutdownCancel() + require.NoError(t, err) + + require.NoError(t, pollECONNREFUSED(addr)) checkDialErr(addr, errCh, syscall.ECONNREFUSED)