Files
containerd/integration/issue10467_linux_test.go
Wei Fu 1ac97c2c13 *: properly shutdown non-groupable shims to prevent resource leaks
Previously, to address issue #11708, PR #11793 changed containerd to always
invoke the shim binary to establish shim connections, rather than reusing the
sandbox shim. However, this change did not ensure that the Shutdown API was
called to stop the shim process.

Starting with containerd v2.0.0, the Shutdown API is only invoked for sandbox
containers (when container.SandboxID is empty). This approach works for
groupable shims, where multiple containers share a single socket address and
only require a single Shutdown call. However, for non-groupable shims, each
container requires its own Shutdown call during cleanup to avoid leaking shim
processes.

Additionally, PR #11793 introduced a corner case during upgrades:
- T1: An old container-shim-runc-v2 (<=v1.7.X) is running for pod A.
- T2: containerd is upgraded to v2.X.Y.
- T3: A new container A-C1 is created in pod A using the new shim-runc-v2 binary.
- T4: bootstrap.json indicates version:3 protocol, but it is downgraded to version:2 in memory.
- T5: containerd is restarted.
- T6: containerd fails to connect to A-C1.
- T7: The A-C1 container is left in EXITED status in the CRI plugin.

To address this, ensure that loadShimTask downgrades to version:2 if necessary,
and always invoke the Shutdown API for each non-groupable shim during cleanup to
prevent resource leaks and handle upgrade scenarios correctly.

(Introduced by #11793)

Signed-off-by: Wei Fu <fuweid89@gmail.com>
2025-06-02 00:01:08 -04:00

122 lines
3.9 KiB
Go

/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package integration
import (
"fmt"
"path/filepath"
"syscall"
"testing"
"time"
"github.com/containerd/continuity/fs"
"github.com/stretchr/testify/require"
"go.etcd.io/bbolt"
)
// TestIssue10467 tests the migration of sandboxes into the proper bucket. Prior to v1.7.21, the
// sandboxes were stored incorrectly in the root bucket. In order to verify the migration, a v1.7.20
// must run and create a sandbox, then check the migration after upgrading to a newer version.
func TestIssue10467(t *testing.T) {
latestVersion := "v1.7.20"
releaseBinDir := t.TempDir()
downloadReleaseBinary(t, releaseBinDir, latestVersion)
t.Logf("Install config for release %s", latestVersion)
workDir := t.TempDir()
oneSevenCtrdConfig(t, releaseBinDir, workDir)
t.Log("Starting the previous release's containerd")
previousCtrdBinPath := filepath.Join(releaseBinDir, "bin", "containerd")
previousProc := newCtrdProc(t, previousCtrdBinPath, workDir, []string{"ENABLE_CRI_SANDBOXES=yes"})
boltdbPath := filepath.Join(workDir, "root", "io.containerd.metadata.v1.bolt", "meta.db")
ctrdLogPath := previousProc.logPath()
t.Cleanup(func() {
if t.Failed() {
dumpFileContent(t, ctrdLogPath)
}
})
require.NoError(t, previousProc.isReady())
needToCleanup := true
t.Cleanup(func() {
if t.Failed() && needToCleanup {
t.Logf("Try to cleanup leaky pods")
cleanupPods(t, previousProc.criRuntimeService(t))
}
})
t.Log("Prepare pods for current release")
upgradeCaseFuncs, hookFunc := shouldManipulateContainersInPodAfterUpgrade("")(t, 2, previousProc.criRuntimeService(t), previousProc.criImageService(t))
upgradeCaseFunc := upgradeCaseFuncs[0]
needToCleanup = false
require.Nil(t, hookFunc)
t.Log("Gracefully stop previous release's containerd process")
require.NoError(t, previousProc.kill(syscall.SIGTERM))
require.NoError(t, previousProc.wait(5*time.Minute))
t.Logf("%s should have bucket k8s.io in root", boltdbPath)
db, err := bbolt.Open(boltdbPath, 0600, &bbolt.Options{ReadOnly: true})
require.NoError(t, err)
require.NoError(t, db.View(func(tx *bbolt.Tx) error {
if tx.Bucket([]byte("k8s.io")) == nil {
return fmt.Errorf("expected k8s.io bucket")
}
return nil
}))
require.NoError(t, db.Close())
t.Log("Install default config for current release")
currentReleaseCtrdDefaultConfig(t, workDir)
t.Log("Starting the current release's containerd")
currentProc := newCtrdProc(t, "containerd", workDir, nil)
require.NoError(t, currentProc.isReady())
t.Cleanup(func() {
t.Log("Cleanup all the pods")
cleanupPods(t, currentProc.criRuntimeService(t))
t.Log("Stopping current release's containerd process")
require.NoError(t, currentProc.kill(syscall.SIGTERM))
require.NoError(t, currentProc.wait(5*time.Minute))
})
t.Logf("%s should not have bucket k8s.io in root after restart", boltdbPath)
copiedBoltdbPath := filepath.Join(t.TempDir(), "meta.db.new")
require.NoError(t, fs.CopyFile(copiedBoltdbPath, boltdbPath))
db, err = bbolt.Open(copiedBoltdbPath, 0600, &bbolt.Options{ReadOnly: true})
require.NoError(t, err)
require.NoError(t, db.View(func(tx *bbolt.Tx) error {
if tx.Bucket([]byte("k8s.io")) != nil {
return fmt.Errorf("unexpected k8s.io bucket")
}
return nil
}))
require.NoError(t, db.Close())
t.Log("Verifing")
upgradeCaseFunc(t, currentProc.criRuntimeService(t), currentProc.criImageService(t))
}