mirror of
https://github.com/moby/buildkit.git
synced 2026-06-30 19:57:39 +00:00
This patch makes the process handling consistent between runc.Run and runc.Exec usage. Previously runc.Run would use context.Background for the runc.Run process and would monitor the request context for shutdown requests, sending a SIGKILL to the container pid1 process. This allowed runc.Run to gracefully shutdown and reap child processes. This logic was not used for runc.Exec where instead we were passing in the request context to runc.Exec, and if that request context was cancelled the runc process would immediately terminate preventing runc from reaping the child process. In this scenario the extra pid will remain forever and then when the pid1 process will get wedged in zap_pid_ns_processes syscall upon shutdown waiting fo the zombie pid to exit. With this fix both runc.Run and runc.Exec will use context.Background for runc processes and monitor the request context for shutdown request triggering a SIGKILL to the pid being monitored by runc. Signed-off-by: coryb <cbennett@netflix.com>
570 lines
14 KiB
Go
570 lines
14 KiB
Go
package runcexecutor
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/moby/buildkit/util/bklog"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
|
|
"github.com/containerd/containerd/mount"
|
|
containerdoci "github.com/containerd/containerd/oci"
|
|
"github.com/containerd/continuity/fs"
|
|
runc "github.com/containerd/go-runc"
|
|
"github.com/docker/docker/pkg/idtools"
|
|
"github.com/moby/buildkit/executor"
|
|
"github.com/moby/buildkit/executor/oci"
|
|
gatewayapi "github.com/moby/buildkit/frontend/gateway/pb"
|
|
"github.com/moby/buildkit/identity"
|
|
"github.com/moby/buildkit/solver/pb"
|
|
"github.com/moby/buildkit/util/network"
|
|
rootlessspecconv "github.com/moby/buildkit/util/rootless/specconv"
|
|
"github.com/moby/buildkit/util/stack"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
type Opt struct {
|
|
// root directory
|
|
Root string
|
|
CommandCandidates []string
|
|
// without root privileges (has nothing to do with Opt.Root directory)
|
|
Rootless bool
|
|
// DefaultCgroupParent is the cgroup-parent name for executor
|
|
DefaultCgroupParent string
|
|
// ProcessMode
|
|
ProcessMode oci.ProcessMode
|
|
IdentityMapping *idtools.IdentityMapping
|
|
// runc run --no-pivot (unrecommended)
|
|
NoPivot bool
|
|
DNS *oci.DNSConfig
|
|
OOMScoreAdj *int
|
|
ApparmorProfile string
|
|
SELinux bool
|
|
TracingSocket string
|
|
}
|
|
|
|
var defaultCommandCandidates = []string{"buildkit-runc", "runc"}
|
|
|
|
type runcExecutor struct {
|
|
runc *runc.Runc
|
|
root string
|
|
cgroupParent string
|
|
rootless bool
|
|
networkProviders map[pb.NetMode]network.Provider
|
|
processMode oci.ProcessMode
|
|
idmap *idtools.IdentityMapping
|
|
noPivot bool
|
|
dns *oci.DNSConfig
|
|
oomScoreAdj *int
|
|
running map[string]chan error
|
|
mu sync.Mutex
|
|
apparmorProfile string
|
|
selinux bool
|
|
tracingSocket string
|
|
}
|
|
|
|
func New(opt Opt, networkProviders map[pb.NetMode]network.Provider) (executor.Executor, error) {
|
|
cmds := opt.CommandCandidates
|
|
if cmds == nil {
|
|
cmds = defaultCommandCandidates
|
|
}
|
|
|
|
var cmd string
|
|
var found bool
|
|
for _, cmd = range cmds {
|
|
if _, err := exec.LookPath(cmd); err == nil {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
return nil, errors.Errorf("failed to find %s binary", cmd)
|
|
}
|
|
|
|
root := opt.Root
|
|
|
|
if err := os.MkdirAll(root, 0o711); err != nil {
|
|
return nil, errors.Wrapf(err, "failed to create %s", root)
|
|
}
|
|
|
|
root, err := filepath.Abs(root)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
root, err = filepath.EvalSymlinks(root)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// clean up old hosts/resolv.conf file. ignore errors
|
|
os.RemoveAll(filepath.Join(root, "hosts"))
|
|
os.RemoveAll(filepath.Join(root, "resolv.conf"))
|
|
|
|
runtime := &runc.Runc{
|
|
Command: cmd,
|
|
Log: filepath.Join(root, "runc-log.json"),
|
|
LogFormat: runc.JSON,
|
|
Setpgid: true,
|
|
// we don't execute runc with --rootless=(true|false) explicitly,
|
|
// so as to support non-runc runtimes
|
|
}
|
|
|
|
updateRuncFieldsForHostOS(runtime)
|
|
|
|
w := &runcExecutor{
|
|
runc: runtime,
|
|
root: root,
|
|
cgroupParent: opt.DefaultCgroupParent,
|
|
rootless: opt.Rootless,
|
|
networkProviders: networkProviders,
|
|
processMode: opt.ProcessMode,
|
|
idmap: opt.IdentityMapping,
|
|
noPivot: opt.NoPivot,
|
|
dns: opt.DNS,
|
|
oomScoreAdj: opt.OOMScoreAdj,
|
|
running: make(map[string]chan error),
|
|
apparmorProfile: opt.ApparmorProfile,
|
|
selinux: opt.SELinux,
|
|
tracingSocket: opt.TracingSocket,
|
|
}
|
|
return w, nil
|
|
}
|
|
|
|
func (w *runcExecutor) Run(ctx context.Context, id string, root executor.Mount, mounts []executor.Mount, process executor.ProcessInfo, started chan<- struct{}) (err error) {
|
|
meta := process.Meta
|
|
|
|
startedOnce := sync.Once{}
|
|
done := make(chan error, 1)
|
|
w.mu.Lock()
|
|
w.running[id] = done
|
|
w.mu.Unlock()
|
|
defer func() {
|
|
w.mu.Lock()
|
|
delete(w.running, id)
|
|
w.mu.Unlock()
|
|
done <- err
|
|
close(done)
|
|
if started != nil {
|
|
startedOnce.Do(func() {
|
|
close(started)
|
|
})
|
|
}
|
|
}()
|
|
|
|
provider, ok := w.networkProviders[meta.NetMode]
|
|
if !ok {
|
|
return errors.Errorf("unknown network mode %s", meta.NetMode)
|
|
}
|
|
namespace, err := provider.New(ctx, meta.Hostname)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer namespace.Close()
|
|
|
|
if meta.NetMode == pb.NetMode_HOST {
|
|
bklog.G(ctx).Info("enabling HostNetworking")
|
|
}
|
|
|
|
resolvConf, err := oci.GetResolvConf(ctx, w.root, w.idmap, w.dns)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
hostsFile, clean, err := oci.GetHostsFile(ctx, w.root, meta.ExtraHosts, w.idmap, meta.Hostname)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if clean != nil {
|
|
defer clean()
|
|
}
|
|
|
|
mountable, err := root.Src.Mount(ctx, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
rootMount, release, err := mountable.Mount()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if release != nil {
|
|
defer release()
|
|
}
|
|
|
|
if id == "" {
|
|
id = identity.NewID()
|
|
}
|
|
bundle := filepath.Join(w.root, id)
|
|
|
|
if err := os.Mkdir(bundle, 0o711); err != nil {
|
|
return err
|
|
}
|
|
defer os.RemoveAll(bundle)
|
|
|
|
identity := idtools.Identity{}
|
|
if w.idmap != nil {
|
|
identity = w.idmap.RootPair()
|
|
}
|
|
|
|
rootFSPath := filepath.Join(bundle, "rootfs")
|
|
if err := idtools.MkdirAllAndChown(rootFSPath, 0o700, identity); err != nil {
|
|
return err
|
|
}
|
|
if err := mount.All(rootMount, rootFSPath); err != nil {
|
|
return err
|
|
}
|
|
defer mount.Unmount(rootFSPath, 0)
|
|
|
|
defer executor.MountStubsCleaner(ctx, rootFSPath, mounts, meta.RemoveMountStubsRecursive)()
|
|
|
|
uid, gid, sgids, err := oci.GetUser(rootFSPath, meta.User)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
f, err := os.Create(filepath.Join(bundle, "config.json"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
|
|
opts := []containerdoci.SpecOpts{oci.WithUIDGID(uid, gid, sgids)}
|
|
|
|
if meta.ReadonlyRootFS {
|
|
opts = append(opts, containerdoci.WithRootFSReadonly())
|
|
}
|
|
|
|
identity = idtools.Identity{
|
|
UID: int(uid),
|
|
GID: int(gid),
|
|
}
|
|
if w.idmap != nil {
|
|
identity, err = w.idmap.ToHost(identity)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
spec, cleanup, err := oci.GenerateSpec(ctx, meta, mounts, id, resolvConf, hostsFile, namespace, w.cgroupParent, w.processMode, w.idmap, w.apparmorProfile, w.selinux, w.tracingSocket, opts...)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer cleanup()
|
|
|
|
spec.Root.Path = rootFSPath
|
|
if root.Readonly {
|
|
spec.Root.Readonly = true
|
|
}
|
|
|
|
newp, err := fs.RootPath(rootFSPath, meta.Cwd)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "working dir %s points to invalid target", newp)
|
|
}
|
|
if _, err := os.Stat(newp); err != nil {
|
|
if err := idtools.MkdirAllAndChown(newp, 0o755, identity); err != nil {
|
|
return errors.Wrapf(err, "failed to create working directory %s", newp)
|
|
}
|
|
}
|
|
|
|
spec.Process.Terminal = meta.Tty
|
|
spec.Process.OOMScoreAdj = w.oomScoreAdj
|
|
if w.rootless {
|
|
if err := rootlessspecconv.ToRootless(spec); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := json.NewEncoder(f).Encode(spec); err != nil {
|
|
return err
|
|
}
|
|
|
|
bklog.G(ctx).Debugf("> creating %s %v", id, meta.Args)
|
|
|
|
trace.SpanFromContext(ctx).AddEvent("Container created")
|
|
err = w.run(ctx, id, bundle, process, func() {
|
|
startedOnce.Do(func() {
|
|
trace.SpanFromContext(ctx).AddEvent("Container started")
|
|
if started != nil {
|
|
close(started)
|
|
}
|
|
})
|
|
})
|
|
return exitError(ctx, err)
|
|
}
|
|
|
|
func exitError(ctx context.Context, err error) error {
|
|
if err != nil {
|
|
exitErr := &gatewayapi.ExitError{
|
|
ExitCode: gatewayapi.UnknownExitStatus,
|
|
Err: err,
|
|
}
|
|
var runcExitError *runc.ExitError
|
|
if errors.As(err, &runcExitError) {
|
|
exitErr = &gatewayapi.ExitError{
|
|
ExitCode: uint32(runcExitError.Status),
|
|
}
|
|
}
|
|
trace.SpanFromContext(ctx).AddEvent(
|
|
"Container exited",
|
|
trace.WithAttributes(
|
|
attribute.Int("exit.code", int(exitErr.ExitCode)),
|
|
),
|
|
)
|
|
select {
|
|
case <-ctx.Done():
|
|
exitErr.Err = errors.Wrapf(ctx.Err(), exitErr.Error())
|
|
return exitErr
|
|
default:
|
|
return stack.Enable(exitErr)
|
|
}
|
|
}
|
|
|
|
trace.SpanFromContext(ctx).AddEvent(
|
|
"Container exited",
|
|
trace.WithAttributes(attribute.Int("exit.code", 0)),
|
|
)
|
|
return nil
|
|
}
|
|
|
|
func (w *runcExecutor) Exec(ctx context.Context, id string, process executor.ProcessInfo) (err error) {
|
|
// first verify the container is running, if we get an error assume the container
|
|
// is in the process of being created and check again every 100ms or until
|
|
// context is canceled.
|
|
var state *runc.Container
|
|
for {
|
|
w.mu.Lock()
|
|
done, ok := w.running[id]
|
|
w.mu.Unlock()
|
|
if !ok {
|
|
return errors.Errorf("container %s not found", id)
|
|
}
|
|
|
|
state, _ = w.runc.State(ctx, id)
|
|
if state != nil && state.Status == "running" {
|
|
break
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case err, ok := <-done:
|
|
if !ok || err == nil {
|
|
return errors.Errorf("container %s has stopped", id)
|
|
}
|
|
return errors.Wrapf(err, "container %s has exited with error", id)
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
}
|
|
|
|
// load default process spec (for Env, Cwd etc) from bundle
|
|
f, err := os.Open(filepath.Join(state.Bundle, "config.json"))
|
|
if err != nil {
|
|
return errors.WithStack(err)
|
|
}
|
|
defer f.Close()
|
|
|
|
spec := &specs.Spec{}
|
|
if err := json.NewDecoder(f).Decode(spec); err != nil {
|
|
return err
|
|
}
|
|
|
|
if process.Meta.User != "" {
|
|
uid, gid, sgids, err := oci.GetUser(state.Rootfs, process.Meta.User)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
spec.Process.User = specs.User{
|
|
UID: uid,
|
|
GID: gid,
|
|
AdditionalGids: sgids,
|
|
}
|
|
}
|
|
|
|
spec.Process.Terminal = process.Meta.Tty
|
|
spec.Process.Args = process.Meta.Args
|
|
if process.Meta.Cwd != "" {
|
|
spec.Process.Cwd = process.Meta.Cwd
|
|
}
|
|
|
|
if len(process.Meta.Env) > 0 {
|
|
spec.Process.Env = process.Meta.Env
|
|
}
|
|
|
|
err = w.exec(ctx, id, state.Bundle, spec.Process, process, nil)
|
|
return exitError(ctx, err)
|
|
}
|
|
|
|
type forwardIO struct {
|
|
stdin io.ReadCloser
|
|
stdout, stderr io.WriteCloser
|
|
}
|
|
|
|
func (s *forwardIO) Close() error {
|
|
return nil
|
|
}
|
|
|
|
func (s *forwardIO) Set(cmd *exec.Cmd) {
|
|
cmd.Stdin = s.stdin
|
|
cmd.Stdout = s.stdout
|
|
cmd.Stderr = s.stderr
|
|
}
|
|
|
|
func (s *forwardIO) Stdin() io.WriteCloser {
|
|
return nil
|
|
}
|
|
|
|
func (s *forwardIO) Stdout() io.ReadCloser {
|
|
return nil
|
|
}
|
|
|
|
func (s *forwardIO) Stderr() io.ReadCloser {
|
|
return nil
|
|
}
|
|
|
|
// procHandle is to track the os process so we can send signals to it.
|
|
type procHandle struct {
|
|
Process *os.Process
|
|
ready chan struct{}
|
|
ended chan struct{}
|
|
shutdown func()
|
|
}
|
|
|
|
// runcProcessHandle will create a procHandle that will be monitored, where
|
|
// on ctx.Done the process will be killed. If the kill fails, then the cancel
|
|
// will be called. This is to allow for runc to go through its normal shutdown
|
|
// procedure if the ctx is canceled and to ensure there are no zombie processes
|
|
// left by runc.
|
|
func runcProcessHandle(ctx context.Context, id string) (*procHandle, context.Context) {
|
|
runcCtx, cancel := context.WithCancel(context.Background())
|
|
p := &procHandle{
|
|
ready: make(chan struct{}),
|
|
ended: make(chan struct{}),
|
|
shutdown: cancel,
|
|
}
|
|
// preserve the logger on the context used for the runc process handling
|
|
runcCtx = bklog.WithLogger(runcCtx, bklog.G(ctx))
|
|
|
|
go func() {
|
|
// Wait for pid
|
|
select {
|
|
case <-ctx.Done():
|
|
return // nothing to kill
|
|
case <-p.ready:
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
killCtx, timeout := context.WithTimeout(context.Background(), 7*time.Second)
|
|
if err := p.Process.Kill(); err != nil {
|
|
bklog.G(ctx).Errorf("failed to kill runc %s: %+v", id, err)
|
|
select {
|
|
case <-killCtx.Done():
|
|
timeout()
|
|
cancel()
|
|
return
|
|
default:
|
|
}
|
|
}
|
|
timeout()
|
|
select {
|
|
case <-time.After(50 * time.Millisecond):
|
|
case <-p.ended:
|
|
return
|
|
}
|
|
case <-p.ended:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
|
|
return p, runcCtx
|
|
}
|
|
|
|
// Release will free resources with a procHandle.
|
|
func (p *procHandle) Release() {
|
|
close(p.ended)
|
|
if p.Process != nil {
|
|
p.Process.Release()
|
|
}
|
|
}
|
|
|
|
// Shutdown should be called after the runc process has exited. This will allow
|
|
// the signal handling and tty resize loops to exit, terminating the
|
|
// goroutines.
|
|
func (p *procHandle) Shutdown() {
|
|
if p.shutdown != nil {
|
|
p.shutdown()
|
|
}
|
|
}
|
|
|
|
// WaitForReady will wait until the Process has been populated or the
|
|
// provided context was cancelled. This should be called before using
|
|
// the Process field.
|
|
func (p *procHandle) WaitForReady(ctx context.Context) error {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-p.ready:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WaitForStart will record the pid reported by Runc via the channel.
|
|
// We wait for up to 10s for the runc process to start. If the started
|
|
// callback is non-nil it will be called after receiving the pid.
|
|
func (p *procHandle) WaitForStart(ctx context.Context, startedCh <-chan int, started func()) error {
|
|
startedCtx, timeout := context.WithTimeout(ctx, 10*time.Second)
|
|
defer timeout()
|
|
var err error
|
|
select {
|
|
case <-startedCtx.Done():
|
|
return errors.New("runc started message never received")
|
|
case pid, ok := <-startedCh:
|
|
if !ok {
|
|
return errors.New("runc process failed to send pid")
|
|
}
|
|
if started != nil {
|
|
started()
|
|
}
|
|
p.Process, err = os.FindProcess(pid)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "unable to find runc process for pid %d", pid)
|
|
}
|
|
close(p.ready)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// handleSignals will wait until the runcProcess is ready then will
|
|
// send each signal received on the channel to the process.
|
|
func handleSignals(ctx context.Context, runcProcess *procHandle, signals <-chan syscall.Signal) error {
|
|
if signals == nil {
|
|
return nil
|
|
}
|
|
err := runcProcess.WaitForReady(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case sig := <-signals:
|
|
err := runcProcess.Process.Signal(sig)
|
|
if err != nil {
|
|
bklog.G(ctx).Errorf("failed to signal %s to process: %s", sig, err)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|