diff --git a/components/supervisor/go.mod b/components/supervisor/go.mod index 0e131c6d5b3d69..f77074cc80de10 100644 --- a/components/supervisor/go.mod +++ b/components/supervisor/go.mod @@ -24,7 +24,6 @@ require ( github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3 github.com/improbable-eng/grpc-web v0.14.0 - github.com/mailru/easygo v0.0.0-20190618140210-3c14a0dc985f github.com/prometheus/client_golang v1.14.0 github.com/prometheus/client_model v0.3.0 github.com/prometheus/common v0.42.0 @@ -116,6 +115,7 @@ require ( github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/moredure/easygo v0.0.0-20220122214504-21cd2ebdd15b github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.0.2 // indirect github.com/opencontainers/runtime-spec v1.0.2 // indirect diff --git a/components/supervisor/go.sum b/components/supervisor/go.sum index 4760166cdd1cde..9c686da66ca21d 100644 --- a/components/supervisor/go.sum +++ b/components/supervisor/go.sum @@ -231,8 +231,6 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= -github.com/mailru/easygo v0.0.0-20190618140210-3c14a0dc985f h1:4+gHs0jJFJ06bfN8PshnM6cHcxGjRUVRLo5jndDiKRQ= -github.com/mailru/easygo v0.0.0-20190618140210-3c14a0dc985f/go.mod h1:tHCZHV8b2A90ObojrEAzY0Lb03gxUxjDHr5IJyAh4ew= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= @@ -253,6 +251,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJ github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/moredure/easygo v0.0.0-20220122214504-21cd2ebdd15b h1:7zlLj8X9PO8TOvWcjKZe2SwviEZE2vsnFYk85Zhi5kE= +github.com/moredure/easygo v0.0.0-20220122214504-21cd2ebdd15b/go.mod h1:RhGu0fPcznUgpHzlXjr0zuSSzRJhwe/wbFQLKHAeDSY= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= diff --git a/components/supervisor/pkg/activation/activation.go b/components/supervisor/pkg/activation/activation.go index cf21968ac0ec04..59e3ec96c54b35 100644 --- a/components/supervisor/pkg/activation/activation.go +++ b/components/supervisor/pkg/activation/activation.go @@ -10,7 +10,8 @@ import ( "os" "sync" - "github.com/mailru/easygo/netpoll" + "github.com/gitpod-io/gitpod/common-go/log" + "github.com/moredure/easygo/netpoll" "golang.org/x/xerrors" ) @@ -19,16 +20,22 @@ type Callback func(socketFD *os.File) error // Listen polls on the listener and calls callback when someone writes to it. func Listen(ctx context.Context, l net.Listener, activate Callback) error { - poller, err := netpoll.New(nil) + poller, err := netpoll.New(&netpoll.Config{ + OnWaitError: func(err error) { + log.WithError(err).Error("netpoll: wait loop error") + }, + }) if err != nil { return err } + defer poller.Close() // Get netpoll descriptor with EventRead|EventEdgeTriggered. desc, err := netpoll.HandleListener(l, netpoll.EventRead|netpoll.EventEdgeTriggered) if err != nil { return err } + defer desc.Close() var ( runc = make(chan bool, 1) diff --git a/components/supervisor/pkg/supervisor/docker.go b/components/supervisor/pkg/supervisor/docker.go new file mode 100644 index 00000000000000..4b25873ca66a8f --- /dev/null +++ b/components/supervisor/pkg/supervisor/docker.go @@ -0,0 +1,287 @@ +// Copyright (c) 2023 Gitpod GmbH. All rights reserved. +// Licensed under the GNU Affero General Public License (AGPL). +// See License.AGPL.txt in the project root for license information. + +package supervisor + +import ( + "context" + "errors" + "fmt" + "io" + "net" + "os" + "os/exec" + "strings" + "sync" + "syscall" + "time" + + "github.com/cenkalti/backoff/v4" + "golang.org/x/xerrors" + + "github.com/gitpod-io/gitpod/common-go/analytics" + "github.com/gitpod-io/gitpod/common-go/log" + "github.com/gitpod-io/gitpod/common-go/process" + "github.com/gitpod-io/gitpod/supervisor/api" + "github.com/gitpod-io/gitpod/supervisor/pkg/activation" + "github.com/gitpod-io/gitpod/supervisor/pkg/dropwriter" + "github.com/gitpod-io/gitpod/supervisor/pkg/terminal" +) + +// if exit error happens one after another within dockerStartErrorBurstDuration then we're in the burst +// and should stop trying after maxBurstDockerStartAttempts +const ( + maxBurstDockerStartAttempts = 8 + dockerStartErrorBurstDuration = time.Minute * 1 + maxIntervalBetweenDockerStart = 15 * time.Second + + logsDir = "/workspace/.gitpod/logs" + dockerUpLogFilePath = logsDir + "/docker-up.log" +) + +var ( + failedForRetryError = errors.New("failed for retry start docker-up") +) + +func socketActivationForDocker(parentCtx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, notifications *NotificationService, cstate ContentState) { + defer wg.Done() + + ctx, cancel := context.WithCancel(parentCtx) + defer cancel() + + // only start activation if content is ready + <-cstate.ContentReady() + + if ctx.Err() != nil { + return + } + + logFile, err := openDockerUpLogFile() + if err != nil { + log.WithError(err).Error("docker-up: cannot open log file") + } else { + defer logFile.Close() + } + + var notificationSent bool + var notificationDisabled = cfg.isHeadless() || logFile == nil + notifyDockerUpFail := func() error { + if notificationDisabled || notificationSent { + return nil + } + notificationSent = true + defer func() { + notificationSent = false + }() + openLogs := "Open Logs" + dontShowAgain := "Don't Show Again" + resp, err := notifications.Notify(ctx, &api.NotifyRequest{ + Level: api.NotifyRequest_ERROR, + Message: "Docker daemon is failing to start.", + Actions: []string{openLogs, dontShowAgain}, + }) + if err != nil { + return err + } + telemetry.Track(analytics.TrackMessage{ + Identity: analytics.Identity{UserID: cfg.OwnerId}, + Event: "gitpod_activate_docker_fail_notification", + Properties: map[string]interface{}{ + "instanceId": cfg.WorkspaceInstanceID, + "workspaceId": cfg.WorkspaceID, + "debugWorkspace": cfg.isDebugWorkspace(), + "action": resp.Action, + }, + }) + if resp.Action == dontShowAgain { + notificationDisabled = true + return nil + } + if resp.Action != openLogs { + return nil + } + gpPath, err := exec.LookPath("gp") + if err != nil { + return err + } + gpCmd := exec.CommandContext(ctx, gpPath, "open", dockerUpLogFilePath) + gpCmd.Env = childProcEnvvars + gpCmd.Stdout = os.Stdout + gpCmd.Stderr = os.Stderr + return gpCmd.Run() + } + + var stdout, stderr io.Writer + + stdout = os.Stdout + stderr = os.Stderr + if cfg.WorkspaceLogRateLimit > 0 { + limit := int64(cfg.WorkspaceLogRateLimit) + stdout = dropwriter.Writer(stdout, dropwriter.NewBucket(limit*1024*3, limit*1024)) + stderr = dropwriter.Writer(stderr, dropwriter.NewBucket(limit*1024*3, limit*1024)) + log.WithField("limit_kb_per_sec", limit).Info("docker-up: rate limiting log output") + } + + if logFile != nil { + stdout = io.MultiWriter(stdout, logFile) + stderr = io.MultiWriter(stderr, logFile) + } + + for ctx.Err() == nil { + err := listenToDockerSocket(ctx, term, cfg, telemetry, logFile, stdout, stderr) + if ctx.Err() != nil { + return + } + + if errors.Is(err, failedForRetryError) { + go func() { + notifyErr := notifyDockerUpFail() + if notifyErr != nil && !errors.Is(notifyErr, context.Canceled) { + log.WithError(notifyErr).Error("cannot notify about docker-up failure") + } + }() + } + + time.Sleep(1 * time.Second) + } +} + +// listenToDockerSocket listens to the docker socket and starts the docker-up process +// if it fails to start with sometimes retry then listener is closed to requests all incoming requests and prevent infinite try +func listenToDockerSocket(parentCtx context.Context, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, logFile *os.File, stdout, stderr io.Writer) error { + ctx, cancel := context.WithCancel(parentCtx) + defer cancel() + + fn := "/var/run/docker.sock" + l, err := net.Listen("unix", fn) + if err != nil { + return err + } + + go func() { + <-ctx.Done() + l.Close() + }() + + _ = os.Chown(fn, gitpodUID, gitpodGID) + + var lastExitErrorTime time.Time + burstAttempts := 0 + backoffStrategy := backoff.NewExponentialBackOff() + backoffStrategy.MaxInterval = maxIntervalBetweenDockerStart + + for ctx.Err() == nil { + err = activation.Listen(ctx, l, func(socketFD *os.File) error { + defer socketFD.Close() + startTime := time.Now() + telemetry.Track(analytics.TrackMessage{ + Identity: analytics.Identity{UserID: cfg.OwnerId}, + Event: "gitpod_activate_docker", + Properties: map[string]interface{}{ + "instanceId": cfg.WorkspaceInstanceID, + "workspaceId": cfg.WorkspaceID, + "debugWorkspace": cfg.isDebugWorkspace(), + }, + }) + + if logFile != nil { + defer fmt.Fprintf(logFile, "\n======= Stop docker-up at %s =======\n", time.Now().Format(time.RFC3339)) + fmt.Fprintf(logFile, "======= Start docker-up at %s =======\n", startTime.Format(time.RFC3339)) + } + + cmd := exec.CommandContext(ctx, "/usr/bin/docker-up") + cmd.Env = append(os.Environ(), "LISTEN_FDS=1") + cmd.ExtraFiles = []*os.File{socketFD} + cmd.Stdout = stdout + cmd.Stderr = stderr + + err = cmd.Start() + if err != nil { + return err + } + ptyCtx, cancel := context.WithCancel(ctx) + go func(ptyCtx context.Context) { + select { + case <-ctx.Done(): + _ = cmd.Process.Signal(syscall.SIGTERM) + case <-ptyCtx.Done(): + } + }(ptyCtx) + err = cmd.Wait() + cancel() + return err + }) + + if ctx.Err() != nil { + return ctx.Err() + } + + var exitError *exec.ExitError + if err != nil { + exitError, _ = err.(*exec.ExitError) + } + if exitError != nil && exitError.ExitCode() > 0 { + // docker-up or daemon exited with an error - we'll try again + // it can be a transient condition like apt is locked by another process to install prerequisites + // or permament like misconfigured env var, image which does not allow to install prerequisites at all + // or any general issue with docker daemon startup + if time.Since(lastExitErrorTime) <= dockerStartErrorBurstDuration { + // we're in the exit error burst + burstAttempts++ + } else { + // no burst, reset the counter + burstAttempts = 0 + backoffStrategy.Reset() + } + lastExitErrorTime = time.Now() + + } else { + // transient condition like a docker daemon get killed because of OOM + burstAttempts = 0 + backoffStrategy.Reset() + } + + nextBackOff := backoffStrategy.NextBackOff() + + if nextBackOff == backoff.Stop || burstAttempts >= maxBurstDockerStartAttempts { + cancel() + log.WithError(err).WithField("attempts", burstAttempts).Error("cannot activate docker after maximum attempts, stopping trying permanently") + if logFile != nil { + fmt.Fprintln(logFile, "Cannot activate docker after maximum attempts, stopping trying permanently.") + fmt.Fprintln(logFile, "Please check logs above, fix the configuration, use `gp validate` to verify change, and commit to apply.") + fmt.Fprintf(logFile, "If it does not help, please reach out to the support. Don't forget to share your workspace ID: %s.\n", cfg.WorkspaceID) + } + return failedForRetryError + } + + if err != nil && + !errors.Is(err, context.Canceled) && + !process.IsNotChildProcess(err) && + !strings.Contains(err.Error(), "signal: ") { + // don't log typical transient errors + log.WithError(err).WithField("attempts", burstAttempts).WithField("backoff", nextBackOff.String()).Error("cannot activate docker, retrying...") + } + time.Sleep(nextBackOff) + } + return ctx.Err() +} + +func openDockerUpLogFile() (*os.File, error) { + if err := os.MkdirAll(logsDir, 0755); err != nil { + return nil, xerrors.Errorf("cannot create logs dir: %w", err) + } + if err := os.Chown(logsDir, gitpodUID, gitpodGID); err != nil { + return nil, xerrors.Errorf("cannot chown logs dir: %w", err) + } + logFile, err := os.OpenFile(dockerUpLogFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return nil, xerrors.Errorf("cannot open docker-up log file: %w", err) + } + + if err := os.Chown(dockerUpLogFilePath, gitpodUID, gitpodGID); err != nil { + _ = logFile.Close() + return nil, xerrors.Errorf("cannot chown docker-up log file: %w", err) + } + return logFile, nil +} diff --git a/components/supervisor/pkg/supervisor/supervisor.go b/components/supervisor/pkg/supervisor/supervisor.go index 39b942bc0a3257..7c3764affaf83a 100644 --- a/components/supervisor/pkg/supervisor/supervisor.go +++ b/components/supervisor/pkg/supervisor/supervisor.go @@ -55,7 +55,6 @@ import ( "github.com/gitpod-io/gitpod/content-service/pkg/git" gitpod "github.com/gitpod-io/gitpod/gitpod-protocol" "github.com/gitpod-io/gitpod/supervisor/api" - "github.com/gitpod-io/gitpod/supervisor/pkg/activation" "github.com/gitpod-io/gitpod/supervisor/pkg/config" "github.com/gitpod-io/gitpod/supervisor/pkg/dropwriter" "github.com/gitpod-io/gitpod/supervisor/pkg/metrics" @@ -408,7 +407,7 @@ func Run(options ...RunOption) { if !opts.RunGP { wg.Add(1) - go socketActivationForDocker(ctx, &wg, termMux, cfg, telemetry) + go socketActivationForDocker(ctx, &wg, termMux, cfg, telemetry, notificationService, cstate) } if cfg.isHeadless() { @@ -1547,73 +1546,6 @@ func recordInitializerMetrics(path string, metrics *metrics.SupervisorMetrics) { } } -func socketActivationForDocker(ctx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, w analytics.Writer) { - defer wg.Done() - - fn := "/var/run/docker.sock" - l, err := net.Listen("unix", fn) - if err != nil { - log.WithError(err).Error("cannot provide Docker activation socket") - return - } - - go func() { - <-ctx.Done() - l.Close() - }() - - _ = os.Chown(fn, gitpodUID, gitpodGID) - for ctx.Err() == nil { - err = activation.Listen(ctx, l, func(socketFD *os.File) error { - defer socketFD.Close() - cmd := exec.Command("/usr/bin/docker-up") - cmd.Env = append(os.Environ(), "LISTEN_FDS=1") - cmd.ExtraFiles = []*os.File{socketFD} - alias, err := term.Start(cmd, terminal.TermOptions{ - Annotations: map[string]string{ - "gitpod.supervisor": "true", - }, - LogToStdout: true, - }) - outcome := "success" - if err != nil { - outcome = "failure" - } - w.Track(analytics.TrackMessage{ - Identity: analytics.Identity{UserID: cfg.OwnerId}, - Event: "gitpod_activate_docker", - Properties: map[string]interface{}{ - "instanceId": cfg.WorkspaceInstanceID, - "workspaceId": cfg.WorkspaceID, - "outcome": outcome, - "debugWorkspace": cfg.isDebugWorkspace(), - }, - }) - if err != nil { - return err - } - pty, ok := term.Get(alias) - if !ok { - return errors.New("cannot find pty") - } - ptyCtx, cancel := context.WithCancel(context.Background()) - go func(ptyCtx context.Context) { - select { - case <-ctx.Done(): - _ = pty.Command.Process.Signal(syscall.SIGTERM) - case <-ptyCtx.Done(): - } - }(ptyCtx) - _, err = pty.Wait() - cancel() - return err - }) - if err != nil && !errors.Is(err, context.Canceled) && err.Error() != "signal: killed" { - log.WithError(err).Error("cannot provide Docker activation socket") - } - } -} - type PerfAnalyzer struct { label string defs []int