From cd67e5fdb91e645e90640fd7815c104c9e68c825 Mon Sep 17 00:00:00 2001 From: Dane Everitt Date: Mon, 31 Jan 2022 19:09:08 -0500 Subject: [PATCH] Fix logic for context based environment stopping Uses dual contexts to handle stopping using a timed context, and also terminating the entire process loop if the parent context gets canceled. --- environment/docker/power.go | 66 +++++++++++++++++++++++------------ environment/environment.go | 15 ++++---- router/router_transfer.go | 2 +- router/websocket/websocket.go | 5 +-- server/backup.go | 2 +- server/install.go | 3 +- server/listeners.go | 3 +- server/power.go | 10 +----- server/update.go | 4 ++- 9 files changed, 62 insertions(+), 48 deletions(-) diff --git a/environment/docker/power.go b/environment/docker/power.go index bf83497..3370075 100644 --- a/environment/docker/power.go +++ b/environment/docker/power.go @@ -177,7 +177,9 @@ func (e *Environment) Stop(ctx context.Context) error { return e.SendCommand(s.Value) } - t := time.Second * 30 + // Allow the stop action to run for however long it takes, similar to executing a command + // and using a different logic pathway to wait for the container to stop successfully. + t := time.Duration(-1) if err := e.client.ContainerStop(ctx, e.Id, &t); err != nil { // If the container does not exist just mark the process as stopped and return without // an error. @@ -196,48 +198,66 @@ func (e *Environment) Stop(ctx context.Context) error { // command. If the server does not stop after seconds have passed, an error will // be returned, or the instance will be terminated forcefully depending on the // value of the second argument. -func (e *Environment) WaitForStop(seconds uint, terminate bool) error { - ctx, cancel := context.WithTimeout(context.Background(), time.Second * time.Duration(seconds)) +// +// Calls to Environment.Terminate() in this function use the context passed +// through since we don't want to prevent termination of the server instance +// just because the context.WithTimeout() has expired. +func (e *Environment) WaitForStop(ctx context.Context, duration time.Duration, terminate bool) error { + tctx, cancel := context.WithTimeout(context.Background(), duration) defer cancel() - return e.WaitForStopWithContext(ctx, terminate) -} + // If the parent context is canceled, abort the timed context for termination. + go func() { + select { + case <-ctx.Done(): + cancel() + case <-tctx.Done(): + // When the timed context is canceled, terminate this routine since we no longer + // need to worry about the parent routine being canceled. + break + } + }() -func (e *Environment) WaitForStopWithContext(ctx context.Context, terminate bool) error { - if err := e.Stop(ctx); err != nil { + doTermination := func (s string) error { + e.log().WithField("step", s).WithField("duration", duration).Warn("container stop did not complete in time, terminating process...") + return e.Terminate(ctx, os.Kill) + } + + // We pass through the timed context for this stop action so that if one of the + // internal docker calls fails to ever finish before we've exhausted the time limit + // the resources get cleaned up, and the exection is stopped. + if err := e.Stop(tctx); err != nil { + if terminate && errors.Is(err, context.DeadlineExceeded) { + return doTermination("stop") + } return err } // Block the return of this function until the container as been marked as no // longer running. If this wait does not end by the time seconds have passed, // attempt to terminate the container, or return an error. - ok, errChan := e.client.ContainerWait(ctx, e.Id, container.WaitConditionNotRunning) + ok, errChan := e.client.ContainerWait(tctx, e.Id, container.WaitConditionNotRunning) select { case <-ctx.Done(): if err := ctx.Err(); err != nil { if terminate { - log.WithField("container_id", e.Id).Info("server did not stop in time, executing process termination") - - return e.Terminate(ctx, os.Kill) + return doTermination("parent-context") } return err } case err := <-errChan: // If the error stems from the container not existing there is no point in wasting // CPU time to then try and terminate it. - if err != nil && !client.IsErrNotFound(err) { - if terminate { - l := log.WithField("container_id", e.Id) - if errors.Is(err, context.DeadlineExceeded) { - l.Warn("deadline exceeded for container stop; terminating process") - } else { - l.WithField("error", err).Warn("error while waiting for container stop; terminating process") - } - - return e.Terminate(ctx, os.Kill) - } - return errors.WrapIf(err, "environment/docker: error waiting on container to enter \"not-running\" state") + if err == nil || client.IsErrNotFound(err) { + return nil } + if terminate { + if !errors.Is(err, context.DeadlineExceeded) { + e.log().WithField("error", err).Warn("error while waiting for container stop; terminating process") + } + return doTermination("wait") + } + return errors.WrapIf(err, "environment/docker: error waiting on container to enter \"not-running\" state") case <-ok: } diff --git a/environment/environment.go b/environment/environment.go index 0fa226a..2d03837 100644 --- a/environment/environment.go +++ b/environment/environment.go @@ -3,6 +3,7 @@ package environment import ( "context" "os" + "time" "github.com/pterodactyl/wings/events" ) @@ -63,15 +64,11 @@ type ProcessEnvironment interface { Stop(ctx context.Context) error // WaitForStop waits for a server instance to stop gracefully. If the server is - // still detected as running after seconds, an error will be returned, or the server - // will be terminated depending on the value of the second argument. - WaitForStop(seconds uint, terminate bool) error - - // WaitForStopWithContext works in the same fashion as WaitForStop, but accepts a - // context value and will stop waiting when the context is canceled. If the terminate - // option is true, the server will be terminate when the context is canceled if the - // server has not stopped at that point. - WaitForStopWithContext(ctx context.Context, terminate bool) error + // still detected as running after "duration", an error will be returned, or the server + // will be terminated depending on the value of the second argument. If the context + // provided is canceled the underlying wait conditions will be stopped and the + // entire loop will be ended (potentially without stopping or terminating). + WaitForStop(ctx context.Context, duration time.Duration, terminate bool) error // Terminate stops a running server instance using the provided signal. This function // is a no-op if the server is already stopped. diff --git a/router/router_transfer.go b/router/router_transfer.go index 794981e..b3153e0 100644 --- a/router/router_transfer.go +++ b/router/router_transfer.go @@ -178,7 +178,7 @@ func postServerArchive(c *gin.Context) { // Ensure the server is offline. Sometimes a "No such container" error gets through // which means the server is already stopped. We can ignore that. - if err := s.Environment.WaitForStop(60, false); err != nil && !strings.Contains(strings.ToLower(err.Error()), "no such container") { + if err := s.Environment.WaitForStop(s.Context(), time.Minute, false); err != nil && !strings.Contains(strings.ToLower(err.Error()), "no such container") { sendTransferLog("Failed to stop server, aborting transfer..") l.WithField("error", err).Error("failed to stop server") return diff --git a/router/websocket/websocket.go b/router/websocket/websocket.go index 0d88627..064160d 100644 --- a/router/websocket/websocket.go +++ b/router/websocket/websocket.go @@ -11,9 +11,10 @@ import ( "emperror.dev/errors" "github.com/apex/log" "github.com/gbrlsnchs/jwt/v3" + "github.com/goccy/go-json" "github.com/google/uuid" "github.com/gorilla/websocket" - "github.com/goccy/go-json" + "github.com/pterodactyl/wings/system" "github.com/pterodactyl/wings/config" "github.com/pterodactyl/wings/environment" @@ -353,7 +354,7 @@ func (h *Handler) HandleInbound(ctx context.Context, m Message) error { } err := h.server.HandlePowerAction(action) - if errors.Is(err, context.DeadlineExceeded) { + if errors.Is(err, system.ErrLockerLocked) { m, _ := h.GetErrorMessage("another power action is currently being processed for this server, please try again later") _ = h.SendJson(Message{ diff --git a/server/backup.go b/server/backup.go index b7d5cf6..2040f6c 100644 --- a/server/backup.go +++ b/server/backup.go @@ -142,7 +142,7 @@ func (s *Server) RestoreBackup(b backup.BackupInterface, reader io.ReadCloser) ( // instance, otherwise you'll likely hit all types of write errors due to the // server being suspended. if s.Environment.State() != environment.ProcessOfflineState { - if err = s.Environment.WaitForStop(120, false); err != nil { + if err = s.Environment.WaitForStop(s.Context(), time.Minute*2, false); err != nil { if !client.IsErrNotFound(err) { return errors.WrapIf(err, "server/backup: restore: failed to wait for container stop") } diff --git a/server/install.go b/server/install.go index d4c6dd6..bb9a878 100644 --- a/server/install.go +++ b/server/install.go @@ -10,6 +10,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "emperror.dev/errors" "github.com/apex/log" @@ -79,7 +80,7 @@ func (s *Server) Install(sync bool) error { func (s *Server) Reinstall() error { if s.Environment.State() != environment.ProcessOfflineState { s.Log().Debug("waiting for server instance to enter a stopped state") - if err := s.Environment.WaitForStop(10, true); err != nil { + if err := s.Environment.WaitForStop(s.Context(), time.Second*10, true); err != nil { return err } } diff --git a/server/listeners.go b/server/listeners.go index 5d0bfb7..54e8801 100644 --- a/server/listeners.go +++ b/server/listeners.go @@ -5,6 +5,7 @@ import ( "regexp" "strconv" "sync" + "time" "github.com/apex/log" @@ -44,7 +45,7 @@ func (dsl *diskSpaceLimiter) Reset() { func (dsl *diskSpaceLimiter) Trigger() { dsl.o.Do(func() { dsl.server.PublishConsoleOutputFromDaemon("Server is exceeding the assigned disk space limit, stopping process now.") - if err := dsl.server.Environment.WaitForStop(60, true); err != nil { + if err := dsl.server.Environment.WaitForStop(dsl.server.Context(), time.Minute, true); err != nil { dsl.server.Log().WithField("error", err).Error("failed to stop server after exceeding space limit!") } }) diff --git a/server/power.go b/server/power.go index f556705..eb83935 100644 --- a/server/power.go +++ b/server/power.go @@ -135,12 +135,9 @@ func (s *Server) HandlePowerAction(action PowerAction, waitSeconds ...int) error case PowerActionStop: fallthrough case PowerActionRestart: - ctx, cancel := context.WithTimeout(s.Context(), time.Second) - defer cancel() - // We're specifically waiting for the process to be stopped here, otherwise the lock is // released too soon, and you can rack up all sorts of issues. - if err := s.Environment.WaitForStopWithContext(ctx, true); err != nil { + if err := s.Environment.WaitForStop(s.Context(), time.Minute*10, true); err != nil { // Even timeout errors should be bubbled back up the stack. If the process didn't stop // nicely, but the terminate argument was passed then the server is stopped without an // error being returned. @@ -156,11 +153,6 @@ func (s *Server) HandlePowerAction(action PowerAction, waitSeconds ...int) error return nil } - // Release the resources we acquired for the initial timer context since we don't - // need them anymore at this point, and the start process can take quite awhile to - // complete. - cancel() - // Now actually try to start the process by executing the normal pre-boot logic. if err := s.onBeforeStart(); err != nil { return err diff --git a/server/update.go b/server/update.go index 3334b59..c8a57bb 100644 --- a/server/update.go +++ b/server/update.go @@ -1,6 +1,8 @@ package server import ( + "time" + "github.com/pterodactyl/wings/environment/docker" "github.com/pterodactyl/wings/environment" @@ -58,7 +60,7 @@ func (s *Server) SyncWithEnvironment() { s.Log().Info("server suspended with running process state, terminating now") go func(s *Server) { - if err := s.Environment.WaitForStop(60, true); err != nil { + if err := s.Environment.WaitForStop(s.Context(), time.Minute, true); err != nil { s.Log().WithField("error", err).Warn("failed to terminate server environment after suspension") } }(s)