[#389686] Add crash detection logic to daemon

2019-11-30 16:43:18 -08:00 · 2019-11-30 16:43:18 -08:00 · 8c57583ce9
commit 8c57583ce9
parent be14811eb4
6 changed files with 125 additions and 7 deletions
--- a/config/config.go
+++ b/config/config.go
@ -82,6 +82,11 @@ type SystemConfiguration struct {
 	// daemon boot. This can take a long time on systems with many servers, or on
 	// systems with servers containing thousands of files.
 	SetPermissionsOnBoot bool `yaml:"set_permissions_on_boot"`
+
+	// Determines if Wings should detect a server that stops with a normal exit code of
+	// "0" as being crashed if the process stopped without any Wings interaction. E.g.
+	// the user did not press the stop button, but the process stopped cleanly.
+	DetectCleanExitAsCrash bool `default:"true" yaml:"detect_clean_exit_as_crash"`
 }

 // Defines the docker configuration used by the daemon when interacting with
--- a/server/crash.go
+++ b/server/crash.go
@ -0,0 +1,66 @@
+package server
+
+import (
+	"github.com/pkg/errors"
+	"github.com/pterodactyl/wings/config"
+	"go.uber.org/zap"
+	"time"
+)
+
+type CrashDetection struct {
+	// If set to false, the system will not listen for crash detection events that
+	// can indicate that the server stopped unexpectedly.
+	Enabled bool `default:"true" json:"enabled" yaml:"enabled"`
+
+	// Tracks the time of the last server crash event.
+	lastCrash time.Time
+}
+
+// Looks at the environment exit state to determine if the process exited cleanly or
+// if it was the result of an event that we should try to recover from.
+//
+// This function assumes it is called under circumstances where a crash is suspected
+// of occuring. It will not do anything to determine if it was actually a crash, just
+// look at the exit state and check if it meets the criteria of being called a crash
+// by Wings.
+//
+// If the server is determined to have crashed, the process will be restarted and the
+// counter for the server will be incremented.
+//
+// @todo output event to server console
+func (s *Server) handleServerCrash() error {
+	// No point in doing anything here if the server isn't currently offline, there
+	// is no reason to do a crash detection event. If the server crash detection is
+	// disabled we want to skip anything after this as well.
+	if s.State != ProcessOfflineState || !s.CrashDetection.Enabled {
+		if s.CrashDetection.Enabled {
+			zap.S().Debugw("server triggered crash detection but handler is disabled for server process", zap.String("server", s.Uuid))
+		}
+
+		return nil
+	}
+
+	exitCode, oomKilled, err := s.Environment.ExitState()
+	if err != nil {
+		return errors.WithStack(err)
+	}
+
+	// If the system is not configured to detect a clean exit code as a crash, and the
+	// crash is not the result of the program running out of memory, do nothing.
+	if exitCode == 0 && !oomKilled && !config.Get().System.DetectCleanExitAsCrash {
+		zap.S().Debugw("server exited with successful code; system configured to not detect as crash", zap.String("server", s.Uuid))
+
+		return nil
+	}
+
+	c := s.CrashDetection.lastCrash
+	// If the last crash time was within the last 60 seconds we do not want to perform
+	// an automatic reboot of the process. Return an error that can be handled.
+	if !c.IsZero() && c.Add(time.Second * 60).After(time.Now()) {
+		return &crashTooFrequent{}
+	}
+
+	s.CrashDetection.lastCrash = time.Now()
+
+	return s.Environment.Start()
+}
--- a/server/environment.go
+++ b/server/environment.go
@ -40,6 +40,10 @@ type Environment interface {
 	// is not running no error should be returned.
 	Terminate(signal os.Signal) error

+	// Returns the exit state of the process. The first result is the exit code, the second
+	// determines if the process was killed by the system OOM killer.
+	ExitState() (uint32, bool, error)
+
 	// Creates the necessary environment for running the server process. For example,
 	// in the Docker environment create will create a new container instance for the
 	// server.
--- a/server/environment_docker.go
+++ b/server/environment_docker.go
@ -316,6 +316,17 @@ func (d *DockerEnvironment) Terminate(signal os.Signal) error {
 	)
 }

+// Determine the container exit state and return the exit code and wether or not
+// the container was killed by the OOM killer.
+func (d *DockerEnvironment) ExitState() (uint32, bool, error) {
+	c, err := d.Client.ContainerInspect(context.Background(), d.Server.Uuid)
+	if err != nil {
+		return 0, false, errors.WithStack(err)
+	}
+
+	return uint32(c.State.ExitCode), c.State.OOMKilled, nil
+}
+
 // Attaches to the docker container itself and ensures that we can pipe data in and out
 // of the process stream. This should not be used for reading console data as you *will*
 // miss important output at the beginning because of the time delay with attaching to the
--- a/server/errors.go
+++ b/server/errors.go
@ -11,4 +11,17 @@ func IsSuspendedError(err error) bool {
 	_, ok := err.(*suspendedError)

 	return ok
-}
+}
+
+type crashTooFrequent struct {
+}
+
+func (e *crashTooFrequent) Error() string {
+	return "server has crashed too soon after the last detected crash"
+}
+
+func IsTooFrequentCrashError(err error) bool {
+	_, ok := err.(*crashTooFrequent)
+
+	return ok
+}
--- a/server/server.go
+++ b/server/server.go
@ -39,6 +39,7 @@ type Server struct {
 	// server process.
 	EnvVars map[string]string `json:"environment" yaml:"environment"`

+	CrashDetection CrashDetection `json:"crash_detection" yaml:"crash_detection"`
 	Build          BuildSettings  `json:"build"`
 	Allocations    Allocations    `json:"allocations"`
 	Environment    Environment    `json:"-" yaml:"-"`
@ -287,9 +288,9 @@ func (s *Server) SetState(state string) error {
 	//
 	// We also get the benefit of server status changes always propagating corrected configurations
 	// to the disk should we forget to do it elsewhere.
-	go func (serv *Server) {
-		if _, err := serv.WriteConfigurationToDisk(); err != nil {
-			zap.S().Warnw("failed to write server state change to disk", zap.String("server", serv.Uuid), zap.Error(err))
+	go func(server *Server) {
+		if _, err := server.WriteConfigurationToDisk(); err != nil {
+			zap.S().Warnw("failed to write server state change to disk", zap.String("server", server.Uuid), zap.Error(err))
 		}
 	}(s)

@ -298,9 +299,27 @@ func (s *Server) SetState(state string) error {
 	// Emit the event to any listeners that are currently registered.
 	s.Emit(StatusEvent, s.State)

-	// @todo handle a crash event here. Need to port the logic from the Nodejs daemon
-	// into this daemon. I believe its basically just if state != stopping && newState = stopped
-	// then crashed.
+	// If server was in an online state, and is now in an offline state we should handle
+	// that as a crash event. In that scenario, check the last crash time, and the crash
+	// counter.
+	//
+	// In the event that we have passed the thresholds, don't do anything, otherwise
+	// automatically attempt to start the process back up for the user. This is done in a
+	// seperate thread as to not block any actions currently taking place in the flow
+	// that called this function.
+	if (prevState == ProcessStartingState || prevState == ProcessRunningState) && s.State == ProcessOfflineState {
+		zap.S().Infow("detected server as entering a potentially crashed state; running handler", zap.String("server", s.Uuid))
+
+		go func(server *Server) {
+			if err := server.handleServerCrash(); err != nil {
+				if IsTooFrequentCrashError(err) {
+					zap.S().Infow("did not restart server after crash; occurred too soon after last", zap.String("server", server.Uuid))
+				} else {
+					zap.S().Errorw("failed to handle server crash state", zap.String("server", server.Uuid), zap.Error(err))
+				}
+			}
+		}(s)
+	}

 	return nil
 }