[#389686] Add crash detection logic to daemon

2019-11-30 16:43:18 -08:00 · 2019-11-30 16:43:18 -08:00 · 8c57583ce9
commit 8c57583ce9
parent be14811eb4
6 changed files with 125 additions and 7 deletions
--- a/config/config.go
+++ b/config/config.go
@ -82,6 +82,11 @@ type SystemConfiguration struct {
 	// daemon boot. This can take a long time on systems with many servers, or on
 	// systems with servers containing thousands of files.
 	SetPermissionsOnBoot bool `yaml:"set_permissions_on_boot"`
 	// Determines if Wings should detect a server that stops with a normal exit code of
 	// "0" as being crashed if the process stopped without any Wings interaction. E.g.
 	// the user did not press the stop button, but the process stopped cleanly.
 	DetectCleanExitAsCrash bool `default:"true" yaml:"detect_clean_exit_as_crash"`
 }
 // Defines the docker configuration used by the daemon when interacting with
--- a/server/crash.go
+++ b/server/crash.go
@ -0,0 +1,66 @@
 package server
 import (
 	"github.com/pkg/errors"
 	"github.com/pterodactyl/wings/config"
 	"go.uber.org/zap"
 	"time"
 )
 type CrashDetection struct {
 	// If set to false, the system will not listen for crash detection events that
 	// can indicate that the server stopped unexpectedly.
 	Enabled bool `default:"true" json:"enabled" yaml:"enabled"`
 	// Tracks the time of the last server crash event.
 	lastCrash time.Time
 }
 // Looks at the environment exit state to determine if the process exited cleanly or
 // if it was the result of an event that we should try to recover from.
 //
 // This function assumes it is called under circumstances where a crash is suspected
 // of occuring. It will not do anything to determine if it was actually a crash, just
 // look at the exit state and check if it meets the criteria of being called a crash
 // by Wings.
 //
 // If the server is determined to have crashed, the process will be restarted and the
 // counter for the server will be incremented.
 //
 // @todo output event to server console
 func (s *Server) handleServerCrash() error {
 	// No point in doing anything here if the server isn't currently offline, there
 	// is no reason to do a crash detection event. If the server crash detection is
 	// disabled we want to skip anything after this as well.
 	if s.State != ProcessOfflineState || !s.CrashDetection.Enabled {
 		if s.CrashDetection.Enabled {
 			zap.S().Debugw("server triggered crash detection but handler is disabled for server process", zap.String("server", s.Uuid))
 		}
 		return nil
 	}
 	exitCode, oomKilled, err := s.Environment.ExitState()
 	if err != nil {
 		return errors.WithStack(err)
 	}
 	// If the system is not configured to detect a clean exit code as a crash, and the
 	// crash is not the result of the program running out of memory, do nothing.
 	if exitCode == 0 && !oomKilled && !config.Get().System.DetectCleanExitAsCrash {
 		zap.S().Debugw("server exited with successful code; system configured to not detect as crash", zap.String("server", s.Uuid))
 		return nil
 	}
 	c := s.CrashDetection.lastCrash
 	// If the last crash time was within the last 60 seconds we do not want to perform
 	// an automatic reboot of the process. Return an error that can be handled.
 	if !c.IsZero() && c.Add(time.Second * 60).After(time.Now()) {
 		return &crashTooFrequent{}
 	}
 	s.CrashDetection.lastCrash = time.Now()
 	return s.Environment.Start()
 }
--- a/server/environment.go
+++ b/server/environment.go
@ -40,6 +40,10 @@ type Environment interface {
 	// is not running no error should be returned.
 	Terminate(signal os.Signal) error
 	// Returns the exit state of the process. The first result is the exit code, the second
 	// determines if the process was killed by the system OOM killer.
 	ExitState() (uint32, bool, error)
 	// Creates the necessary environment for running the server process. For example,
 	// in the Docker environment create will create a new container instance for the
 	// server.
--- a/server/environment_docker.go
+++ b/server/environment_docker.go
@ -316,6 +316,17 @@ func (d *DockerEnvironment) Terminate(signal os.Signal) error {
 	)
 }
 // Determine the container exit state and return the exit code and wether or not
 // the container was killed by the OOM killer.
 func (d *DockerEnvironment) ExitState() (uint32, bool, error) {
 	c, err := d.Client.ContainerInspect(context.Background(), d.Server.Uuid)
 	if err != nil {
 		return 0, false, errors.WithStack(err)
 	}
 	return uint32(c.State.ExitCode), c.State.OOMKilled, nil
 }
 // Attaches to the docker container itself and ensures that we can pipe data in and out
 // of the process stream. This should not be used for reading console data as you *will*
 // miss important output at the beginning because of the time delay with attaching to the
--- a/server/errors.go
+++ b/server/errors.go
@ -11,4 +11,17 @@ func IsSuspendedError(err error) bool {
 	_, ok := err.(*suspendedError)
 	return ok
-}
+}
 type crashTooFrequent struct {
 }
 func (e *crashTooFrequent) Error() string {
 	return "server has crashed too soon after the last detected crash"
 }
 func IsTooFrequentCrashError(err error) bool {
 	_, ok := err.(*crashTooFrequent)
 	return ok
 }
--- a/server/server.go
+++ b/server/server.go
@ -39,6 +39,7 @@ type Server struct {
 	// server process.
 	EnvVars map[string]string `json:"environment" yaml:"environment"`
 	CrashDetection CrashDetection `json:"crash_detection" yaml:"crash_detection"`
 	Build          BuildSettings  `json:"build"`
 	Allocations    Allocations    `json:"allocations"`
 	Environment    Environment    `json:"-" yaml:"-"`
@ -287,9 +288,9 @@ func (s *Server) SetState(state string) error {
 	//
 	// We also get the benefit of server status changes always propagating corrected configurations
 	// to the disk should we forget to do it elsewhere.
-	go func (serv *Server) {
+	go func(server *Server) {
-		if _, err := serv.WriteConfigurationToDisk(); err != nil {
+		if _, err := server.WriteConfigurationToDisk(); err != nil {
-			zap.S().Warnw("failed to write server state change to disk", zap.String("server", serv.Uuid), zap.Error(err))
+			zap.S().Warnw("failed to write server state change to disk", zap.String("server", server.Uuid), zap.Error(err))
 		}
 	}(s)
@ -298,9 +299,27 @@ func (s *Server) SetState(state string) error {
 	// Emit the event to any listeners that are currently registered.
 	s.Emit(StatusEvent, s.State)
-	// @todo handle a crash event here. Need to port the logic from the Nodejs daemon
+	// If server was in an online state, and is now in an offline state we should handle
-	// into this daemon. I believe its basically just if state != stopping && newState = stopped
+	// that as a crash event. In that scenario, check the last crash time, and the crash
-	// then crashed.
+	// counter.
 	//
 	// In the event that we have passed the thresholds, don't do anything, otherwise
 	// automatically attempt to start the process back up for the user. This is done in a
 	// seperate thread as to not block any actions currently taking place in the flow
 	// that called this function.
 	if (prevState == ProcessStartingState || prevState == ProcessRunningState) && s.State == ProcessOfflineState {
 		zap.S().Infow("detected server as entering a potentially crashed state; running handler", zap.String("server", s.Uuid))
 		go func(server *Server) {
 			if err := server.handleServerCrash(); err != nil {
 				if IsTooFrequentCrashError(err) {
 					zap.S().Infow("did not restart server after crash; occurred too soon after last", zap.String("server", server.Uuid))
 				} else {
 					zap.S().Errorw("failed to handle server crash state", zap.String("server", server.Uuid), zap.Error(err))
 				}
 			}
 		}(s)
 	}
 	return nil
 }