[#389686] Add crash detection logic to daemon

This commit is contained in:
Dane Everitt 2019-11-30 16:43:18 -08:00
parent be14811eb4
commit 8c57583ce9
No known key found for this signature in database
GPG Key ID: EEA66103B3D71F53
6 changed files with 125 additions and 7 deletions

View File

@ -82,6 +82,11 @@ type SystemConfiguration struct {
// daemon boot. This can take a long time on systems with many servers, or on
// systems with servers containing thousands of files.
SetPermissionsOnBoot bool `yaml:"set_permissions_on_boot"`
// Determines if Wings should detect a server that stops with a normal exit code of
// "0" as being crashed if the process stopped without any Wings interaction. E.g.
// the user did not press the stop button, but the process stopped cleanly.
DetectCleanExitAsCrash bool `default:"true" yaml:"detect_clean_exit_as_crash"`
}
// Defines the docker configuration used by the daemon when interacting with

66
server/crash.go Normal file
View File

@ -0,0 +1,66 @@
package server
import (
"github.com/pkg/errors"
"github.com/pterodactyl/wings/config"
"go.uber.org/zap"
"time"
)
type CrashDetection struct {
// If set to false, the system will not listen for crash detection events that
// can indicate that the server stopped unexpectedly.
Enabled bool `default:"true" json:"enabled" yaml:"enabled"`
// Tracks the time of the last server crash event.
lastCrash time.Time
}
// Looks at the environment exit state to determine if the process exited cleanly or
// if it was the result of an event that we should try to recover from.
//
// This function assumes it is called under circumstances where a crash is suspected
// of occuring. It will not do anything to determine if it was actually a crash, just
// look at the exit state and check if it meets the criteria of being called a crash
// by Wings.
//
// If the server is determined to have crashed, the process will be restarted and the
// counter for the server will be incremented.
//
// @todo output event to server console
func (s *Server) handleServerCrash() error {
// No point in doing anything here if the server isn't currently offline, there
// is no reason to do a crash detection event. If the server crash detection is
// disabled we want to skip anything after this as well.
if s.State != ProcessOfflineState || !s.CrashDetection.Enabled {
if s.CrashDetection.Enabled {
zap.S().Debugw("server triggered crash detection but handler is disabled for server process", zap.String("server", s.Uuid))
}
return nil
}
exitCode, oomKilled, err := s.Environment.ExitState()
if err != nil {
return errors.WithStack(err)
}
// If the system is not configured to detect a clean exit code as a crash, and the
// crash is not the result of the program running out of memory, do nothing.
if exitCode == 0 && !oomKilled && !config.Get().System.DetectCleanExitAsCrash {
zap.S().Debugw("server exited with successful code; system configured to not detect as crash", zap.String("server", s.Uuid))
return nil
}
c := s.CrashDetection.lastCrash
// If the last crash time was within the last 60 seconds we do not want to perform
// an automatic reboot of the process. Return an error that can be handled.
if !c.IsZero() && c.Add(time.Second * 60).After(time.Now()) {
return &crashTooFrequent{}
}
s.CrashDetection.lastCrash = time.Now()
return s.Environment.Start()
}

View File

@ -40,6 +40,10 @@ type Environment interface {
// is not running no error should be returned.
Terminate(signal os.Signal) error
// Returns the exit state of the process. The first result is the exit code, the second
// determines if the process was killed by the system OOM killer.
ExitState() (uint32, bool, error)
// Creates the necessary environment for running the server process. For example,
// in the Docker environment create will create a new container instance for the
// server.

View File

@ -316,6 +316,17 @@ func (d *DockerEnvironment) Terminate(signal os.Signal) error {
)
}
// Determine the container exit state and return the exit code and wether or not
// the container was killed by the OOM killer.
func (d *DockerEnvironment) ExitState() (uint32, bool, error) {
c, err := d.Client.ContainerInspect(context.Background(), d.Server.Uuid)
if err != nil {
return 0, false, errors.WithStack(err)
}
return uint32(c.State.ExitCode), c.State.OOMKilled, nil
}
// Attaches to the docker container itself and ensures that we can pipe data in and out
// of the process stream. This should not be used for reading console data as you *will*
// miss important output at the beginning because of the time delay with attaching to the

View File

@ -12,3 +12,16 @@ func IsSuspendedError(err error) bool {
return ok
}
type crashTooFrequent struct {
}
func (e *crashTooFrequent) Error() string {
return "server has crashed too soon after the last detected crash"
}
func IsTooFrequentCrashError(err error) bool {
_, ok := err.(*crashTooFrequent)
return ok
}

View File

@ -39,6 +39,7 @@ type Server struct {
// server process.
EnvVars map[string]string `json:"environment" yaml:"environment"`
CrashDetection CrashDetection `json:"crash_detection" yaml:"crash_detection"`
Build BuildSettings `json:"build"`
Allocations Allocations `json:"allocations"`
Environment Environment `json:"-" yaml:"-"`
@ -287,9 +288,9 @@ func (s *Server) SetState(state string) error {
//
// We also get the benefit of server status changes always propagating corrected configurations
// to the disk should we forget to do it elsewhere.
go func (serv *Server) {
if _, err := serv.WriteConfigurationToDisk(); err != nil {
zap.S().Warnw("failed to write server state change to disk", zap.String("server", serv.Uuid), zap.Error(err))
go func(server *Server) {
if _, err := server.WriteConfigurationToDisk(); err != nil {
zap.S().Warnw("failed to write server state change to disk", zap.String("server", server.Uuid), zap.Error(err))
}
}(s)
@ -298,9 +299,27 @@ func (s *Server) SetState(state string) error {
// Emit the event to any listeners that are currently registered.
s.Emit(StatusEvent, s.State)
// @todo handle a crash event here. Need to port the logic from the Nodejs daemon
// into this daemon. I believe its basically just if state != stopping && newState = stopped
// then crashed.
// If server was in an online state, and is now in an offline state we should handle
// that as a crash event. In that scenario, check the last crash time, and the crash
// counter.
//
// In the event that we have passed the thresholds, don't do anything, otherwise
// automatically attempt to start the process back up for the user. This is done in a
// seperate thread as to not block any actions currently taking place in the flow
// that called this function.
if (prevState == ProcessStartingState || prevState == ProcessRunningState) && s.State == ProcessOfflineState {
zap.S().Infow("detected server as entering a potentially crashed state; running handler", zap.String("server", s.Uuid))
go func(server *Server) {
if err := server.handleServerCrash(); err != nil {
if IsTooFrequentCrashError(err) {
zap.S().Infow("did not restart server after crash; occurred too soon after last", zap.String("server", server.Uuid))
} else {
zap.S().Errorw("failed to handle server crash state", zap.String("server", server.Uuid), zap.Error(err))
}
}
}(s)
}
return nil
}