[#389686] Add crash detection logic to daemon
This commit is contained in:
parent
be14811eb4
commit
8c57583ce9
|
@ -82,6 +82,11 @@ type SystemConfiguration struct {
|
|||
// daemon boot. This can take a long time on systems with many servers, or on
|
||||
// systems with servers containing thousands of files.
|
||||
SetPermissionsOnBoot bool `yaml:"set_permissions_on_boot"`
|
||||
|
||||
// Determines if Wings should detect a server that stops with a normal exit code of
|
||||
// "0" as being crashed if the process stopped without any Wings interaction. E.g.
|
||||
// the user did not press the stop button, but the process stopped cleanly.
|
||||
DetectCleanExitAsCrash bool `default:"true" yaml:"detect_clean_exit_as_crash"`
|
||||
}
|
||||
|
||||
// Defines the docker configuration used by the daemon when interacting with
|
||||
|
|
66
server/crash.go
Normal file
66
server/crash.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
package server
|
||||
|
||||
import (
|
||||
"github.com/pkg/errors"
|
||||
"github.com/pterodactyl/wings/config"
|
||||
"go.uber.org/zap"
|
||||
"time"
|
||||
)
|
||||
|
||||
type CrashDetection struct {
|
||||
// If set to false, the system will not listen for crash detection events that
|
||||
// can indicate that the server stopped unexpectedly.
|
||||
Enabled bool `default:"true" json:"enabled" yaml:"enabled"`
|
||||
|
||||
// Tracks the time of the last server crash event.
|
||||
lastCrash time.Time
|
||||
}
|
||||
|
||||
// Looks at the environment exit state to determine if the process exited cleanly or
|
||||
// if it was the result of an event that we should try to recover from.
|
||||
//
|
||||
// This function assumes it is called under circumstances where a crash is suspected
|
||||
// of occuring. It will not do anything to determine if it was actually a crash, just
|
||||
// look at the exit state and check if it meets the criteria of being called a crash
|
||||
// by Wings.
|
||||
//
|
||||
// If the server is determined to have crashed, the process will be restarted and the
|
||||
// counter for the server will be incremented.
|
||||
//
|
||||
// @todo output event to server console
|
||||
func (s *Server) handleServerCrash() error {
|
||||
// No point in doing anything here if the server isn't currently offline, there
|
||||
// is no reason to do a crash detection event. If the server crash detection is
|
||||
// disabled we want to skip anything after this as well.
|
||||
if s.State != ProcessOfflineState || !s.CrashDetection.Enabled {
|
||||
if s.CrashDetection.Enabled {
|
||||
zap.S().Debugw("server triggered crash detection but handler is disabled for server process", zap.String("server", s.Uuid))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
exitCode, oomKilled, err := s.Environment.ExitState()
|
||||
if err != nil {
|
||||
return errors.WithStack(err)
|
||||
}
|
||||
|
||||
// If the system is not configured to detect a clean exit code as a crash, and the
|
||||
// crash is not the result of the program running out of memory, do nothing.
|
||||
if exitCode == 0 && !oomKilled && !config.Get().System.DetectCleanExitAsCrash {
|
||||
zap.S().Debugw("server exited with successful code; system configured to not detect as crash", zap.String("server", s.Uuid))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
c := s.CrashDetection.lastCrash
|
||||
// If the last crash time was within the last 60 seconds we do not want to perform
|
||||
// an automatic reboot of the process. Return an error that can be handled.
|
||||
if !c.IsZero() && c.Add(time.Second * 60).After(time.Now()) {
|
||||
return &crashTooFrequent{}
|
||||
}
|
||||
|
||||
s.CrashDetection.lastCrash = time.Now()
|
||||
|
||||
return s.Environment.Start()
|
||||
}
|
|
@ -40,6 +40,10 @@ type Environment interface {
|
|||
// is not running no error should be returned.
|
||||
Terminate(signal os.Signal) error
|
||||
|
||||
// Returns the exit state of the process. The first result is the exit code, the second
|
||||
// determines if the process was killed by the system OOM killer.
|
||||
ExitState() (uint32, bool, error)
|
||||
|
||||
// Creates the necessary environment for running the server process. For example,
|
||||
// in the Docker environment create will create a new container instance for the
|
||||
// server.
|
||||
|
|
|
@ -316,6 +316,17 @@ func (d *DockerEnvironment) Terminate(signal os.Signal) error {
|
|||
)
|
||||
}
|
||||
|
||||
// Determine the container exit state and return the exit code and wether or not
|
||||
// the container was killed by the OOM killer.
|
||||
func (d *DockerEnvironment) ExitState() (uint32, bool, error) {
|
||||
c, err := d.Client.ContainerInspect(context.Background(), d.Server.Uuid)
|
||||
if err != nil {
|
||||
return 0, false, errors.WithStack(err)
|
||||
}
|
||||
|
||||
return uint32(c.State.ExitCode), c.State.OOMKilled, nil
|
||||
}
|
||||
|
||||
// Attaches to the docker container itself and ensures that we can pipe data in and out
|
||||
// of the process stream. This should not be used for reading console data as you *will*
|
||||
// miss important output at the beginning because of the time delay with attaching to the
|
||||
|
|
|
@ -12,3 +12,16 @@ func IsSuspendedError(err error) bool {
|
|||
|
||||
return ok
|
||||
}
|
||||
|
||||
type crashTooFrequent struct {
|
||||
}
|
||||
|
||||
func (e *crashTooFrequent) Error() string {
|
||||
return "server has crashed too soon after the last detected crash"
|
||||
}
|
||||
|
||||
func IsTooFrequentCrashError(err error) bool {
|
||||
_, ok := err.(*crashTooFrequent)
|
||||
|
||||
return ok
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ type Server struct {
|
|||
// server process.
|
||||
EnvVars map[string]string `json:"environment" yaml:"environment"`
|
||||
|
||||
CrashDetection CrashDetection `json:"crash_detection" yaml:"crash_detection"`
|
||||
Build BuildSettings `json:"build"`
|
||||
Allocations Allocations `json:"allocations"`
|
||||
Environment Environment `json:"-" yaml:"-"`
|
||||
|
@ -287,9 +288,9 @@ func (s *Server) SetState(state string) error {
|
|||
//
|
||||
// We also get the benefit of server status changes always propagating corrected configurations
|
||||
// to the disk should we forget to do it elsewhere.
|
||||
go func (serv *Server) {
|
||||
if _, err := serv.WriteConfigurationToDisk(); err != nil {
|
||||
zap.S().Warnw("failed to write server state change to disk", zap.String("server", serv.Uuid), zap.Error(err))
|
||||
go func(server *Server) {
|
||||
if _, err := server.WriteConfigurationToDisk(); err != nil {
|
||||
zap.S().Warnw("failed to write server state change to disk", zap.String("server", server.Uuid), zap.Error(err))
|
||||
}
|
||||
}(s)
|
||||
|
||||
|
@ -298,9 +299,27 @@ func (s *Server) SetState(state string) error {
|
|||
// Emit the event to any listeners that are currently registered.
|
||||
s.Emit(StatusEvent, s.State)
|
||||
|
||||
// @todo handle a crash event here. Need to port the logic from the Nodejs daemon
|
||||
// into this daemon. I believe its basically just if state != stopping && newState = stopped
|
||||
// then crashed.
|
||||
// If server was in an online state, and is now in an offline state we should handle
|
||||
// that as a crash event. In that scenario, check the last crash time, and the crash
|
||||
// counter.
|
||||
//
|
||||
// In the event that we have passed the thresholds, don't do anything, otherwise
|
||||
// automatically attempt to start the process back up for the user. This is done in a
|
||||
// seperate thread as to not block any actions currently taking place in the flow
|
||||
// that called this function.
|
||||
if (prevState == ProcessStartingState || prevState == ProcessRunningState) && s.State == ProcessOfflineState {
|
||||
zap.S().Infow("detected server as entering a potentially crashed state; running handler", zap.String("server", s.Uuid))
|
||||
|
||||
go func(server *Server) {
|
||||
if err := server.handleServerCrash(); err != nil {
|
||||
if IsTooFrequentCrashError(err) {
|
||||
zap.S().Infow("did not restart server after crash; occurred too soon after last", zap.String("server", server.Uuid))
|
||||
} else {
|
||||
zap.S().Errorw("failed to handle server crash state", zap.String("server", server.Uuid), zap.Error(err))
|
||||
}
|
||||
}
|
||||
}(s)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user