[#389686] Add crash detection logic to daemon
This commit is contained in:
parent
be14811eb4
commit
8c57583ce9
|
@ -82,6 +82,11 @@ type SystemConfiguration struct {
|
||||||
// daemon boot. This can take a long time on systems with many servers, or on
|
// daemon boot. This can take a long time on systems with many servers, or on
|
||||||
// systems with servers containing thousands of files.
|
// systems with servers containing thousands of files.
|
||||||
SetPermissionsOnBoot bool `yaml:"set_permissions_on_boot"`
|
SetPermissionsOnBoot bool `yaml:"set_permissions_on_boot"`
|
||||||
|
|
||||||
|
// Determines if Wings should detect a server that stops with a normal exit code of
|
||||||
|
// "0" as being crashed if the process stopped without any Wings interaction. E.g.
|
||||||
|
// the user did not press the stop button, but the process stopped cleanly.
|
||||||
|
DetectCleanExitAsCrash bool `default:"true" yaml:"detect_clean_exit_as_crash"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Defines the docker configuration used by the daemon when interacting with
|
// Defines the docker configuration used by the daemon when interacting with
|
||||||
|
|
66
server/crash.go
Normal file
66
server/crash.go
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
package server
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/pkg/errors"
|
||||||
|
"github.com/pterodactyl/wings/config"
|
||||||
|
"go.uber.org/zap"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type CrashDetection struct {
|
||||||
|
// If set to false, the system will not listen for crash detection events that
|
||||||
|
// can indicate that the server stopped unexpectedly.
|
||||||
|
Enabled bool `default:"true" json:"enabled" yaml:"enabled"`
|
||||||
|
|
||||||
|
// Tracks the time of the last server crash event.
|
||||||
|
lastCrash time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
// Looks at the environment exit state to determine if the process exited cleanly or
|
||||||
|
// if it was the result of an event that we should try to recover from.
|
||||||
|
//
|
||||||
|
// This function assumes it is called under circumstances where a crash is suspected
|
||||||
|
// of occuring. It will not do anything to determine if it was actually a crash, just
|
||||||
|
// look at the exit state and check if it meets the criteria of being called a crash
|
||||||
|
// by Wings.
|
||||||
|
//
|
||||||
|
// If the server is determined to have crashed, the process will be restarted and the
|
||||||
|
// counter for the server will be incremented.
|
||||||
|
//
|
||||||
|
// @todo output event to server console
|
||||||
|
func (s *Server) handleServerCrash() error {
|
||||||
|
// No point in doing anything here if the server isn't currently offline, there
|
||||||
|
// is no reason to do a crash detection event. If the server crash detection is
|
||||||
|
// disabled we want to skip anything after this as well.
|
||||||
|
if s.State != ProcessOfflineState || !s.CrashDetection.Enabled {
|
||||||
|
if s.CrashDetection.Enabled {
|
||||||
|
zap.S().Debugw("server triggered crash detection but handler is disabled for server process", zap.String("server", s.Uuid))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
exitCode, oomKilled, err := s.Environment.ExitState()
|
||||||
|
if err != nil {
|
||||||
|
return errors.WithStack(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the system is not configured to detect a clean exit code as a crash, and the
|
||||||
|
// crash is not the result of the program running out of memory, do nothing.
|
||||||
|
if exitCode == 0 && !oomKilled && !config.Get().System.DetectCleanExitAsCrash {
|
||||||
|
zap.S().Debugw("server exited with successful code; system configured to not detect as crash", zap.String("server", s.Uuid))
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
c := s.CrashDetection.lastCrash
|
||||||
|
// If the last crash time was within the last 60 seconds we do not want to perform
|
||||||
|
// an automatic reboot of the process. Return an error that can be handled.
|
||||||
|
if !c.IsZero() && c.Add(time.Second * 60).After(time.Now()) {
|
||||||
|
return &crashTooFrequent{}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.CrashDetection.lastCrash = time.Now()
|
||||||
|
|
||||||
|
return s.Environment.Start()
|
||||||
|
}
|
|
@ -40,6 +40,10 @@ type Environment interface {
|
||||||
// is not running no error should be returned.
|
// is not running no error should be returned.
|
||||||
Terminate(signal os.Signal) error
|
Terminate(signal os.Signal) error
|
||||||
|
|
||||||
|
// Returns the exit state of the process. The first result is the exit code, the second
|
||||||
|
// determines if the process was killed by the system OOM killer.
|
||||||
|
ExitState() (uint32, bool, error)
|
||||||
|
|
||||||
// Creates the necessary environment for running the server process. For example,
|
// Creates the necessary environment for running the server process. For example,
|
||||||
// in the Docker environment create will create a new container instance for the
|
// in the Docker environment create will create a new container instance for the
|
||||||
// server.
|
// server.
|
||||||
|
|
|
@ -316,6 +316,17 @@ func (d *DockerEnvironment) Terminate(signal os.Signal) error {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine the container exit state and return the exit code and wether or not
|
||||||
|
// the container was killed by the OOM killer.
|
||||||
|
func (d *DockerEnvironment) ExitState() (uint32, bool, error) {
|
||||||
|
c, err := d.Client.ContainerInspect(context.Background(), d.Server.Uuid)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false, errors.WithStack(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return uint32(c.State.ExitCode), c.State.OOMKilled, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Attaches to the docker container itself and ensures that we can pipe data in and out
|
// Attaches to the docker container itself and ensures that we can pipe data in and out
|
||||||
// of the process stream. This should not be used for reading console data as you *will*
|
// of the process stream. This should not be used for reading console data as you *will*
|
||||||
// miss important output at the beginning because of the time delay with attaching to the
|
// miss important output at the beginning because of the time delay with attaching to the
|
||||||
|
|
|
@ -12,3 +12,16 @@ func IsSuspendedError(err error) bool {
|
||||||
|
|
||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type crashTooFrequent struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *crashTooFrequent) Error() string {
|
||||||
|
return "server has crashed too soon after the last detected crash"
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsTooFrequentCrashError(err error) bool {
|
||||||
|
_, ok := err.(*crashTooFrequent)
|
||||||
|
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
|
@ -39,6 +39,7 @@ type Server struct {
|
||||||
// server process.
|
// server process.
|
||||||
EnvVars map[string]string `json:"environment" yaml:"environment"`
|
EnvVars map[string]string `json:"environment" yaml:"environment"`
|
||||||
|
|
||||||
|
CrashDetection CrashDetection `json:"crash_detection" yaml:"crash_detection"`
|
||||||
Build BuildSettings `json:"build"`
|
Build BuildSettings `json:"build"`
|
||||||
Allocations Allocations `json:"allocations"`
|
Allocations Allocations `json:"allocations"`
|
||||||
Environment Environment `json:"-" yaml:"-"`
|
Environment Environment `json:"-" yaml:"-"`
|
||||||
|
@ -287,9 +288,9 @@ func (s *Server) SetState(state string) error {
|
||||||
//
|
//
|
||||||
// We also get the benefit of server status changes always propagating corrected configurations
|
// We also get the benefit of server status changes always propagating corrected configurations
|
||||||
// to the disk should we forget to do it elsewhere.
|
// to the disk should we forget to do it elsewhere.
|
||||||
go func (serv *Server) {
|
go func(server *Server) {
|
||||||
if _, err := serv.WriteConfigurationToDisk(); err != nil {
|
if _, err := server.WriteConfigurationToDisk(); err != nil {
|
||||||
zap.S().Warnw("failed to write server state change to disk", zap.String("server", serv.Uuid), zap.Error(err))
|
zap.S().Warnw("failed to write server state change to disk", zap.String("server", server.Uuid), zap.Error(err))
|
||||||
}
|
}
|
||||||
}(s)
|
}(s)
|
||||||
|
|
||||||
|
@ -298,9 +299,27 @@ func (s *Server) SetState(state string) error {
|
||||||
// Emit the event to any listeners that are currently registered.
|
// Emit the event to any listeners that are currently registered.
|
||||||
s.Emit(StatusEvent, s.State)
|
s.Emit(StatusEvent, s.State)
|
||||||
|
|
||||||
// @todo handle a crash event here. Need to port the logic from the Nodejs daemon
|
// If server was in an online state, and is now in an offline state we should handle
|
||||||
// into this daemon. I believe its basically just if state != stopping && newState = stopped
|
// that as a crash event. In that scenario, check the last crash time, and the crash
|
||||||
// then crashed.
|
// counter.
|
||||||
|
//
|
||||||
|
// In the event that we have passed the thresholds, don't do anything, otherwise
|
||||||
|
// automatically attempt to start the process back up for the user. This is done in a
|
||||||
|
// seperate thread as to not block any actions currently taking place in the flow
|
||||||
|
// that called this function.
|
||||||
|
if (prevState == ProcessStartingState || prevState == ProcessRunningState) && s.State == ProcessOfflineState {
|
||||||
|
zap.S().Infow("detected server as entering a potentially crashed state; running handler", zap.String("server", s.Uuid))
|
||||||
|
|
||||||
|
go func(server *Server) {
|
||||||
|
if err := server.handleServerCrash(); err != nil {
|
||||||
|
if IsTooFrequentCrashError(err) {
|
||||||
|
zap.S().Infow("did not restart server after crash; occurred too soon after last", zap.String("server", server.Uuid))
|
||||||
|
} else {
|
||||||
|
zap.S().Errorw("failed to handle server crash state", zap.String("server", server.Uuid), zap.Error(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(s)
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user