worker: Introduce heartbeats
An occupied worker checks about every 15 seconds if it's current job was cancelled. Use this to introduce a heartbeat mechanism, where if composer hasn't heard from the worker in 2 minutes, the job times out and is set to fail.
This commit is contained in:
parent
0fcb44e617
commit
4385c39d66
6 changed files with 166 additions and 46 deletions
|
|
@ -40,13 +40,14 @@ var ErrInvalidToken = errors.New("token does not exist")
|
|||
var ErrJobNotRunning = errors.New("job isn't running")
|
||||
|
||||
func NewServer(logger *log.Logger, jobs jobqueue.JobQueue, artifactsDir string, identityFilter []string) *Server {
|
||||
|
||||
return &Server{
|
||||
s := &Server{
|
||||
jobs: jobs,
|
||||
logger: logger,
|
||||
artifactsDir: artifactsDir,
|
||||
identityFilter: identityFilter,
|
||||
}
|
||||
go s.WatchHeartbeats()
|
||||
return s
|
||||
}
|
||||
|
||||
func (s *Server) Handler() http.Handler {
|
||||
|
|
@ -111,6 +112,23 @@ func (s *Server) VerifyIdentityHeader(nextHandler echo.HandlerFunc) echo.Handler
|
|||
}
|
||||
}
|
||||
|
||||
// This function should be started as a goroutine
|
||||
// Every 30 seconds it goes through all running jobs, removing any unresponsive ones.
|
||||
// It fails jobs which fail to check if they cancelled for more than 2 minutes.
|
||||
func (s *Server) WatchHeartbeats() {
|
||||
//nolint:staticcheck // avoid SA1015, this is an endless function
|
||||
for range time.Tick(time.Second * 30) {
|
||||
for _, token := range s.jobs.Heartbeats(time.Second * 120) {
|
||||
id, _ := s.jobs.IdFromToken(token)
|
||||
log.Printf("Removing unresponsive job: %s\n", id)
|
||||
err := s.FinishJob(token, nil)
|
||||
if err != nil {
|
||||
log.Printf("Error finishing unresponsive job: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) EnqueueOSBuild(arch string, job *OSBuildJob) (uuid.UUID, error) {
|
||||
return s.jobs.Enqueue("osbuild:"+arch, job, nil)
|
||||
}
|
||||
|
|
@ -355,6 +373,8 @@ func (h *apiHandlers) GetJob(ctx echo.Context, tokenstr string) error {
|
|||
return ctx.JSON(http.StatusOK, getJobResponse{})
|
||||
}
|
||||
|
||||
h.server.jobs.RefreshHeartbeat(token)
|
||||
|
||||
status, _, err := h.server.JobStatus(jobId, &json.RawMessage{})
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue