worker: Introduce heartbeats

An occupied worker checks about every 15 seconds if it's current job was
cancelled. Use this to introduce a heartbeat mechanism, where if
composer hasn't heard from the worker in 2 minutes, the job times out
and is set to fail.
This commit is contained in:
sanne 2021-07-05 11:45:16 +02:00 committed by Tom Gundersen
parent 0fcb44e617
commit 4385c39d66
6 changed files with 166 additions and 46 deletions

View file

@ -40,13 +40,14 @@ var ErrInvalidToken = errors.New("token does not exist")
var ErrJobNotRunning = errors.New("job isn't running")
func NewServer(logger *log.Logger, jobs jobqueue.JobQueue, artifactsDir string, identityFilter []string) *Server {
return &Server{
s := &Server{
jobs: jobs,
logger: logger,
artifactsDir: artifactsDir,
identityFilter: identityFilter,
}
go s.WatchHeartbeats()
return s
}
func (s *Server) Handler() http.Handler {
@ -111,6 +112,23 @@ func (s *Server) VerifyIdentityHeader(nextHandler echo.HandlerFunc) echo.Handler
}
}
// This function should be started as a goroutine
// Every 30 seconds it goes through all running jobs, removing any unresponsive ones.
// It fails jobs which fail to check if they cancelled for more than 2 minutes.
func (s *Server) WatchHeartbeats() {
//nolint:staticcheck // avoid SA1015, this is an endless function
for range time.Tick(time.Second * 30) {
for _, token := range s.jobs.Heartbeats(time.Second * 120) {
id, _ := s.jobs.IdFromToken(token)
log.Printf("Removing unresponsive job: %s\n", id)
err := s.FinishJob(token, nil)
if err != nil {
log.Printf("Error finishing unresponsive job: %v", err)
}
}
}
}
func (s *Server) EnqueueOSBuild(arch string, job *OSBuildJob) (uuid.UUID, error) {
return s.jobs.Enqueue("osbuild:"+arch, job, nil)
}
@ -355,6 +373,8 @@ func (h *apiHandlers) GetJob(ctx echo.Context, tokenstr string) error {
return ctx.JSON(http.StatusOK, getJobResponse{})
}
h.server.jobs.RefreshHeartbeat(token)
status, _, err := h.server.JobStatus(jobId, &json.RawMessage{})
if err != nil {
return err