worker: Introduce heartbeats

An occupied worker checks about every 15 seconds if it's current job was
cancelled. Use this to introduce a heartbeat mechanism, where if
composer hasn't heard from the worker in 2 minutes, the job times out
and is set to fail.
This commit is contained in:
sanne 2021-07-05 11:45:16 +02:00 committed by Tom Gundersen
parent 0fcb44e617
commit 4385c39d66
6 changed files with 166 additions and 46 deletions

View file

@ -51,6 +51,7 @@ type fsJobQueue struct {
// and renamed to `$STATE_DIRECTORY/artifacts/$JOB_ID` once the job is
// reported as done.
jobIdByToken map[uuid.UUID]uuid.UUID
heartbeats map[uuid.UUID]time.Time // token -> heartbeat
}
// On-disk job struct. Contains all necessary (but non-redundant) information
@ -84,6 +85,7 @@ func New(dir string) (*fsJobQueue, error) {
pending: make(map[string]chan uuid.UUID),
dependants: make(map[uuid.UUID][]uuid.UUID),
jobIdByToken: make(map[uuid.UUID]uuid.UUID),
heartbeats: make(map[uuid.UUID]time.Time),
}
// Look for jobs that are still pending and build the dependant map.
@ -112,6 +114,7 @@ func New(dir string) (*fsJobQueue, error) {
}
} else {
q.jobIdByToken[j.Token] = j.Id
q.heartbeats[j.Token] = time.Now()
}
}
@ -226,6 +229,7 @@ func (q *fsJobQueue) Dequeue(ctx context.Context, jobTypes []string) (uuid.UUID,
j.Token = uuid.New()
q.jobIdByToken[j.Token] = j.Id
q.heartbeats[j.Token] = time.Now()
err := q.db.Write(j.Id.String(), j)
if err != nil {
@ -259,6 +263,7 @@ func (q *fsJobQueue) FinishJob(id uuid.UUID, result interface{}) error {
return fmt.Errorf("error marshaling result: %v", err)
}
delete(q.heartbeats, j.Token)
delete(q.jobIdByToken, j.Token)
j.Token = uuid.Nil
@ -298,6 +303,8 @@ func (q *fsJobQueue) CancelJob(id uuid.UUID) error {
j.Canceled = true
delete(q.heartbeats, j.Token)
err = q.db.Write(id.String(), j)
if err != nil {
return fmt.Errorf("error writing job %s: %v", id, err)
@ -345,6 +352,28 @@ func (q *fsJobQueue) IdFromToken(token uuid.UUID) (id uuid.UUID, err error) {
return id, nil
}
// Retrieve a list of tokens tied to jobs, which most recent action has been
// olderThan time ago
func (q *fsJobQueue) Heartbeats(olderThan time.Duration) (tokens []uuid.UUID) {
q.mu.Lock()
defer q.mu.Unlock()
now := time.Now()
for token, hb := range q.heartbeats {
if now.Sub(hb) > olderThan {
tokens = append(tokens, token)
}
}
return
}
func (q *fsJobQueue) RefreshHeartbeat(token uuid.UUID) {
q.mu.Lock()
defer q.mu.Unlock()
if token != uuid.Nil {
q.heartbeats[token] = time.Now()
}
}
// Reads job with `id`. This is a thin wrapper around `q.db.Read`, which
// returns the job directly, or and error if a job with `id` does not exist.
func (q *fsJobQueue) readJob(id uuid.UUID) (*job, error) {

View file

@ -338,3 +338,33 @@ func TestCancel(t *testing.T) {
err = json.Unmarshal(result, &testResult{})
require.NoError(t, err)
}
func TestHeartbeats(t *testing.T) {
q, dir := newTemporaryQueue(t)
defer cleanupTempDir(t, dir)
id := pushTestJob(t, q, "octopus", nil, nil)
// No heartbeats for queued job
require.Empty(t, q.Heartbeats(time.Second*0))
r, tok, _, _, _, err := q.Dequeue(context.Background(), []string{"octopus"})
require.NoError(t, err)
require.Equal(t, id, r)
require.NotEmpty(t, tok)
tokens := q.Heartbeats(time.Second * 0)
require.Contains(t, tokens, tok)
require.Empty(t, q.Heartbeats(time.Hour*24))
id2, err := q.IdFromToken(tok)
require.NoError(t, err)
require.Equal(t, id, id2)
err = q.FinishJob(id, &testResult{})
require.NoError(t, err)
// No heartbeats for finished job
require.Empty(t, q.Heartbeats(time.Second*0))
_, err = q.IdFromToken(tok)
require.Equal(t, jobqueue.ErrNotExist, err)
}