worker: Introduce heartbeats
An occupied worker checks about every 15 seconds if it's current job was cancelled. Use this to introduce a heartbeat mechanism, where if composer hasn't heard from the worker in 2 minutes, the job times out and is set to fail.
This commit is contained in:
parent
0fcb44e617
commit
4385c39d66
6 changed files with 166 additions and 46 deletions
|
|
@ -51,6 +51,7 @@ type fsJobQueue struct {
|
|||
// and renamed to `$STATE_DIRECTORY/artifacts/$JOB_ID` once the job is
|
||||
// reported as done.
|
||||
jobIdByToken map[uuid.UUID]uuid.UUID
|
||||
heartbeats map[uuid.UUID]time.Time // token -> heartbeat
|
||||
}
|
||||
|
||||
// On-disk job struct. Contains all necessary (but non-redundant) information
|
||||
|
|
@ -84,6 +85,7 @@ func New(dir string) (*fsJobQueue, error) {
|
|||
pending: make(map[string]chan uuid.UUID),
|
||||
dependants: make(map[uuid.UUID][]uuid.UUID),
|
||||
jobIdByToken: make(map[uuid.UUID]uuid.UUID),
|
||||
heartbeats: make(map[uuid.UUID]time.Time),
|
||||
}
|
||||
|
||||
// Look for jobs that are still pending and build the dependant map.
|
||||
|
|
@ -112,6 +114,7 @@ func New(dir string) (*fsJobQueue, error) {
|
|||
}
|
||||
} else {
|
||||
q.jobIdByToken[j.Token] = j.Id
|
||||
q.heartbeats[j.Token] = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -226,6 +229,7 @@ func (q *fsJobQueue) Dequeue(ctx context.Context, jobTypes []string) (uuid.UUID,
|
|||
|
||||
j.Token = uuid.New()
|
||||
q.jobIdByToken[j.Token] = j.Id
|
||||
q.heartbeats[j.Token] = time.Now()
|
||||
|
||||
err := q.db.Write(j.Id.String(), j)
|
||||
if err != nil {
|
||||
|
|
@ -259,6 +263,7 @@ func (q *fsJobQueue) FinishJob(id uuid.UUID, result interface{}) error {
|
|||
return fmt.Errorf("error marshaling result: %v", err)
|
||||
}
|
||||
|
||||
delete(q.heartbeats, j.Token)
|
||||
delete(q.jobIdByToken, j.Token)
|
||||
j.Token = uuid.Nil
|
||||
|
||||
|
|
@ -298,6 +303,8 @@ func (q *fsJobQueue) CancelJob(id uuid.UUID) error {
|
|||
|
||||
j.Canceled = true
|
||||
|
||||
delete(q.heartbeats, j.Token)
|
||||
|
||||
err = q.db.Write(id.String(), j)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing job %s: %v", id, err)
|
||||
|
|
@ -345,6 +352,28 @@ func (q *fsJobQueue) IdFromToken(token uuid.UUID) (id uuid.UUID, err error) {
|
|||
return id, nil
|
||||
}
|
||||
|
||||
// Retrieve a list of tokens tied to jobs, which most recent action has been
|
||||
// olderThan time ago
|
||||
func (q *fsJobQueue) Heartbeats(olderThan time.Duration) (tokens []uuid.UUID) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
now := time.Now()
|
||||
for token, hb := range q.heartbeats {
|
||||
if now.Sub(hb) > olderThan {
|
||||
tokens = append(tokens, token)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (q *fsJobQueue) RefreshHeartbeat(token uuid.UUID) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
if token != uuid.Nil {
|
||||
q.heartbeats[token] = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
// Reads job with `id`. This is a thin wrapper around `q.db.Read`, which
|
||||
// returns the job directly, or and error if a job with `id` does not exist.
|
||||
func (q *fsJobQueue) readJob(id uuid.UUID) (*job, error) {
|
||||
|
|
|
|||
|
|
@ -338,3 +338,33 @@ func TestCancel(t *testing.T) {
|
|||
err = json.Unmarshal(result, &testResult{})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestHeartbeats(t *testing.T) {
|
||||
q, dir := newTemporaryQueue(t)
|
||||
defer cleanupTempDir(t, dir)
|
||||
|
||||
id := pushTestJob(t, q, "octopus", nil, nil)
|
||||
// No heartbeats for queued job
|
||||
require.Empty(t, q.Heartbeats(time.Second*0))
|
||||
|
||||
r, tok, _, _, _, err := q.Dequeue(context.Background(), []string{"octopus"})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, id, r)
|
||||
require.NotEmpty(t, tok)
|
||||
|
||||
tokens := q.Heartbeats(time.Second * 0)
|
||||
require.Contains(t, tokens, tok)
|
||||
require.Empty(t, q.Heartbeats(time.Hour*24))
|
||||
|
||||
id2, err := q.IdFromToken(tok)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, id, id2)
|
||||
|
||||
err = q.FinishJob(id, &testResult{})
|
||||
require.NoError(t, err)
|
||||
|
||||
// No heartbeats for finished job
|
||||
require.Empty(t, q.Heartbeats(time.Second*0))
|
||||
_, err = q.IdFromToken(tok)
|
||||
require.Equal(t, jobqueue.ErrNotExist, err)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue