worker/server: requeue unresponsive jobs
If a job is unresponsive the worker has most likely crashed or been shut down and the in-progress job been lost. Instead of failing these jobs, requeue them up to two times. Once a job is lost a third time it fails. This avoids infinite loops. This is implemented by extending FinishJob to RequeuOrFinish job. It takes a max number of requeues as an argument, and if that is 0, it has the same behavior as FinishJob used to have. If the maximum number of requeues has not yet been reached, then the running job is returned to pending state to be picked up again.
This commit is contained in:
parent
d02f666a4b
commit
626530818d
8 changed files with 216 additions and 61 deletions
|
|
@ -75,7 +75,8 @@ type job struct {
|
|||
FinishedAt time.Time `json:"finished_at,omitempty"`
|
||||
ExpiresAt time.Time `json:"expires_at,omitempty"`
|
||||
|
||||
Canceled bool `json:"canceled,omitempty"`
|
||||
Retries uint64 `json:"retries"`
|
||||
Canceled bool `json:"canceled,omitempty"`
|
||||
}
|
||||
|
||||
// Create a new fsJobQueue object for `dir`. This object must have exclusive
|
||||
|
|
@ -111,7 +112,7 @@ func New(dir string) (*fsJobQueue, error) {
|
|||
if !j.StartedAt.IsZero() && j.FinishedAt.IsZero() && !j.Canceled {
|
||||
// Fail older running jobs which don't have a token stored
|
||||
if j.Token == uuid.Nil {
|
||||
err = q.FinishJob(j.Id, nil)
|
||||
err = q.RequeueOrFinishJob(j.Id, 0, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error finishing job '%s' without a token: %v", j.Id, err)
|
||||
}
|
||||
|
|
@ -274,7 +275,7 @@ func (q *fsJobQueue) DequeueByID(ctx context.Context, id uuid.UUID) (uuid.UUID,
|
|||
return j.Token, j.Dependencies, j.Type, j.Args, nil
|
||||
}
|
||||
|
||||
func (q *fsJobQueue) FinishJob(id uuid.UUID, result interface{}) error {
|
||||
func (q *fsJobQueue) RequeueOrFinishJob(id uuid.UUID, maxRetries uint64, result interface{}) error {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
|
||||
|
|
@ -291,33 +292,57 @@ func (q *fsJobQueue) FinishJob(id uuid.UUID, result interface{}) error {
|
|||
return jobqueue.ErrNotRunning
|
||||
}
|
||||
|
||||
j.FinishedAt = time.Now()
|
||||
|
||||
j.Result, err = json.Marshal(result)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshaling result: %v", err)
|
||||
}
|
||||
|
||||
delete(q.heartbeats, j.Token)
|
||||
delete(q.jobIdByToken, j.Token)
|
||||
delete(q.heartbeats, j.Token)
|
||||
|
||||
// Write before notifying dependants, because it will be read again.
|
||||
err = q.db.Write(id.String(), j)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing job %s: %v", id, err)
|
||||
}
|
||||
if j.Retries >= maxRetries {
|
||||
j.FinishedAt = time.Now()
|
||||
|
||||
for _, depid := range q.dependants[id] {
|
||||
dep, err := q.readJob(depid)
|
||||
j.Result, err = json.Marshal(result)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("error marshaling result: %v", err)
|
||||
}
|
||||
err = q.maybeEnqueue(dep, false)
|
||||
|
||||
// Write before notifying dependants, because it will be read again.
|
||||
err = q.db.Write(id.String(), j)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("error writing job %s: %v", id, err)
|
||||
}
|
||||
|
||||
for _, depid := range q.dependants[id] {
|
||||
dep, err := q.readJob(depid)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = q.maybeEnqueue(dep, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
delete(q.dependants, id)
|
||||
} else {
|
||||
j.Token = uuid.Nil
|
||||
j.StartedAt = time.Time{}
|
||||
j.Retries += 1
|
||||
|
||||
// Write the job before updating in-memory state, so that the latter
|
||||
// doesn't become corrupt when writing fails.
|
||||
err = q.db.Write(j.Id.String(), j)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot write job: %v", err)
|
||||
}
|
||||
|
||||
// add the job to the list of pending ones
|
||||
q.pending.PushBack(j.Id)
|
||||
|
||||
// notify all listeners in a non-blocking way
|
||||
for c := range q.listeners {
|
||||
select {
|
||||
case c <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
delete(q.dependants, id)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ func TestJobQueue(t *testing.T, makeJobQueue MakeJobQueue) {
|
|||
t.Run("errors", wrap(testErrors))
|
||||
t.Run("args", wrap(testArgs))
|
||||
t.Run("cancel", wrap(testCancel))
|
||||
t.Run("requeue", wrap(testRequeue))
|
||||
t.Run("requeue-limit", wrap(testRequeueLimit))
|
||||
t.Run("job-types", wrap(testJobTypes))
|
||||
t.Run("dependencies", wrap(testDependencies))
|
||||
t.Run("multiple-workers", wrap(testMultipleWorkers))
|
||||
|
|
@ -61,7 +63,7 @@ func finishNextTestJob(t *testing.T, q jobqueue.JobQueue, jobType string, result
|
|||
require.Equal(t, jobType, typ)
|
||||
require.NotNil(t, args)
|
||||
|
||||
err = q.FinishJob(id, result)
|
||||
err = q.RequeueOrFinishJob(id, 0, result)
|
||||
require.NoError(t, err)
|
||||
|
||||
return id
|
||||
|
|
@ -88,7 +90,7 @@ func testErrors(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.NoError(t, err)
|
||||
require.Equal(t, id, idFromT)
|
||||
|
||||
err = q.FinishJob(id, nil)
|
||||
err = q.RequeueOrFinishJob(id, 0, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Make sure the token gets removed
|
||||
|
|
@ -363,7 +365,7 @@ func testCancel(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.Equal(t, jobType, "clownfish")
|
||||
require.True(t, canceled)
|
||||
require.Nil(t, result)
|
||||
err = q.FinishJob(id, &testResult{})
|
||||
err = q.RequeueOrFinishJob(id, 0, &testResult{})
|
||||
require.Error(t, err)
|
||||
|
||||
// Cancel a running job, which should not dequeue the canceled job from above
|
||||
|
|
@ -383,7 +385,7 @@ func testCancel(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.Equal(t, jobType, "clownfish")
|
||||
require.True(t, canceled)
|
||||
require.Nil(t, result)
|
||||
err = q.FinishJob(id, &testResult{})
|
||||
err = q.RequeueOrFinishJob(id, 0, &testResult{})
|
||||
require.Error(t, err)
|
||||
|
||||
// Cancel a finished job, which is a no-op
|
||||
|
|
@ -396,7 +398,7 @@ func testCancel(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.Empty(t, deps)
|
||||
require.Equal(t, "clownfish", typ)
|
||||
require.Equal(t, json.RawMessage("null"), args)
|
||||
err = q.FinishJob(id, &testResult{})
|
||||
err = q.RequeueOrFinishJob(id, 0, &testResult{})
|
||||
require.NoError(t, err)
|
||||
err = q.CancelJob(id)
|
||||
require.Error(t, err)
|
||||
|
|
@ -409,6 +411,73 @@ func testCancel(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func testRequeue(t *testing.T, q jobqueue.JobQueue) {
|
||||
// Requeue a non-existing job
|
||||
err := q.RequeueOrFinishJob(uuid.New(), 1, nil)
|
||||
require.Error(t, err)
|
||||
|
||||
// Requeue a pending job
|
||||
id := pushTestJob(t, q, "clownfish", nil, nil, "")
|
||||
require.NotEmpty(t, id)
|
||||
err = q.RequeueOrFinishJob(id, 1, nil)
|
||||
require.Error(t, err)
|
||||
|
||||
// Requeue a running job
|
||||
r, tok1, deps, typ, args, err := q.Dequeue(context.Background(), []string{"clownfish"}, []string{""})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, id, r)
|
||||
require.NotEmpty(t, tok1)
|
||||
require.Empty(t, deps)
|
||||
require.Equal(t, "clownfish", typ)
|
||||
require.Equal(t, json.RawMessage("null"), args)
|
||||
err = q.RequeueOrFinishJob(id, 1, nil)
|
||||
require.NoError(t, err)
|
||||
r, tok2, deps, typ, args, err := q.Dequeue(context.Background(), []string{"clownfish"}, []string{""})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, id, r)
|
||||
require.NotEmpty(t, tok2)
|
||||
require.NotEqual(t, tok1, tok2)
|
||||
require.Empty(t, deps)
|
||||
require.Equal(t, "clownfish", typ)
|
||||
require.Equal(t, json.RawMessage("null"), args)
|
||||
jobType, _, result, _, _, _, canceled, _, _, err := q.JobStatus(id)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, jobType, "clownfish")
|
||||
require.False(t, canceled)
|
||||
require.Nil(t, result)
|
||||
err = q.RequeueOrFinishJob(id, 0, &testResult{})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Requeue a finished job
|
||||
err = q.RequeueOrFinishJob(id, 1, nil)
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func testRequeueLimit(t *testing.T, q jobqueue.JobQueue) {
|
||||
// Start a job
|
||||
id := pushTestJob(t, q, "clownfish", nil, nil, "")
|
||||
require.NotEmpty(t, id)
|
||||
_, _, _, _, _, err := q.Dequeue(context.Background(), []string{"clownfish"}, []string{""})
|
||||
require.NoError(t, err)
|
||||
// Requeue once
|
||||
err = q.RequeueOrFinishJob(id, 1, nil)
|
||||
require.NoError(t, err)
|
||||
// Start again
|
||||
_, _, _, _, _, err = q.Dequeue(context.Background(), []string{"clownfish"}, []string{""})
|
||||
require.NoError(t, err)
|
||||
_, _, result, _, _, finished, _, _, _, err := q.JobStatus(id)
|
||||
require.NoError(t, err)
|
||||
require.True(t, finished.IsZero())
|
||||
require.Nil(t, result)
|
||||
// Requeue a second time, this time finishing it
|
||||
err = q.RequeueOrFinishJob(id, 1, &testResult{})
|
||||
require.NoError(t, err)
|
||||
_, _, result, _, _, finished, _, _, _, err = q.JobStatus(id)
|
||||
require.NoError(t, err)
|
||||
require.False(t, finished.IsZero())
|
||||
require.NotNil(t, result)
|
||||
}
|
||||
|
||||
func testHeartbeats(t *testing.T, q jobqueue.JobQueue) {
|
||||
id := pushTestJob(t, q, "octopus", nil, nil, "")
|
||||
// No heartbeats for queued job
|
||||
|
|
@ -434,7 +503,7 @@ func testHeartbeats(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.NoError(t, err)
|
||||
require.Equal(t, id2, id)
|
||||
|
||||
err = q.FinishJob(id, &testResult{})
|
||||
err = q.RequeueOrFinishJob(id, 0, &testResult{})
|
||||
require.NoError(t, err)
|
||||
|
||||
// No heartbeats for finished job
|
||||
|
|
@ -456,7 +525,7 @@ func testDequeueByID(t *testing.T, q jobqueue.JobQueue) {
|
|||
require.Equal(t, "octopus", typ)
|
||||
require.NotNil(t, args)
|
||||
|
||||
err = q.FinishJob(one, nil)
|
||||
err = q.RequeueOrFinishJob(one, 0, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, two, finishNextTestJob(t, q, "octopus", testResult{}, nil))
|
||||
|
|
@ -482,7 +551,7 @@ func testDequeueByID(t *testing.T, q jobqueue.JobQueue) {
|
|||
_, _, _, _, err = q.DequeueByID(context.Background(), one)
|
||||
require.Equal(t, jobqueue.ErrNotPending, err)
|
||||
|
||||
err = q.FinishJob(one, nil)
|
||||
err = q.RequeueOrFinishJob(one, 0, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, _, _, _, err = q.DequeueByID(context.Background(), one)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue