worker/server: requeue unresponsive jobs

If a job is unresponsive the worker has most likely crashed or been shut down and the in-progress job been lost. Instead of failing these jobs, requeue them up to two times. Once a job is lost a third time it fails. This avoids infinite loops. This is implemented by extending FinishJob to RequeuOrFinish job. It takes a max number of requeues as an argument, and if that is 0, it has the same behavior as FinishJob used to have. If the maximum number of requeues has not yet been reached, then the running job is returned to pending state to be picked up again.
2022-03-18 21:39:32 +00:00 · 2022-03-18 21:39:32 +00:00 · 626530818d
commit 626530818d
parent d02f666a4b
8 changed files with 216 additions and 61 deletions
--- a/pkg/jobqueue/jobqueue.go
+++ b/pkg/jobqueue/jobqueue.go
@ -49,9 +49,12 @@ type JobQueue interface {
 	// can be unmarshaled to the type given in Enqueue().
 	DequeueByID(ctx context.Context, id uuid.UUID) (uuid.UUID, []uuid.UUID, string, json.RawMessage, error)

-	// Mark the job with `id` as finished. `result` must fit the associated
-	// job type and must be serializable to JSON.
-	FinishJob(id uuid.UUID, result interface{}) error
+	// Tries to requeue a running job by its ID
+	//
+	// Returns the given job to the pending state. If the job has reached
+	// the maxRetries number of retries already, finish the job instead.
+	// `result` must fit the associated job type and must be serializable to JSON.
+	RequeueOrFinishJob(id uuid.UUID, maxRetries uint64, result interface{}) error

 	// Cancel a job. Does nothing if the job has already finished.
 	CancelJob(id uuid.UUID) error
@ -95,6 +98,6 @@ var (
 	ErrNotExist       = errors.New("job does not exist")
 	ErrNotPending     = errors.New("job is not pending")
 	ErrNotRunning     = errors.New("job is not running")
-	ErrCanceled       = errors.New("job ws canceled")
+	ErrCanceled       = errors.New("job was canceled")
 	ErrDequeueTimeout = errors.New("dequeue context timed out or was canceled")
 )