internal/worker/server: return an error on depsolve timeout HMS-2989
Fixes the special case that if no worker is available and we generate an internal timeout and cancel the depsolve including all followup jobs, no error was propagated.
This commit is contained in:
parent
03e74e77b2
commit
d3e3474fb7
7 changed files with 208 additions and 31 deletions
|
|
@ -94,6 +94,12 @@ const (
|
|||
WHERE id = $1 AND finished_at IS NULL
|
||||
RETURNING type, started_at`
|
||||
|
||||
sqlFailJob = `
|
||||
UPDATE jobs
|
||||
SET token = $2, started_at = now(), finished_at = now(), result = $3
|
||||
WHERE id = $1 AND finished_at IS NULL AND started_at IS NULL AND token IS NULL
|
||||
RETURNING id, type`
|
||||
|
||||
sqlInsertHeartbeat = `
|
||||
INSERT INTO heartbeats(token, id, heartbeat)
|
||||
VALUES ($1, $2, now())`
|
||||
|
|
@ -592,6 +598,32 @@ func (q *DBJobQueue) CancelJob(id uuid.UUID) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (q *DBJobQueue) FailJob(id uuid.UUID, result interface{}) error {
|
||||
conn, err := q.pool.Acquire(context.Background())
|
||||
if err != nil {
|
||||
return fmt.Errorf("error connecting to database: %w", err)
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
var jobType string
|
||||
var resultId uuid.UUID
|
||||
dummyToken := uuid.New()
|
||||
err = conn.QueryRow(context.Background(), sqlFailJob, id, dummyToken, result).Scan(&resultId, &jobType)
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return jobqueue.ErrNotRunning
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("error failing job %s: %w", id, err)
|
||||
}
|
||||
if id != resultId {
|
||||
return fmt.Errorf("that should never happen, I wanted to set %s to failed but got %s back from DB", id, resultId)
|
||||
}
|
||||
|
||||
q.logger.Info("Job set to failed", "job_type", jobType, "job_id", id.String())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (q *DBJobQueue) JobStatus(id uuid.UUID) (jobType string, channel string, result json.RawMessage, queued, started, finished time.Time, canceled bool, deps []uuid.UUID, dependents []uuid.UUID, err error) {
|
||||
conn, err := q.pool.Acquire(context.Background())
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -59,6 +59,9 @@ type JobQueue interface {
|
|||
// Cancel a job. Does nothing if the job has already finished.
|
||||
CancelJob(id uuid.UUID) error
|
||||
|
||||
// Fail a job that didn't even start (e.g. no worker available)
|
||||
FailJob(id uuid.UUID, result interface{}) error
|
||||
|
||||
// If the job has finished, returns the result as raw JSON.
|
||||
//
|
||||
// Returns the current status of the job, in the form of three times:
|
||||
|
|
@ -114,6 +117,8 @@ var (
|
|||
ErrDequeueTimeout = errors.New("dequeue context timed out or was canceled")
|
||||
ErrActiveJobs = errors.New("worker has active jobs associated with it")
|
||||
ErrWorkerNotExist = errors.New("worker does not exist")
|
||||
ErrRunning = errors.New("job is running, but wasn't expected to be")
|
||||
ErrFinished = errors.New("job is finished, but wasn't expected to be")
|
||||
)
|
||||
|
||||
type Worker struct {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue