internal/worker/server: return an error on depsolve timeout HMS-2989
Fixes the special case that if no worker is available and we generate an internal timeout and cancel the depsolve including all followup jobs, no error was propagated.
This commit is contained in:
parent
03e74e77b2
commit
d3e3474fb7
7 changed files with 208 additions and 31 deletions
|
|
@ -3,6 +3,7 @@ package v2
|
|||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
|
@ -263,7 +264,7 @@ func (s *Server) enqueueCompose(irs []imageRequest, channel string) (uuid.UUID,
|
|||
|
||||
s.goroutinesGroup.Add(1)
|
||||
go func() {
|
||||
serializeManifest(s.goroutinesCtx, manifestSource, s.workers, depsolveJobID, containerResolveJobID, ostreeResolveJobID, manifestJobID, ir.manifestSeed)
|
||||
serializeManifest(s.goroutinesCtx, manifestSource, s.workers, depsolveJobID, containerResolveJobID, ostreeResolveJobID, manifestJobID, id, ir.manifestSeed)
|
||||
defer s.goroutinesGroup.Done()
|
||||
}()
|
||||
|
||||
|
|
@ -423,7 +424,7 @@ func (s *Server) enqueueKojiCompose(taskID uint64, server, name, version, releas
|
|||
// copy the image request while passing it into the goroutine to prevent data races
|
||||
s.goroutinesGroup.Add(1)
|
||||
go func(ir imageRequest) {
|
||||
serializeManifest(s.goroutinesCtx, manifestSource, s.workers, depsolveJobID, containerResolveJobID, ostreeResolveJobID, manifestJobID, ir.manifestSeed)
|
||||
serializeManifest(s.goroutinesCtx, manifestSource, s.workers, depsolveJobID, containerResolveJobID, ostreeResolveJobID, manifestJobID, buildID, ir.manifestSeed)
|
||||
defer s.goroutinesGroup.Done()
|
||||
}(ir)
|
||||
}
|
||||
|
|
@ -444,23 +445,80 @@ func (s *Server) enqueueKojiCompose(taskID uint64, server, name, version, releas
|
|||
return id, nil
|
||||
}
|
||||
|
||||
func serializeManifest(ctx context.Context, manifestSource *manifest.Manifest, workers *worker.Server, depsolveJobID, containerResolveJobID, ostreeResolveJobID, manifestJobID uuid.UUID, seed int64) {
|
||||
ctx, cancel := context.WithTimeout(ctx, time.Minute*5)
|
||||
func serializeManifest(ctx context.Context, manifestSource *manifest.Manifest, workers *worker.Server, depsolveJobID, containerResolveJobID, ostreeResolveJobID, manifestJobID, osbuildJobID uuid.UUID, seed int64) {
|
||||
// prepared to become a config variable
|
||||
const depsolveTimeout = 5
|
||||
ctx, cancel := context.WithTimeout(ctx, time.Minute*depsolveTimeout)
|
||||
defer cancel()
|
||||
|
||||
// wait until job is in a pending state
|
||||
var token uuid.UUID
|
||||
jobResult := &worker.ManifestJobByIDResult{
|
||||
Manifest: nil,
|
||||
ManifestInfo: worker.ManifestInfo{
|
||||
OSBuildComposerVersion: common.BuildVersion(),
|
||||
},
|
||||
}
|
||||
|
||||
var dynArgs []json.RawMessage
|
||||
var err error
|
||||
token := uuid.Nil
|
||||
logWithId := logrus.WithField("jobId", manifestJobID)
|
||||
|
||||
defer func() {
|
||||
// token == uuid.Nil indicates that no worker even started processing
|
||||
if token == uuid.Nil {
|
||||
if jobResult.JobError != nil {
|
||||
// set all jobs to "failed"
|
||||
jobs := map[string]uuid.UUID{
|
||||
"depsolve": depsolveJobID,
|
||||
"containerResolve": containerResolveJobID,
|
||||
"ostreeResolve": ostreeResolveJobID,
|
||||
"manifest": manifestJobID,
|
||||
"osbuild": osbuildJobID,
|
||||
}
|
||||
|
||||
for jobName, jobID := range jobs {
|
||||
if jobID != uuid.Nil {
|
||||
err := workers.SetFailed(jobID, jobResult.JobError)
|
||||
if err != nil {
|
||||
logWithId.Errorf("Error failing %s job: %v", jobName, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
logWithId.Errorf("Internal error, no worker started depsolve but we didn't get a reason.")
|
||||
}
|
||||
} else {
|
||||
result, err := json.Marshal(jobResult)
|
||||
if err != nil {
|
||||
logWithId.Errorf("Error marshalling manifest job results: %v", err)
|
||||
}
|
||||
err = workers.FinishJob(token, result)
|
||||
if err != nil {
|
||||
logWithId.Errorf("Error finishing manifest job: %v", err)
|
||||
}
|
||||
if jobResult.JobError != nil {
|
||||
logWithId.Errorf("Error in manifest job %v: %v", jobResult.JobError.Reason, err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// wait until job is in a pending state
|
||||
for {
|
||||
_, token, _, _, dynArgs, err = workers.RequestJobById(ctx, "", manifestJobID)
|
||||
if err == jobqueue.ErrNotPending {
|
||||
if errors.Is(err, jobqueue.ErrNotPending) {
|
||||
logWithId.Debug("Manifest job not pending, waiting for depsolve job to finish")
|
||||
time.Sleep(time.Millisecond * 50)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logWithId.Warning("Manifest job dependencies took longer than 5 minutes to finish, or the server is shutting down, returning to avoid dangling routines")
|
||||
logWithId.Warning(fmt.Sprintf("Manifest job dependencies took longer than %d minutes to finish,"+
|
||||
" or the server is shutting down, returning to avoid dangling routines", depsolveTimeout))
|
||||
|
||||
jobResult.JobError = clienterrors.New(clienterrors.ErrorDepsolveTimeout,
|
||||
"Timeout while waiting for package dependency resolution",
|
||||
"There may be a temporary issue with compute resources. "+
|
||||
"We’re looking into it, please try again later.",
|
||||
)
|
||||
break
|
||||
default:
|
||||
continue
|
||||
|
|
@ -473,13 +531,6 @@ func serializeManifest(ctx context.Context, manifestSource *manifest.Manifest, w
|
|||
break
|
||||
}
|
||||
|
||||
jobResult := &worker.ManifestJobByIDResult{
|
||||
Manifest: nil,
|
||||
ManifestInfo: worker.ManifestInfo{
|
||||
OSBuildComposerVersion: common.BuildVersion(),
|
||||
},
|
||||
}
|
||||
|
||||
// add osbuild/images dependency info to job result
|
||||
osbuildImagesDep, err := common.GetDepModuleInfoByPath(common.OSBuildImagesModulePath)
|
||||
if err != nil {
|
||||
|
|
@ -491,22 +542,6 @@ func serializeManifest(ctx context.Context, manifestSource *manifest.Manifest, w
|
|||
jobResult.ManifestInfo.OSBuildComposerDeps = append(jobResult.ManifestInfo.OSBuildComposerDeps, osbuildImagesDepModule)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if jobResult.JobError != nil {
|
||||
logWithId.Errorf("Error in manifest job %v: %v", jobResult.JobError.Reason, err)
|
||||
}
|
||||
|
||||
result, err := json.Marshal(jobResult)
|
||||
if err != nil {
|
||||
logWithId.Errorf("Error marshalling manifest job results: %v", err)
|
||||
}
|
||||
|
||||
err = workers.FinishJob(token, result)
|
||||
if err != nil {
|
||||
logWithId.Errorf("Error finishing manifest job: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
if len(dynArgs) == 0 {
|
||||
reason := "No dynamic arguments"
|
||||
jobResult.JobError = clienterrors.New(clienterrors.ErrorNoDynamicArgs, reason, nil)
|
||||
|
|
|
|||
|
|
@ -403,6 +403,40 @@ func (q *fsJobQueue) CancelJob(id uuid.UUID) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (q *fsJobQueue) FailJob(id uuid.UUID, result interface{}) error {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
|
||||
j, err := q.readJob(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !j.FinishedAt.IsZero() {
|
||||
return jobqueue.ErrFinished
|
||||
}
|
||||
|
||||
if !j.StartedAt.IsZero() {
|
||||
return jobqueue.ErrRunning
|
||||
}
|
||||
|
||||
j.Result, err = json.Marshal(result)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
j.StartedAt = time.Now()
|
||||
j.FinishedAt = time.Now()
|
||||
j.Token = uuid.New()
|
||||
|
||||
err = q.db.Write(id.String(), j)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing job %s: %v", id, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (q *fsJobQueue) JobStatus(id uuid.UUID) (jobType string, channel string, result json.RawMessage, queued, started, finished time.Time, canceled bool, deps []uuid.UUID, dependents []uuid.UUID, err error) {
|
||||
j, err := q.readJob(id)
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/osbuild/osbuild-composer/internal/worker"
|
||||
"github.com/osbuild/osbuild-composer/internal/worker/clienterrors"
|
||||
"os"
|
||||
"sync"
|
||||
"testing"
|
||||
|
|
@ -55,6 +57,7 @@ func TestJobQueue(t *testing.T, makeJobQueue MakeJobQueue) {
|
|||
t.Run("multiple-channels", wrap(testMultipleChannels))
|
||||
t.Run("100-dequeuers", wrap(test100dequeuers))
|
||||
t.Run("workers", wrap(testWorkers))
|
||||
t.Run("fail", wrap(testFail))
|
||||
}
|
||||
|
||||
func pushTestJob(t *testing.T, q jobqueue.JobQueue, jobType string, args interface{}, dependencies []uuid.UUID, channel string) uuid.UUID {
|
||||
|
|
@ -762,3 +765,56 @@ func testWorkers(t *testing.T, q jobqueue.JobQueue) {
|
|||
err = q.DeleteWorker(w2)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func testFail(t *testing.T, q jobqueue.JobQueue) {
|
||||
startTime := time.Now()
|
||||
|
||||
FailedJobErrorResult := worker.JobResult{
|
||||
JobError: clienterrors.New(clienterrors.ErrorDepsolveTimeout,
|
||||
"Test timeout reason",
|
||||
"Test timeout details",
|
||||
),
|
||||
}
|
||||
|
||||
testReason, err := json.Marshal(FailedJobErrorResult)
|
||||
require.NoError(t, err)
|
||||
|
||||
// set a non-existing job to failed
|
||||
err = q.FailJob(uuid.New(), testReason)
|
||||
require.Error(t, err)
|
||||
|
||||
// Cancel a pending job
|
||||
id := pushTestJob(t, q, "coralreef", nil, nil, "testchannel")
|
||||
require.NotEmpty(t, id)
|
||||
|
||||
err = q.FailJob(id, testReason)
|
||||
require.NoError(t, err)
|
||||
|
||||
//nolint:golint,ineffassign
|
||||
jobType, channel, result, queued, started, finished, canceled, _, _, err := q.JobStatus(id)
|
||||
require.NoError(t, err)
|
||||
|
||||
endTime := time.Now()
|
||||
type JobResult struct {
|
||||
JobError *clienterrors.Error `json:"job_error"`
|
||||
}
|
||||
var r1 JobResult
|
||||
err = json.Unmarshal(result, &r1)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NotNil(t, r1)
|
||||
require.Equal(t, "Test timeout reason", r1.JobError.Reason)
|
||||
require.Equal(t, "Test timeout details", r1.JobError.Details)
|
||||
require.Equal(t, clienterrors.ErrorDepsolveTimeout, r1.JobError.ID)
|
||||
require.Equal(t, "testchannel", channel)
|
||||
require.Equal(t, "coralreef", jobType)
|
||||
require.Equal(t, false, canceled)
|
||||
|
||||
allTimings := []time.Time{queued, started, finished}
|
||||
|
||||
for _, tmr := range allTimings {
|
||||
require.Less(t, startTime, tmr)
|
||||
require.Greater(t, endTime, tmr)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ const (
|
|||
ErrorJobPanicked ClientErrorCode = 37
|
||||
ErrorGeneratingSignedURL ClientErrorCode = 38
|
||||
ErrorInvalidRepositoryURL ClientErrorCode = 39
|
||||
ErrorDepsolveTimeout ClientErrorCode = 40
|
||||
)
|
||||
|
||||
type ClientErrorCode int
|
||||
|
|
|
|||
|
|
@ -566,6 +566,20 @@ func (s *Server) Cancel(id uuid.UUID) error {
|
|||
return s.jobs.CancelJob(id)
|
||||
}
|
||||
|
||||
// SetFailed sets the given job id to "failed" with the given error
|
||||
func (s *Server) SetFailed(id uuid.UUID, error *clienterrors.Error) error {
|
||||
FailedJobErrorResult := JobResult{
|
||||
JobError: error,
|
||||
}
|
||||
|
||||
res, err := json.Marshal(FailedJobErrorResult)
|
||||
if err != nil {
|
||||
logrus.Errorf("error marshalling the error: %v", err)
|
||||
return nil
|
||||
}
|
||||
return s.jobs.FailJob(id, res)
|
||||
}
|
||||
|
||||
// Provides access to artifacts of a job. Returns an io.Reader for the artifact
|
||||
// and the artifact's size.
|
||||
func (s *Server) JobArtifact(id uuid.UUID, name string) (io.Reader, int64, error) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue