metrics: add worker error metrics

This commit introduces the collection of error
metrics since it is now possible to differentiate
between internal errors and user input errors.
Additionally, the error status is reported for
job duration metrics.
This commit is contained in:
Gianluca Zuccarelli 2022-01-31 11:21:19 +00:00 committed by Tom Gundersen
parent 6c4caec022
commit 290472dfdf
3 changed files with 50 additions and 4 deletions

View file

@ -3,12 +3,22 @@ package prometheus
import (
"time"
"github.com/osbuild/osbuild-composer/internal/worker/clienterrors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
const workerSubsystem = "composer_worker"
var (
TotalJobs = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "total_jobs",
Namespace: namespace,
Subsystem: workerSubsystem,
Help: "Total jobs",
}, []string{"type", "status"})
)
var (
PendingJobs = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "pending_jobs",
@ -34,7 +44,7 @@ var (
Subsystem: workerSubsystem,
Help: "Duration spent by workers on a job.",
Buckets: []float64{.1, .2, .5, 1, 2, 4, 8, 16, 32, 40, 48, 64, 96, 128, 160, 192, 224, 256, 320, 382, 448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2049},
}, []string{"type"})
}, []string{"type", "status"})
)
var (
@ -68,10 +78,11 @@ func CancelJobMetrics(started time.Time, jobType string) {
}
}
func FinishJobMetrics(started time.Time, finished time.Time, canceled bool, jobType string) {
func FinishJobMetrics(started time.Time, finished time.Time, canceled bool, jobType string, status clienterrors.StatusCode) {
if !finished.IsZero() && !canceled {
diff := finished.Sub(started).Seconds()
JobDuration.WithLabelValues(jobType).Observe(diff)
JobDuration.WithLabelValues(jobType, status.ToString()).Observe(diff)
TotalJobs.WithLabelValues(jobType, status.ToString()).Inc()
RunningJobs.WithLabelValues(jobType).Dec()
}
}

View file

@ -34,6 +34,40 @@ type Error struct {
Details interface{} `json:"details"`
}
const (
JobStatusSuccess = "2xx"
JobStatusUserInputError = "4xx"
JobStatusInternalError = "5xx"
)
type StatusCode string
func (s *StatusCode) ToString() string {
return string(*s)
}
func GetStatusCode(err *Error) StatusCode {
if err == nil {
return JobStatusSuccess
}
switch err.ID {
case ErrorDNFDepsolveError:
return JobStatusInternalError
case ErrorDNFMarkingError:
return JobStatusInternalError
case ErrorNoDynamicArgs:
return JobStatusUserInputError
case ErrorInvalidTargetConfig:
return JobStatusUserInputError
case ErrorSharingTarget:
return JobStatusUserInputError
case ErrorInvalidTarget:
return JobStatusUserInputError
default:
return JobStatusInternalError
}
}
func WorkerClientError(code ClientErrorCode, reason string, details ...interface{}) *Error {
return &Error{
ID: code,

View file

@ -486,7 +486,8 @@ func (s *Server) FinishJob(token uuid.UUID, result json.RawMessage) error {
if err != nil {
logrus.Errorf("error finding job status: %v", err)
} else {
prometheus.FinishJobMetrics(status.Started, status.Finished, status.Canceled, jobType)
statusCode := clienterrors.GetStatusCode(jobResult.JobError)
prometheus.FinishJobMetrics(status.Started, status.Finished, status.Canceled, jobType, statusCode)
}
// Move artifacts from the temporary location to the final job