worker: protect the instance from upgrading
Before the instance was vulnerable to an OTA update while processing a request. Because there is no way of retriggering a job in Composer, it is better to avoid this situation. The way we are doing it is by setting the `protected` flag onto the instance when a job is being processed. This way the AWS scheduler does hopefully not shutdown the machine at the wrong time. Main caveats of this solution: * Starvation: If a worker keeps accepting new jobs, then it might not be updated. * Inconsistency: There exist a window between the job acceptation and the protection where the worker can be shutdown without having the time to protect itself.
This commit is contained in:
parent
b6f61d26ea
commit
ab7fe6558a
7 changed files with 20826 additions and 0 deletions
|
|
@ -14,6 +14,11 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/BurntSushi/toml"
|
||||
"github.com/aws/aws-sdk-go/aws"
|
||||
"github.com/aws/aws-sdk-go/aws/awserr"
|
||||
"github.com/aws/aws-sdk-go/aws/ec2metadata"
|
||||
"github.com/aws/aws-sdk-go/aws/session"
|
||||
"github.com/aws/aws-sdk-go/service/autoscaling"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/osbuild/osbuild-composer/internal/common"
|
||||
|
|
@ -84,6 +89,64 @@ func WatchJob(ctx context.Context, job worker.Job) {
|
|||
}
|
||||
}
|
||||
|
||||
// protect an AWS instance from scaling and/or terminating.
|
||||
func setProtection(protected bool) {
|
||||
// create a new session
|
||||
awsSession, err := session.NewSession()
|
||||
if err != nil {
|
||||
logrus.Debugf("Error getting an AWS session, %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
// get the identity for the instanceID
|
||||
identity, err := ec2metadata.New(awsSession).GetInstanceIdentityDocument()
|
||||
if err != nil {
|
||||
logrus.Debugf("Error getting the identity document, %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
svc := autoscaling.New(awsSession)
|
||||
|
||||
// get the autoscaling group info for the auto scaling group name
|
||||
asInstanceInput := &autoscaling.DescribeAutoScalingInstancesInput{
|
||||
InstanceIds: []*string{
|
||||
aws.String(identity.InstanceID),
|
||||
},
|
||||
}
|
||||
asInstanceOutput, err := svc.DescribeAutoScalingInstances(asInstanceInput)
|
||||
if err != nil {
|
||||
if aerr, ok := err.(awserr.Error); ok {
|
||||
logrus.Warningf("Error getting the Autoscaling instances: %s %s", aerr.Code(), aerr.Error())
|
||||
} else {
|
||||
logrus.Errorf("Error getting the Autoscaling instances: unknown, %s", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// make the request to protect (or unprotect) the instance
|
||||
input := &autoscaling.SetInstanceProtectionInput{
|
||||
AutoScalingGroupName: asInstanceOutput.AutoScalingInstances[0].AutoScalingGroupName,
|
||||
InstanceIds: []*string{
|
||||
aws.String(identity.InstanceID),
|
||||
},
|
||||
ProtectedFromScaleIn: aws.Bool(protected),
|
||||
}
|
||||
_, err = svc.SetInstanceProtection(input)
|
||||
if err != nil {
|
||||
if aerr, ok := err.(awserr.Error); ok {
|
||||
logrus.Warningf("Error protecting instance: %s %s", aerr.Code(), aerr.Error())
|
||||
} else {
|
||||
logrus.Errorf("Error protecting instance: unknown, %s", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
if protected {
|
||||
logrus.Info("Instance protected")
|
||||
} else {
|
||||
logrus.Info("Instance protection removed")
|
||||
}
|
||||
}
|
||||
|
||||
// Requests and runs 1 job of specified type(s)
|
||||
// Returning an error here will result in the worker backing off for a while and retrying
|
||||
func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls map[string]JobImplementation) error {
|
||||
|
|
@ -104,6 +167,13 @@ func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls
|
|||
return err
|
||||
}
|
||||
|
||||
// Depsolve requests needs reactivity, since setting the protection can take up to 6s to timeout if the worker isn't
|
||||
// in an AWS env, disable this setting for them.
|
||||
if job.Type() != "depsolve" {
|
||||
setProtection(true)
|
||||
defer setProtection(false)
|
||||
}
|
||||
|
||||
logrus.Infof("Running job '%s' (%s)\n", job.Id(), job.Type())
|
||||
|
||||
ctx, cancelWatcher := context.WithCancel(context.Background())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue