worker: protect the instance from upgrading

Before the instance was vulnerable to an OTA update while processing a
request. Because there is no way of retriggering a job in Composer, it
is better to avoid this situation.
The way we are doing it is by setting the `protected` flag onto the
instance when a job is being processed. This way the AWS scheduler
does hopefully not shutdown the machine at the wrong time.

Main caveats of this solution:
* Starvation: If a worker keeps accepting new jobs, then it might not be
  updated.
* Inconsistency: There exist a window between the job acceptation and the
  protection where the worker can be shutdown without having the time to
  protect itself.
This commit is contained in:
Thomas Lavocat 2022-04-20 10:55:14 +02:00 committed by Sanne Raymaekers
parent b6f61d26ea
commit ab7fe6558a
7 changed files with 20826 additions and 0 deletions

View file

@ -14,6 +14,11 @@ import (
"time"
"github.com/BurntSushi/toml"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/sirupsen/logrus"
"github.com/osbuild/osbuild-composer/internal/common"
@ -84,6 +89,64 @@ func WatchJob(ctx context.Context, job worker.Job) {
}
}
// protect an AWS instance from scaling and/or terminating.
func setProtection(protected bool) {
// create a new session
awsSession, err := session.NewSession()
if err != nil {
logrus.Debugf("Error getting an AWS session, %s", err)
return
}
// get the identity for the instanceID
identity, err := ec2metadata.New(awsSession).GetInstanceIdentityDocument()
if err != nil {
logrus.Debugf("Error getting the identity document, %s", err)
return
}
svc := autoscaling.New(awsSession)
// get the autoscaling group info for the auto scaling group name
asInstanceInput := &autoscaling.DescribeAutoScalingInstancesInput{
InstanceIds: []*string{
aws.String(identity.InstanceID),
},
}
asInstanceOutput, err := svc.DescribeAutoScalingInstances(asInstanceInput)
if err != nil {
if aerr, ok := err.(awserr.Error); ok {
logrus.Warningf("Error getting the Autoscaling instances: %s %s", aerr.Code(), aerr.Error())
} else {
logrus.Errorf("Error getting the Autoscaling instances: unknown, %s", err)
}
return
}
// make the request to protect (or unprotect) the instance
input := &autoscaling.SetInstanceProtectionInput{
AutoScalingGroupName: asInstanceOutput.AutoScalingInstances[0].AutoScalingGroupName,
InstanceIds: []*string{
aws.String(identity.InstanceID),
},
ProtectedFromScaleIn: aws.Bool(protected),
}
_, err = svc.SetInstanceProtection(input)
if err != nil {
if aerr, ok := err.(awserr.Error); ok {
logrus.Warningf("Error protecting instance: %s %s", aerr.Code(), aerr.Error())
} else {
logrus.Errorf("Error protecting instance: unknown, %s", err)
}
return
}
if protected {
logrus.Info("Instance protected")
} else {
logrus.Info("Instance protection removed")
}
}
// Requests and runs 1 job of specified type(s)
// Returning an error here will result in the worker backing off for a while and retrying
func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls map[string]JobImplementation) error {
@ -104,6 +167,13 @@ func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls
return err
}
// Depsolve requests needs reactivity, since setting the protection can take up to 6s to timeout if the worker isn't
// in an AWS env, disable this setting for them.
if job.Type() != "depsolve" {
setProtection(true)
defer setProtection(false)
}
logrus.Infof("Running job '%s' (%s)\n", job.Id(), job.Type())
ctx, cancelWatcher := context.WithCancel(context.Background())