worker: protect the instance from upgrading

Before the instance was vulnerable to an OTA update while processing a request. Because there is no way of retriggering a job in Composer, it is better to avoid this situation. The way we are doing it is by setting the `protected` flag onto the instance when a job is being processed. This way the AWS scheduler does hopefully not shutdown the machine at the wrong time. Main caveats of this solution: * Starvation: If a worker keeps accepting new jobs, then it might not be updated. * Inconsistency: There exist a window between the job acceptation and the protection where the worker can be shutdown without having the time to protect itself.
2022-04-20 10:55:14 +02:00 · 2022-04-20 10:55:14 +02:00 · ab7fe6558a
commit ab7fe6558a
parent b6f61d26ea
7 changed files with 20826 additions and 0 deletions
--- a/cmd/osbuild-worker/main.go
+++ b/cmd/osbuild-worker/main.go
@ -14,6 +14,11 @@ import (
 	"time"

 	"github.com/BurntSushi/toml"
+	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/awserr"
+	"github.com/aws/aws-sdk-go/aws/ec2metadata"
+	"github.com/aws/aws-sdk-go/aws/session"
+	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/sirupsen/logrus"

 	"github.com/osbuild/osbuild-composer/internal/common"
@ -84,6 +89,64 @@ func WatchJob(ctx context.Context, job worker.Job) {
 	}
 }

+// protect an AWS instance from scaling and/or terminating.
+func setProtection(protected bool) {
+	// create a new session
+	awsSession, err := session.NewSession()
+	if err != nil {
+		logrus.Debugf("Error getting an AWS session, %s", err)
+		return
+	}
+
+	// get the identity for the instanceID
+	identity, err := ec2metadata.New(awsSession).GetInstanceIdentityDocument()
+	if err != nil {
+		logrus.Debugf("Error getting the identity document, %s", err)
+		return
+	}
+
+	svc := autoscaling.New(awsSession)
+
+	// get the autoscaling group info for the auto scaling group name
+	asInstanceInput := &autoscaling.DescribeAutoScalingInstancesInput{
+		InstanceIds: []*string{
+			aws.String(identity.InstanceID),
+		},
+	}
+	asInstanceOutput, err := svc.DescribeAutoScalingInstances(asInstanceInput)
+	if err != nil {
+		if aerr, ok := err.(awserr.Error); ok {
+			logrus.Warningf("Error getting the Autoscaling instances: %s %s", aerr.Code(), aerr.Error())
+		} else {
+			logrus.Errorf("Error getting the Autoscaling instances: unknown, %s", err)
+		}
+		return
+	}
+
+	// make the request to protect (or unprotect) the instance
+	input := &autoscaling.SetInstanceProtectionInput{
+		AutoScalingGroupName: asInstanceOutput.AutoScalingInstances[0].AutoScalingGroupName,
+		InstanceIds: []*string{
+			aws.String(identity.InstanceID),
+		},
+		ProtectedFromScaleIn: aws.Bool(protected),
+	}
+	_, err = svc.SetInstanceProtection(input)
+	if err != nil {
+		if aerr, ok := err.(awserr.Error); ok {
+			logrus.Warningf("Error protecting instance: %s %s", aerr.Code(), aerr.Error())
+		} else {
+			logrus.Errorf("Error protecting instance: unknown, %s", err)
+		}
+		return
+	}
+	if protected {
+		logrus.Info("Instance protected")
+	} else {
+		logrus.Info("Instance protection removed")
+	}
+}
+
 // Requests and runs 1 job of specified type(s)
 // Returning an error here will result in the worker backing off for a while and retrying
 func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls map[string]JobImplementation) error {
@ -104,6 +167,13 @@ func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls
 		return err
 	}

+	// Depsolve requests needs reactivity, since setting the protection can take up to 6s to timeout if the worker isn't
+	// in an AWS env, disable this setting for them.
+	if job.Type() != "depsolve" {
+		setProtection(true)
+		defer setProtection(false)
+	}
+
 	logrus.Infof("Running job '%s' (%s)\n", job.Id(), job.Type())

 	ctx, cancelWatcher := context.WithCancel(context.Background())