worker: protect the instance from upgrading

Before the instance was vulnerable to an OTA update while processing a request. Because there is no way of retriggering a job in Composer, it is better to avoid this situation. The way we are doing it is by setting the `protected` flag onto the instance when a job is being processed. This way the AWS scheduler does hopefully not shutdown the machine at the wrong time. Main caveats of this solution: * Starvation: If a worker keeps accepting new jobs, then it might not be updated. * Inconsistency: There exist a window between the job acceptation and the protection where the worker can be shutdown without having the time to protect itself.
2022-04-20 10:55:14 +02:00 · 2022-04-20 10:55:14 +02:00 · ab7fe6558a
commit ab7fe6558a
parent b6f61d26ea
7 changed files with 20826 additions and 0 deletions
--- a/cmd/osbuild-worker/main.go
+++ b/cmd/osbuild-worker/main.go
@ -14,6 +14,11 @@ import (
 	"time"

 	"github.com/BurntSushi/toml"
+	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/awserr"
+	"github.com/aws/aws-sdk-go/aws/ec2metadata"
+	"github.com/aws/aws-sdk-go/aws/session"
+	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/sirupsen/logrus"

 	"github.com/osbuild/osbuild-composer/internal/common"
@ -84,6 +89,64 @@ func WatchJob(ctx context.Context, job worker.Job) {
 	}
 }

+// protect an AWS instance from scaling and/or terminating.
+func setProtection(protected bool) {
+	// create a new session
+	awsSession, err := session.NewSession()
+	if err != nil {
+		logrus.Debugf("Error getting an AWS session, %s", err)
+		return
+	}
+
+	// get the identity for the instanceID
+	identity, err := ec2metadata.New(awsSession).GetInstanceIdentityDocument()
+	if err != nil {
+		logrus.Debugf("Error getting the identity document, %s", err)
+		return
+	}
+
+	svc := autoscaling.New(awsSession)
+
+	// get the autoscaling group info for the auto scaling group name
+	asInstanceInput := &autoscaling.DescribeAutoScalingInstancesInput{
+		InstanceIds: []*string{
+			aws.String(identity.InstanceID),
+		},
+	}
+	asInstanceOutput, err := svc.DescribeAutoScalingInstances(asInstanceInput)
+	if err != nil {
+		if aerr, ok := err.(awserr.Error); ok {
+			logrus.Warningf("Error getting the Autoscaling instances: %s %s", aerr.Code(), aerr.Error())
+		} else {
+			logrus.Errorf("Error getting the Autoscaling instances: unknown, %s", err)
+		}
+		return
+	}
+
+	// make the request to protect (or unprotect) the instance
+	input := &autoscaling.SetInstanceProtectionInput{
+		AutoScalingGroupName: asInstanceOutput.AutoScalingInstances[0].AutoScalingGroupName,
+		InstanceIds: []*string{
+			aws.String(identity.InstanceID),
+		},
+		ProtectedFromScaleIn: aws.Bool(protected),
+	}
+	_, err = svc.SetInstanceProtection(input)
+	if err != nil {
+		if aerr, ok := err.(awserr.Error); ok {
+			logrus.Warningf("Error protecting instance: %s %s", aerr.Code(), aerr.Error())
+		} else {
+			logrus.Errorf("Error protecting instance: unknown, %s", err)
+		}
+		return
+	}
+	if protected {
+		logrus.Info("Instance protected")
+	} else {
+		logrus.Info("Instance protection removed")
+	}
+}
+
 // Requests and runs 1 job of specified type(s)
 // Returning an error here will result in the worker backing off for a while and retrying
 func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls map[string]JobImplementation) error {
@ -104,6 +167,13 @@ func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls
 		return err
 	}

+	// Depsolve requests needs reactivity, since setting the protection can take up to 6s to timeout if the worker isn't
+	// in an AWS env, disable this setting for them.
+	if job.Type() != "depsolve" {
+		setProtection(true)
+		defer setProtection(false)
+	}
+
 	logrus.Infof("Running job '%s' (%s)\n", job.Id(), job.Type())

 	ctx, cancelWatcher := context.WithCancel(context.Background())
--- a/vendor/github.com/aws/aws-sdk-go/service/autoscaling/api.go
+++ b/vendor/github.com/aws/aws-sdk-go/service/autoscaling/api.go
--- a/vendor/github.com/aws/aws-sdk-go/service/autoscaling/doc.go
+++ b/vendor/github.com/aws/aws-sdk-go/service/autoscaling/doc.go
@ -0,0 +1,37 @@
+// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
+
+// Package autoscaling provides the client and types for making API
+// requests to Auto Scaling.
+//
+// Amazon EC2 Auto Scaling is designed to automatically launch or terminate
+// EC2 instances based on user-defined scaling policies, scheduled actions,
+// and health checks.
+//
+// For more information about Amazon EC2 Auto Scaling, see the Amazon EC2 Auto
+// Scaling User Guide (https://docs.aws.amazon.com/autoscaling/ec2/userguide/what-is-amazon-ec2-auto-scaling.html).
+// For information about granting IAM users required permissions for calls to
+// Amazon EC2 Auto Scaling, see Granting IAM users required permissions for
+// Amazon EC2 Auto Scaling resources (https://docs.aws.amazon.com/autoscaling/ec2/APIReference/ec2-auto-scaling-api-permissions.html)
+// in the Amazon EC2 Auto Scaling API Reference.
+//
+// See https://docs.aws.amazon.com/goto/WebAPI/autoscaling-2011-01-01 for more information on this service.
+//
+// See autoscaling package documentation for more information.
+// https://docs.aws.amazon.com/sdk-for-go/api/service/autoscaling/
+//
+// Using the Client
+//
+// To contact Auto Scaling with the SDK use the New function to create
+// a new service client. With that client you can make API requests to the service.
+// These clients are safe to use concurrently.
+//
+// See the SDK's documentation for more information on how to use the SDK.
+// https://docs.aws.amazon.com/sdk-for-go/api/
+//
+// See aws.Config documentation for more information on configuring SDK clients.
+// https://docs.aws.amazon.com/sdk-for-go/api/aws/#Config
+//
+// See the Auto Scaling client AutoScaling for more
+// information on creating client for this service.
+// https://docs.aws.amazon.com/sdk-for-go/api/service/autoscaling/#New
+package autoscaling
--- a/vendor/github.com/aws/aws-sdk-go/service/autoscaling/errors.go
+++ b/vendor/github.com/aws/aws-sdk-go/service/autoscaling/errors.go
@ -0,0 +1,68 @@
+// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
+
+package autoscaling
+
+const (
+
+	// ErrCodeActiveInstanceRefreshNotFoundFault for service response error code
+	// "ActiveInstanceRefreshNotFound".
+	//
+	// The request failed because an active instance refresh for the specified Auto
+	// Scaling group was not found.
+	ErrCodeActiveInstanceRefreshNotFoundFault = "ActiveInstanceRefreshNotFound"
+
+	// ErrCodeAlreadyExistsFault for service response error code
+	// "AlreadyExists".
+	//
+	// You already have an Auto Scaling group or launch configuration with this
+	// name.
+	ErrCodeAlreadyExistsFault = "AlreadyExists"
+
+	// ErrCodeInstanceRefreshInProgressFault for service response error code
+	// "InstanceRefreshInProgress".
+	//
+	// The request failed because an active instance refresh operation already exists
+	// for the specified Auto Scaling group.
+	ErrCodeInstanceRefreshInProgressFault = "InstanceRefreshInProgress"
+
+	// ErrCodeInvalidNextToken for service response error code
+	// "InvalidNextToken".
+	//
+	// The NextToken value is not valid.
+	ErrCodeInvalidNextToken = "InvalidNextToken"
+
+	// ErrCodeLimitExceededFault for service response error code
+	// "LimitExceeded".
+	//
+	// You have already reached a limit for your Amazon EC2 Auto Scaling resources
+	// (for example, Auto Scaling groups, launch configurations, or lifecycle hooks).
+	// For more information, see DescribeAccountLimits (https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_DescribeAccountLimits.html)
+	// in the Amazon EC2 Auto Scaling API Reference.
+	ErrCodeLimitExceededFault = "LimitExceeded"
+
+	// ErrCodeResourceContentionFault for service response error code
+	// "ResourceContention".
+	//
+	// You already have a pending update to an Amazon EC2 Auto Scaling resource
+	// (for example, an Auto Scaling group, instance, or load balancer).
+	ErrCodeResourceContentionFault = "ResourceContention"
+
+	// ErrCodeResourceInUseFault for service response error code
+	// "ResourceInUse".
+	//
+	// The operation can't be performed because the resource is in use.
+	ErrCodeResourceInUseFault = "ResourceInUse"
+
+	// ErrCodeScalingActivityInProgressFault for service response error code
+	// "ScalingActivityInProgress".
+	//
+	// The operation can't be performed because there are scaling activities in
+	// progress.
+	ErrCodeScalingActivityInProgressFault = "ScalingActivityInProgress"
+
+	// ErrCodeServiceLinkedRoleFailure for service response error code
+	// "ServiceLinkedRoleFailure".
+	//
+	// The service-linked role is not yet ready for use.
+	ErrCodeServiceLinkedRoleFailure = "ServiceLinkedRoleFailure"
+)
--- a/vendor/github.com/aws/aws-sdk-go/service/autoscaling/service.go
+++ b/vendor/github.com/aws/aws-sdk-go/service/autoscaling/service.go
@ -0,0 +1,103 @@
+// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
+
+package autoscaling
+
+import (
+	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/client"
+	"github.com/aws/aws-sdk-go/aws/client/metadata"
+	"github.com/aws/aws-sdk-go/aws/request"
+	"github.com/aws/aws-sdk-go/aws/signer/v4"
+	"github.com/aws/aws-sdk-go/private/protocol/query"
+)
+
+// AutoScaling provides the API operation methods for making requests to
+// Auto Scaling. See this package's package overview docs
+// for details on the service.
+//
+// AutoScaling methods are safe to use concurrently. It is not safe to
+// modify mutate any of the struct's properties though.
+type AutoScaling struct {
+	*client.Client
+}
+
+// Used for custom client initialization logic
+var initClient func(*client.Client)
+
+// Used for custom request initialization logic
+var initRequest func(*request.Request)
+
+// Service information constants
+const (
+	ServiceName = "autoscaling"  // Name of service.
+	EndpointsID = ServiceName    // ID to lookup a service endpoint with.
+	ServiceID   = "Auto Scaling" // ServiceID is a unique identifier of a specific service.
+)
+
+// New creates a new instance of the AutoScaling client with a session.
+// If additional configuration is needed for the client instance use the optional
+// aws.Config parameter to add your extra config.
+//
+// Example:
+//     mySession := session.Must(session.NewSession())
+//
+//     // Create a AutoScaling client from just a session.
+//     svc := autoscaling.New(mySession)
+//
+//     // Create a AutoScaling client with additional configuration
+//     svc := autoscaling.New(mySession, aws.NewConfig().WithRegion("us-west-2"))
+func New(p client.ConfigProvider, cfgs ...*aws.Config) *AutoScaling {
+	c := p.ClientConfig(EndpointsID, cfgs...)
+	if c.SigningNameDerived || len(c.SigningName) == 0 {
+		c.SigningName = EndpointsID
+		// No Fallback
+	}
+	return newClient(*c.Config, c.Handlers, c.PartitionID, c.Endpoint, c.SigningRegion, c.SigningName, c.ResolvedRegion)
+}
+
+// newClient creates, initializes and returns a new service client instance.
+func newClient(cfg aws.Config, handlers request.Handlers, partitionID, endpoint, signingRegion, signingName, resolvedRegion string) *AutoScaling {
+	svc := &AutoScaling{
+		Client: client.New(
+			cfg,
+			metadata.ClientInfo{
+				ServiceName:    ServiceName,
+				ServiceID:      ServiceID,
+				SigningName:    signingName,
+				SigningRegion:  signingRegion,
+				PartitionID:    partitionID,
+				Endpoint:       endpoint,
+				APIVersion:     "2011-01-01",
+				ResolvedRegion: resolvedRegion,
+			},
+			handlers,
+		),
+	}
+
+	// Handlers
+	svc.Handlers.Sign.PushBackNamed(v4.SignRequestHandler)
+	svc.Handlers.Build.PushBackNamed(query.BuildHandler)
+	svc.Handlers.Unmarshal.PushBackNamed(query.UnmarshalHandler)
+	svc.Handlers.UnmarshalMeta.PushBackNamed(query.UnmarshalMetaHandler)
+	svc.Handlers.UnmarshalError.PushBackNamed(query.UnmarshalErrorHandler)
+
+	// Run custom client initialization if present
+	if initClient != nil {
+		initClient(svc.Client)
+	}
+
+	return svc
+}
+
+// newRequest creates a new request for a AutoScaling operation and runs any
+// custom request initialization.
+func (c *AutoScaling) newRequest(op *request.Operation, params, data interface{}) *request.Request {
+	req := c.NewRequest(op, params, data)
+
+	// Run custom request initialization if present
+	if initRequest != nil {
+		initRequest(req)
+	}
+
+	return req
+}
--- a/vendor/github.com/aws/aws-sdk-go/service/autoscaling/waiters.go
+++ b/vendor/github.com/aws/aws-sdk-go/service/autoscaling/waiters.go
@ -0,0 +1,163 @@
+// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
+
+package autoscaling
+
+import (
+	"time"
+
+	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/request"
+)
+
+// WaitUntilGroupExists uses the Auto Scaling API operation
+// DescribeAutoScalingGroups to wait for a condition to be met before returning.
+// If the condition is not met within the max attempt window, an error will
+// be returned.
+func (c *AutoScaling) WaitUntilGroupExists(input *DescribeAutoScalingGroupsInput) error {
+	return c.WaitUntilGroupExistsWithContext(aws.BackgroundContext(), input)
+}
+
+// WaitUntilGroupExistsWithContext is an extended version of WaitUntilGroupExists.
+// With the support for passing in a context and options to configure the
+// Waiter and the underlying request options.
+//
+// The context must be non-nil and will be used for request cancellation. If
+// the context is nil a panic will occur. In the future the SDK may create
+// sub-contexts for http.Requests. See https://golang.org/pkg/context/
+// for more information on using Contexts.
+func (c *AutoScaling) WaitUntilGroupExistsWithContext(ctx aws.Context, input *DescribeAutoScalingGroupsInput, opts ...request.WaiterOption) error {
+	w := request.Waiter{
+		Name:        "WaitUntilGroupExists",
+		MaxAttempts: 10,
+		Delay:       request.ConstantWaiterDelay(5 * time.Second),
+		Acceptors: []request.WaiterAcceptor{
+			{
+				State:   request.SuccessWaiterState,
+				Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
+				Expected: true,
+			},
+			{
+				State:   request.RetryWaiterState,
+				Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
+				Expected: false,
+			},
+		},
+		Logger: c.Config.Logger,
+		NewRequest: func(opts []request.Option) (*request.Request, error) {
+			var inCpy *DescribeAutoScalingGroupsInput
+			if input != nil {
+				tmp := *input
+				inCpy = &tmp
+			}
+			req, _ := c.DescribeAutoScalingGroupsRequest(inCpy)
+			req.SetContext(ctx)
+			req.ApplyOptions(opts...)
+			return req, nil
+		},
+	}
+	w.ApplyOptions(opts...)
+
+	return w.WaitWithContext(ctx)
+}
+
+// WaitUntilGroupInService uses the Auto Scaling API operation
+// DescribeAutoScalingGroups to wait for a condition to be met before returning.
+// If the condition is not met within the max attempt window, an error will
+// be returned.
+func (c *AutoScaling) WaitUntilGroupInService(input *DescribeAutoScalingGroupsInput) error {
+	return c.WaitUntilGroupInServiceWithContext(aws.BackgroundContext(), input)
+}
+
+// WaitUntilGroupInServiceWithContext is an extended version of WaitUntilGroupInService.
+// With the support for passing in a context and options to configure the
+// Waiter and the underlying request options.
+//
+// The context must be non-nil and will be used for request cancellation. If
+// the context is nil a panic will occur. In the future the SDK may create
+// sub-contexts for http.Requests. See https://golang.org/pkg/context/
+// for more information on using Contexts.
+func (c *AutoScaling) WaitUntilGroupInServiceWithContext(ctx aws.Context, input *DescribeAutoScalingGroupsInput, opts ...request.WaiterOption) error {
+	w := request.Waiter{
+		Name:        "WaitUntilGroupInService",
+		MaxAttempts: 40,
+		Delay:       request.ConstantWaiterDelay(15 * time.Second),
+		Acceptors: []request.WaiterAcceptor{
+			{
+				State:   request.SuccessWaiterState,
+				Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) >= MinSize][], `false`)",
+				Expected: false,
+			},
+			{
+				State:   request.RetryWaiterState,
+				Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) >= MinSize][], `false`)",
+				Expected: true,
+			},
+		},
+		Logger: c.Config.Logger,
+		NewRequest: func(opts []request.Option) (*request.Request, error) {
+			var inCpy *DescribeAutoScalingGroupsInput
+			if input != nil {
+				tmp := *input
+				inCpy = &tmp
+			}
+			req, _ := c.DescribeAutoScalingGroupsRequest(inCpy)
+			req.SetContext(ctx)
+			req.ApplyOptions(opts...)
+			return req, nil
+		},
+	}
+	w.ApplyOptions(opts...)
+
+	return w.WaitWithContext(ctx)
+}
+
+// WaitUntilGroupNotExists uses the Auto Scaling API operation
+// DescribeAutoScalingGroups to wait for a condition to be met before returning.
+// If the condition is not met within the max attempt window, an error will
+// be returned.
+func (c *AutoScaling) WaitUntilGroupNotExists(input *DescribeAutoScalingGroupsInput) error {
+	return c.WaitUntilGroupNotExistsWithContext(aws.BackgroundContext(), input)
+}
+
+// WaitUntilGroupNotExistsWithContext is an extended version of WaitUntilGroupNotExists.
+// With the support for passing in a context and options to configure the
+// Waiter and the underlying request options.
+//
+// The context must be non-nil and will be used for request cancellation. If
+// the context is nil a panic will occur. In the future the SDK may create
+// sub-contexts for http.Requests. See https://golang.org/pkg/context/
+// for more information on using Contexts.
+func (c *AutoScaling) WaitUntilGroupNotExistsWithContext(ctx aws.Context, input *DescribeAutoScalingGroupsInput, opts ...request.WaiterOption) error {
+	w := request.Waiter{
+		Name:        "WaitUntilGroupNotExists",
+		MaxAttempts: 40,
+		Delay:       request.ConstantWaiterDelay(15 * time.Second),
+		Acceptors: []request.WaiterAcceptor{
+			{
+				State:   request.SuccessWaiterState,
+				Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
+				Expected: false,
+			},
+			{
+				State:   request.RetryWaiterState,
+				Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
+				Expected: true,
+			},
+		},
+		Logger: c.Config.Logger,
+		NewRequest: func(opts []request.Option) (*request.Request, error) {
+			var inCpy *DescribeAutoScalingGroupsInput
+			if input != nil {
+				tmp := *input
+				inCpy = &tmp
+			}
+			req, _ := c.DescribeAutoScalingGroupsRequest(inCpy)
+			req.SetContext(ctx)
+			req.ApplyOptions(opts...)
+			return req, nil
+		},
+	}
+	w.ApplyOptions(opts...)
+
+	return w.WaitWithContext(ctx)
+}
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -114,6 +114,7 @@ github.com/aws/aws-sdk-go/private/protocol/rest
 github.com/aws/aws-sdk-go/private/protocol/restjson
 github.com/aws/aws-sdk-go/private/protocol/restxml
 github.com/aws/aws-sdk-go/private/protocol/xml/xmlutil
+github.com/aws/aws-sdk-go/service/autoscaling
 github.com/aws/aws-sdk-go/service/ec2
 github.com/aws/aws-sdk-go/service/s3
 github.com/aws/aws-sdk-go/service/s3/s3iface