worker: protect the instance from upgrading

Before the instance was vulnerable to an OTA update while processing a
request. Because there is no way of retriggering a job in Composer, it
is better to avoid this situation.
The way we are doing it is by setting the `protected` flag onto the
instance when a job is being processed. This way the AWS scheduler
does hopefully not shutdown the machine at the wrong time.

Main caveats of this solution:
* Starvation: If a worker keeps accepting new jobs, then it might not be
  updated.
* Inconsistency: There exist a window between the job acceptation and the
  protection where the worker can be shutdown without having the time to
  protect itself.
This commit is contained in:
Thomas Lavocat 2022-04-20 10:55:14 +02:00 committed by Sanne Raymaekers
parent b6f61d26ea
commit ab7fe6558a
7 changed files with 20826 additions and 0 deletions

View file

@ -14,6 +14,11 @@ import (
"time"
"github.com/BurntSushi/toml"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/sirupsen/logrus"
"github.com/osbuild/osbuild-composer/internal/common"
@ -84,6 +89,64 @@ func WatchJob(ctx context.Context, job worker.Job) {
}
}
// protect an AWS instance from scaling and/or terminating.
func setProtection(protected bool) {
// create a new session
awsSession, err := session.NewSession()
if err != nil {
logrus.Debugf("Error getting an AWS session, %s", err)
return
}
// get the identity for the instanceID
identity, err := ec2metadata.New(awsSession).GetInstanceIdentityDocument()
if err != nil {
logrus.Debugf("Error getting the identity document, %s", err)
return
}
svc := autoscaling.New(awsSession)
// get the autoscaling group info for the auto scaling group name
asInstanceInput := &autoscaling.DescribeAutoScalingInstancesInput{
InstanceIds: []*string{
aws.String(identity.InstanceID),
},
}
asInstanceOutput, err := svc.DescribeAutoScalingInstances(asInstanceInput)
if err != nil {
if aerr, ok := err.(awserr.Error); ok {
logrus.Warningf("Error getting the Autoscaling instances: %s %s", aerr.Code(), aerr.Error())
} else {
logrus.Errorf("Error getting the Autoscaling instances: unknown, %s", err)
}
return
}
// make the request to protect (or unprotect) the instance
input := &autoscaling.SetInstanceProtectionInput{
AutoScalingGroupName: asInstanceOutput.AutoScalingInstances[0].AutoScalingGroupName,
InstanceIds: []*string{
aws.String(identity.InstanceID),
},
ProtectedFromScaleIn: aws.Bool(protected),
}
_, err = svc.SetInstanceProtection(input)
if err != nil {
if aerr, ok := err.(awserr.Error); ok {
logrus.Warningf("Error protecting instance: %s %s", aerr.Code(), aerr.Error())
} else {
logrus.Errorf("Error protecting instance: unknown, %s", err)
}
return
}
if protected {
logrus.Info("Instance protected")
} else {
logrus.Info("Instance protection removed")
}
}
// Requests and runs 1 job of specified type(s)
// Returning an error here will result in the worker backing off for a while and retrying
func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls map[string]JobImplementation) error {
@ -104,6 +167,13 @@ func RequestAndRunJob(client *worker.Client, acceptedJobTypes []string, jobImpls
return err
}
// Depsolve requests needs reactivity, since setting the protection can take up to 6s to timeout if the worker isn't
// in an AWS env, disable this setting for them.
if job.Type() != "depsolve" {
setProtection(true)
defer setProtection(false)
}
logrus.Infof("Running job '%s' (%s)\n", job.Id(), job.Type())
ctx, cancelWatcher := context.WithCancel(context.Background())

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,37 @@
// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
// Package autoscaling provides the client and types for making API
// requests to Auto Scaling.
//
// Amazon EC2 Auto Scaling is designed to automatically launch or terminate
// EC2 instances based on user-defined scaling policies, scheduled actions,
// and health checks.
//
// For more information about Amazon EC2 Auto Scaling, see the Amazon EC2 Auto
// Scaling User Guide (https://docs.aws.amazon.com/autoscaling/ec2/userguide/what-is-amazon-ec2-auto-scaling.html).
// For information about granting IAM users required permissions for calls to
// Amazon EC2 Auto Scaling, see Granting IAM users required permissions for
// Amazon EC2 Auto Scaling resources (https://docs.aws.amazon.com/autoscaling/ec2/APIReference/ec2-auto-scaling-api-permissions.html)
// in the Amazon EC2 Auto Scaling API Reference.
//
// See https://docs.aws.amazon.com/goto/WebAPI/autoscaling-2011-01-01 for more information on this service.
//
// See autoscaling package documentation for more information.
// https://docs.aws.amazon.com/sdk-for-go/api/service/autoscaling/
//
// Using the Client
//
// To contact Auto Scaling with the SDK use the New function to create
// a new service client. With that client you can make API requests to the service.
// These clients are safe to use concurrently.
//
// See the SDK's documentation for more information on how to use the SDK.
// https://docs.aws.amazon.com/sdk-for-go/api/
//
// See aws.Config documentation for more information on configuring SDK clients.
// https://docs.aws.amazon.com/sdk-for-go/api/aws/#Config
//
// See the Auto Scaling client AutoScaling for more
// information on creating client for this service.
// https://docs.aws.amazon.com/sdk-for-go/api/service/autoscaling/#New
package autoscaling

View file

@ -0,0 +1,68 @@
// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
package autoscaling
const (
// ErrCodeActiveInstanceRefreshNotFoundFault for service response error code
// "ActiveInstanceRefreshNotFound".
//
// The request failed because an active instance refresh for the specified Auto
// Scaling group was not found.
ErrCodeActiveInstanceRefreshNotFoundFault = "ActiveInstanceRefreshNotFound"
// ErrCodeAlreadyExistsFault for service response error code
// "AlreadyExists".
//
// You already have an Auto Scaling group or launch configuration with this
// name.
ErrCodeAlreadyExistsFault = "AlreadyExists"
// ErrCodeInstanceRefreshInProgressFault for service response error code
// "InstanceRefreshInProgress".
//
// The request failed because an active instance refresh operation already exists
// for the specified Auto Scaling group.
ErrCodeInstanceRefreshInProgressFault = "InstanceRefreshInProgress"
// ErrCodeInvalidNextToken for service response error code
// "InvalidNextToken".
//
// The NextToken value is not valid.
ErrCodeInvalidNextToken = "InvalidNextToken"
// ErrCodeLimitExceededFault for service response error code
// "LimitExceeded".
//
// You have already reached a limit for your Amazon EC2 Auto Scaling resources
// (for example, Auto Scaling groups, launch configurations, or lifecycle hooks).
// For more information, see DescribeAccountLimits (https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_DescribeAccountLimits.html)
// in the Amazon EC2 Auto Scaling API Reference.
ErrCodeLimitExceededFault = "LimitExceeded"
// ErrCodeResourceContentionFault for service response error code
// "ResourceContention".
//
// You already have a pending update to an Amazon EC2 Auto Scaling resource
// (for example, an Auto Scaling group, instance, or load balancer).
ErrCodeResourceContentionFault = "ResourceContention"
// ErrCodeResourceInUseFault for service response error code
// "ResourceInUse".
//
// The operation can't be performed because the resource is in use.
ErrCodeResourceInUseFault = "ResourceInUse"
// ErrCodeScalingActivityInProgressFault for service response error code
// "ScalingActivityInProgress".
//
// The operation can't be performed because there are scaling activities in
// progress.
ErrCodeScalingActivityInProgressFault = "ScalingActivityInProgress"
// ErrCodeServiceLinkedRoleFailure for service response error code
// "ServiceLinkedRoleFailure".
//
// The service-linked role is not yet ready for use.
ErrCodeServiceLinkedRoleFailure = "ServiceLinkedRoleFailure"
)

View file

@ -0,0 +1,103 @@
// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
package autoscaling
import (
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/client"
"github.com/aws/aws-sdk-go/aws/client/metadata"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/signer/v4"
"github.com/aws/aws-sdk-go/private/protocol/query"
)
// AutoScaling provides the API operation methods for making requests to
// Auto Scaling. See this package's package overview docs
// for details on the service.
//
// AutoScaling methods are safe to use concurrently. It is not safe to
// modify mutate any of the struct's properties though.
type AutoScaling struct {
*client.Client
}
// Used for custom client initialization logic
var initClient func(*client.Client)
// Used for custom request initialization logic
var initRequest func(*request.Request)
// Service information constants
const (
ServiceName = "autoscaling" // Name of service.
EndpointsID = ServiceName // ID to lookup a service endpoint with.
ServiceID = "Auto Scaling" // ServiceID is a unique identifier of a specific service.
)
// New creates a new instance of the AutoScaling client with a session.
// If additional configuration is needed for the client instance use the optional
// aws.Config parameter to add your extra config.
//
// Example:
// mySession := session.Must(session.NewSession())
//
// // Create a AutoScaling client from just a session.
// svc := autoscaling.New(mySession)
//
// // Create a AutoScaling client with additional configuration
// svc := autoscaling.New(mySession, aws.NewConfig().WithRegion("us-west-2"))
func New(p client.ConfigProvider, cfgs ...*aws.Config) *AutoScaling {
c := p.ClientConfig(EndpointsID, cfgs...)
if c.SigningNameDerived || len(c.SigningName) == 0 {
c.SigningName = EndpointsID
// No Fallback
}
return newClient(*c.Config, c.Handlers, c.PartitionID, c.Endpoint, c.SigningRegion, c.SigningName, c.ResolvedRegion)
}
// newClient creates, initializes and returns a new service client instance.
func newClient(cfg aws.Config, handlers request.Handlers, partitionID, endpoint, signingRegion, signingName, resolvedRegion string) *AutoScaling {
svc := &AutoScaling{
Client: client.New(
cfg,
metadata.ClientInfo{
ServiceName: ServiceName,
ServiceID: ServiceID,
SigningName: signingName,
SigningRegion: signingRegion,
PartitionID: partitionID,
Endpoint: endpoint,
APIVersion: "2011-01-01",
ResolvedRegion: resolvedRegion,
},
handlers,
),
}
// Handlers
svc.Handlers.Sign.PushBackNamed(v4.SignRequestHandler)
svc.Handlers.Build.PushBackNamed(query.BuildHandler)
svc.Handlers.Unmarshal.PushBackNamed(query.UnmarshalHandler)
svc.Handlers.UnmarshalMeta.PushBackNamed(query.UnmarshalMetaHandler)
svc.Handlers.UnmarshalError.PushBackNamed(query.UnmarshalErrorHandler)
// Run custom client initialization if present
if initClient != nil {
initClient(svc.Client)
}
return svc
}
// newRequest creates a new request for a AutoScaling operation and runs any
// custom request initialization.
func (c *AutoScaling) newRequest(op *request.Operation, params, data interface{}) *request.Request {
req := c.NewRequest(op, params, data)
// Run custom request initialization if present
if initRequest != nil {
initRequest(req)
}
return req
}

View file

@ -0,0 +1,163 @@
// Code generated by private/model/cli/gen-api/main.go. DO NOT EDIT.
package autoscaling
import (
"time"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/request"
)
// WaitUntilGroupExists uses the Auto Scaling API operation
// DescribeAutoScalingGroups to wait for a condition to be met before returning.
// If the condition is not met within the max attempt window, an error will
// be returned.
func (c *AutoScaling) WaitUntilGroupExists(input *DescribeAutoScalingGroupsInput) error {
return c.WaitUntilGroupExistsWithContext(aws.BackgroundContext(), input)
}
// WaitUntilGroupExistsWithContext is an extended version of WaitUntilGroupExists.
// With the support for passing in a context and options to configure the
// Waiter and the underlying request options.
//
// The context must be non-nil and will be used for request cancellation. If
// the context is nil a panic will occur. In the future the SDK may create
// sub-contexts for http.Requests. See https://golang.org/pkg/context/
// for more information on using Contexts.
func (c *AutoScaling) WaitUntilGroupExistsWithContext(ctx aws.Context, input *DescribeAutoScalingGroupsInput, opts ...request.WaiterOption) error {
w := request.Waiter{
Name: "WaitUntilGroupExists",
MaxAttempts: 10,
Delay: request.ConstantWaiterDelay(5 * time.Second),
Acceptors: []request.WaiterAcceptor{
{
State: request.SuccessWaiterState,
Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
Expected: true,
},
{
State: request.RetryWaiterState,
Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
Expected: false,
},
},
Logger: c.Config.Logger,
NewRequest: func(opts []request.Option) (*request.Request, error) {
var inCpy *DescribeAutoScalingGroupsInput
if input != nil {
tmp := *input
inCpy = &tmp
}
req, _ := c.DescribeAutoScalingGroupsRequest(inCpy)
req.SetContext(ctx)
req.ApplyOptions(opts...)
return req, nil
},
}
w.ApplyOptions(opts...)
return w.WaitWithContext(ctx)
}
// WaitUntilGroupInService uses the Auto Scaling API operation
// DescribeAutoScalingGroups to wait for a condition to be met before returning.
// If the condition is not met within the max attempt window, an error will
// be returned.
func (c *AutoScaling) WaitUntilGroupInService(input *DescribeAutoScalingGroupsInput) error {
return c.WaitUntilGroupInServiceWithContext(aws.BackgroundContext(), input)
}
// WaitUntilGroupInServiceWithContext is an extended version of WaitUntilGroupInService.
// With the support for passing in a context and options to configure the
// Waiter and the underlying request options.
//
// The context must be non-nil and will be used for request cancellation. If
// the context is nil a panic will occur. In the future the SDK may create
// sub-contexts for http.Requests. See https://golang.org/pkg/context/
// for more information on using Contexts.
func (c *AutoScaling) WaitUntilGroupInServiceWithContext(ctx aws.Context, input *DescribeAutoScalingGroupsInput, opts ...request.WaiterOption) error {
w := request.Waiter{
Name: "WaitUntilGroupInService",
MaxAttempts: 40,
Delay: request.ConstantWaiterDelay(15 * time.Second),
Acceptors: []request.WaiterAcceptor{
{
State: request.SuccessWaiterState,
Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) >= MinSize][], `false`)",
Expected: false,
},
{
State: request.RetryWaiterState,
Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) >= MinSize][], `false`)",
Expected: true,
},
},
Logger: c.Config.Logger,
NewRequest: func(opts []request.Option) (*request.Request, error) {
var inCpy *DescribeAutoScalingGroupsInput
if input != nil {
tmp := *input
inCpy = &tmp
}
req, _ := c.DescribeAutoScalingGroupsRequest(inCpy)
req.SetContext(ctx)
req.ApplyOptions(opts...)
return req, nil
},
}
w.ApplyOptions(opts...)
return w.WaitWithContext(ctx)
}
// WaitUntilGroupNotExists uses the Auto Scaling API operation
// DescribeAutoScalingGroups to wait for a condition to be met before returning.
// If the condition is not met within the max attempt window, an error will
// be returned.
func (c *AutoScaling) WaitUntilGroupNotExists(input *DescribeAutoScalingGroupsInput) error {
return c.WaitUntilGroupNotExistsWithContext(aws.BackgroundContext(), input)
}
// WaitUntilGroupNotExistsWithContext is an extended version of WaitUntilGroupNotExists.
// With the support for passing in a context and options to configure the
// Waiter and the underlying request options.
//
// The context must be non-nil and will be used for request cancellation. If
// the context is nil a panic will occur. In the future the SDK may create
// sub-contexts for http.Requests. See https://golang.org/pkg/context/
// for more information on using Contexts.
func (c *AutoScaling) WaitUntilGroupNotExistsWithContext(ctx aws.Context, input *DescribeAutoScalingGroupsInput, opts ...request.WaiterOption) error {
w := request.Waiter{
Name: "WaitUntilGroupNotExists",
MaxAttempts: 40,
Delay: request.ConstantWaiterDelay(15 * time.Second),
Acceptors: []request.WaiterAcceptor{
{
State: request.SuccessWaiterState,
Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
Expected: false,
},
{
State: request.RetryWaiterState,
Matcher: request.PathWaiterMatch, Argument: "length(AutoScalingGroups) > `0`",
Expected: true,
},
},
Logger: c.Config.Logger,
NewRequest: func(opts []request.Option) (*request.Request, error) {
var inCpy *DescribeAutoScalingGroupsInput
if input != nil {
tmp := *input
inCpy = &tmp
}
req, _ := c.DescribeAutoScalingGroupsRequest(inCpy)
req.SetContext(ctx)
req.ApplyOptions(opts...)
return req, nil
},
}
w.ApplyOptions(opts...)
return w.WaitWithContext(ctx)
}

1
vendor/modules.txt vendored
View file

@ -114,6 +114,7 @@ github.com/aws/aws-sdk-go/private/protocol/rest
github.com/aws/aws-sdk-go/private/protocol/restjson
github.com/aws/aws-sdk-go/private/protocol/restxml
github.com/aws/aws-sdk-go/private/protocol/xml/xmlutil
github.com/aws/aws-sdk-go/service/autoscaling
github.com/aws/aws-sdk-go/service/ec2
github.com/aws/aws-sdk-go/service/s3
github.com/aws/aws-sdk-go/service/s3/s3iface