diff --git a/internal/cloud/awscloud/export_test.go b/internal/cloud/awscloud/export_test.go index 35f4283f5..d0589f980 100644 --- a/internal/cloud/awscloud/export_test.go +++ b/internal/cloud/awscloud/export_test.go @@ -1,3 +1,4 @@ package awscloud var NewForTest = newForTest +var DoCreateFleetRetry = doCreateFleetRetry diff --git a/internal/cloud/awscloud/secure-instance.go b/internal/cloud/awscloud/secure-instance.go index 435fe26f1..3c8805098 100644 --- a/internal/cloud/awscloud/secure-instance.go +++ b/internal/cloud/awscloud/secure-instance.go @@ -5,6 +5,7 @@ import ( "encoding/base64" "errors" "fmt" + "slices" "strings" "time" @@ -547,20 +548,24 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, return nil, fmt.Errorf("Unable to create spot fleet: %w", err) } - if len(createFleetOutput.Errors) > 0 { - logrus.Warnf("Received error %s from CreateFleet, retrying CreateFleet with OnDemand instance", *createFleetOutput.Errors[0].ErrorCode) + retry, fleetErrs := doCreateFleetRetry(createFleetOutput) + if len(fleetErrs) > 0 && retry { + logrus.Warnf("Received errors (%s) from CreateFleet, retrying CreateFleet with OnDemand instance", strings.Join(fleetErrs, "; ")) input.SpotOptions = nil createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) + if err != nil { + return nil, fmt.Errorf("Unable to create on demand fleet: %w", err) + } } - if err == nil && len(createFleetOutput.Errors) > 0 { - logrus.Warnf("Received error %s from CreateFleet with OnDemand instance option, retrying across availability zones", *createFleetOutput.Errors[0].ErrorCode) + retry, fleetErrs = doCreateFleetRetry(createFleetOutput) + if len(fleetErrs) > 0 && retry { + logrus.Warnf("Received errors (%s) from CreateFleet with OnDemand instance option, retrying across availability zones", strings.Join(fleetErrs, "; ")) input.LaunchTemplateConfigs[0].Overrides = nil createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) - } - - if err != nil { - return nil, fmt.Errorf("Unable to create fleet, tried on-demand and across AZs: %w", err) + if err != nil { + return nil, fmt.Errorf("Unable to create on demand fleet across AZs: %w", err) + } } if len(createFleetOutput.Errors) > 0 { @@ -579,3 +584,30 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, } return createFleetOutput, nil } + +func doCreateFleetRetry(cfOutput *ec2.CreateFleetOutput) (bool, []string) { + if cfOutput == nil { + return false, nil + } + + retryCodes := []string{ + "UnfulfillableCapacity", + "InsufficientInstanceCapacity", + } + msg := []string{} + retry := false + for _, err := range cfOutput.Errors { + if slices.Contains(retryCodes, *err.ErrorCode) { + retry = true + } + msg = append(msg, fmt.Sprintf("%s: %s", *err.ErrorCode, *err.ErrorMessage)) + } + + // Do not retry in case an instance already exists, in that case just fail and let the worker terminate the SI + if len(cfOutput.Instances) > 0 && len(cfOutput.Instances[0].InstanceIds) > 0 { + retry = false + msg = append(msg, fmt.Sprintf("Already launched instance (%s), aborting create fleet", cfOutput.Instances[0].InstanceIds)) + } + + return retry, msg +} diff --git a/internal/cloud/awscloud/secure-instance_test.go b/internal/cloud/awscloud/secure-instance_test.go index 32613adfd..2ebc48bcc 100644 --- a/internal/cloud/awscloud/secure-instance_test.go +++ b/internal/cloud/awscloud/secure-instance_test.go @@ -4,6 +4,9 @@ import ( "fmt" "testing" + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/stretchr/testify/require" "github.com/osbuild/osbuild-composer/internal/cloud/awscloud" @@ -164,3 +167,64 @@ func TestSICreateFleetFailures(t *testing.T) { require.Equal(t, 4, m.calledFn["DeleteSecurityGroup"]) require.Equal(t, 4, m.calledFn["DeleteLaunchTemplate"]) } + +func TestDoCreateFleetRetry(t *testing.T) { + cfOutput := &ec2.CreateFleetOutput{ + Errors: []ec2types.CreateFleetError{ + { + ErrorCode: aws.String("UnfulfillableCapacity"), + ErrorMessage: aws.String("Msg"), + }, + }, + } + retry, fmtErrs := awscloud.DoCreateFleetRetry(cfOutput) + require.True(t, retry) + require.Equal(t, []string{"UnfulfillableCapacity: Msg"}, fmtErrs) + + cfOutput = &ec2.CreateFleetOutput{ + Errors: []ec2types.CreateFleetError{ + { + ErrorCode: aws.String("Bogus"), + ErrorMessage: aws.String("Msg"), + }, + { + ErrorCode: aws.String("InsufficientInstanceCapacity"), + ErrorMessage: aws.String("Msg"), + }, + }, + } + retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput) + require.True(t, retry) + require.Equal(t, []string{"Bogus: Msg", "InsufficientInstanceCapacity: Msg"}, fmtErrs) + + cfOutput = &ec2.CreateFleetOutput{ + Errors: []ec2types.CreateFleetError{ + { + ErrorCode: aws.String("Bogus"), + ErrorMessage: aws.String("Msg"), + }, + }, + } + retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput) + require.False(t, retry) + require.Equal(t, []string{"Bogus: Msg"}, fmtErrs) + + cfOutput = &ec2.CreateFleetOutput{ + Errors: []ec2types.CreateFleetError{ + { + ErrorCode: aws.String("InsufficientInstanceCapacity"), + ErrorMessage: aws.String("Msg"), + }, + }, + Instances: []ec2types.CreateFleetInstance{ + { + InstanceIds: []string{ + "instance-id", + }, + }, + }, + } + retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput) + require.False(t, retry) + require.Equal(t, []string{"InsufficientInstanceCapacity: Msg", "Already launched instance ([instance-id]), aborting create fleet"}, fmtErrs) +}