cloud/awscloud: rework create fleet retry logic

The current path sometimes launches two instances, which is problematic because the rest of the secure instance code expects exactly one instance. A security group could be attached to both instances, and would block the worker from launching any more SIs, as it tries to delete the old security group first, which is still held by one of the surplus SIs which didn't get terminated. Only retry if: - on "UnfulfillableCapacity" or "InsufficientInstanceCapacity" error codes; - there wasn't an instance launched anyway. If either of these checks fail, do not try to launch another one, and just fail the job.
2024-10-23 11:46:25 +02:00 · 2024-10-23 11:46:25 +02:00 · d5912259a0
commit d5912259a0
parent 661f39cbb9
3 changed files with 105 additions and 8 deletions
--- a/internal/cloud/awscloud/export_test.go
+++ b/internal/cloud/awscloud/export_test.go
@ -1,3 +1,4 @@
 package awscloud

 var NewForTest = newForTest
+var DoCreateFleetRetry = doCreateFleetRetry
--- a/internal/cloud/awscloud/secure-instance.go
+++ b/internal/cloud/awscloud/secure-instance.go
@ -5,6 +5,7 @@ import (
 	"encoding/base64"
 	"errors"
 	"fmt"
+	"slices"
 	"strings"
 	"time"

@ -547,20 +548,24 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput,
 		return nil, fmt.Errorf("Unable to create spot fleet: %w", err)
 	}

-	if len(createFleetOutput.Errors) > 0 {
-		logrus.Warnf("Received error %s from CreateFleet, retrying CreateFleet with OnDemand instance", *createFleetOutput.Errors[0].ErrorCode)
+	retry, fleetErrs := doCreateFleetRetry(createFleetOutput)
+	if len(fleetErrs) > 0 && retry {
+		logrus.Warnf("Received errors (%s) from CreateFleet, retrying CreateFleet with OnDemand instance", strings.Join(fleetErrs, "; "))
 		input.SpotOptions = nil
 		createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
+		if err != nil {
+			return nil, fmt.Errorf("Unable to create on demand fleet: %w", err)
+		}
 	}

-	if err == nil && len(createFleetOutput.Errors) > 0 {
-		logrus.Warnf("Received error %s from CreateFleet with OnDemand instance option, retrying across availability zones", *createFleetOutput.Errors[0].ErrorCode)
+	retry, fleetErrs = doCreateFleetRetry(createFleetOutput)
+	if len(fleetErrs) > 0 && retry {
+		logrus.Warnf("Received errors (%s) from CreateFleet with OnDemand instance option, retrying across availability zones", strings.Join(fleetErrs, "; "))
 		input.LaunchTemplateConfigs[0].Overrides = nil
 		createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
-	}
-
-	if err != nil {
-		return nil, fmt.Errorf("Unable to create fleet, tried on-demand and across AZs: %w", err)
+		if err != nil {
+			return nil, fmt.Errorf("Unable to create on demand fleet across AZs: %w", err)
+		}
 	}

 	if len(createFleetOutput.Errors) > 0 {
@ -579,3 +584,30 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput,
 	}
 	return createFleetOutput, nil
 }
+
+func doCreateFleetRetry(cfOutput *ec2.CreateFleetOutput) (bool, []string) {
+	if cfOutput == nil {
+		return false, nil
+	}
+
+	retryCodes := []string{
+		"UnfulfillableCapacity",
+		"InsufficientInstanceCapacity",
+	}
+	msg := []string{}
+	retry := false
+	for _, err := range cfOutput.Errors {
+		if slices.Contains(retryCodes, *err.ErrorCode) {
+			retry = true
+		}
+		msg = append(msg, fmt.Sprintf("%s: %s", *err.ErrorCode, *err.ErrorMessage))
+	}
+
+	// Do not retry in case an instance already exists, in that case just fail and let the worker terminate the SI
+	if len(cfOutput.Instances) > 0 && len(cfOutput.Instances[0].InstanceIds) > 0 {
+		retry = false
+		msg = append(msg, fmt.Sprintf("Already launched instance (%s), aborting create fleet", cfOutput.Instances[0].InstanceIds))
+	}
+
+	return retry, msg
+}
--- a/internal/cloud/awscloud/secure-instance_test.go
+++ b/internal/cloud/awscloud/secure-instance_test.go
@ -4,6 +4,9 @@ import (
 	"fmt"
 	"testing"

+	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/ec2"
+	ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
 	"github.com/stretchr/testify/require"

 	"github.com/osbuild/osbuild-composer/internal/cloud/awscloud"
@ -164,3 +167,64 @@ func TestSICreateFleetFailures(t *testing.T) {
 	require.Equal(t, 4, m.calledFn["DeleteSecurityGroup"])
 	require.Equal(t, 4, m.calledFn["DeleteLaunchTemplate"])
 }
+
+func TestDoCreateFleetRetry(t *testing.T) {
+	cfOutput := &ec2.CreateFleetOutput{
+		Errors: []ec2types.CreateFleetError{
+			{
+				ErrorCode:    aws.String("UnfulfillableCapacity"),
+				ErrorMessage: aws.String("Msg"),
+			},
+		},
+	}
+	retry, fmtErrs := awscloud.DoCreateFleetRetry(cfOutput)
+	require.True(t, retry)
+	require.Equal(t, []string{"UnfulfillableCapacity: Msg"}, fmtErrs)
+
+	cfOutput = &ec2.CreateFleetOutput{
+		Errors: []ec2types.CreateFleetError{
+			{
+				ErrorCode:    aws.String("Bogus"),
+				ErrorMessage: aws.String("Msg"),
+			},
+			{
+				ErrorCode:    aws.String("InsufficientInstanceCapacity"),
+				ErrorMessage: aws.String("Msg"),
+			},
+		},
+	}
+	retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput)
+	require.True(t, retry)
+	require.Equal(t, []string{"Bogus: Msg", "InsufficientInstanceCapacity: Msg"}, fmtErrs)
+
+	cfOutput = &ec2.CreateFleetOutput{
+		Errors: []ec2types.CreateFleetError{
+			{
+				ErrorCode:    aws.String("Bogus"),
+				ErrorMessage: aws.String("Msg"),
+			},
+		},
+	}
+	retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput)
+	require.False(t, retry)
+	require.Equal(t, []string{"Bogus: Msg"}, fmtErrs)
+
+	cfOutput = &ec2.CreateFleetOutput{
+		Errors: []ec2types.CreateFleetError{
+			{
+				ErrorCode:    aws.String("InsufficientInstanceCapacity"),
+				ErrorMessage: aws.String("Msg"),
+			},
+		},
+		Instances: []ec2types.CreateFleetInstance{
+			{
+				InstanceIds: []string{
+					"instance-id",
+				},
+			},
+		},
+	}
+	retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput)
+	require.False(t, retry)
+	require.Equal(t, []string{"InsufficientInstanceCapacity: Msg", "Already launched instance ([instance-id]), aborting create fleet"}, fmtErrs)
+}