From 5eb8227bf311ff4e02ddbdb78486b521f1dbb320 Mon Sep 17 00:00:00 2001 From: Sanne Raymaekers Date: Tue, 15 Oct 2024 13:18:56 +0200 Subject: [PATCH] cloud/awscloud: retry CreateFleet regardless of the error code The errors returned by create fleet are not entirely clear. It seems it also returns `InsufficientInstanceCapacity` in addition to `UnfulfillableCapacity`. Let's just retry three times regardless of the create fleet error, that way there's no need to chase error codes which aren't clearly defined. --- internal/cloud/awscloud/secure-instance.go | 10 +++++----- internal/cloud/awscloud/secure-instance_test.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/cloud/awscloud/secure-instance.go b/internal/cloud/awscloud/secure-instance.go index 98749cd02..435fe26f1 100644 --- a/internal/cloud/awscloud/secure-instance.go +++ b/internal/cloud/awscloud/secure-instance.go @@ -547,14 +547,14 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, return nil, fmt.Errorf("Unable to create spot fleet: %w", err) } - if len(createFleetOutput.Errors) > 0 && *createFleetOutput.Errors[0].ErrorCode == "UnfulfillableCapacity" { - logrus.Warn("Received UnfulfillableCapacity from CreateFleet, retrying CreateFleet with OnDemand instance") + if len(createFleetOutput.Errors) > 0 { + logrus.Warnf("Received error %s from CreateFleet, retrying CreateFleet with OnDemand instance", *createFleetOutput.Errors[0].ErrorCode) input.SpotOptions = nil createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) } - if len(createFleetOutput.Errors) > 0 && *createFleetOutput.Errors[0].ErrorCode == "UnfulfillableCapacity" { - logrus.Warn("Received UnfulfillableCapacity from CreateFleet with OnDemand instance option, retrying across availability zones") + if err == nil && len(createFleetOutput.Errors) > 0 { + logrus.Warnf("Received error %s from CreateFleet with OnDemand instance option, retrying across availability zones", *createFleetOutput.Errors[0].ErrorCode) input.LaunchTemplateConfigs[0].Overrides = nil createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) } @@ -566,7 +566,7 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, if len(createFleetOutput.Errors) > 0 { fleetErrs := []string{} for _, fleetErr := range createFleetOutput.Errors { - fleetErrs = append(fleetErrs, *fleetErr.ErrorMessage) + fleetErrs = append(fleetErrs, fmt.Sprintf("%s: %s", *fleetErr.ErrorCode, *fleetErr.ErrorMessage)) } return nil, fmt.Errorf("Unable to create fleet: %v", strings.Join(fleetErrs, "; ")) } diff --git a/internal/cloud/awscloud/secure-instance_test.go b/internal/cloud/awscloud/secure-instance_test.go index ad27d342b..32613adfd 100644 --- a/internal/cloud/awscloud/secure-instance_test.go +++ b/internal/cloud/awscloud/secure-instance_test.go @@ -142,7 +142,7 @@ func TestSICreateFleetFailures(t *testing.T) { aws := awscloud.NewForTest(m, &ec2imdsmock{t, "instance-id", "region1"}, nil, nil, nil) require.NotNil(t, aws) - // unfillable capacity should call create fleet thrice + // create fleet error should call create fleet thrice m.failFn["CreateFleet"] = nil si, err := aws.RunSecureInstance("iam-profile", "key-name", "cw-group", "hostname") require.Error(t, err)