From 905df418aac38e6aecfa7e715ce221f6fb68f001 Mon Sep 17 00:00:00 2001 From: Sanne Raymaekers Date: Mon, 7 Oct 2024 12:49:25 +0200 Subject: [PATCH] cloud/aws: add a third secure instance fallback across AZs In case the on demand option failed as well, retry one more time across availability zones. This significantly increases the pool of available instances, but increases network related costs, as transferring data between AZs is not free. --- internal/cloud/awscloud/secure-instance.go | 9 ++++++++- internal/cloud/awscloud/secure-instance_test.go | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/cloud/awscloud/secure-instance.go b/internal/cloud/awscloud/secure-instance.go index e4c9ce9f1..98749cd02 100644 --- a/internal/cloud/awscloud/secure-instance.go +++ b/internal/cloud/awscloud/secure-instance.go @@ -552,8 +552,15 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, input.SpotOptions = nil createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) } + + if len(createFleetOutput.Errors) > 0 && *createFleetOutput.Errors[0].ErrorCode == "UnfulfillableCapacity" { + logrus.Warn("Received UnfulfillableCapacity from CreateFleet with OnDemand instance option, retrying across availability zones") + input.LaunchTemplateConfigs[0].Overrides = nil + createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) + } + if err != nil { - return nil, fmt.Errorf("Unable to create on-demand fleet: %w", err) + return nil, fmt.Errorf("Unable to create fleet, tried on-demand and across AZs: %w", err) } if len(createFleetOutput.Errors) > 0 { diff --git a/internal/cloud/awscloud/secure-instance_test.go b/internal/cloud/awscloud/secure-instance_test.go index 6d8fc6338..ad27d342b 100644 --- a/internal/cloud/awscloud/secure-instance_test.go +++ b/internal/cloud/awscloud/secure-instance_test.go @@ -142,12 +142,12 @@ func TestSICreateFleetFailures(t *testing.T) { aws := awscloud.NewForTest(m, &ec2imdsmock{t, "instance-id", "region1"}, nil, nil, nil) require.NotNil(t, aws) - // unfillable capacity should call create fleet twice + // unfillable capacity should call create fleet thrice m.failFn["CreateFleet"] = nil si, err := aws.RunSecureInstance("iam-profile", "key-name", "cw-group", "hostname") require.Error(t, err) require.Nil(t, si) - require.Equal(t, 2, m.calledFn["CreateFleet"]) + require.Equal(t, 3, m.calledFn["CreateFleet"]) require.Equal(t, 1, m.calledFn["CreateSecurityGroup"]) require.Equal(t, 1, m.calledFn["CreateLaunchTemplate"]) require.Equal(t, 2, m.calledFn["DeleteSecurityGroup"]) @@ -158,7 +158,7 @@ func TestSICreateFleetFailures(t *testing.T) { si, err = aws.RunSecureInstance("iam-profile", "key-name", "cw-group", "hostname") require.Error(t, err) require.Nil(t, si) - require.Equal(t, 3, m.calledFn["CreateFleet"]) + require.Equal(t, 4, m.calledFn["CreateFleet"]) require.Equal(t, 2, m.calledFn["CreateSecurityGroup"]) require.Equal(t, 2, m.calledFn["CreateLaunchTemplate"]) require.Equal(t, 4, m.calledFn["DeleteSecurityGroup"])