cloud/awscloud: rework create fleet retry logic

The current path sometimes launches two instances, which is problematic
because the rest of the secure instance code expects exactly one
instance. A security group could be attached to both instances, and
would block the worker from launching any more SIs, as it tries to
delete the old security group first, which is still held by one of the
surplus SIs which didn't get terminated.

Only retry if:
- on "UnfulfillableCapacity" or "InsufficientInstanceCapacity" error codes;
- there wasn't an instance launched anyway.

If either of these checks fail, do not try to launch another one, and
just fail the job.
This commit is contained in:
Sanne Raymaekers 2024-10-23 11:46:25 +02:00
parent 661f39cbb9
commit d5912259a0
3 changed files with 105 additions and 8 deletions

View file

@ -4,6 +4,9 @@ import (
"fmt"
"testing"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/stretchr/testify/require"
"github.com/osbuild/osbuild-composer/internal/cloud/awscloud"
@ -164,3 +167,64 @@ func TestSICreateFleetFailures(t *testing.T) {
require.Equal(t, 4, m.calledFn["DeleteSecurityGroup"])
require.Equal(t, 4, m.calledFn["DeleteLaunchTemplate"])
}
func TestDoCreateFleetRetry(t *testing.T) {
cfOutput := &ec2.CreateFleetOutput{
Errors: []ec2types.CreateFleetError{
{
ErrorCode: aws.String("UnfulfillableCapacity"),
ErrorMessage: aws.String("Msg"),
},
},
}
retry, fmtErrs := awscloud.DoCreateFleetRetry(cfOutput)
require.True(t, retry)
require.Equal(t, []string{"UnfulfillableCapacity: Msg"}, fmtErrs)
cfOutput = &ec2.CreateFleetOutput{
Errors: []ec2types.CreateFleetError{
{
ErrorCode: aws.String("Bogus"),
ErrorMessage: aws.String("Msg"),
},
{
ErrorCode: aws.String("InsufficientInstanceCapacity"),
ErrorMessage: aws.String("Msg"),
},
},
}
retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput)
require.True(t, retry)
require.Equal(t, []string{"Bogus: Msg", "InsufficientInstanceCapacity: Msg"}, fmtErrs)
cfOutput = &ec2.CreateFleetOutput{
Errors: []ec2types.CreateFleetError{
{
ErrorCode: aws.String("Bogus"),
ErrorMessage: aws.String("Msg"),
},
},
}
retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput)
require.False(t, retry)
require.Equal(t, []string{"Bogus: Msg"}, fmtErrs)
cfOutput = &ec2.CreateFleetOutput{
Errors: []ec2types.CreateFleetError{
{
ErrorCode: aws.String("InsufficientInstanceCapacity"),
ErrorMessage: aws.String("Msg"),
},
},
Instances: []ec2types.CreateFleetInstance{
{
InstanceIds: []string{
"instance-id",
},
},
},
}
retry, fmtErrs = awscloud.DoCreateFleetRetry(cfOutput)
require.False(t, retry)
require.Equal(t, []string{"InsufficientInstanceCapacity: Msg", "Already launched instance ([instance-id]), aborting create fleet"}, fmtErrs)
}