From f6feb7675b5108de55789aedef32cb29f3ee8d72 Mon Sep 17 00:00:00 2001 From: Sanne Raymaekers Date: Mon, 2 Dec 2024 15:25:32 +0100 Subject: [PATCH] cloud/awscloud: use any instance create fleet returns Even in case of errors, as long as create fleet returns an instance, attempt to use it. In some cases AWS returns `InsufficientInstanceCapacity` but still creates an instance: ``` msg="Won't retry CreateFleet with OnDemand instance, retry: false, errors: InsufficientInstanceCapacity: There is no Spot capacity available that matches your request.; Already launched instance ([i-...]), aborting create fleet" msg="doCreateFleetRetry: returning retry: false, msg: [InsufficientInstanceCapacity: There is no Spot capacity available that matches your request. Already launched instance ([i-...]), aborting create fleet]" msg="doCreateFleetRetry: cancelling retry, instance already exists: [i-...]" msg="doCreateFleetRetry: setting retry to true" msg="Checking to retry fleet create on error InsufficientInstanceCapacity (msg: There is no Spot capacity available that matches your request.)" ``` --- internal/cloud/awscloud/secure-instance.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/internal/cloud/awscloud/secure-instance.go b/internal/cloud/awscloud/secure-instance.go index aed2a72e0..d24d4aeff 100644 --- a/internal/cloud/awscloud/secure-instance.go +++ b/internal/cloud/awscloud/secure-instance.go @@ -170,6 +170,7 @@ func (a *AWS) RunSecureInstance(iamProfile, keyName, cloudWatchGroup, hostname s }, Type: ec2types.FleetTypeInstant, }) + // retrieve any instance information even if there's an error, that way the instance // will be terminated before other resources are removed. if createFleetOutput != nil { @@ -181,7 +182,11 @@ func (a *AWS) RunSecureInstance(iamProfile, keyName, cloudWatchGroup, hostname s } } if err != nil { - return nil, err + if secureInstance.FleetID == "" || secureInstance.InstanceID == "" { + logrus.Infof("CreateFleet returned an error (%v), without either an instance (%s) or a fleet (%s)", err, secureInstance.InstanceID, secureInstance.FleetID) + return nil, err + } + logrus.Warnf("CreateFleet returned an error (%v) but also created an instance (%s) in fleet (%s), continuing as normal.", err, secureInstance.InstanceID, secureInstance.FleetID) } instWaiter := ec2.NewInstanceStatusOkWaiter(a.ec2)