awscloud: add very verbose logging to createFleet creation
We still see this error sometimes: Unable to start secure instance: Unable to create fleet: InsufficientInstanceCapacity: There is no Spot capacity available that matches your request This is awkward because the message mentions that there is no spot capacity, even though the current code should retry on InsufficientInstanceCapacity. I also confirmed this by searching for the retries log messages: there are none in the logs. We need a bigger hammer. Let's log everything that happens in the createFleet method in order to have better understanding why the retry logic isn't triggered. We should probably move most of the newly added logs to the debug level, but let's delay that until we have more insight into what's happening.
This commit is contained in:
parent
54ffc08814
commit
64ff0e3dad
1 changed files with 19 additions and 0 deletions
|
|
@ -3,6 +3,7 @@ package awscloud
|
|||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
|
|
@ -592,6 +593,7 @@ func (a *AWS) deleteSGIfExists(si *SecureInstance) error {
|
|||
}
|
||||
|
||||
func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, error) {
|
||||
logCreateFleetInput(input)
|
||||
createFleetOutput, err := a.ec2.CreateFleet(context.Background(), input)
|
||||
if err != nil {
|
||||
return createFleetOutput, fmt.Errorf("Unable to create spot fleet: %w", err)
|
||||
|
|
@ -602,20 +604,26 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput,
|
|||
logrus.Warnf("Received errors (%s) from CreateFleet, retrying CreateFleet with OnDemand instance", strings.Join(fleetErrs, "; "))
|
||||
input.SpotOptions = nil
|
||||
input.TargetCapacitySpecification.DefaultTargetCapacityType = ec2types.DefaultTargetCapacityTypeOnDemand
|
||||
logCreateFleetInput(input)
|
||||
createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
|
||||
if err != nil {
|
||||
return createFleetOutput, fmt.Errorf("Unable to create on demand fleet: %w", err)
|
||||
}
|
||||
} else {
|
||||
logrus.Infof("Won't retry CreateFleet with OnDemand instance, retry: %v, errors: %s", retry, strings.Join(fleetErrs, "; "))
|
||||
}
|
||||
|
||||
retry, fleetErrs = doCreateFleetRetry(createFleetOutput)
|
||||
if len(fleetErrs) > 0 && retry {
|
||||
logrus.Warnf("Received errors (%s) from CreateFleet with OnDemand instance option, retrying across availability zones", strings.Join(fleetErrs, "; "))
|
||||
input.LaunchTemplateConfigs[0].Overrides = nil
|
||||
logCreateFleetInput(input)
|
||||
createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
|
||||
if err != nil {
|
||||
return createFleetOutput, fmt.Errorf("Unable to create on demand fleet across AZs: %w", err)
|
||||
}
|
||||
} else {
|
||||
logrus.Infof("Won't retry CreateFleet across AZs, retry: %v, errors: %s", retry, strings.Join(fleetErrs, "; "))
|
||||
}
|
||||
|
||||
if len(createFleetOutput.Errors) > 0 {
|
||||
|
|
@ -650,15 +658,26 @@ func doCreateFleetRetry(cfOutput *ec2.CreateFleetOutput) (bool, []string) {
|
|||
logrus.Infof("Checking to retry fleet create on error %s (msg: %s)", *err.ErrorCode, *err.ErrorMessage)
|
||||
if slices.Contains(retryCodes, *err.ErrorCode) {
|
||||
retry = true
|
||||
logrus.Infof("doCreateFleetRetry: setting retry to true")
|
||||
}
|
||||
msg = append(msg, fmt.Sprintf("%s: %s", *err.ErrorCode, *err.ErrorMessage))
|
||||
}
|
||||
|
||||
// Do not retry in case an instance already exists, in that case just fail and let the worker terminate the SI
|
||||
if len(cfOutput.Instances) > 0 && len(cfOutput.Instances[0].InstanceIds) > 0 {
|
||||
logrus.Infof("doCreateFleetRetry: cancelling retry, instance already exists: %s", cfOutput.Instances[0].InstanceIds)
|
||||
retry = false
|
||||
msg = append(msg, fmt.Sprintf("Already launched instance (%s), aborting create fleet", cfOutput.Instances[0].InstanceIds))
|
||||
}
|
||||
|
||||
logrus.Infof("doCreateFleetRetry: returning retry: %v, msg: %v", retry, msg)
|
||||
return retry, msg
|
||||
}
|
||||
|
||||
func logCreateFleetInput(input *ec2.CreateFleetInput) {
|
||||
if inputJSON, err := json.Marshal(input); err != nil {
|
||||
logrus.Warnf("Unable to marshal input for logging: %v", input)
|
||||
} else {
|
||||
logrus.Infof("Creating fleet with input: %s", inputJSON)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue