test/api.sh: randomize used GCP zone from the region

The `api.sh` test currently always defaults to "<REGION>-a" zone when
creating instance using the built image. The resources in a zone may get
exhausted and the solution is to use a different zone. Currently even a
CI job retry won't help with mitigation of such error during a CI run.

Modify `api.sh` to pick random GCP zone for a given region when creating
a compute instance. Use only GCP zones which are "UP".

The `cloud-cleaner` relied on the behavior of `api.sh` to always choose
the "<REGION>-a" zone. Guessing the chosen zone in `cloud-cleaner` is
not viable, but thankfully the instance name is by default unique for
the whole GCP project. Modify `cloud-cleaner` to iterate over all
available zones in the used region and try to delete the specific
instance in each of them.

Make `ComputeZonesInRegion` method from the `internal/cloud/gcp` package
exported and use it in `cloud-cleaner` for getting the list of available
zones in a region.

Signed-off-by: Tomas Hozza <thozza@redhat.com>
This commit is contained in:
Tomas Hozza 2021-06-22 13:27:45 +02:00 committed by Ondřej Budai
parent 1fd151ce28
commit 3a0540dff0
3 changed files with 37 additions and 16 deletions

View file

@ -27,8 +27,6 @@ func cleanupGCP(testID string, wg *sync.WaitGroup) {
log.Println("[GCP] Error: 'GCP_REGION' is not set in the environment.")
return
}
// api.sh test uses '--zone="$GCP_REGION-a"'
GCPZone := fmt.Sprintf("%s-a", GCPRegion)
GCPBucket, ok := os.LookupEnv("GCP_BUCKET")
if !ok {
log.Println("[GCP] Error: 'GCP_BUCKET' is not set in the environment.")
@ -63,16 +61,31 @@ func cleanupGCP(testID string, wg *sync.WaitGroup) {
ctx := context.Background()
// Try to delete potentially running instance
log.Printf("[GCP] 🧹 Deleting VM instance %s in %s. "+
"This should fail if the test succedded.", GCPInstance, GCPZone)
err = g.ComputeInstanceDelete(ctx, GCPZone, GCPInstance)
// api.sh chooses a random GCP Zone from the set Region. Since we
// don't know which one it is, iterate over all Zones in the Region
// and try to delete the instance. Unless the instance has set
// "VmDnsSetting:ZonalOnly", which we don't do, this is safe and the
// instance name must be unique for the whole GCP project.
GCPZones, err := g.ComputeZonesInRegion(ctx, GCPRegion)
if err != nil {
log.Printf("[GCP] Error: Failed to get available Zones for the '%s' Region: %v", GCPRegion, err)
return
}
for _, GCPZone := range GCPZones {
log.Printf("[GCP] 🧹 Deleting VM instance %s in %s. "+
"This should fail if the test succeeded.", GCPInstance, GCPZone)
err = g.ComputeInstanceDelete(ctx, GCPZone, GCPInstance)
if err == nil {
// If an instance with the given name was successfully deleted in one of the Zones, we are done.
break
} else {
log.Printf("[GCP] Error: %v", err)
}
}
// Try to clean up storage of cache objects after image import job
log.Println("[GCP] 🧹 Cleaning up cache objects from storage after image " +
"import. This should fail if the test succedded.")
"import. This should fail if the test succeeded.")
cacheObjects, errs := g.StorageImageImportCleanup(ctx, GCPImage)
for _, err = range errs {
log.Printf("[GCP] Error: %v", err)
@ -94,7 +107,7 @@ func cleanupGCP(testID string, wg *sync.WaitGroup) {
}
// Try to delete the imported image
log.Printf("[GCP] 🧹 Deleting image %s. This should fail if the test succedded.", GCPImage)
log.Printf("[GCP] 🧹 Deleting image %s. This should fail if the test succeeded.", GCPImage)
err = g.ComputeImageDelete(ctx, GCPImage)
if err != nil {
log.Printf("[GCP] Error: %v", err)
@ -119,14 +132,14 @@ func cleanupAzure(testID string, wg *sync.WaitGroup) {
// Delete the vhd image
imageName := "image-" + testID + ".vhd"
log.Println("[Azure] Deleting image. This should fail if the test succedded.")
log.Println("[Azure] Deleting image. This should fail if the test succeeded.")
err = azuretest.DeleteImageFromAzure(creds, imageName)
if err != nil {
log.Printf("[Azure] Error: %v", err)
}
// Delete all remaining resources (see the full list in the CleanUpBootedVM function)
log.Println("[Azure] Cleaning up booted VM. This should fail if the test succedded.")
log.Println("[Azure] Cleaning up booted VM. This should fail if the test succeeded.")
parameters := azuretest.NewDeploymentParameters(creds, imageName, testID, "")
clientCredentialsConfig := auth.NewClientCredentialsConfig(creds.ClientID, creds.ClientSecret, creds.TenantID)
authorizer, err := clientCredentialsConfig.Authorizer()
@ -149,6 +162,7 @@ func main() {
if err != nil {
log.Fatalf("Failed to get testID: %v", err)
}
log.Printf("TEST_ID=%s", testID)
var wg sync.WaitGroup
wg.Add(2)

View file

@ -87,7 +87,7 @@ func (g *GCP) ComputeImageImport(ctx context.Context, bucket, object, imageName,
// regions than what can potentially fit into int32.
gceRegion := gceRegions[int(gceRegionIndex.Int64())]
availableZones, err := g.computeZonesInRegion(ctx, gceRegion)
availableZones, err := g.ComputeZonesInRegion(ctx, gceRegion)
if err != nil {
return nil, fmt.Errorf("failed to get available GCE Zones within Region '%s': %v", region, err)
}
@ -396,11 +396,11 @@ func (g *GCP) storageRegionToComputeRegions(ctx context.Context, region string)
}
}
// computeZonesInRegion returns list of zones within the given GCE Region, which are "UP".
// ComputeZonesInRegion returns list of zones within the given GCE Region, which are "UP".
//
// Uses:
// - Compute Engine API
func (g *GCP) computeZonesInRegion(ctx context.Context, region string) ([]string, error) {
func (g *GCP) ComputeZonesInRegion(ctx context.Context, region string) ([]string, error) {
var zones []string
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))

View file

@ -153,10 +153,11 @@ function cleanupGCP() {
GCP_CMD="${GCP_CMD:-}"
GCP_IMAGE_NAME="${GCP_IMAGE_NAME:-}"
GCP_INSTANCE_NAME="${GCP_INSTANCE_NAME:-}"
GCP_ZONE="${GCP_ZONE:-}"
if [ -n "$GCP_CMD" ]; then
set +e
$GCP_CMD compute instances delete --zone="$GCP_REGION-a" "$GCP_INSTANCE_NAME"
$GCP_CMD compute instances delete --zone="$GCP_ZONE" "$GCP_INSTANCE_NAME"
$GCP_CMD compute images delete "$GCP_IMAGE_NAME"
set -e
fi
@ -832,12 +833,18 @@ function verifyInGCP() {
# resource ID can have max 62 characters, the $GCP_TEST_ID_HASH contains 56 characters
GCP_INSTANCE_NAME="vm-$GCP_TEST_ID_HASH"
# Randomize the used GCP zone to prevent hitting "exhausted resources" error on each test re-run
# disable Shellcheck error as the suggested alternatives are less readable for this use case
# shellcheck disable=SC2207
local GCP_ZONES=($($GCP_CMD compute zones list --filter="region=$GCP_REGION" | jq '.[] | select(.status == "UP") | .name' | tr -d '"' | tr '\n' ' '))
GCP_ZONE=${GCP_ZONES[$((RANDOM % ${#GCP_ZONES[@]}))]}
$GCP_CMD compute instances create "$GCP_INSTANCE_NAME" \
--zone="$GCP_REGION-a" \
--zone="$GCP_ZONE" \
--image-project="$GCP_PROJECT" \
--image="$GCP_IMAGE_NAME" \
--metadata-from-file=ssh-keys="$GCP_SSH_METADATA_FILE"
HOST=$($GCP_CMD compute instances describe "$GCP_INSTANCE_NAME" --zone="$GCP_REGION-a" --format='get(networkInterfaces[0].accessConfigs[0].natIP)')
HOST=$($GCP_CMD compute instances describe "$GCP_INSTANCE_NAME" --zone="$GCP_ZONE" --format='get(networkInterfaces[0].accessConfigs[0].natIP)')
echo "⏱ Waiting for GCP instance to respond to ssh"
_instanceWaitSSH "$HOST"