From 3a0540dff0f225d8626221f6a143d87afc1e6493 Mon Sep 17 00:00:00 2001 From: Tomas Hozza Date: Tue, 22 Jun 2021 13:27:45 +0200 Subject: [PATCH] test/api.sh: randomize used GCP zone from the region The `api.sh` test currently always defaults to "-a" zone when creating instance using the built image. The resources in a zone may get exhausted and the solution is to use a different zone. Currently even a CI job retry won't help with mitigation of such error during a CI run. Modify `api.sh` to pick random GCP zone for a given region when creating a compute instance. Use only GCP zones which are "UP". The `cloud-cleaner` relied on the behavior of `api.sh` to always choose the "-a" zone. Guessing the chosen zone in `cloud-cleaner` is not viable, but thankfully the instance name is by default unique for the whole GCP project. Modify `cloud-cleaner` to iterate over all available zones in the used region and try to delete the specific instance in each of them. Make `ComputeZonesInRegion` method from the `internal/cloud/gcp` package exported and use it in `cloud-cleaner` for getting the list of available zones in a region. Signed-off-by: Tomas Hozza --- cmd/cloud-cleaner/main.go | 34 ++++++++++++++++++++++++---------- internal/cloud/gcp/compute.go | 6 +++--- test/cases/api.sh | 13 ++++++++++--- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/cmd/cloud-cleaner/main.go b/cmd/cloud-cleaner/main.go index 411171bd0..a53c7dafe 100644 --- a/cmd/cloud-cleaner/main.go +++ b/cmd/cloud-cleaner/main.go @@ -27,8 +27,6 @@ func cleanupGCP(testID string, wg *sync.WaitGroup) { log.Println("[GCP] Error: 'GCP_REGION' is not set in the environment.") return } - // api.sh test uses '--zone="$GCP_REGION-a"' - GCPZone := fmt.Sprintf("%s-a", GCPRegion) GCPBucket, ok := os.LookupEnv("GCP_BUCKET") if !ok { log.Println("[GCP] Error: 'GCP_BUCKET' is not set in the environment.") @@ -63,16 +61,31 @@ func cleanupGCP(testID string, wg *sync.WaitGroup) { ctx := context.Background() // Try to delete potentially running instance - log.Printf("[GCP] 🧹 Deleting VM instance %s in %s. "+ - "This should fail if the test succedded.", GCPInstance, GCPZone) - err = g.ComputeInstanceDelete(ctx, GCPZone, GCPInstance) + // api.sh chooses a random GCP Zone from the set Region. Since we + // don't know which one it is, iterate over all Zones in the Region + // and try to delete the instance. Unless the instance has set + // "VmDnsSetting:ZonalOnly", which we don't do, this is safe and the + // instance name must be unique for the whole GCP project. + GCPZones, err := g.ComputeZonesInRegion(ctx, GCPRegion) if err != nil { - log.Printf("[GCP] Error: %v", err) + log.Printf("[GCP] Error: Failed to get available Zones for the '%s' Region: %v", GCPRegion, err) + return + } + for _, GCPZone := range GCPZones { + log.Printf("[GCP] 🧹 Deleting VM instance %s in %s. "+ + "This should fail if the test succeeded.", GCPInstance, GCPZone) + err = g.ComputeInstanceDelete(ctx, GCPZone, GCPInstance) + if err == nil { + // If an instance with the given name was successfully deleted in one of the Zones, we are done. + break + } else { + log.Printf("[GCP] Error: %v", err) + } } // Try to clean up storage of cache objects after image import job log.Println("[GCP] 🧹 Cleaning up cache objects from storage after image " + - "import. This should fail if the test succedded.") + "import. This should fail if the test succeeded.") cacheObjects, errs := g.StorageImageImportCleanup(ctx, GCPImage) for _, err = range errs { log.Printf("[GCP] Error: %v", err) @@ -94,7 +107,7 @@ func cleanupGCP(testID string, wg *sync.WaitGroup) { } // Try to delete the imported image - log.Printf("[GCP] 🧹 Deleting image %s. This should fail if the test succedded.", GCPImage) + log.Printf("[GCP] 🧹 Deleting image %s. This should fail if the test succeeded.", GCPImage) err = g.ComputeImageDelete(ctx, GCPImage) if err != nil { log.Printf("[GCP] Error: %v", err) @@ -119,14 +132,14 @@ func cleanupAzure(testID string, wg *sync.WaitGroup) { // Delete the vhd image imageName := "image-" + testID + ".vhd" - log.Println("[Azure] Deleting image. This should fail if the test succedded.") + log.Println("[Azure] Deleting image. This should fail if the test succeeded.") err = azuretest.DeleteImageFromAzure(creds, imageName) if err != nil { log.Printf("[Azure] Error: %v", err) } // Delete all remaining resources (see the full list in the CleanUpBootedVM function) - log.Println("[Azure] Cleaning up booted VM. This should fail if the test succedded.") + log.Println("[Azure] Cleaning up booted VM. This should fail if the test succeeded.") parameters := azuretest.NewDeploymentParameters(creds, imageName, testID, "") clientCredentialsConfig := auth.NewClientCredentialsConfig(creds.ClientID, creds.ClientSecret, creds.TenantID) authorizer, err := clientCredentialsConfig.Authorizer() @@ -149,6 +162,7 @@ func main() { if err != nil { log.Fatalf("Failed to get testID: %v", err) } + log.Printf("TEST_ID=%s", testID) var wg sync.WaitGroup wg.Add(2) diff --git a/internal/cloud/gcp/compute.go b/internal/cloud/gcp/compute.go index e631ba646..5f1f72a59 100644 --- a/internal/cloud/gcp/compute.go +++ b/internal/cloud/gcp/compute.go @@ -87,7 +87,7 @@ func (g *GCP) ComputeImageImport(ctx context.Context, bucket, object, imageName, // regions than what can potentially fit into int32. gceRegion := gceRegions[int(gceRegionIndex.Int64())] - availableZones, err := g.computeZonesInRegion(ctx, gceRegion) + availableZones, err := g.ComputeZonesInRegion(ctx, gceRegion) if err != nil { return nil, fmt.Errorf("failed to get available GCE Zones within Region '%s': %v", region, err) } @@ -396,11 +396,11 @@ func (g *GCP) storageRegionToComputeRegions(ctx context.Context, region string) } } -// computeZonesInRegion returns list of zones within the given GCE Region, which are "UP". +// ComputeZonesInRegion returns list of zones within the given GCE Region, which are "UP". // // Uses: // - Compute Engine API -func (g *GCP) computeZonesInRegion(ctx context.Context, region string) ([]string, error) { +func (g *GCP) ComputeZonesInRegion(ctx context.Context, region string) ([]string, error) { var zones []string computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds)) diff --git a/test/cases/api.sh b/test/cases/api.sh index ad78bcc72..93fce1032 100755 --- a/test/cases/api.sh +++ b/test/cases/api.sh @@ -153,10 +153,11 @@ function cleanupGCP() { GCP_CMD="${GCP_CMD:-}" GCP_IMAGE_NAME="${GCP_IMAGE_NAME:-}" GCP_INSTANCE_NAME="${GCP_INSTANCE_NAME:-}" + GCP_ZONE="${GCP_ZONE:-}" if [ -n "$GCP_CMD" ]; then set +e - $GCP_CMD compute instances delete --zone="$GCP_REGION-a" "$GCP_INSTANCE_NAME" + $GCP_CMD compute instances delete --zone="$GCP_ZONE" "$GCP_INSTANCE_NAME" $GCP_CMD compute images delete "$GCP_IMAGE_NAME" set -e fi @@ -832,12 +833,18 @@ function verifyInGCP() { # resource ID can have max 62 characters, the $GCP_TEST_ID_HASH contains 56 characters GCP_INSTANCE_NAME="vm-$GCP_TEST_ID_HASH" + # Randomize the used GCP zone to prevent hitting "exhausted resources" error on each test re-run + # disable Shellcheck error as the suggested alternatives are less readable for this use case + # shellcheck disable=SC2207 + local GCP_ZONES=($($GCP_CMD compute zones list --filter="region=$GCP_REGION" | jq '.[] | select(.status == "UP") | .name' | tr -d '"' | tr '\n' ' ')) + GCP_ZONE=${GCP_ZONES[$((RANDOM % ${#GCP_ZONES[@]}))]} + $GCP_CMD compute instances create "$GCP_INSTANCE_NAME" \ - --zone="$GCP_REGION-a" \ + --zone="$GCP_ZONE" \ --image-project="$GCP_PROJECT" \ --image="$GCP_IMAGE_NAME" \ --metadata-from-file=ssh-keys="$GCP_SSH_METADATA_FILE" - HOST=$($GCP_CMD compute instances describe "$GCP_INSTANCE_NAME" --zone="$GCP_REGION-a" --format='get(networkInterfaces[0].accessConfigs[0].natIP)') + HOST=$($GCP_CMD compute instances describe "$GCP_INSTANCE_NAME" --zone="$GCP_ZONE" --format='get(networkInterfaces[0].accessConfigs[0].natIP)') echo "⏱ Waiting for GCP instance to respond to ssh" _instanceWaitSSH "$HOST"