debian-forge-composer/internal/cloud/gcp/compute.go
Tomas Hozza 3a0540dff0 test/api.sh: randomize used GCP zone from the region
The `api.sh` test currently always defaults to "<REGION>-a" zone when
creating instance using the built image. The resources in a zone may get
exhausted and the solution is to use a different zone. Currently even a
CI job retry won't help with mitigation of such error during a CI run.

Modify `api.sh` to pick random GCP zone for a given region when creating
a compute instance. Use only GCP zones which are "UP".

The `cloud-cleaner` relied on the behavior of `api.sh` to always choose
the "<REGION>-a" zone. Guessing the chosen zone in `cloud-cleaner` is
not viable, but thankfully the instance name is by default unique for
the whole GCP project. Modify `cloud-cleaner` to iterate over all
available zones in the used region and try to delete the specific
instance in each of them.

Make `ComputeZonesInRegion` method from the `internal/cloud/gcp` package
exported and use it in `cloud-cleaner` for getting the list of available
zones in a region.

Signed-off-by: Tomas Hozza <thozza@redhat.com>
2021-07-16 10:14:30 +02:00

434 lines
17 KiB
Go

package gcp
import (
"context"
"crypto/rand"
"fmt"
"math/big"
"strings"
"time"
cloudbuild "cloud.google.com/go/cloudbuild/apiv1"
"github.com/golang/protobuf/ptypes"
"google.golang.org/api/compute/v1"
"google.golang.org/api/option"
cloudbuildpb "google.golang.org/genproto/googleapis/devtools/cloudbuild/v1"
"google.golang.org/protobuf/types/known/durationpb"
)
// ComputeImageImport imports a previously uploaded image by submitting a Cloud Build API
// job. The job builds an image into Compute Engine from an image uploaded to the
// storage.
//
// The Build job usually creates a number of cache files in the Storage.
// This method does not do any cleanup, regardless if the image import succeeds or fails.
//
// To delete the Storage object (image) used for the image import, use StorageObjectDelete().
//
// To delete all potentially left over resources after the Build job, use CloudbuildBuildCleanup().
// This is especially important in case the image import is cancelled via the passed Context.
// Cancelling the build leaves behind all resources that it created - instances and disks.
// Therefore if you don't clean up the resources, they'll continue running and costing you money.
//
// bucket - Google storage bucket name with the uploaded image
// object - Google storage object name of the uploaded image
// imageName - Desired image name after the import. This must be unique within the whole project.
// os - Specifies the OS type used when installing GCP guest tools.
// If empty (""), then the image is imported without the installation of GCP guest tools.
// Valid values are: centos-7, centos-8, debian-8, debian-9, opensuse-15, rhel-6,
// rhel-6-byol, rhel-7, rhel-7-byol, rhel-8, rhel-8-byol, sles-12,
// sles-12-byol, sles-15, sles-15-byol, sles-sap-12, sles-sap-12-byol,
// sles-sap-15, sles-sap-15-byol, ubuntu-1404, ubuntu-1604, ubuntu-1804,
// ubuntu-2004, windows-10-x64-byol, windows-10-x86-byol,
// windows-2008r2, windows-2008r2-byol, windows-2012, windows-2012-byol,
// windows-2012r2, windows-2012r2-byol, windows-2016, windows-2016-byol,
// windows-2019, windows-2019-byol, windows-7-x64-byol,
// windows-7-x86-byol, windows-8-x64-byol, windows-8-x86-byol
// region - A valid region where the resulting image should be located. If empty,
// the multi-region location closest to the source is chosen automatically.
// See: https://cloud.google.com/storage/docs/locations
//
// Uses:
// - Cloud Build API
func (g *GCP) ComputeImageImport(ctx context.Context, bucket, object, imageName, os, region string) (*cloudbuildpb.Build, error) {
cloudbuildClient, err := cloudbuild.NewClient(ctx, option.WithCredentials(g.creds))
if err != nil {
return nil, fmt.Errorf("failed to get Cloud Build client: %v", err)
}
defer cloudbuildClient.Close()
buildStepArgs := []string{
fmt.Sprintf("-source_file=gs://%s/%s", bucket, object),
fmt.Sprintf("-image_name=%s", imageName),
"-timeout=7000s",
"-client_id=api",
}
if region != "" {
buildStepArgs = append(buildStepArgs, fmt.Sprintf("-storage_location=%s", region))
// Set the region to be used by the daisy workflow when creating resources
// If not specified, the workflow seems to always default to us-central1.
// The Region passed as the argument is a Google Storage Region, which can be a multi or dual region.
// Multi and Dual regions don't work with GCE API, therefore we need to get the list of GCE regions
// that they map to. If the passed Region is not a multi or dual Region, then the returned slice contains
// only the Region passed as an argument.
gceRegions, err := g.storageRegionToComputeRegions(ctx, region)
if err != nil {
return nil, fmt.Errorf("failed to translate Google Storage Region to GCE Region: %v", err)
}
// Pick a random GCE Region to be used by the image import workflow
gceRegionIndex, err := rand.Int(rand.Reader, big.NewInt(int64(len(gceRegions))))
if err != nil {
return nil, fmt.Errorf("failed to pick random GCE Region: %v", err)
}
// The expecation is that Google won't have more regions listed for multi/dual
// regions than what can potentially fit into int32.
gceRegion := gceRegions[int(gceRegionIndex.Int64())]
availableZones, err := g.ComputeZonesInRegion(ctx, gceRegion)
if err != nil {
return nil, fmt.Errorf("failed to get available GCE Zones within Region '%s': %v", region, err)
}
// Pick random zone from the list
gceZoneIndex, err := rand.Int(rand.Reader, big.NewInt(int64(len(availableZones))))
if err != nil {
return nil, fmt.Errorf("failed to pick random GCE Zone: %v", err)
}
// The expecation is that Google won't have more zones in a region than what can potentially fit into int32
zone := availableZones[int(gceZoneIndex.Int64())]
buildStepArgs = append(buildStepArgs, fmt.Sprintf("-zone=%s", zone))
}
if os != "" {
buildStepArgs = append(buildStepArgs, fmt.Sprintf("-os=%s", os))
} else {
// This effectively marks the image as non-bootable for the import process,
// but it has no effect on the later use or booting in Compute Engine other
// than the GCP guest tools not being installed.
buildStepArgs = append(buildStepArgs, "-data_disk")
}
imageBuild := &cloudbuildpb.Build{
Steps: []*cloudbuildpb.BuildStep{{
Name: "gcr.io/compute-image-tools/gce_vm_image_import:release",
Args: buildStepArgs,
}},
Tags: []string{
"gce-daisy",
"gce-daisy-image-import",
},
Timeout: durationpb.New(time.Second * 7200),
}
createBuildReq := &cloudbuildpb.CreateBuildRequest{
ProjectId: g.creds.ProjectID,
Build: imageBuild,
}
resp, err := cloudbuildClient.CreateBuild(ctx, createBuildReq)
if err != nil {
return nil, fmt.Errorf("failed to create image import build job: %v", err)
}
// Get the returned Build struct
buildOpMetadata := &cloudbuildpb.BuildOperationMetadata{}
if err := ptypes.UnmarshalAny(resp.Metadata, buildOpMetadata); err != nil {
return nil, err
}
imageBuild = buildOpMetadata.Build
getBuldReq := &cloudbuildpb.GetBuildRequest{
ProjectId: imageBuild.ProjectId,
Id: imageBuild.Id,
}
// Wait for the build to finish
for {
select {
case <-time.After(30 * time.Second):
// Just check the build status below
case <-ctx.Done():
// cancel the build
cancelBuildReq := &cloudbuildpb.CancelBuildRequest{
ProjectId: imageBuild.ProjectId,
Id: imageBuild.Id,
}
// since the provided ctx has been canceled, create a new one to cancel the build
ctx = context.Background()
// Cancelling the build leaves behind all resources that it created (instances and disks)
imageBuild, err = cloudbuildClient.CancelBuild(ctx, cancelBuildReq)
if err != nil {
return imageBuild, fmt.Errorf("failed to cancel the image import build job: %v", err)
}
}
imageBuild, err = cloudbuildClient.GetBuild(ctx, getBuldReq)
if err != nil {
return imageBuild, fmt.Errorf("failed to get the build info: %v", err)
}
// The build finished
if imageBuild.Status != cloudbuildpb.Build_WORKING && imageBuild.Status != cloudbuildpb.Build_QUEUED {
break
}
}
if imageBuild.Status != cloudbuildpb.Build_SUCCESS {
return imageBuild, fmt.Errorf("image import didn't finish successfully: %s", imageBuild.Status)
}
return imageBuild, nil
}
// ComputeImageURL returns an image's URL to Google Cloud Console. The method does
// not check at all, if the image actually exists or not.
func (g *GCP) ComputeImageURL(imageName string) string {
return fmt.Sprintf("https://console.cloud.google.com/compute/imagesDetail/projects/%s/global/images/%s", g.creds.ProjectID, imageName)
}
// ComputeImageShare shares the specified Compute Engine image with list of accounts.
//
// "shareWith" is a list of accounts to share the image with. Items can be one
// of the following options:
//
// - `user:{emailid}`: An email address that represents a specific
// Google account. For example, `alice@example.com`.
//
// - `serviceAccount:{emailid}`: An email address that represents a
// service account. For example, `my-other-app@appspot.gserviceaccount.com`.
//
// - `group:{emailid}`: An email address that represents a Google group.
// For example, `admins@example.com`.
//
// - `domain:{domain}`: The G Suite domain (primary) that represents all
// the users of that domain. For example, `google.com` or `example.com`.
//
// Uses:
// - Compute Engine API
func (g *GCP) ComputeImageShare(ctx context.Context, imageName string, shareWith []string) error {
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return fmt.Errorf("failed to get Compute Engine client: %v", err)
}
// Standard role to enable account to view and use a specific Image
imageDesiredRole := "roles/compute.imageUser"
// Get the current Policy set on the Image
policy, err := computeService.Images.GetIamPolicy(g.creds.ProjectID, imageName).Context(ctx).Do()
if err != nil {
return fmt.Errorf("failed to get image's policy: %v", err)
}
// Add new members, who can use the image
// Completely override the old policy
userBinding := &compute.Binding{
Members: shareWith,
Role: imageDesiredRole,
}
newPolicy := &compute.Policy{
Bindings: []*compute.Binding{userBinding},
Etag: policy.Etag,
}
req := &compute.GlobalSetPolicyRequest{
Policy: newPolicy,
}
_, err = computeService.Images.SetIamPolicy(g.creds.ProjectID, imageName, req).Context(ctx).Do()
if err != nil {
return fmt.Errorf("failed to set new image policy: %v", err)
}
// Users won't see the shared image in their images.list requests, unless
// they are also granted a specific "imagesList" role on the project. If you
// don't need users to be able to view the list of shared images, this
// step can be skipped.
//
// Downside of granting the "imagesList" role to a project is that the user
// will be able to list all available images in the project, even those that
// they can't use because of insufficient permissions.
//
// Even without the ability to view / list shared images, the user can still
// create a Compute Engine instance using the image via API or 'gcloud' tool.
//
// Custom role to enable account to only list images in the project.
// Without this role, the account won't be able to list and see the image
// in the GCP Web UI.
// For now, the decision is that the account should not get any role to the
// project, where the image has been imported.
return nil
}
// ComputeImageDelete deletes a Compute Engine image with the given name. If the
// image existed and was successfully deleted, no error is returned.
//
// Uses:
// - Compute Engine API
func (g *GCP) ComputeImageDelete(ctx context.Context, image string) error {
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return fmt.Errorf("failed to get Compute Engine client: %v", err)
}
_, err = computeService.Images.Delete(g.creds.ProjectID, image).Context(ctx).Do()
return err
}
// ComputeInstanceDelete deletes a Compute Engine instance with the given name and
// running in the given zone. If the instance existed and was successfully deleted,
// no error is returned.
//
// Uses:
// - Compute Engine API
func (g *GCP) ComputeInstanceDelete(ctx context.Context, zone, instance string) error {
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return fmt.Errorf("failed to get Compute Engine client: %v", err)
}
_, err = computeService.Instances.Delete(g.creds.ProjectID, zone, instance).Context(ctx).Do()
return err
}
// ComputeInstanceGet fetches a Compute Engine instance information. If fetching the information
// was successful, it is returned to the caller, otherwise <nil> is returned with a proper error.
//
// Uses:
// - Compute Engine API
func (g *GCP) ComputeInstanceGet(ctx context.Context, zone, instance string) (*compute.Instance, error) {
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return nil, fmt.Errorf("failed to get Compute Engine client: %v", err)
}
resp, err := computeService.Instances.Get(g.creds.ProjectID, zone, instance).Context(ctx).Do()
return resp, err
}
// ComputeDiskDelete deletes a Compute Engine disk with the given name and
// running in the given zone. If the disk existed and was successfully deleted,
// no error is returned.
//
// Uses:
// - Compute Engine API
func (g *GCP) ComputeDiskDelete(ctx context.Context, zone, disk string) error {
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return fmt.Errorf("failed to get Compute Engine client: %v", err)
}
_, err = computeService.Disks.Delete(g.creds.ProjectID, zone, disk).Context(ctx).Do()
return err
}
// storageRegionToComputeRegion translates a Google Storage Region to GCE Region.
// This is useful mainly for multi and dual Storage Regions. For each valid multi
// or dual Region name, a slice with relevant GCE Regions is returned. If the
// Region provided as an argument is not multi or dual Region, a slice with the
// provided argument as the only item is returned.
//
// In general, Storage Regions correspond to the Compute Engine Regions. However,
// Storage allows also Multi and Dual regions, which must be mapped to GCE Regions,
// since these can not be used with GCE API calls.
//
// Uses:
// - Compute Engine API
func (g *GCP) storageRegionToComputeRegions(ctx context.Context, region string) ([]string, error) {
regionLower := strings.ToLower(region)
// Handle Dual-Regions
// https://cloud.google.com/storage/docs/locations#location-dr
if regionLower == "asia1" {
return []string{"asia-northeast1", "asia-northeast2"}, nil
} else if regionLower == "eur4" {
return []string{"europe-north1", "europe-west4"}, nil
} else if regionLower == "nam4" {
return []string{"us-central1", "us-east1"}, nil
}
// Handle Regular Region
if regionLower != "asia" && regionLower != "eu" && regionLower != "us" {
// Just return a slice with the region, which we got as
return []string{regionLower}, nil
}
// Handle Multi-Regions
// https://cloud.google.com/storage/docs/locations#location-mr
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return nil, fmt.Errorf("failed to get Compute Engine client: %v", err)
}
regionObjList, err := computeService.Regions.List(g.creds.ProjectID).Context(ctx).Do()
if err != nil {
return nil, fmt.Errorf("failed to get available Compute Engine Regions: %v", err)
}
regionsMap := make(map[string][]string)
for _, regionObj := range regionObjList.Items {
regionPrefix := strings.Split(regionObj.Name, "-")[0]
regionsMap[regionPrefix] = append(regionsMap[regionPrefix], regionObj.Name)
}
switch regionLower {
case "asia", "us":
return regionsMap[regionLower], nil
case "eu":
var euRegions []string
for _, euRegion := range regionsMap["europe"] {
// "europe-west2" (London) and "europe-west6" (Zurich) are excluded
// see https://cloud.google.com/storage/docs/locations#location-mr
if euRegion != "europe-west2" && euRegion != "europe-west6" {
euRegions = append(euRegions, euRegion)
}
}
return euRegions, nil
default:
// This case should never happen, since the "default" case is handled above by
// if regionLower != "asia" && regionLower != "eu" && regionLower != "us"
return nil, fmt.Errorf("failed to translate Google Storage Region '%s' to Compute Engine Region", regionLower)
}
}
// ComputeZonesInRegion returns list of zones within the given GCE Region, which are "UP".
//
// Uses:
// - Compute Engine API
func (g *GCP) ComputeZonesInRegion(ctx context.Context, region string) ([]string, error) {
var zones []string
computeService, err := compute.NewService(ctx, option.WithCredentials(g.creds))
if err != nil {
return nil, fmt.Errorf("failed to get Compute Engine client: %v", err)
}
// Get available zones in the given region
regionObj, err := computeService.Regions.Get(g.creds.ProjectID, region).Context(ctx).Do()
if err != nil {
return nil, fmt.Errorf("failed to get information about Compute Engine region '%s': %v", region, err)
}
for _, zoneURL := range regionObj.Zones {
// zone URL example - "https://www.googleapis.com/compute/v1/projects/<PROJECT_ID>/zones/us-central1-a"
zoneNameSs := strings.Split(zoneURL, "/")
zoneName := zoneNameSs[len(zoneNameSs)-1]
zoneObj, err := computeService.Zones.Get(g.creds.ProjectID, zoneName).Context(ctx).Do()
if err != nil {
return nil, fmt.Errorf("failed to get information about Compute Engine zone '%s': %v", zoneName, err)
}
// Make sure to return only Zones, which can be used
if zoneObj.Status == "UP" {
zones = append(zones, zoneName)
}
}
return zones, nil
}