debian-forge-composer/cmd/osbuild-service-maintenance/aws.go
Florian Schüller 7ebe266d3c osbuild-service-maintenance: implement removal on invalid parent
Add a safeguard to ensure secure instances without valid
parent instances are terminated, as they are unnecessary to retain.
Typically, the parent does not exist if the secure instance is
older than 2 hours, but this check provides additional validation.
HMS-3632
2024-12-10 11:43:51 +01:00

210 lines
5.7 KiB
Go

package main
import (
"context"
"fmt"
"slices"
"sync"
"time"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/sirupsen/logrus"
"golang.org/x/sync/semaphore"
"github.com/osbuild/osbuild-composer/internal/cloud/awscloud"
)
type ChildToParentAssociation struct {
Child string
Parent string
}
func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey string, cutoff time.Time) error {
const region = "us-east-1"
var a *awscloud.AWS
var err error
if accessKeyID != "" && accessKey != "" {
a, err = awscloud.New(region, accessKeyID, accessKey, "")
if err != nil {
return err
}
} else {
logrus.Infof("One of AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY is missing, trying default credentials…")
a, err = awscloud.NewDefault(region)
if err != nil {
return err
}
}
regions, err := a.Regions()
if err != nil {
return err
}
for _, region := range regions {
a, err := awscloud.New(region, accessKeyID, accessKey, "")
if err != nil {
logrus.Errorf("Unable to create new aws session for region %s: %v", region, err)
continue
}
var wg sync.WaitGroup
sem := semaphore.NewWeighted(int64(maxConcurrentRequests))
images, err := a.DescribeImagesByTag("Name", "composer-api-*")
if err != nil {
logrus.Errorf("Unable to describe images for region %s: %v", region, err)
continue
}
for index, image := range images {
// TODO are these actual concerns?
if image.ImageId == nil {
logrus.Infof("ImageId is nil %v", image)
continue
}
if image.CreationDate == nil {
logrus.Infof("Image %v has nil creationdate", *image.ImageId)
continue
}
created, err := time.Parse(time.RFC3339, *image.CreationDate)
if err != nil {
logrus.Infof("Unable to parse date %s for image %s", *image.CreationDate, *image.ImageId)
continue
}
if !created.Before(cutoff) {
continue
}
if dryRun {
logrus.Infof("Dry run, aws image %s in region %s, with creation date %s would be removed", *image.ImageId, region, *image.CreationDate)
continue
}
if err = sem.Acquire(context.Background(), 1); err != nil {
logrus.Errorf("Error acquiring semaphore: %v", err)
continue
}
wg.Add(1)
go func(i int) {
defer sem.Release(1)
defer wg.Done()
err := a.RemoveSnapshotAndDeregisterImage(&images[i])
if err != nil {
logrus.Errorf("Cleanup for image %s in region %s failed: %v", *images[i].ImageId, region, err)
}
}(index)
}
wg.Wait()
}
// Terminate leftover secure instances
reservations, err := a.DescribeInstancesByTag("parent", "i-*")
if err != nil {
return fmt.Errorf("Unable to describe instances by tag %w", err)
}
instanceData := getChildParentAssociations(reservations)
var instanceIDs []string
for _, data := range instanceData {
parent, err := a.DescribeInstancesByInstanceID(data.Parent)
if err != nil {
logrus.Errorf("Error getting info of %s (parent of %s): %v", data.Parent, data.Child, err)
continue
}
if !checkValidParent(data.Child, parent) {
instanceIDs = append(instanceIDs, data.Child)
}
}
instanceIDs = filterReservations(instanceIDs, reservations)
logrus.Infof("Cleaning up executor instances: %v", instanceIDs)
if !dryRun {
err = a.TerminateInstances(instanceIDs)
if err != nil {
return fmt.Errorf("Unable to terminate secure instances: %w", err)
}
} else {
logrus.Info("Dry run, didn't actually terminate any instances")
}
return nil
}
func filterReservations(instanceIDs []string, reservations []ec2types.Reservation) []string {
for _, res := range reservations {
for _, i := range res.Instances {
if i.LaunchTime.Before(time.Now().Add(-time.Hour * 2)) {
logrus.Infof("Instance %s is too old", *i.InstanceId)
if !slices.Contains(instanceIDs, *i.InstanceId) {
instanceIDs = append(instanceIDs, *i.InstanceId)
}
}
}
}
return instanceIDs
}
func getChildParentAssociations(reservations []ec2types.Reservation) []ChildToParentAssociation {
var ChildToParentIDs []ChildToParentAssociation
for _, res := range reservations {
for _, i := range res.Instances {
for _, t := range i.Tags {
if *t.Key == "parent" {
ChildToParentIDs = append(ChildToParentIDs, ChildToParentAssociation{
Child: *i.InstanceId,
Parent: *t.Value,
})
}
}
}
}
return ChildToParentIDs
}
func checkValidParent(childId string, parent []ec2types.Reservation) bool {
if len(parent) == 0 {
logrus.Infof("Instance %s has no parent, removing it", childId)
return false
}
if len(parent) != 1 {
logrus.Errorf("Instance %s has %d parents. That should never happen, not changing anything here.", childId, len(parent))
return true
}
if len(parent[0].Instances) == 0 {
logrus.Infof("Instance %s has no parent instance, removing it", childId)
return false
}
if len(parent[0].Instances) != 1 {
logrus.Errorf("Instance %s has %d parent instances. That should never happen, not changing anything here.", childId, len(parent[0].Instances))
return true
}
parentState := parent[0].Instances[0].State.Name
if parentState == ec2types.InstanceStateNameRunning || parentState == ec2types.InstanceStateNamePending {
return true
}
logrus.Infof("Instance %s has a parent (%s) in state %s, so we'll terminate %s.", childId, *parent[0].Instances[0].InstanceId, parentState, childId)
return false
}
// allTerminated returns true if any instance of the reservations is not terminated
// then it's considered "in use"
func allTerminated(reservations []ec2types.Reservation) bool {
for _, reservation := range reservations {
for _, instance := range reservation.Instances {
if instance.State != nil && (instance.State.Name != ec2types.InstanceStateNameTerminated) {
return false
}
}
}
return true
}