cmd: use log in service-maintenance

This commit is contained in:
Lukas Zapletal 2025-05-06 14:53:33 +02:00 committed by Achilleas Koutsou
parent 46f0a71053
commit 0e5d8a94f2
4 changed files with 83 additions and 89 deletions

View file

@ -4,12 +4,12 @@ import (
"context"
"errors"
"fmt"
"log"
"slices"
"sync"
"time"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/sirupsen/logrus"
"golang.org/x/sync/semaphore"
"github.com/osbuild/osbuild-composer/internal/cloud/awscloud"
@ -33,7 +33,7 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
return err
}
} else {
logrus.Infof("One of AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY is missing, trying default credentials…")
log.Printf("One of AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY is missing, trying default credentials…")
a, err = awscloud.NewDefault(region)
if err != nil {
return err
@ -48,7 +48,7 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
for _, region := range regions {
a, err := awscloud.New(region, accessKeyID, accessKey, "")
if err != nil {
logrus.Errorf("Unable to create new aws session for region %s: %v", region, err)
log.Printf("Unable to create new aws session for region %s: %v", region, err)
continue
}
@ -56,24 +56,24 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
sem := semaphore.NewWeighted(int64(maxConcurrentRequests))
images, err := a.DescribeImagesByTag("Name", "composer-api-*")
if err != nil {
logrus.Errorf("Unable to describe images for region %s: %v", region, err)
log.Printf("Unable to describe images for region %s: %v", region, err)
continue
}
for index, image := range images {
// TODO are these actual concerns?
if image.ImageId == nil {
logrus.Infof("ImageId is nil %v", image)
log.Printf("ImageId is nil %v", image)
continue
}
if image.CreationDate == nil {
logrus.Infof("Image %v has nil creationdate", *image.ImageId)
log.Printf("Image %v has nil creationdate", *image.ImageId)
continue
}
created, err := time.Parse(time.RFC3339, *image.CreationDate)
if err != nil {
logrus.Infof("Unable to parse date %s for image %s", *image.CreationDate, *image.ImageId)
log.Printf("Unable to parse date %s for image %s", *image.CreationDate, *image.ImageId)
continue
}
@ -82,12 +82,12 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
}
if dryRun {
logrus.Infof("Dry run, aws image %s in region %s, with creation date %s would be removed", *image.ImageId, region, *image.CreationDate)
log.Printf("Dry run, aws image %s in region %s, with creation date %s would be removed", *image.ImageId, region, *image.CreationDate)
continue
}
if err = sem.Acquire(ctx, 1); err != nil {
logrus.Errorf("Error acquiring semaphore: %v", err)
log.Printf("Error acquiring semaphore: %v", err)
continue
}
wg.Add(1)
@ -98,32 +98,30 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
err := a.RemoveSnapshotAndDeregisterImage(&images[i])
if err != nil {
logrus.Errorf("Cleanup for image %s in region %s failed: %v", *images[i].ImageId, region, err)
log.Printf("Cleanup for image %s in region %s failed: %v", *images[i].ImageId, region, err)
}
}(index)
}
wg.Wait()
}
// using `errs` to collect all errors as we want to
// continue execution if only one cleanup fails
var errs []error
err = terminateOrphanedSecureInstances(a, dryRun)
if err != nil {
logrus.Errorf("Error in terminating secure instances: %v, continuing other cleanup.", err)
log.Printf("Error in terminating secure instances: %v, continuing other cleanup.", err)
errs = append(errs, err)
}
err = searchSGAndCleanup(ctx, a, dryRun)
if err != nil {
logrus.Errorf("Error in cleaning up security groups: %v", err)
log.Printf("Error in cleaning up security groups: %v", err)
errs = append(errs, err)
}
err = searchLTAndCleanup(ctx, a, dryRun)
if err != nil {
logrus.Errorf("Error in cleaning up launch templates: %v", err)
log.Printf("Error in cleaning up launch templates: %v", err)
errs = append(errs, err)
}
@ -143,7 +141,7 @@ func terminateOrphanedSecureInstances(a *awscloud.AWS, dryRun bool) error {
for _, data := range instanceData {
parent, err := a.DescribeInstancesByInstanceID(data.Parent)
if err != nil {
logrus.Errorf("Error getting info of %s (parent of %s): %v", data.Parent, data.Child, err)
log.Printf("Error getting info of %s (parent of %s): %v", data.Parent, data.Child, err)
continue
}
@ -153,7 +151,7 @@ func terminateOrphanedSecureInstances(a *awscloud.AWS, dryRun bool) error {
}
instanceIDs = filterOnTooOld(instanceIDs, reservations)
logrus.Infof("Cleaning up executor instances: %v", instanceIDs)
log.Printf("Cleaning up executor instances: %v", instanceIDs)
if !dryRun {
if len(instanceIDs) > 0 {
err = a.TerminateInstances(instanceIDs)
@ -162,7 +160,7 @@ func terminateOrphanedSecureInstances(a *awscloud.AWS, dryRun bool) error {
}
}
} else {
logrus.Info("Dry run, didn't actually terminate any instances")
log.Print("Dry run, didn't actually terminate any instances")
}
return nil
}
@ -171,7 +169,7 @@ func filterOnTooOld(instanceIDs []string, reservations []ec2types.Reservation) [
for _, res := range reservations {
for _, i := range res.Instances {
if i.LaunchTime.Before(time.Now().Add(-time.Hour * 2)) {
logrus.Infof("Instance %s is too old", *i.InstanceId)
log.Printf("Instance %s is too old", *i.InstanceId)
if !slices.Contains(instanceIDs, *i.InstanceId) {
instanceIDs = append(instanceIDs, *i.InstanceId)
}
@ -201,19 +199,19 @@ func getChildParentAssociations(reservations []ec2types.Reservation) []ChildToPa
func checkValidParent(childId string, parent []ec2types.Reservation) bool {
if len(parent) == 0 {
logrus.Infof("Instance %s has no parent, removing it", childId)
log.Printf("Instance %s has no parent, removing it", childId)
return false
}
if len(parent) != 1 {
logrus.Errorf("Instance %s has %d parents. That should never happen, not changing anything here.", childId, len(parent))
log.Printf("Instance %s has %d parents. That should never happen, not changing anything here.", childId, len(parent))
return true
}
if len(parent[0].Instances) == 0 {
logrus.Infof("Instance %s has no parent instance, removing it", childId)
log.Printf("Instance %s has no parent instance, removing it", childId)
return false
}
if len(parent[0].Instances) != 1 {
logrus.Errorf("Instance %s has %d parent instances. That should never happen, not changing anything here.", childId, len(parent[0].Instances))
log.Printf("Instance %s has %d parent instances. That should never happen, not changing anything here.", childId, len(parent[0].Instances))
return true
}
@ -221,7 +219,7 @@ func checkValidParent(childId string, parent []ec2types.Reservation) bool {
if parentState != ec2types.InstanceStateNameTerminated {
return true
}
logrus.Infof("Instance %s has a parent (%s) in state %s, so we'll terminate %s.", childId, *parent[0].Instances[0].InstanceId, parentState, childId)
log.Printf("Instance %s has a parent (%s) in state %s, so we'll terminate %s.", childId, *parent[0].Instances[0].InstanceId, parentState, childId)
return false
}
@ -233,7 +231,7 @@ func searchSGAndCleanup(ctx context.Context, a *awscloud.AWS, dryRun bool) error
for _, sg := range securityGroups {
if sg.GroupId == nil || sg.GroupName == nil {
logrus.Errorf(
log.Printf(
"Security Group needs to have a GroupId (%v) and a GroupName (%v).",
sg.GroupId,
sg.GroupName)
@ -241,22 +239,22 @@ func searchSGAndCleanup(ctx context.Context, a *awscloud.AWS, dryRun bool) error
}
reservations, err := a.DescribeInstancesBySecurityGroupID(*sg.GroupId)
if err != nil {
logrus.Errorf("Failed to describe security group %s: %v", *sg.GroupId, err)
log.Printf("Failed to describe security group %s: %v", *sg.GroupId, err)
continue
}
// If no instance is running/pending, delete the SG
if allTerminated(reservations) {
logrus.Infof("Deleting security group: %s (%s)", *sg.GroupName, *sg.GroupId)
log.Printf("Deleting security group: %s (%s)", *sg.GroupName, *sg.GroupId)
if !dryRun {
err := a.DeleteSecurityGroupById(ctx, sg.GroupId)
if err != nil {
logrus.Errorf("Failed to delete security group %s: %v", *sg.GroupId, err)
log.Printf("Failed to delete security group %s: %v", *sg.GroupId, err)
}
}
} else {
logrus.Debugf("Security group %s has non terminated instances associated with it.", *sg.GroupId)
log.Printf("Security group %s has non terminated instances associated with it.", *sg.GroupId)
}
}
return nil
@ -283,7 +281,7 @@ func searchLTAndCleanup(ctx context.Context, a *awscloud.AWS, dryRun bool) error
for _, lt := range launchTemplates {
if lt.LaunchTemplateName == nil || lt.LaunchTemplateId == nil {
logrus.Errorf(
log.Printf(
"Launch template needs to have a LaunchTemplateName (%v) and a LaunchTemplateId (%v).",
lt.LaunchTemplateName,
lt.LaunchTemplateId)
@ -292,21 +290,21 @@ func searchLTAndCleanup(ctx context.Context, a *awscloud.AWS, dryRun bool) error
reservations, err := a.DescribeInstancesByLaunchTemplateID(*lt.LaunchTemplateId)
if err != nil {
logrus.Errorf("Failed to describe launch template %s: %v", *lt.LaunchTemplateId, err)
log.Printf("Failed to describe launch template %s: %v", *lt.LaunchTemplateId, err)
continue
}
if allTerminated(reservations) {
logrus.Infof("Deleting launch template: %s (%s)\n", *lt.LaunchTemplateName, *lt.LaunchTemplateId)
log.Printf("Deleting launch template: %s (%s)\n", *lt.LaunchTemplateName, *lt.LaunchTemplateId)
if !dryRun {
err := a.DeleteLaunchTemplateById(ctx, lt.LaunchTemplateId)
if err != nil {
logrus.Errorf("Failed to delete launch template %s: %v", *lt.LaunchTemplateId, err)
log.Printf("Failed to delete launch template %s: %v", *lt.LaunchTemplateId, err)
}
}
} else {
fmt.Printf("Launch template %s has non terminated instances associated with it.\n", *lt.LaunchTemplateId)
log.Printf("Launch template %s has non terminated instances associated with it.\n", *lt.LaunchTemplateId)
}
}
return nil

View file

@ -3,10 +3,10 @@ package main
import (
"context"
"fmt"
"log"
"time"
"github.com/jackc/pgx/v4"
"github.com/sirupsen/logrus"
)
const (
@ -94,31 +94,30 @@ func (d *db) LogVacuumStats() error {
return err
}
logrus.Infof("Stats for table %s", relName)
logrus.Infof(" Total table size: %s", relSize)
logrus.Info(" Tuples:")
logrus.Infof(" Inserted: %d", ins)
logrus.Infof(" Updated: %d", upd)
logrus.Infof(" Deleted: %d", del)
logrus.Infof(" Live: %d", live)
logrus.Infof(" Dead: %d", dead)
logrus.Info(" Vacuum stats:")
logrus.Infof(" Vacuum count: %d", vc)
logrus.Infof(" AutoVacuum count: %d", avc)
logrus.Infof(" Last vacuum: %v", lvc)
logrus.Infof(" Last autovacuum: %v", lavc)
logrus.Info(" Analyze stats:")
logrus.Infof(" Analyze count: %d", ac)
logrus.Infof(" AutoAnalyze count: %d", aac)
logrus.Infof(" Last analyze: %v", lan)
logrus.Infof(" Last autoanalyze: %v", laan)
logrus.Info("---")
log.Printf("Stats for table %s", relName)
log.Printf(" Total table size: %s", relSize)
log.Println(" Tuples:")
log.Printf(" Inserted: %d", ins)
log.Printf(" Updated: %d", upd)
log.Printf(" Deleted: %d", del)
log.Printf(" Live: %d", live)
log.Printf(" Dead: %d", dead)
log.Println(" Vacuum stats:")
log.Printf(" Vacuum count: %d", vc)
log.Printf(" AutoVacuum count: %d", avc)
log.Printf(" Last vacuum: %v", lvc)
log.Printf(" Last autovacuum: %v", lavc)
log.Println(" Analyze stats:")
log.Printf(" Analyze count: %d", ac)
log.Printf(" AutoAnalyze count: %d", aac)
log.Printf(" Last analyze: %v", lan)
log.Printf(" Last autoanalyze: %v", laan)
log.Println("---")
}
if rows.Err() != nil {
return rows.Err()
}
return nil
}
func DBCleanup(dbURL string, dryRun bool, cutoff time.Time) error {
@ -129,7 +128,7 @@ func DBCleanup(dbURL string, dryRun bool, cutoff time.Time) error {
err = db.LogVacuumStats()
if err != nil {
logrus.Errorf("Error running vacuum stats: %v", err)
log.Printf("Error running vacuum stats: %v", err)
}
var rows int64
@ -138,21 +137,21 @@ func DBCleanup(dbURL string, dryRun bool, cutoff time.Time) error {
if dryRun {
rows, err = db.ExpiredJobCount()
if err != nil {
logrus.Warningf("Error querying expired jobs: %v", err)
log.Printf("Error querying expired jobs: %v", err)
}
logrus.Infof("Dryrun, expired job count: %d", rows)
log.Printf("Dryrun, expired job count: %d", rows)
break
}
rows, err = db.DeleteJobs()
if err != nil {
logrus.Errorf("Error deleting jobs: %v, %d rows affected", rows, err)
log.Printf("Error deleting jobs: %v, %d rows affected", err, rows)
return err
}
err = db.VacuumAnalyze()
if err != nil {
logrus.Errorf("Error running vacuum analyze: %v", err)
log.Printf("Error running vacuum analyze: %v", err)
return err
}
@ -160,12 +159,12 @@ func DBCleanup(dbURL string, dryRun bool, cutoff time.Time) error {
break
}
logrus.Infof("Deleted results for %d", rows)
log.Printf("Deleted results for %d", rows)
}
err = db.LogVacuumStats()
if err != nil {
logrus.Errorf("Error running vacuum stats: %v", err)
log.Printf("Error running vacuum stats: %v", err)
}
return nil

View file

@ -3,11 +3,11 @@ package main
import (
"context"
"fmt"
"log"
"sync"
"time"
compute "cloud.google.com/go/compute/apiv1"
"github.com/sirupsen/logrus"
"golang.org/x/sync/semaphore"
"google.golang.org/api/iterator"
@ -29,12 +29,12 @@ func GCPCleanup(creds []byte, maxConcurrentRequests int, dryRun bool, cutoff tim
break
}
if err != nil {
logrus.Fatalf("Error iterating over list of images: %v", err)
log.Fatalf("Error iterating over list of images: %v", err)
}
created, err := time.Parse(time.RFC3339, image.GetCreationTimestamp())
if err != nil {
logrus.Errorf("Unable to parse image %s(%d)'s creation timestamp: %v", image.GetName(), image.Id, err)
log.Printf("Unable to parse image %s(%d)'s creation timestamp: %v", image.GetName(), image.Id, err)
continue
}
@ -43,12 +43,12 @@ func GCPCleanup(creds []byte, maxConcurrentRequests int, dryRun bool, cutoff tim
}
if dryRun {
logrus.Infof("Dry run, gcp image %s(%d), with creation date %v would be removed", image.GetName(), image.Id, created)
log.Printf("Dry run, gcp image %s(%d), with creation date %v would be removed", image.GetName(), image.Id, created)
continue
}
if err = sem.Acquire(context.Background(), 1); err != nil {
logrus.Errorf("Error acquiring semaphore: %v", err)
log.Printf("Error acquiring semaphore: %v", err)
continue
}
wg.Add(1)
@ -59,7 +59,7 @@ func GCPCleanup(creds []byte, maxConcurrentRequests int, dryRun bool, cutoff tim
err = g.ComputeImageDelete(context.Background(), image.GetName())
if err != nil {
logrus.Errorf("Error deleting image %s created at %v: %v", image.GetName(), created, err)
log.Printf("Error deleting image %s created at %v: %v", image.GetName(), created, err)
}
}(fmt.Sprintf("%d", image.Id))
}

View file

@ -3,18 +3,15 @@ package main
import (
"encoding/json"
"fmt"
"log"
"sync"
"time"
"github.com/sirupsen/logrus"
)
func main() {
logrus.SetReportCaller(true)
// 14 days
cutoff := time.Now().Add(-(time.Hour * 24 * 14))
logrus.Infof("Cutoff date: %v", cutoff)
log.Printf("Cutoff date: %v", cutoff)
conf := Config{
MaxConcurrentRequests: 20,
@ -24,15 +21,15 @@ func main() {
}
err := LoadConfigFromEnv(&conf)
if err != nil {
logrus.Fatal(err)
log.Fatal(err)
}
if conf.DryRun {
logrus.Info("Dry run, no state will be changed")
log.Println("Dry run, no state will be changed")
}
if conf.MaxConcurrentRequests == 0 {
logrus.Fatal("Max concurrent requests is 0")
log.Fatal("Max concurrent requests is 0")
}
var wg sync.WaitGroup
@ -40,14 +37,14 @@ func main() {
go func() {
defer wg.Done()
if !conf.EnableAWSMaintenance {
logrus.Info("AWS maintenance not enabled, skipping")
log.Println("AWS maintenance not enabled, skipping")
return
}
logrus.Info("Cleaning up AWS")
log.Println("Cleaning up AWS")
err := AWSCleanup(conf.MaxConcurrentRequests, conf.DryRun, conf.AWSAccessKeyID, conf.AWSSecretAccessKey, cutoff)
if err != nil {
logrus.Errorf("AWS cleanup failed: %v", err)
log.Printf("AWS cleanup failed: %v", err)
}
}()
@ -55,40 +52,40 @@ func main() {
go func() {
defer wg.Done()
if !conf.EnableGCPMaintenance {
logrus.Info("GCP maintenance not enabled, skipping")
log.Println("GCP maintenance not enabled, skipping")
return
}
logrus.Info("Cleaning up GCP")
log.Println("Cleaning up GCP")
var gcpConf GCPCredentialsConfig
err := LoadConfigFromEnv(&gcpConf)
if err != nil {
logrus.Error("Unable to load GCP config from environment")
log.Println("Unable to load GCP config from environment")
return
}
if !gcpConf.valid() {
logrus.Error("GCP credentials invalid, fields missing")
log.Println("GCP credentials invalid, fields missing")
return
}
creds, err := json.Marshal(&gcpConf)
if err != nil {
logrus.Errorf("Unable to marshal gcp conf: %v", err)
log.Printf("Unable to marshal gcp conf: %v", err)
return
}
err = GCPCleanup(creds, conf.MaxConcurrentRequests, conf.DryRun, cutoff)
if err != nil {
logrus.Errorf("GCP Cleanup failed: %v", err)
log.Printf("GCP Cleanup failed: %v", err)
}
}()
wg.Wait()
logrus.Info("🦀🦀🦀 cloud cleanup done 🦀🦀🦀")
log.Println("🦀🦀🦀 cloud cleanup done 🦀🦀🦀")
if !conf.EnableDBMaintenance {
logrus.Info("🦀🦀🦀 DB maintenance not enabled, skipping 🦀🦀🦀")
log.Println("🦀🦀🦀 DB maintenance not enabled, skipping 🦀🦀🦀")
return
}
dbURL := fmt.Sprintf("postgres://%s:%s@%s:%s/%s?sslmode=%s",
@ -101,7 +98,7 @@ func main() {
)
err = DBCleanup(dbURL, conf.DryRun, cutoff)
if err != nil {
logrus.Fatalf("Error during DBCleanup: %v", err)
log.Fatalf("Error during DBCleanup: %v", err)
}
logrus.Info("🦀🦀🦀 dbqueue cleanup done 🦀🦀🦀")
log.Println("🦀🦀🦀 dbqueue cleanup done 🦀🦀🦀")
}