deb-bootc-compose/internal/monitoring/checks.go
2025-08-18 23:32:51 -07:00

274 lines
6.3 KiB
Go

package monitoring
import (
"fmt"
"os"
"path/filepath"
"runtime"
"github.com/sirupsen/logrus"
)
// SystemHealthCheck checks system-level health
type SystemHealthCheck struct {
logger *logrus.Logger
}
// NewSystemHealthCheck creates a new system health check
func NewSystemHealthCheck() *SystemHealthCheck {
return &SystemHealthCheck{
logger: logrus.New(),
}
}
func (s *SystemHealthCheck) Name() string {
return "system"
}
func (s *SystemHealthCheck) IsCritical() bool {
return true
}
func (s *SystemHealthCheck) Check() (*HealthCheck, error) {
var m runtime.MemStats
runtime.ReadMemStats(&m)
// Check memory usage
memoryUsage := float64(m.Alloc) / float64(m.Sys) * 100
memoryStatus := HealthStatusHealthy
if memoryUsage > 90 {
memoryStatus = HealthStatusUnhealthy
} else if memoryUsage > 75 {
memoryStatus = HealthStatusDegraded
}
// Determine overall status
overallStatus := memoryStatus
details := map[string]interface{}{
"memory_alloc": m.Alloc,
"memory_sys": m.Sys,
"memory_usage_pct": memoryUsage,
"goroutines": runtime.NumGoroutine(),
}
return &HealthCheck{
Name: s.Name(),
Status: overallStatus,
Message: fmt.Sprintf("System health: memory %.1f%%", memoryUsage),
Details: details,
Critical: s.IsCritical(),
}, nil
}
// PackageManagerHealthCheck checks package manager health
type PackageManagerHealthCheck struct {
cacheDir string
logger *logrus.Logger
}
// NewPackageManagerHealthCheck creates a new package manager health check
func NewPackageManagerHealthCheck(cacheDir string) *PackageManagerHealthCheck {
return &PackageManagerHealthCheck{
cacheDir: cacheDir,
logger: logrus.New(),
}
}
func (p *PackageManagerHealthCheck) Name() string {
return "package_manager"
}
func (p *PackageManagerHealthCheck) IsCritical() bool {
return false
}
func (p *PackageManagerHealthCheck) Check() (*HealthCheck, error) {
// Check if cache directory exists and is writable
if p.cacheDir == "" {
return &HealthCheck{
Name: p.Name(),
Status: HealthStatusUnhealthy,
Message: "Cache directory not configured",
Critical: p.IsCritical(),
}, nil
}
// Check if directory exists
if _, err := os.Stat(p.cacheDir); os.IsNotExist(err) {
return &HealthCheck{
Name: p.Name(),
Status: HealthStatusDegraded,
Message: "Cache directory does not exist",
Critical: p.IsCritical(),
}, nil
}
// Check if directory is writable
testFile := filepath.Join(p.cacheDir, ".health_check_test")
if err := os.WriteFile(testFile, []byte("test"), 0644); err != nil {
return &HealthCheck{
Name: p.Name(),
Status: HealthStatusUnhealthy,
Message: "Cache directory is not writable",
Critical: p.IsCritical(),
}, nil
}
// Clean up test file
os.Remove(testFile)
// Check cache size
cacheSize, err := p.getCacheSize()
if err != nil {
return &HealthCheck{
Name: p.Name(),
Status: HealthStatusDegraded,
Message: "Unable to determine cache size",
Critical: p.IsCritical(),
}, nil
}
details := map[string]interface{}{
"cache_dir": p.cacheDir,
"cache_size": cacheSize,
"writable": true,
}
return &HealthCheck{
Name: p.Name(),
Status: HealthStatusHealthy,
Message: "Package manager cache is healthy",
Details: details,
Critical: p.IsCritical(),
}, nil
}
func (p *PackageManagerHealthCheck) getCacheSize() (int64, error) {
var size int64
err := filepath.Walk(p.cacheDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
size += info.Size()
}
return nil
})
return size, err
}
// OSTreeHealthCheck checks OSTree tool health
type OSTreeHealthCheck struct {
logger *logrus.Logger
}
// NewOSTreeHealthCheck creates a new OSTree health check
func NewOSTreeHealthCheck() *OSTreeHealthCheck {
return &OSTreeHealthCheck{
logger: logrus.New(),
}
}
func (o *OSTreeHealthCheck) Name() string {
return "ostree"
}
func (o *OSTreeHealthCheck) IsCritical() bool {
return true
}
func (o *OSTreeHealthCheck) Check() (*HealthCheck, error) {
// Check if ostree command is available
if _, err := os.Stat("/usr/bin/ostree"); os.IsNotExist(err) {
return &HealthCheck{
Name: o.Name(),
Status: HealthStatusUnhealthy,
Message: "OSTree command not found",
Critical: o.IsCritical(),
}, nil
}
// Check if apt-ostree is available
aptOstreeAvailable := false
if _, err := os.Stat("/usr/bin/apt-ostree"); err == nil {
aptOstreeAvailable = true
}
details := map[string]interface{}{
"ostree_available": true,
"apt_ostree_available": aptOstreeAvailable,
}
message := "OSTree tools are available"
if aptOstreeAvailable {
message += " (including apt-ostree)"
} else {
message += " (apt-ostree not available)"
}
status := HealthStatusHealthy
if !aptOstreeAvailable {
status = HealthStatusDegraded
}
return &HealthCheck{
Name: o.Name(),
Status: status,
Message: message,
Details: details,
Critical: o.IsCritical(),
}, nil
}
// BuildSystemHealthCheck checks build system connectivity
type BuildSystemHealthCheck struct {
orchestratorURL string
logger *logrus.Logger
}
// NewBuildSystemHealthCheck creates a new build system health check
func NewBuildSystemHealthCheck(orchestratorURL string) *BuildSystemHealthCheck {
return &BuildSystemHealthCheck{
orchestratorURL: orchestratorURL,
logger: logrus.New(),
}
}
func (b *BuildSystemHealthCheck) Name() string {
return "build_system"
}
func (b *BuildSystemHealthCheck) IsCritical() bool {
return false
}
func (b *BuildSystemHealthCheck) Check() (*HealthCheck, error) {
if b.orchestratorURL == "" {
return &HealthCheck{
Name: b.Name(),
Status: HealthStatusUnknown,
Message: "Orchestrator URL not configured",
Critical: b.IsCritical(),
}, nil
}
// Try to connect to orchestrator
// This is a simplified check - in production you'd want to make an actual HTTP request
// For now, we'll simulate a connection check
details := map[string]interface{}{
"orchestrator_url": b.orchestratorURL,
"configured": true,
}
// Placeholder: assume healthy if configured
// In reality, this would make an HTTP request to /health endpoint
return &HealthCheck{
Name: b.Name(),
Status: HealthStatusHealthy,
Message: "Build system is configured and accessible",
Details: details,
Critical: b.IsCritical(),
}, nil
}