debian-forge-composer/internal/monitoring/system_monitor.go
robojerk 4eeaa43c39
Some checks failed
Tests / 🛃 Unit tests (push) Failing after 13s
Tests / 🗄 DB tests (push) Failing after 19s
Tests / 🐍 Lint python scripts (push) Failing after 1s
Tests / ⌨ Golang Lint (push) Failing after 1s
Tests / 📦 Packit config lint (push) Failing after 1s
Tests / 🔍 Check source preparation (push) Failing after 1s
Tests / 🔍 Check for valid snapshot urls (push) Failing after 1s
Tests / 🔍 Check for missing or unused runner repos (push) Failing after 1s
Tests / 🐚 Shellcheck (push) Failing after 1s
Tests / 📦 RPMlint (push) Failing after 1s
Tests / Gitlab CI trigger helper (push) Failing after 1s
Tests / 🎀 kube-linter (push) Failing after 1s
Tests / 🧹 cloud-cleaner-is-enabled (push) Successful in 3s
Tests / 🔍 Check spec file osbuild/images dependencies (push) Failing after 1s
did stuff
2025-08-26 10:34:42 -07:00

1076 lines
28 KiB
Go

package monitoring
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"time"
"github.com/sirupsen/logrus"
)
type SystemMonitor struct {
logger *logrus.Logger
config *MonitoringConfig
metrics *MetricsCollector
healthChecks map[string]HealthCheck
alerts *AlertManager
storage *MetricsStorage
running bool
mu sync.RWMutex
}
type MonitoringConfig struct {
Enabled bool `json:"enabled"`
Interval time.Duration `json:"interval"`
MetricsPath string `json:"metrics_path"`
AlertPath string `json:"alert_path"`
RetentionDays int `json:"retention_days"`
Thresholds map[string]float64 `json:"thresholds"`
Metadata map[string]string `json:"metadata"`
}
type MetricsCollector struct {
metrics map[string]Metric
mu sync.RWMutex
}
type Metric struct {
Name string `json:"name"`
Value float64 `json:"value"`
Unit string `json:"unit"`
Type string `json:"type"`
Timestamp time.Time `json:"timestamp"`
Labels map[string]string `json:"labels"`
Metadata map[string]interface{} `json:"metadata"`
}
type HealthCheck struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Type string `json:"type"`
Command string `json:"command"`
Args []string `json:"args"`
Interval time.Duration `json:"interval"`
Timeout time.Duration `json:"timeout"`
Threshold float64 `json:"threshold"`
Enabled bool `json:"enabled"`
Metadata map[string]interface{} `json:"metadata"`
}
type HealthCheckResult struct {
ID string `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
Message string `json:"message"`
Value float64 `json:"value"`
Threshold float64 `json:"threshold"`
Timestamp time.Time `json:"timestamp"`
Duration time.Duration `json:"duration"`
Error string `json:"error,omitempty"`
Metadata map[string]interface{} `json:"metadata"`
}
type AlertManager struct {
alerts map[string]Alert
channels map[string]AlertChannel
mu sync.RWMutex
}
type Alert struct {
ID string `json:"id"`
Level string `json:"level"`
Message string `json:"message"`
Source string `json:"source"`
Timestamp time.Time `json:"timestamp"`
Acknowledged bool `json:"acknowledged"`
Metadata map[string]interface{} `json:"metadata"`
}
type AlertChannel struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Config map[string]interface{} `json:"config"`
Enabled bool `json:"enabled"`
Metadata map[string]interface{} `json:"metadata"`
}
type MetricsStorage struct {
path string
retention time.Duration
mu sync.RWMutex
}
type SystemStatus struct {
Timestamp time.Time `json:"timestamp"`
Uptime time.Duration `json:"uptime"`
LoadAverage []float64 `json:"load_average"`
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage float64 `json:"memory_usage"`
DiskUsage float64 `json:"disk_usage"`
NetworkIO NetworkStats `json:"network_io"`
ProcessCount int `json:"process_count"`
HealthStatus map[string]string `json:"health_status"`
Alerts []Alert `json:"alerts"`
Metadata map[string]interface{} `json:"metadata"`
}
type NetworkStats struct {
BytesReceived uint64 `json:"bytes_received"`
BytesSent uint64 `json:"bytes_sent"`
PacketsReceived uint64 `json:"packets_received"`
PacketsSent uint64 `json:"packets_sent"`
ErrorsReceived uint64 `json:"errors_received"`
ErrorsSent uint64 `json:"errors_sent"`
}
func NewSystemMonitor(config *MonitoringConfig, logger *logrus.Logger) *SystemMonitor {
monitor := &SystemMonitor{
logger: logger,
config: config,
metrics: NewMetricsCollector(),
healthChecks: make(map[string]HealthCheck),
alerts: NewAlertManager(),
storage: NewMetricsStorage(config.MetricsPath, time.Duration(config.RetentionDays)*24*time.Hour),
running: false,
}
// Initialize health checks
monitor.initializeHealthChecks()
// Initialize alert channels
monitor.initializeAlertChannels()
return monitor
}
func NewMetricsCollector() *MetricsCollector {
return &MetricsCollector{
metrics: make(map[string]Metric),
}
}
func NewAlertManager() *AlertManager {
return &AlertManager{
alerts: make(map[string]Alert),
channels: make(map[string]AlertChannel),
}
}
func NewMetricsStorage(path string, retention time.Duration) *MetricsStorage {
return &MetricsStorage{
path: path,
retention: retention,
}
}
func (sm *SystemMonitor) initializeHealthChecks() {
// System resource health checks
sm.healthChecks["cpu_usage"] = HealthCheck{
ID: "cpu_usage",
Name: "CPU Usage",
Description: "Monitor CPU usage percentage",
Type: "resource",
Command: "top",
Args: []string{"-bn1", "-p", "1"},
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Threshold: 80.0,
Enabled: true,
}
sm.healthChecks["memory_usage"] = HealthCheck{
ID: "memory_usage",
Name: "Memory Usage",
Description: "Monitor memory usage percentage",
Type: "resource",
Command: "free",
Args: []string{"-m"},
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Threshold: 85.0,
Enabled: true,
}
sm.healthChecks["disk_usage"] = HealthCheck{
ID: "disk_usage",
Name: "Disk Usage",
Description: "Monitor disk usage percentage",
Type: "resource",
Command: "df",
Args: []string{"-h", "/"},
Interval: 60 * time.Second,
Timeout: 10 * time.Second,
Threshold: 90.0,
Enabled: true,
}
sm.healthChecks["load_average"] = HealthCheck{
ID: "load_average",
Name: "Load Average",
Description: "Monitor system load average",
Type: "resource",
Command: "uptime",
Args: []string{},
Interval: 30 * time.Second,
Timeout: 10 * time.Second,
Threshold: 5.0,
Enabled: true,
}
// Service health checks
sm.healthChecks["debian_forge_service"] = HealthCheck{
ID: "debian_forge_service",
Name: "Debian Forge Service",
Description: "Check if Debian Forge service is running",
Type: "service",
Command: "systemctl",
Args: []string{"is-active", "debian-forge"},
Interval: 60 * time.Second,
Timeout: 10 * time.Second,
Threshold: 0.0,
Enabled: true,
}
sm.healthChecks["database_connection"] = HealthCheck{
ID: "database_connection",
Name: "Database Connection",
Description: "Check database connectivity",
Type: "service",
Command: "pg_isready",
Args: []string{"-h", "localhost"},
Interval: 60 * time.Second,
Timeout: 10 * time.Second,
Threshold: 0.0,
Enabled: true,
}
// Network health checks
sm.healthChecks["network_connectivity"] = HealthCheck{
ID: "network_connectivity",
Name: "Network Connectivity",
Description: "Check basic network connectivity",
Type: "network",
Command: "ping",
Args: []string{"-c", "1", "8.8.8.8"},
Interval: 60 * time.Second,
Timeout: 10 * time.Second,
Threshold: 0.0,
Enabled: true,
}
}
func (sm *SystemMonitor) initializeAlertChannels() {
// Email alert channel
sm.alerts.channels["email"] = AlertChannel{
ID: "email",
Name: "Email Alerts",
Type: "email",
Enabled: true,
Config: map[string]interface{}{
"smtp_server": "localhost",
"smtp_port": 25,
"from": "alerts@debian-forge.local",
"to": []string{"admin@debian-forge.local"},
},
}
// Slack alert channel
sm.alerts.channels["slack"] = AlertChannel{
ID: "slack",
Name: "Slack Alerts",
Type: "slack",
Enabled: false,
Config: map[string]interface{}{
"webhook_url": "",
"channel": "#alerts",
},
}
// Webhook alert channel
sm.alerts.channels["webhook"] = AlertChannel{
ID: "webhook",
Name: "Webhook Alerts",
Type: "webhook",
Enabled: false,
Config: map[string]interface{}{
"url": "",
"method": "POST",
"headers": map[string]string{},
},
}
}
func (sm *SystemMonitor) Start() error {
sm.mu.Lock()
defer sm.mu.Unlock()
if sm.running {
return fmt.Errorf("monitor is already running")
}
sm.running = true
sm.logger.Info("Starting system monitor")
// Start monitoring goroutine
go sm.monitoringLoop()
// Start health check goroutines
for _, check := range sm.healthChecks {
if check.Enabled {
go sm.healthCheckLoop(check)
}
}
return nil
}
func (sm *SystemMonitor) Stop() error {
sm.mu.Lock()
defer sm.mu.Unlock()
if !sm.running {
return fmt.Errorf("monitor is not running")
}
sm.running = false
sm.logger.Info("Stopping system monitor")
return nil
}
func (sm *SystemMonitor) monitoringLoop() {
ticker := time.NewTicker(sm.config.Interval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if !sm.isRunning() {
return
}
// Collect system metrics
if err := sm.collectSystemMetrics(); err != nil {
sm.logger.Errorf("Failed to collect system metrics: %v", err)
}
// Check thresholds and generate alerts
sm.checkThresholds()
// Store metrics
if err := sm.storeMetrics(); err != nil {
sm.logger.Errorf("Failed to store metrics: %v", err)
}
}
}
}
func (sm *SystemMonitor) healthCheckLoop(check HealthCheck) {
ticker := time.NewTicker(check.Interval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if !sm.isRunning() {
return
}
// Run health check
result := sm.runHealthCheck(check)
// Process result
sm.processHealthCheckResult(result)
}
}
}
func (sm *SystemMonitor) isRunning() bool {
sm.mu.RLock()
defer sm.mu.RUnlock()
return sm.running
}
func (sm *SystemMonitor) collectSystemMetrics() error {
// Collect CPU metrics
if err := sm.collectCPUMetrics(); err != nil {
sm.logger.Warnf("Failed to collect CPU metrics: %v", err)
}
// Collect memory metrics
if err := sm.collectMemoryMetrics(); err != nil {
sm.logger.Warnf("Failed to collect memory metrics: %v", err)
}
// Collect disk metrics
if err := sm.collectDiskMetrics(); err != nil {
sm.logger.Warnf("Failed to collect disk metrics: %v", err)
}
// Collect network metrics
if err := sm.collectNetworkMetrics(); err != nil {
sm.logger.Warnf("Failed to collect network metrics: %v", err)
}
// Collect process metrics
if err := sm.collectProcessMetrics(); err != nil {
sm.logger.Warnf("Failed to collect process metrics: %v", err)
}
return nil
}
func (sm *SystemMonitor) collectCPUMetrics() error {
// Read /proc/loadavg for load average
data, err := os.ReadFile("/proc/loadavg")
if err != nil {
return fmt.Errorf("failed to read loadavg: %w", err)
}
fields := strings.Fields(string(data))
if len(fields) >= 3 {
if load1, err := strconv.ParseFloat(fields[0], 64); err == nil {
sm.metrics.setMetric("load_1min", load1, "", "gauge", map[string]string{"type": "load_average"})
}
if load5, err := strconv.ParseFloat(fields[1], 64); err == nil {
sm.metrics.setMetric("load_5min", load5, "", "gauge", map[string]string{"type": "load_average"})
}
if load15, err := strconv.ParseFloat(fields[2], 64); err == nil {
sm.metrics.setMetric("load_15min", load15, "", "gauge", map[string]string{"type": "load_average"})
}
}
// Read /proc/stat for CPU usage
data, err = os.ReadFile("/proc/stat")
if err != nil {
return fmt.Errorf("failed to read stat: %w", err)
}
lines := strings.Split(string(data), "\n")
for _, line := range lines {
if strings.HasPrefix(line, "cpu ") {
fields := strings.Fields(line)
if len(fields) >= 5 {
// Calculate CPU usage percentage
total := 0.0
idle := 0.0
for i := 1; i < len(fields); i++ {
if val, err := strconv.ParseFloat(fields[i], 64); err == nil {
total += val
if i == 4 { // idle time
idle = val
}
}
}
if total > 0 {
usage := ((total - idle) / total) * 100.0
sm.metrics.setMetric("cpu_usage", usage, "%", "gauge", map[string]string{"type": "cpu"})
}
}
break
}
}
return nil
}
func (sm *SystemMonitor) collectMemoryMetrics() error {
// Read /proc/meminfo for memory usage
data, err := os.ReadFile("/proc/meminfo")
if err != nil {
return fmt.Errorf("failed to read meminfo: %w", err)
}
var total, available uint64
lines := strings.Split(string(data), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) >= 2 {
switch fields[0] {
case "MemTotal:":
if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
total = val
}
case "MemAvailable:":
if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
available = val
}
}
}
}
if total > 0 {
usage := float64(total-available) / float64(total) * 100.0
sm.metrics.setMetric("memory_usage", usage, "%", "gauge", map[string]string{"type": "memory"})
sm.metrics.setMetric("memory_total", float64(total), "KB", "gauge", map[string]string{"type": "memory"})
sm.metrics.setMetric("memory_available", float64(available), "KB", "gauge", map[string]string{"type": "memory"})
}
return nil
}
func (sm *SystemMonitor) collectDiskMetrics() error {
// Use df command to get disk usage
cmd := exec.Command("df", "-h", "/")
output, err := cmd.Output()
if err != nil {
return fmt.Errorf("df command failed: %w", err)
}
lines := strings.Split(string(output), "\n")
if len(lines) >= 2 {
fields := strings.Fields(lines[1])
if len(fields) >= 5 {
usageStr := strings.TrimSuffix(fields[4], "%")
if usage, err := strconv.ParseFloat(usageStr, 64); err == nil {
sm.metrics.setMetric("disk_usage", usage, "%", "gauge", map[string]string{"type": "disk", "mount": "/"})
}
}
}
return nil
}
func (sm *SystemMonitor) collectNetworkMetrics() error {
// Read /proc/net/dev for network statistics
data, err := os.ReadFile("/proc/net/dev")
if err != nil {
return fmt.Errorf("failed to read net/dev: %w", err)
}
lines := strings.Split(string(data), "\n")
for _, line := range lines {
if strings.Contains(line, ":") && !strings.Contains(line, "lo:") {
fields := strings.Fields(line)
if len(fields) >= 17 {
interfaceName := strings.TrimSuffix(fields[0], ":")
if bytesReceived, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
sm.metrics.setMetric("network_bytes_received", float64(bytesReceived), "bytes", "counter",
map[string]string{"interface": interfaceName})
}
if bytesSent, err := strconv.ParseUint(fields[9], 10, 64); err == nil {
sm.metrics.setMetric("network_bytes_sent", float64(bytesSent), "bytes", "counter",
map[string]string{"interface": interfaceName})
}
}
}
}
return nil
}
func (sm *SystemMonitor) collectProcessMetrics() error {
// Count running processes
cmd := exec.Command("ps", "-e", "--no-headers")
output, err := cmd.Output()
if err != nil {
return fmt.Errorf("ps command failed: %w", err)
}
lines := strings.Split(string(output), "\n")
processCount := len(lines)
if len(lines) > 0 && lines[len(lines)-1] == "" {
processCount--
}
sm.metrics.setMetric("process_count", float64(processCount), "count", "gauge", map[string]string{"type": "process"})
return nil
}
func (sm *SystemMonitor) runHealthCheck(check HealthCheck) HealthCheckResult {
result := HealthCheckResult{
ID: check.ID,
Name: check.Name,
Status: "unknown",
Message: "Health check completed",
Timestamp: time.Now(),
Threshold: check.Threshold,
Metadata: make(map[string]interface{}),
}
startTime := time.Now()
// Run the health check command
cmd := exec.Command(check.Command, check.Args...)
cmd.Timeout = check.Timeout
output, err := cmd.Output()
result.Duration = time.Since(startTime)
if err != nil {
result.Status = "failed"
result.Message = "Health check command failed"
result.Error = err.Error()
return result
}
// Parse output based on check type
switch check.Type {
case "resource":
result = sm.parseResourceCheck(check, string(output))
case "service":
result = sm.parseServiceCheck(check, string(output))
case "network":
result = sm.parseNetworkCheck(check, string(output))
default:
result.Status = "unknown"
result.Message = "Unknown check type"
}
return result
}
func (sm *SystemMonitor) parseResourceCheck(check HealthCheck, output string) HealthCheckResult {
result := HealthCheckResult{
ID: check.ID,
Name: check.Name,
Status: "unknown",
Message: "Resource check completed",
Timestamp: time.Now(),
Threshold: check.Threshold,
Metadata: make(map[string]interface{}),
}
// Parse output based on command
switch check.Command {
case "top":
// Parse top output for CPU usage
lines := strings.Split(output, "\n")
for _, line := range lines {
if strings.Contains(line, "Cpu(s):") {
fields := strings.Fields(line)
for i, field := range fields {
if strings.Contains(field, "%us") {
if i > 0 {
if usage, err := strconv.ParseFloat(fields[i-1], 64); err == nil {
result.Value = usage
if usage > check.Threshold {
result.Status = "critical"
result.Message = fmt.Sprintf("CPU usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
} else {
result.Status = "healthy"
result.Message = fmt.Sprintf("CPU usage %.1f%% is within normal range", usage)
}
}
}
break
}
}
}
}
case "free":
// Parse free output for memory usage
lines := strings.Split(output, "\n")
if len(lines) >= 2 {
fields := strings.Fields(lines[1])
if len(fields) >= 3 {
if total, err := strconv.ParseFloat(fields[1], 64); err == nil {
if used, err := strconv.ParseFloat(fields[2], 64); err == nil {
usage := (used / total) * 100.0
result.Value = usage
if usage > check.Threshold {
result.Status = "critical"
result.Message = fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
} else {
result.Status = "healthy"
result.Message = fmt.Sprintf("Memory usage %.1f%% is within normal range", usage)
}
}
}
}
}
case "df":
// Parse df output for disk usage
lines := strings.Split(output, "\n")
if len(lines) >= 2 {
fields := strings.Fields(lines[1])
if len(fields) >= 5 {
usageStr := strings.TrimSuffix(fields[4], "%")
if usage, err := strconv.ParseFloat(usageStr, 64); err == nil {
result.Value = usage
if usage > check.Threshold {
result.Status = "critical"
result.Message = fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
} else {
result.Status = "healthy"
result.Message = fmt.Sprintf("Disk usage %.1f%% is within normal range", usage)
}
}
}
}
case "uptime":
// Parse uptime output for load average
fields := strings.Fields(output)
if len(fields) >= 10 {
loadStr := strings.TrimSuffix(fields[9], ",")
if load, err := strconv.ParseFloat(loadStr, 64); err == nil {
result.Value = load
if load > check.Threshold {
result.Status = "critical"
result.Message = fmt.Sprintf("Load average %.2f exceeds threshold %.2f", load, check.Threshold)
} else {
result.Status = "healthy"
result.Message = fmt.Sprintf("Load average %.2f is within normal range", load)
}
}
}
}
return result
}
func (sm *SystemMonitor) parseServiceCheck(check HealthCheck, output string) HealthCheckResult {
result := HealthCheckResult{
ID: check.ID,
Name: check.Name,
Status: "unknown",
Message: "Service check completed",
Timestamp: time.Now(),
Threshold: check.Threshold,
Metadata: make(map[string]interface{}),
}
output = strings.TrimSpace(output)
switch output {
case "active":
result.Status = "healthy"
result.Message = "Service is running"
result.Value = 1.0
case "inactive":
result.Status = "critical"
result.Message = "Service is not running"
result.Value = 0.0
default:
result.Status = "unknown"
result.Message = fmt.Sprintf("Service status: %s", output)
}
return result
}
func (sm *SystemMonitor) parseNetworkCheck(check HealthCheck, output string) HealthCheckResult {
result := HealthCheckResult{
ID: check.ID,
Name: check.Name,
Status: "unknown",
Message: "Network check completed",
Timestamp: time.Now(),
Threshold: check.Threshold,
Metadata: make(map[string]interface{}),
}
// Check if ping was successful
if strings.Contains(output, "1 received") {
result.Status = "healthy"
result.Message = "Network connectivity is working"
result.Value = 1.0
} else {
result.Status = "critical"
result.Message = "Network connectivity failed"
result.Value = 0.0
}
return result
}
func (sm *SystemMonitor) processHealthCheckResult(result HealthCheckResult) {
// Store health check result
sm.metrics.setMetric(fmt.Sprintf("health_check_%s", result.ID),
float64(sm.healthStatusToValue(result.Status)), "status", "gauge",
map[string]string{"check": result.ID, "status": result.Status})
// Generate alert if status is critical
if result.Status == "critical" {
alert := Alert{
ID: generateAlertID(),
Level: "critical",
Message: result.Message,
Source: fmt.Sprintf("health_check:%s", result.ID),
Timestamp: time.Now(),
Acknowledged: false,
Metadata: map[string]interface{}{
"check_id": result.ID,
"value": result.Value,
"threshold": result.Threshold,
},
}
sm.alerts.addAlert(alert)
}
}
func (sm *SystemMonitor) healthStatusToValue(status string) int {
switch status {
case "healthy":
return 1
case "warning":
return 2
case "critical":
return 3
default:
return 0
}
}
func (sm *SystemMonitor) checkThresholds() {
sm.metrics.mu.RLock()
defer sm.metrics.mu.RUnlock()
for name, metric := range sm.metrics.metrics {
if threshold, exists := sm.config.Thresholds[name]; exists {
if metric.Value > threshold {
alert := Alert{
ID: generateAlertID(),
Level: "warning",
Message: fmt.Sprintf("Metric %s (%.2f) exceeds threshold %.2f", name, metric.Value, threshold),
Source: fmt.Sprintf("metric:%s", name),
Timestamp: time.Now(),
Acknowledged: false,
Metadata: map[string]interface{}{
"metric_name": name,
"value": metric.Value,
"threshold": threshold,
},
}
sm.alerts.addAlert(alert)
}
}
}
}
func (sm *SystemMonitor) storeMetrics() error {
sm.metrics.mu.RLock()
metrics := make(map[string]Metric)
for k, v := range sm.metrics.metrics {
metrics[k] = v
}
sm.metrics.mu.RUnlock()
// Store metrics to file
data, err := json.MarshalIndent(metrics, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal metrics: %w", err)
}
// Create metrics directory if it doesn't exist
if err := os.MkdirAll(sm.config.MetricsPath, 0755); err != nil {
return fmt.Errorf("failed to create metrics directory: %w", err)
}
// Write metrics to file with timestamp
timestamp := time.Now().Format("2006-01-02_15-04-05")
filename := filepath.Join(sm.config.MetricsPath, fmt.Sprintf("metrics_%s.json", timestamp))
if err := os.WriteFile(filename, data, 0644); err != nil {
return fmt.Errorf("failed to write metrics file: %w", err)
}
// Clean up old metrics files
return sm.cleanupOldMetrics()
}
func (sm *SystemMonitor) cleanupOldMetrics() error {
files, err := os.ReadDir(sm.config.MetricsPath)
if err != nil {
return fmt.Errorf("failed to read metrics directory: %w", err)
}
cutoff := time.Now().Add(-sm.storage.retention)
for _, file := range files {
if file.IsDir() || !strings.HasPrefix(file.Name(), "metrics_") {
continue
}
// Extract timestamp from filename
parts := strings.Split(file.Name(), "_")
if len(parts) >= 3 {
timestampStr := strings.Join(parts[1:3], "_")
timestampStr = strings.TrimSuffix(timestampStr, ".json")
if fileTime, err := time.Parse("2006-01-02_15-04-05", timestampStr); err == nil {
if fileTime.Before(cutoff) {
filePath := filepath.Join(sm.config.MetricsPath, file.Name())
if err := os.Remove(filePath); err != nil {
sm.logger.Warnf("Failed to remove old metrics file %s: %v", filePath, err)
}
}
}
}
}
return nil
}
func (sm *SystemMonitor) GetSystemStatus() *SystemStatus {
status := &SystemStatus{
Timestamp: time.Now(),
HealthStatus: make(map[string]string),
Alerts: sm.alerts.getActiveAlerts(),
Metadata: make(map[string]interface{}),
}
// Get uptime
if uptime, err := sm.getUptime(); err == nil {
status.Uptime = uptime
}
// Get load average
if loadAvg, err := sm.getLoadAverage(); err == nil {
status.LoadAverage = loadAvg
}
// Get current metrics
sm.metrics.mu.RLock()
if cpuMetric, exists := sm.metrics.metrics["cpu_usage"]; exists {
status.CPUUsage = cpuMetric.Value
}
if memMetric, exists := sm.metrics.metrics["memory_usage"]; exists {
status.MemoryUsage = memMetric.Value
}
if diskMetric, exists := sm.metrics.metrics["disk_usage"]; exists {
status.DiskUsage = diskMetric.Value
}
if procMetric, exists := sm.metrics.metrics["process_count"]; exists {
status.ProcessCount = int(procMetric.Value)
}
sm.metrics.mu.RUnlock()
// Get health status for each check
for id, check := range sm.healthChecks {
if check.Enabled {
// Get latest health check result
if metric, exists := sm.metrics.metrics[fmt.Sprintf("health_check_%s", id)]; exists {
status.HealthStatus[id] = sm.valueToHealthStatus(int(metric.Value))
} else {
status.HealthStatus[id] = "unknown"
}
}
}
return status
}
func (sm *SystemMonitor) getUptime() (time.Duration, error) {
data, err := os.ReadFile("/proc/uptime")
if err != nil {
return 0, fmt.Errorf("failed to read uptime: %w", err)
}
fields := strings.Fields(string(data))
if len(fields) >= 1 {
if seconds, err := strconv.ParseFloat(fields[0], 64); err == nil {
return time.Duration(seconds) * time.Second, nil
}
}
return 0, fmt.Errorf("failed to parse uptime")
}
func (sm *SystemMonitor) getLoadAverage() ([]float64, error) {
data, err := os.ReadFile("/proc/loadavg")
if err != nil {
return nil, fmt.Errorf("failed to read loadavg: %w", err)
}
fields := strings.Fields(string(data))
if len(fields) >= 3 {
loads := make([]float64, 3)
for i := 0; i < 3; i++ {
if load, err := strconv.ParseFloat(fields[i], 64); err == nil {
loads[i] = load
}
}
return loads, nil
}
return nil, fmt.Errorf("failed to parse load average")
}
func (sm *SystemMonitor) valueToHealthStatus(value int) string {
switch value {
case 1:
return "healthy"
case 2:
return "warning"
case 3:
return "critical"
default:
return "unknown"
}
}
// MetricsCollector methods
func (mc *MetricsCollector) setMetric(name string, value float64, unit string, metricType string, labels map[string]string) {
mc.mu.Lock()
defer mc.mu.Unlock()
mc.metrics[name] = Metric{
Name: name,
Value: value,
Unit: unit,
Type: metricType,
Timestamp: time.Now(),
Labels: labels,
Metadata: make(map[string]interface{}),
}
}
// AlertManager methods
func (am *AlertManager) addAlert(alert Alert) {
am.mu.Lock()
defer am.mu.Unlock()
am.alerts[alert.ID] = alert
// Send alert through channels
for _, channel := range am.channels {
if channel.Enabled {
go am.sendAlert(alert, channel)
}
}
}
func (am *AlertManager) sendAlert(alert Alert, channel AlertChannel) {
// This is a placeholder for alert sending
// In production, implement actual alert delivery logic
}
func (am *AlertManager) getActiveAlerts() []Alert {
am.mu.RLock()
defer am.mu.RUnlock()
var activeAlerts []Alert
for _, alert := range am.alerts {
if !alert.Acknowledged {
activeAlerts = append(activeAlerts, alert)
}
}
return activeAlerts
}
// Helper functions
func generateAlertID() string {
return fmt.Sprintf("alert-%d", time.Now().UnixNano())
}