package monitoring import ( "encoding/json" "fmt" "os" "os/exec" "path/filepath" "runtime" "strconv" "strings" "sync" "time" "github.com/sirupsen/logrus" ) type SystemMonitor struct { logger *logrus.Logger config *MonitoringConfig metrics *MetricsCollector healthChecks map[string]HealthCheck alerts *AlertManager storage *MetricsStorage running bool mu sync.RWMutex } type MonitoringConfig struct { Enabled bool `json:"enabled"` Interval time.Duration `json:"interval"` MetricsPath string `json:"metrics_path"` AlertPath string `json:"alert_path"` RetentionDays int `json:"retention_days"` Thresholds map[string]float64 `json:"thresholds"` Metadata map[string]string `json:"metadata"` } type MetricsCollector struct { metrics map[string]Metric mu sync.RWMutex } type Metric struct { Name string `json:"name"` Value float64 `json:"value"` Unit string `json:"unit"` Type string `json:"type"` Timestamp time.Time `json:"timestamp"` Labels map[string]string `json:"labels"` Metadata map[string]interface{} `json:"metadata"` } type HealthCheck struct { ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` Type string `json:"type"` Command string `json:"command"` Args []string `json:"args"` Interval time.Duration `json:"interval"` Timeout time.Duration `json:"timeout"` Threshold float64 `json:"threshold"` Enabled bool `json:"enabled"` Metadata map[string]interface{} `json:"metadata"` } type HealthCheckResult struct { ID string `json:"id"` Name string `json:"name"` Status string `json:"status"` Message string `json:"message"` Value float64 `json:"value"` Threshold float64 `json:"threshold"` Timestamp time.Time `json:"timestamp"` Duration time.Duration `json:"duration"` Error string `json:"error,omitempty"` Metadata map[string]interface{} `json:"metadata"` } type AlertManager struct { alerts map[string]Alert channels map[string]AlertChannel mu sync.RWMutex } type Alert struct { ID string `json:"id"` Level string `json:"level"` Message string `json:"message"` Source string `json:"source"` Timestamp time.Time `json:"timestamp"` Acknowledged bool `json:"acknowledged"` Metadata map[string]interface{} `json:"metadata"` } type AlertChannel struct { ID string `json:"id"` Name string `json:"name"` Type string `json:"type"` Config map[string]interface{} `json:"config"` Enabled bool `json:"enabled"` Metadata map[string]interface{} `json:"metadata"` } type MetricsStorage struct { path string retention time.Duration mu sync.RWMutex } type SystemStatus struct { Timestamp time.Time `json:"timestamp"` Uptime time.Duration `json:"uptime"` LoadAverage []float64 `json:"load_average"` CPUUsage float64 `json:"cpu_usage"` MemoryUsage float64 `json:"memory_usage"` DiskUsage float64 `json:"disk_usage"` NetworkIO NetworkStats `json:"network_io"` ProcessCount int `json:"process_count"` HealthStatus map[string]string `json:"health_status"` Alerts []Alert `json:"alerts"` Metadata map[string]interface{} `json:"metadata"` } type NetworkStats struct { BytesReceived uint64 `json:"bytes_received"` BytesSent uint64 `json:"bytes_sent"` PacketsReceived uint64 `json:"packets_received"` PacketsSent uint64 `json:"packets_sent"` ErrorsReceived uint64 `json:"errors_received"` ErrorsSent uint64 `json:"errors_sent"` } func NewSystemMonitor(config *MonitoringConfig, logger *logrus.Logger) *SystemMonitor { monitor := &SystemMonitor{ logger: logger, config: config, metrics: NewMetricsCollector(), healthChecks: make(map[string]HealthCheck), alerts: NewAlertManager(), storage: NewMetricsStorage(config.MetricsPath, time.Duration(config.RetentionDays)*24*time.Hour), running: false, } // Initialize health checks monitor.initializeHealthChecks() // Initialize alert channels monitor.initializeAlertChannels() return monitor } func NewMetricsCollector() *MetricsCollector { return &MetricsCollector{ metrics: make(map[string]Metric), } } func NewAlertManager() *AlertManager { return &AlertManager{ alerts: make(map[string]Alert), channels: make(map[string]AlertChannel), } } func NewMetricsStorage(path string, retention time.Duration) *MetricsStorage { return &MetricsStorage{ path: path, retention: retention, } } func (sm *SystemMonitor) initializeHealthChecks() { // System resource health checks sm.healthChecks["cpu_usage"] = HealthCheck{ ID: "cpu_usage", Name: "CPU Usage", Description: "Monitor CPU usage percentage", Type: "resource", Command: "top", Args: []string{"-bn1", "-p", "1"}, Interval: 30 * time.Second, Timeout: 10 * time.Second, Threshold: 80.0, Enabled: true, } sm.healthChecks["memory_usage"] = HealthCheck{ ID: "memory_usage", Name: "Memory Usage", Description: "Monitor memory usage percentage", Type: "resource", Command: "free", Args: []string{"-m"}, Interval: 30 * time.Second, Timeout: 10 * time.Second, Threshold: 85.0, Enabled: true, } sm.healthChecks["disk_usage"] = HealthCheck{ ID: "disk_usage", Name: "Disk Usage", Description: "Monitor disk usage percentage", Type: "resource", Command: "df", Args: []string{"-h", "/"}, Interval: 60 * time.Second, Timeout: 10 * time.Second, Threshold: 90.0, Enabled: true, } sm.healthChecks["load_average"] = HealthCheck{ ID: "load_average", Name: "Load Average", Description: "Monitor system load average", Type: "resource", Command: "uptime", Args: []string{}, Interval: 30 * time.Second, Timeout: 10 * time.Second, Threshold: 5.0, Enabled: true, } // Service health checks sm.healthChecks["debian_forge_service"] = HealthCheck{ ID: "debian_forge_service", Name: "Debian Forge Service", Description: "Check if Debian Forge service is running", Type: "service", Command: "systemctl", Args: []string{"is-active", "debian-forge"}, Interval: 60 * time.Second, Timeout: 10 * time.Second, Threshold: 0.0, Enabled: true, } sm.healthChecks["database_connection"] = HealthCheck{ ID: "database_connection", Name: "Database Connection", Description: "Check database connectivity", Type: "service", Command: "pg_isready", Args: []string{"-h", "localhost"}, Interval: 60 * time.Second, Timeout: 10 * time.Second, Threshold: 0.0, Enabled: true, } // Network health checks sm.healthChecks["network_connectivity"] = HealthCheck{ ID: "network_connectivity", Name: "Network Connectivity", Description: "Check basic network connectivity", Type: "network", Command: "ping", Args: []string{"-c", "1", "8.8.8.8"}, Interval: 60 * time.Second, Timeout: 10 * time.Second, Threshold: 0.0, Enabled: true, } } func (sm *SystemMonitor) initializeAlertChannels() { // Email alert channel sm.alerts.channels["email"] = AlertChannel{ ID: "email", Name: "Email Alerts", Type: "email", Enabled: true, Config: map[string]interface{}{ "smtp_server": "localhost", "smtp_port": 25, "from": "alerts@debian-forge.local", "to": []string{"admin@debian-forge.local"}, }, } // Slack alert channel sm.alerts.channels["slack"] = AlertChannel{ ID: "slack", Name: "Slack Alerts", Type: "slack", Enabled: false, Config: map[string]interface{}{ "webhook_url": "", "channel": "#alerts", }, } // Webhook alert channel sm.alerts.channels["webhook"] = AlertChannel{ ID: "webhook", Name: "Webhook Alerts", Type: "webhook", Enabled: false, Config: map[string]interface{}{ "url": "", "method": "POST", "headers": map[string]string{}, }, } } func (sm *SystemMonitor) Start() error { sm.mu.Lock() defer sm.mu.Unlock() if sm.running { return fmt.Errorf("monitor is already running") } sm.running = true sm.logger.Info("Starting system monitor") // Start monitoring goroutine go sm.monitoringLoop() // Start health check goroutines for _, check := range sm.healthChecks { if check.Enabled { go sm.healthCheckLoop(check) } } return nil } func (sm *SystemMonitor) Stop() error { sm.mu.Lock() defer sm.mu.Unlock() if !sm.running { return fmt.Errorf("monitor is not running") } sm.running = false sm.logger.Info("Stopping system monitor") return nil } func (sm *SystemMonitor) monitoringLoop() { ticker := time.NewTicker(sm.config.Interval) defer ticker.Stop() for { select { case <-ticker.C: if !sm.isRunning() { return } // Collect system metrics if err := sm.collectSystemMetrics(); err != nil { sm.logger.Errorf("Failed to collect system metrics: %v", err) } // Check thresholds and generate alerts sm.checkThresholds() // Store metrics if err := sm.storeMetrics(); err != nil { sm.logger.Errorf("Failed to store metrics: %v", err) } } } } func (sm *SystemMonitor) healthCheckLoop(check HealthCheck) { ticker := time.NewTicker(check.Interval) defer ticker.Stop() for { select { case <-ticker.C: if !sm.isRunning() { return } // Run health check result := sm.runHealthCheck(check) // Process result sm.processHealthCheckResult(result) } } } func (sm *SystemMonitor) isRunning() bool { sm.mu.RLock() defer sm.mu.RUnlock() return sm.running } func (sm *SystemMonitor) collectSystemMetrics() error { // Collect CPU metrics if err := sm.collectCPUMetrics(); err != nil { sm.logger.Warnf("Failed to collect CPU metrics: %v", err) } // Collect memory metrics if err := sm.collectMemoryMetrics(); err != nil { sm.logger.Warnf("Failed to collect memory metrics: %v", err) } // Collect disk metrics if err := sm.collectDiskMetrics(); err != nil { sm.logger.Warnf("Failed to collect disk metrics: %v", err) } // Collect network metrics if err := sm.collectNetworkMetrics(); err != nil { sm.logger.Warnf("Failed to collect network metrics: %v", err) } // Collect process metrics if err := sm.collectProcessMetrics(); err != nil { sm.logger.Warnf("Failed to collect process metrics: %v", err) } return nil } func (sm *SystemMonitor) collectCPUMetrics() error { // Read /proc/loadavg for load average data, err := os.ReadFile("/proc/loadavg") if err != nil { return fmt.Errorf("failed to read loadavg: %w", err) } fields := strings.Fields(string(data)) if len(fields) >= 3 { if load1, err := strconv.ParseFloat(fields[0], 64); err == nil { sm.metrics.setMetric("load_1min", load1, "", "gauge", map[string]string{"type": "load_average"}) } if load5, err := strconv.ParseFloat(fields[1], 64); err == nil { sm.metrics.setMetric("load_5min", load5, "", "gauge", map[string]string{"type": "load_average"}) } if load15, err := strconv.ParseFloat(fields[2], 64); err == nil { sm.metrics.setMetric("load_15min", load15, "", "gauge", map[string]string{"type": "load_average"}) } } // Read /proc/stat for CPU usage data, err = os.ReadFile("/proc/stat") if err != nil { return fmt.Errorf("failed to read stat: %w", err) } lines := strings.Split(string(data), "\n") for _, line := range lines { if strings.HasPrefix(line, "cpu ") { fields := strings.Fields(line) if len(fields) >= 5 { // Calculate CPU usage percentage total := 0.0 idle := 0.0 for i := 1; i < len(fields); i++ { if val, err := strconv.ParseFloat(fields[i], 64); err == nil { total += val if i == 4 { // idle time idle = val } } } if total > 0 { usage := ((total - idle) / total) * 100.0 sm.metrics.setMetric("cpu_usage", usage, "%", "gauge", map[string]string{"type": "cpu"}) } } break } } return nil } func (sm *SystemMonitor) collectMemoryMetrics() error { // Read /proc/meminfo for memory usage data, err := os.ReadFile("/proc/meminfo") if err != nil { return fmt.Errorf("failed to read meminfo: %w", err) } var total, available uint64 lines := strings.Split(string(data), "\n") for _, line := range lines { fields := strings.Fields(line) if len(fields) >= 2 { switch fields[0] { case "MemTotal:": if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil { total = val } case "MemAvailable:": if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil { available = val } } } } if total > 0 { usage := float64(total-available) / float64(total) * 100.0 sm.metrics.setMetric("memory_usage", usage, "%", "gauge", map[string]string{"type": "memory"}) sm.metrics.setMetric("memory_total", float64(total), "KB", "gauge", map[string]string{"type": "memory"}) sm.metrics.setMetric("memory_available", float64(available), "KB", "gauge", map[string]string{"type": "memory"}) } return nil } func (sm *SystemMonitor) collectDiskMetrics() error { // Use df command to get disk usage cmd := exec.Command("df", "-h", "/") output, err := cmd.Output() if err != nil { return fmt.Errorf("df command failed: %w", err) } lines := strings.Split(string(output), "\n") if len(lines) >= 2 { fields := strings.Fields(lines[1]) if len(fields) >= 5 { usageStr := strings.TrimSuffix(fields[4], "%") if usage, err := strconv.ParseFloat(usageStr, 64); err == nil { sm.metrics.setMetric("disk_usage", usage, "%", "gauge", map[string]string{"type": "disk", "mount": "/"}) } } } return nil } func (sm *SystemMonitor) collectNetworkMetrics() error { // Read /proc/net/dev for network statistics data, err := os.ReadFile("/proc/net/dev") if err != nil { return fmt.Errorf("failed to read net/dev: %w", err) } lines := strings.Split(string(data), "\n") for _, line := range lines { if strings.Contains(line, ":") && !strings.Contains(line, "lo:") { fields := strings.Fields(line) if len(fields) >= 17 { interfaceName := strings.TrimSuffix(fields[0], ":") if bytesReceived, err := strconv.ParseUint(fields[1], 10, 64); err == nil { sm.metrics.setMetric("network_bytes_received", float64(bytesReceived), "bytes", "counter", map[string]string{"interface": interfaceName}) } if bytesSent, err := strconv.ParseUint(fields[9], 10, 64); err == nil { sm.metrics.setMetric("network_bytes_sent", float64(bytesSent), "bytes", "counter", map[string]string{"interface": interfaceName}) } } } } return nil } func (sm *SystemMonitor) collectProcessMetrics() error { // Count running processes cmd := exec.Command("ps", "-e", "--no-headers") output, err := cmd.Output() if err != nil { return fmt.Errorf("ps command failed: %w", err) } lines := strings.Split(string(output), "\n") processCount := len(lines) if len(lines) > 0 && lines[len(lines)-1] == "" { processCount-- } sm.metrics.setMetric("process_count", float64(processCount), "count", "gauge", map[string]string{"type": "process"}) return nil } func (sm *SystemMonitor) runHealthCheck(check HealthCheck) HealthCheckResult { result := HealthCheckResult{ ID: check.ID, Name: check.Name, Status: "unknown", Message: "Health check completed", Timestamp: time.Now(), Threshold: check.Threshold, Metadata: make(map[string]interface{}), } startTime := time.Now() // Run the health check command cmd := exec.Command(check.Command, check.Args...) cmd.Timeout = check.Timeout output, err := cmd.Output() result.Duration = time.Since(startTime) if err != nil { result.Status = "failed" result.Message = "Health check command failed" result.Error = err.Error() return result } // Parse output based on check type switch check.Type { case "resource": result = sm.parseResourceCheck(check, string(output)) case "service": result = sm.parseServiceCheck(check, string(output)) case "network": result = sm.parseNetworkCheck(check, string(output)) default: result.Status = "unknown" result.Message = "Unknown check type" } return result } func (sm *SystemMonitor) parseResourceCheck(check HealthCheck, output string) HealthCheckResult { result := HealthCheckResult{ ID: check.ID, Name: check.Name, Status: "unknown", Message: "Resource check completed", Timestamp: time.Now(), Threshold: check.Threshold, Metadata: make(map[string]interface{}), } // Parse output based on command switch check.Command { case "top": // Parse top output for CPU usage lines := strings.Split(output, "\n") for _, line := range lines { if strings.Contains(line, "Cpu(s):") { fields := strings.Fields(line) for i, field := range fields { if strings.Contains(field, "%us") { if i > 0 { if usage, err := strconv.ParseFloat(fields[i-1], 64); err == nil { result.Value = usage if usage > check.Threshold { result.Status = "critical" result.Message = fmt.Sprintf("CPU usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold) } else { result.Status = "healthy" result.Message = fmt.Sprintf("CPU usage %.1f%% is within normal range", usage) } } } break } } } } case "free": // Parse free output for memory usage lines := strings.Split(output, "\n") if len(lines) >= 2 { fields := strings.Fields(lines[1]) if len(fields) >= 3 { if total, err := strconv.ParseFloat(fields[1], 64); err == nil { if used, err := strconv.ParseFloat(fields[2], 64); err == nil { usage := (used / total) * 100.0 result.Value = usage if usage > check.Threshold { result.Status = "critical" result.Message = fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold) } else { result.Status = "healthy" result.Message = fmt.Sprintf("Memory usage %.1f%% is within normal range", usage) } } } } } case "df": // Parse df output for disk usage lines := strings.Split(output, "\n") if len(lines) >= 2 { fields := strings.Fields(lines[1]) if len(fields) >= 5 { usageStr := strings.TrimSuffix(fields[4], "%") if usage, err := strconv.ParseFloat(usageStr, 64); err == nil { result.Value = usage if usage > check.Threshold { result.Status = "critical" result.Message = fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold) } else { result.Status = "healthy" result.Message = fmt.Sprintf("Disk usage %.1f%% is within normal range", usage) } } } } case "uptime": // Parse uptime output for load average fields := strings.Fields(output) if len(fields) >= 10 { loadStr := strings.TrimSuffix(fields[9], ",") if load, err := strconv.ParseFloat(loadStr, 64); err == nil { result.Value = load if load > check.Threshold { result.Status = "critical" result.Message = fmt.Sprintf("Load average %.2f exceeds threshold %.2f", load, check.Threshold) } else { result.Status = "healthy" result.Message = fmt.Sprintf("Load average %.2f is within normal range", load) } } } } return result } func (sm *SystemMonitor) parseServiceCheck(check HealthCheck, output string) HealthCheckResult { result := HealthCheckResult{ ID: check.ID, Name: check.Name, Status: "unknown", Message: "Service check completed", Timestamp: time.Now(), Threshold: check.Threshold, Metadata: make(map[string]interface{}), } output = strings.TrimSpace(output) switch output { case "active": result.Status = "healthy" result.Message = "Service is running" result.Value = 1.0 case "inactive": result.Status = "critical" result.Message = "Service is not running" result.Value = 0.0 default: result.Status = "unknown" result.Message = fmt.Sprintf("Service status: %s", output) } return result } func (sm *SystemMonitor) parseNetworkCheck(check HealthCheck, output string) HealthCheckResult { result := HealthCheckResult{ ID: check.ID, Name: check.Name, Status: "unknown", Message: "Network check completed", Timestamp: time.Now(), Threshold: check.Threshold, Metadata: make(map[string]interface{}), } // Check if ping was successful if strings.Contains(output, "1 received") { result.Status = "healthy" result.Message = "Network connectivity is working" result.Value = 1.0 } else { result.Status = "critical" result.Message = "Network connectivity failed" result.Value = 0.0 } return result } func (sm *SystemMonitor) processHealthCheckResult(result HealthCheckResult) { // Store health check result sm.metrics.setMetric(fmt.Sprintf("health_check_%s", result.ID), float64(sm.healthStatusToValue(result.Status)), "status", "gauge", map[string]string{"check": result.ID, "status": result.Status}) // Generate alert if status is critical if result.Status == "critical" { alert := Alert{ ID: generateAlertID(), Level: "critical", Message: result.Message, Source: fmt.Sprintf("health_check:%s", result.ID), Timestamp: time.Now(), Acknowledged: false, Metadata: map[string]interface{}{ "check_id": result.ID, "value": result.Value, "threshold": result.Threshold, }, } sm.alerts.addAlert(alert) } } func (sm *SystemMonitor) healthStatusToValue(status string) int { switch status { case "healthy": return 1 case "warning": return 2 case "critical": return 3 default: return 0 } } func (sm *SystemMonitor) checkThresholds() { sm.metrics.mu.RLock() defer sm.metrics.mu.RUnlock() for name, metric := range sm.metrics.metrics { if threshold, exists := sm.config.Thresholds[name]; exists { if metric.Value > threshold { alert := Alert{ ID: generateAlertID(), Level: "warning", Message: fmt.Sprintf("Metric %s (%.2f) exceeds threshold %.2f", name, metric.Value, threshold), Source: fmt.Sprintf("metric:%s", name), Timestamp: time.Now(), Acknowledged: false, Metadata: map[string]interface{}{ "metric_name": name, "value": metric.Value, "threshold": threshold, }, } sm.alerts.addAlert(alert) } } } } func (sm *SystemMonitor) storeMetrics() error { sm.metrics.mu.RLock() metrics := make(map[string]Metric) for k, v := range sm.metrics.metrics { metrics[k] = v } sm.metrics.mu.RUnlock() // Store metrics to file data, err := json.MarshalIndent(metrics, "", " ") if err != nil { return fmt.Errorf("failed to marshal metrics: %w", err) } // Create metrics directory if it doesn't exist if err := os.MkdirAll(sm.config.MetricsPath, 0755); err != nil { return fmt.Errorf("failed to create metrics directory: %w", err) } // Write metrics to file with timestamp timestamp := time.Now().Format("2006-01-02_15-04-05") filename := filepath.Join(sm.config.MetricsPath, fmt.Sprintf("metrics_%s.json", timestamp)) if err := os.WriteFile(filename, data, 0644); err != nil { return fmt.Errorf("failed to write metrics file: %w", err) } // Clean up old metrics files return sm.cleanupOldMetrics() } func (sm *SystemMonitor) cleanupOldMetrics() error { files, err := os.ReadDir(sm.config.MetricsPath) if err != nil { return fmt.Errorf("failed to read metrics directory: %w", err) } cutoff := time.Now().Add(-sm.storage.retention) for _, file := range files { if file.IsDir() || !strings.HasPrefix(file.Name(), "metrics_") { continue } // Extract timestamp from filename parts := strings.Split(file.Name(), "_") if len(parts) >= 3 { timestampStr := strings.Join(parts[1:3], "_") timestampStr = strings.TrimSuffix(timestampStr, ".json") if fileTime, err := time.Parse("2006-01-02_15-04-05", timestampStr); err == nil { if fileTime.Before(cutoff) { filePath := filepath.Join(sm.config.MetricsPath, file.Name()) if err := os.Remove(filePath); err != nil { sm.logger.Warnf("Failed to remove old metrics file %s: %v", filePath, err) } } } } } return nil } func (sm *SystemMonitor) GetSystemStatus() *SystemStatus { status := &SystemStatus{ Timestamp: time.Now(), HealthStatus: make(map[string]string), Alerts: sm.alerts.getActiveAlerts(), Metadata: make(map[string]interface{}), } // Get uptime if uptime, err := sm.getUptime(); err == nil { status.Uptime = uptime } // Get load average if loadAvg, err := sm.getLoadAverage(); err == nil { status.LoadAverage = loadAvg } // Get current metrics sm.metrics.mu.RLock() if cpuMetric, exists := sm.metrics.metrics["cpu_usage"]; exists { status.CPUUsage = cpuMetric.Value } if memMetric, exists := sm.metrics.metrics["memory_usage"]; exists { status.MemoryUsage = memMetric.Value } if diskMetric, exists := sm.metrics.metrics["disk_usage"]; exists { status.DiskUsage = diskMetric.Value } if procMetric, exists := sm.metrics.metrics["process_count"]; exists { status.ProcessCount = int(procMetric.Value) } sm.metrics.mu.RUnlock() // Get health status for each check for id, check := range sm.healthChecks { if check.Enabled { // Get latest health check result if metric, exists := sm.metrics.metrics[fmt.Sprintf("health_check_%s", id)]; exists { status.HealthStatus[id] = sm.valueToHealthStatus(int(metric.Value)) } else { status.HealthStatus[id] = "unknown" } } } return status } func (sm *SystemMonitor) getUptime() (time.Duration, error) { data, err := os.ReadFile("/proc/uptime") if err != nil { return 0, fmt.Errorf("failed to read uptime: %w", err) } fields := strings.Fields(string(data)) if len(fields) >= 1 { if seconds, err := strconv.ParseFloat(fields[0], 64); err == nil { return time.Duration(seconds) * time.Second, nil } } return 0, fmt.Errorf("failed to parse uptime") } func (sm *SystemMonitor) getLoadAverage() ([]float64, error) { data, err := os.ReadFile("/proc/loadavg") if err != nil { return nil, fmt.Errorf("failed to read loadavg: %w", err) } fields := strings.Fields(string(data)) if len(fields) >= 3 { loads := make([]float64, 3) for i := 0; i < 3; i++ { if load, err := strconv.ParseFloat(fields[i], 64); err == nil { loads[i] = load } } return loads, nil } return nil, fmt.Errorf("failed to parse load average") } func (sm *SystemMonitor) valueToHealthStatus(value int) string { switch value { case 1: return "healthy" case 2: return "warning" case 3: return "critical" default: return "unknown" } } // MetricsCollector methods func (mc *MetricsCollector) setMetric(name string, value float64, unit string, metricType string, labels map[string]string) { mc.mu.Lock() defer mc.mu.Unlock() mc.metrics[name] = Metric{ Name: name, Value: value, Unit: unit, Type: metricType, Timestamp: time.Now(), Labels: labels, Metadata: make(map[string]interface{}), } } // AlertManager methods func (am *AlertManager) addAlert(alert Alert) { am.mu.Lock() defer am.mu.Unlock() am.alerts[alert.ID] = alert // Send alert through channels for _, channel := range am.channels { if channel.Enabled { go am.sendAlert(alert, channel) } } } func (am *AlertManager) sendAlert(alert Alert, channel AlertChannel) { // This is a placeholder for alert sending // In production, implement actual alert delivery logic } func (am *AlertManager) getActiveAlerts() []Alert { am.mu.RLock() defer am.mu.RUnlock() var activeAlerts []Alert for _, alert := range am.alerts { if !alert.Acknowledged { activeAlerts = append(activeAlerts, alert) } } return activeAlerts } // Helper functions func generateAlertID() string { return fmt.Sprintf("alert-%d", time.Now().UnixNano()) }