debian-forge-composer/internal/monitoring/system_monitor.go

package monitoring

import (
	"encoding/json"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"runtime"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/sirupsen/logrus"
)

type SystemMonitor struct {
	logger       *logrus.Logger
	config       *MonitoringConfig
	metrics      *MetricsCollector
	healthChecks map[string]HealthCheck
	alerts       *AlertManager
	storage      *MetricsStorage
	running      bool
	mu           sync.RWMutex
}

type MonitoringConfig struct {
	Enabled       bool              `json:"enabled"`
	Interval      time.Duration     `json:"interval"`
	MetricsPath   string            `json:"metrics_path"`
	AlertPath     string            `json:"alert_path"`
	RetentionDays int               `json:"retention_days"`
	Thresholds    map[string]float64 `json:"thresholds"`
	Metadata      map[string]string `json:"metadata"`
}

type MetricsCollector struct {
	metrics map[string]Metric
	mu      sync.RWMutex
}

type Metric struct {
	Name      string                 `json:"name"`
	Value     float64                `json:"value"`
	Unit      string                 `json:"unit"`
	Type      string                 `json:"type"`
	Timestamp time.Time              `json:"timestamp"`
	Labels    map[string]string      `json:"labels"`
	Metadata  map[string]interface{} `json:"metadata"`
}

type HealthCheck struct {
	ID          string                 `json:"id"`
	Name        string                 `json:"name"`
	Description string                 `json:"description"`
	Type        string                 `json:"type"`
	Command     string                 `json:"command"`
	Args        []string               `json:"args"`
	Interval    time.Duration          `json:"interval"`
	Timeout     time.Duration          `json:"timeout"`
	Threshold   float64                `json:"threshold"`
	Enabled     bool                   `json:"enabled"`
	Metadata    map[string]interface{} `json:"metadata"`
}

type HealthCheckResult struct {
	ID          string                 `json:"id"`
	Name        string                 `json:"name"`
	Status      string                 `json:"status"`
	Message     string                 `json:"message"`
	Value       float64                `json:"value"`
	Threshold   float64                `json:"threshold"`
	Timestamp   time.Time              `json:"timestamp"`
	Duration    time.Duration          `json:"duration"`
	Error       string                 `json:"error,omitempty"`
	Metadata    map[string]interface{} `json:"metadata"`
}

type AlertManager struct {
	alerts    map[string]Alert
	channels  map[string]AlertChannel
	mu        sync.RWMutex
}

type Alert struct {
	ID          string                 `json:"id"`
	Level       string                 `json:"level"`
	Message     string                 `json:"message"`
	Source      string                 `json:"source"`
	Timestamp   time.Time              `json:"timestamp"`
	Acknowledged bool                  `json:"acknowledged"`
	Metadata    map[string]interface{} `json:"metadata"`
}

type AlertChannel struct {
	ID       string                 `json:"id"`
	Name     string                 `json:"name"`
	Type     string                 `json:"type"`
	Config   map[string]interface{} `json:"config"`
	Enabled  bool                   `json:"enabled"`
	Metadata map[string]interface{} `json:"metadata"`
}

type MetricsStorage struct {
	path      string
	retention time.Duration
	mu        sync.RWMutex
}

type SystemStatus struct {
	Timestamp     time.Time              `json:"timestamp"`
	Uptime       time.Duration          `json:"uptime"`
	LoadAverage  []float64              `json:"load_average"`
	CPUUsage     float64                `json:"cpu_usage"`
	MemoryUsage  float64                `json:"memory_usage"`
	DiskUsage    float64                `json:"disk_usage"`
	NetworkIO    NetworkStats           `json:"network_io"`
	ProcessCount int                    `json:"process_count"`
	HealthStatus map[string]string      `json:"health_status"`
	Alerts       []Alert                `json:"alerts"`
	Metadata     map[string]interface{} `json:"metadata"`
}

type NetworkStats struct {
	BytesReceived    uint64 `json:"bytes_received"`
	BytesSent        uint64 `json:"bytes_sent"`
	PacketsReceived  uint64 `json:"packets_received"`
	PacketsSent      uint64 `json:"packets_sent"`
	ErrorsReceived   uint64 `json:"errors_received"`
	ErrorsSent       uint64 `json:"errors_sent"`
}

func NewSystemMonitor(config *MonitoringConfig, logger *logrus.Logger) *SystemMonitor {
	monitor := &SystemMonitor{
		logger:       logger,
		config:       config,
		metrics:      NewMetricsCollector(),
		healthChecks: make(map[string]HealthCheck),
		alerts:       NewAlertManager(),
		storage:      NewMetricsStorage(config.MetricsPath, time.Duration(config.RetentionDays)*24*time.Hour),
		running:      false,
	}

	// Initialize health checks
	monitor.initializeHealthChecks()

	// Initialize alert channels
	monitor.initializeAlertChannels()

	return monitor
}

func NewMetricsCollector() *MetricsCollector {
	return &MetricsCollector{
		metrics: make(map[string]Metric),
	}
}

func NewAlertManager() *AlertManager {
	return &AlertManager{
		alerts:   make(map[string]Alert),
		channels: make(map[string]AlertChannel),
	}
}

func NewMetricsStorage(path string, retention time.Duration) *MetricsStorage {
	return &MetricsStorage{
		path:      path,
		retention: retention,
	}
}

func (sm *SystemMonitor) initializeHealthChecks() {
	// System resource health checks
	sm.healthChecks["cpu_usage"] = HealthCheck{
		ID:          "cpu_usage",
		Name:        "CPU Usage",
		Description: "Monitor CPU usage percentage",
		Type:        "resource",
		Command:     "top",
		Args:        []string{"-bn1", "-p", "1"},
		Interval:    30 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   80.0,
		Enabled:     true,
	}

	sm.healthChecks["memory_usage"] = HealthCheck{
		ID:          "memory_usage",
		Name:        "Memory Usage",
		Description: "Monitor memory usage percentage",
		Type:        "resource",
		Command:     "free",
		Args:        []string{"-m"},
		Interval:    30 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   85.0,
		Enabled:     true,
	}

	sm.healthChecks["disk_usage"] = HealthCheck{
		ID:          "disk_usage",
		Name:        "Disk Usage",
		Description: "Monitor disk usage percentage",
		Type:        "resource",
		Command:     "df",
		Args:        []string{"-h", "/"},
		Interval:    60 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   90.0,
		Enabled:     true,
	}

	sm.healthChecks["load_average"] = HealthCheck{
		ID:          "load_average",
		Name:        "Load Average",
		Description: "Monitor system load average",
		Type:        "resource",
		Command:     "uptime",
		Args:        []string{},
		Interval:    30 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   5.0,
		Enabled:     true,
	}

	// Service health checks
	sm.healthChecks["debian_forge_service"] = HealthCheck{
		ID:          "debian_forge_service",
		Name:        "Debian Forge Service",
		Description: "Check if Debian Forge service is running",
		Type:        "service",
		Command:     "systemctl",
		Args:        []string{"is-active", "debian-forge"},
		Interval:    60 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   0.0,
		Enabled:     true,
	}

	sm.healthChecks["database_connection"] = HealthCheck{
		ID:          "database_connection",
		Name:        "Database Connection",
		Description: "Check database connectivity",
		Type:        "service",
		Command:     "pg_isready",
		Args:        []string{"-h", "localhost"},
		Interval:    60 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   0.0,
		Enabled:     true,
	}

	// Network health checks
	sm.healthChecks["network_connectivity"] = HealthCheck{
		ID:          "network_connectivity",
		Name:        "Network Connectivity",
		Description: "Check basic network connectivity",
		Type:        "network",
		Command:     "ping",
		Args:        []string{"-c", "1", "8.8.8.8"},
		Interval:    60 * time.Second,
		Timeout:     10 * time.Second,
		Threshold:   0.0,
		Enabled:     true,
	}
}

func (sm *SystemMonitor) initializeAlertChannels() {
	// Email alert channel
	sm.alerts.channels["email"] = AlertChannel{
		ID:      "email",
		Name:    "Email Alerts",
		Type:    "email",
		Enabled: true,
		Config: map[string]interface{}{
			"smtp_server": "localhost",
			"smtp_port":   25,
			"from":        "alerts@debian-forge.local",
			"to":          []string{"admin@debian-forge.local"},
		},
	}

	// Slack alert channel
	sm.alerts.channels["slack"] = AlertChannel{
		ID:      "slack",
		Name:    "Slack Alerts",
		Type:    "slack",
		Enabled: false,
		Config: map[string]interface{}{
			"webhook_url": "",
			"channel":     "#alerts",
		},
	}

	// Webhook alert channel
	sm.alerts.channels["webhook"] = AlertChannel{
		ID:      "webhook",
		Name:    "Webhook Alerts",
		Type:    "webhook",
		Enabled: false,
		Config: map[string]interface{}{
			"url":     "",
			"method":  "POST",
			"headers": map[string]string{},
		},
	}
}

func (sm *SystemMonitor) Start() error {
	sm.mu.Lock()
	defer sm.mu.Unlock()

	if sm.running {
		return fmt.Errorf("monitor is already running")
	}

	sm.running = true
	sm.logger.Info("Starting system monitor")

	// Start monitoring goroutine
	go sm.monitoringLoop()

	// Start health check goroutines
	for _, check := range sm.healthChecks {
		if check.Enabled {
			go sm.healthCheckLoop(check)
		}
	}

	return nil
}

func (sm *SystemMonitor) Stop() error {
	sm.mu.Lock()
	defer sm.mu.Unlock()

	if !sm.running {
		return fmt.Errorf("monitor is not running")
	}

	sm.running = false
	sm.logger.Info("Stopping system monitor")

	return nil
}

func (sm *SystemMonitor) monitoringLoop() {
	ticker := time.NewTicker(sm.config.Interval)
	defer ticker.Stop()

	for {
		select {
		case <-ticker.C:
			if !sm.isRunning() {
				return
			}

			// Collect system metrics
			if err := sm.collectSystemMetrics(); err != nil {
				sm.logger.Errorf("Failed to collect system metrics: %v", err)
			}

			// Check thresholds and generate alerts
			sm.checkThresholds()

			// Store metrics
			if err := sm.storeMetrics(); err != nil {
				sm.logger.Errorf("Failed to store metrics: %v", err)
			}
		}
	}
}

func (sm *SystemMonitor) healthCheckLoop(check HealthCheck) {
	ticker := time.NewTicker(check.Interval)
	defer ticker.Stop()

	for {
		select {
		case <-ticker.C:
			if !sm.isRunning() {
				return
			}

			// Run health check
			result := sm.runHealthCheck(check)

			// Process result
			sm.processHealthCheckResult(result)
		}
	}
}

func (sm *SystemMonitor) isRunning() bool {
	sm.mu.RLock()
	defer sm.mu.RUnlock()
	return sm.running
}

func (sm *SystemMonitor) collectSystemMetrics() error {
	// Collect CPU metrics
	if err := sm.collectCPUMetrics(); err != nil {
		sm.logger.Warnf("Failed to collect CPU metrics: %v", err)
	}

	// Collect memory metrics
	if err := sm.collectMemoryMetrics(); err != nil {
		sm.logger.Warnf("Failed to collect memory metrics: %v", err)
	}

	// Collect disk metrics
	if err := sm.collectDiskMetrics(); err != nil {
		sm.logger.Warnf("Failed to collect disk metrics: %v", err)
	}

	// Collect network metrics
	if err := sm.collectNetworkMetrics(); err != nil {
		sm.logger.Warnf("Failed to collect network metrics: %v", err)
	}

	// Collect process metrics
	if err := sm.collectProcessMetrics(); err != nil {
		sm.logger.Warnf("Failed to collect process metrics: %v", err)
	}

	return nil
}

func (sm *SystemMonitor) collectCPUMetrics() error {
	// Read /proc/loadavg for load average
	data, err := os.ReadFile("/proc/loadavg")
	if err != nil {
		return fmt.Errorf("failed to read loadavg: %w", err)
	}

	fields := strings.Fields(string(data))
	if len(fields) >= 3 {
		if load1, err := strconv.ParseFloat(fields[0], 64); err == nil {
			sm.metrics.setMetric("load_1min", load1, "", "gauge", map[string]string{"type": "load_average"})
		}
		if load5, err := strconv.ParseFloat(fields[1], 64); err == nil {
			sm.metrics.setMetric("load_5min", load5, "", "gauge", map[string]string{"type": "load_average"})
		}
		if load15, err := strconv.ParseFloat(fields[2], 64); err == nil {
			sm.metrics.setMetric("load_15min", load15, "", "gauge", map[string]string{"type": "load_average"})
		}
	}

	// Read /proc/stat for CPU usage
	data, err = os.ReadFile("/proc/stat")
	if err != nil {
		return fmt.Errorf("failed to read stat: %w", err)
	}

	lines := strings.Split(string(data), "\n")
	for _, line := range lines {
		if strings.HasPrefix(line, "cpu ") {
			fields := strings.Fields(line)
			if len(fields) >= 5 {
				// Calculate CPU usage percentage
				total := 0.0
				idle := 0.0

				for i := 1; i < len(fields); i++ {
					if val, err := strconv.ParseFloat(fields[i], 64); err == nil {
						total += val
						if i == 4 { // idle time
							idle = val
						}
					}
				}

				if total > 0 {
					usage := ((total - idle) / total) * 100.0
					sm.metrics.setMetric("cpu_usage", usage, "%", "gauge", map[string]string{"type": "cpu"})
				}
			}
			break
		}
	}

	return nil
}

func (sm *SystemMonitor) collectMemoryMetrics() error {
	// Read /proc/meminfo for memory usage
	data, err := os.ReadFile("/proc/meminfo")
	if err != nil {
		return fmt.Errorf("failed to read meminfo: %w", err)
	}

	var total, available uint64

	lines := strings.Split(string(data), "\n")
	for _, line := range lines {
		fields := strings.Fields(line)
		if len(fields) >= 2 {
			switch fields[0] {
			case "MemTotal:":
				if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
					total = val
				}
			case "MemAvailable:":
				if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
					available = val
				}
			}
		}
	}

	if total > 0 {
		usage := float64(total-available) / float64(total) * 100.0
		sm.metrics.setMetric("memory_usage", usage, "%", "gauge", map[string]string{"type": "memory"})
		sm.metrics.setMetric("memory_total", float64(total), "KB", "gauge", map[string]string{"type": "memory"})
		sm.metrics.setMetric("memory_available", float64(available), "KB", "gauge", map[string]string{"type": "memory"})
	}

	return nil
}

func (sm *SystemMonitor) collectDiskMetrics() error {
	// Use df command to get disk usage
	cmd := exec.Command("df", "-h", "/")
	output, err := cmd.Output()
	if err != nil {
		return fmt.Errorf("df command failed: %w", err)
	}

	lines := strings.Split(string(output), "\n")
	if len(lines) >= 2 {
		fields := strings.Fields(lines[1])
		if len(fields) >= 5 {
			usageStr := strings.TrimSuffix(fields[4], "%")
			if usage, err := strconv.ParseFloat(usageStr, 64); err == nil {
				sm.metrics.setMetric("disk_usage", usage, "%", "gauge", map[string]string{"type": "disk", "mount": "/"})
			}
		}
	}

	return nil
}

func (sm *SystemMonitor) collectNetworkMetrics() error {
	// Read /proc/net/dev for network statistics
	data, err := os.ReadFile("/proc/net/dev")
	if err != nil {
		return fmt.Errorf("failed to read net/dev: %w", err)
	}

	lines := strings.Split(string(data), "\n")
	for _, line := range lines {
		if strings.Contains(line, ":") && !strings.Contains(line, "lo:") {
			fields := strings.Fields(line)
			if len(fields) >= 17 {
				interfaceName := strings.TrimSuffix(fields[0], ":")

				if bytesReceived, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
					sm.metrics.setMetric("network_bytes_received", float64(bytesReceived), "bytes", "counter",
						map[string]string{"interface": interfaceName})
				}

				if bytesSent, err := strconv.ParseUint(fields[9], 10, 64); err == nil {
					sm.metrics.setMetric("network_bytes_sent", float64(bytesSent), "bytes", "counter",
						map[string]string{"interface": interfaceName})
				}
			}
		}
	}

	return nil
}

func (sm *SystemMonitor) collectProcessMetrics() error {
	// Count running processes
	cmd := exec.Command("ps", "-e", "--no-headers")
	output, err := cmd.Output()
	if err != nil {
		return fmt.Errorf("ps command failed: %w", err)
	}

	lines := strings.Split(string(output), "\n")
	processCount := len(lines)
	if len(lines) > 0 && lines[len(lines)-1] == "" {
		processCount--
	}

	sm.metrics.setMetric("process_count", float64(processCount), "count", "gauge", map[string]string{"type": "process"})

	return nil
}

func (sm *SystemMonitor) runHealthCheck(check HealthCheck) HealthCheckResult {
	result := HealthCheckResult{
		ID:        check.ID,
		Name:      check.Name,
		Status:    "unknown",
		Message:   "Health check completed",
		Timestamp: time.Now(),
		Threshold: check.Threshold,
		Metadata:  make(map[string]interface{}),
	}

	startTime := time.Now()

	// Run the health check command
	cmd := exec.Command(check.Command, check.Args...)
	cmd.Timeout = check.Timeout

	output, err := cmd.Output()
	result.Duration = time.Since(startTime)

	if err != nil {
		result.Status = "failed"
		result.Message = "Health check command failed"
		result.Error = err.Error()
		return result
	}

	// Parse output based on check type
	switch check.Type {
	case "resource":
		result = sm.parseResourceCheck(check, string(output))
	case "service":
		result = sm.parseServiceCheck(check, string(output))
	case "network":
		result = sm.parseNetworkCheck(check, string(output))
	default:
		result.Status = "unknown"
		result.Message = "Unknown check type"
	}

	return result
}

func (sm *SystemMonitor) parseResourceCheck(check HealthCheck, output string) HealthCheckResult {
	result := HealthCheckResult{
		ID:        check.ID,
		Name:      check.Name,
		Status:    "unknown",
		Message:   "Resource check completed",
		Timestamp: time.Now(),
		Threshold: check.Threshold,
		Metadata:  make(map[string]interface{}),
	}

	// Parse output based on command
	switch check.Command {
	case "top":
		// Parse top output for CPU usage
		lines := strings.Split(output, "\n")
		for _, line := range lines {
			if strings.Contains(line, "Cpu(s):") {
				fields := strings.Fields(line)
				for i, field := range fields {
					if strings.Contains(field, "%us") {
						if i > 0 {
							if usage, err := strconv.ParseFloat(fields[i-1], 64); err == nil {
								result.Value = usage
								if usage > check.Threshold {
									result.Status = "critical"
									result.Message = fmt.Sprintf("CPU usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
								} else {
									result.Status = "healthy"
									result.Message = fmt.Sprintf("CPU usage %.1f%% is within normal range", usage)
								}
							}
						}
						break
					}
				}
			}
		}

	case "free":
		// Parse free output for memory usage
		lines := strings.Split(output, "\n")
		if len(lines) >= 2 {
			fields := strings.Fields(lines[1])
			if len(fields) >= 3 {
				if total, err := strconv.ParseFloat(fields[1], 64); err == nil {
					if used, err := strconv.ParseFloat(fields[2], 64); err == nil {
						usage := (used / total) * 100.0
						result.Value = usage
						if usage > check.Threshold {
							result.Status = "critical"
							result.Message = fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
						} else {
							result.Status = "healthy"
							result.Message = fmt.Sprintf("Memory usage %.1f%% is within normal range", usage)
						}
					}
				}
			}
		}

	case "df":
		// Parse df output for disk usage
		lines := strings.Split(output, "\n")
		if len(lines) >= 2 {
			fields := strings.Fields(lines[1])
			if len(fields) >= 5 {
				usageStr := strings.TrimSuffix(fields[4], "%")
				if usage, err := strconv.ParseFloat(usageStr, 64); err == nil {
					result.Value = usage
					if usage > check.Threshold {
						result.Status = "critical"
						result.Message = fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
					} else {
						result.Status = "healthy"
						result.Message = fmt.Sprintf("Disk usage %.1f%% is within normal range", usage)
					}
				}
			}
		}

	case "uptime":
		// Parse uptime output for load average
		fields := strings.Fields(output)
		if len(fields) >= 10 {
			loadStr := strings.TrimSuffix(fields[9], ",")
			if load, err := strconv.ParseFloat(loadStr, 64); err == nil {
				result.Value = load
				if load > check.Threshold {
					result.Status = "critical"
					result.Message = fmt.Sprintf("Load average %.2f exceeds threshold %.2f", load, check.Threshold)
				} else {
					result.Status = "healthy"
					result.Message = fmt.Sprintf("Load average %.2f is within normal range", load)
				}
			}
		}
	}

	return result
}

func (sm *SystemMonitor) parseServiceCheck(check HealthCheck, output string) HealthCheckResult {
	result := HealthCheckResult{
		ID:        check.ID,
		Name:      check.Name,
		Status:    "unknown",
		Message:   "Service check completed",
		Timestamp: time.Now(),
		Threshold: check.Threshold,
		Metadata:  make(map[string]interface{}),
	}

	output = strings.TrimSpace(output)

	switch output {
	case "active":
		result.Status = "healthy"
		result.Message = "Service is running"
		result.Value = 1.0
	case "inactive":
		result.Status = "critical"
		result.Message = "Service is not running"
		result.Value = 0.0
	default:
		result.Status = "unknown"
		result.Message = fmt.Sprintf("Service status: %s", output)
	}

	return result
}

func (sm *SystemMonitor) parseNetworkCheck(check HealthCheck, output string) HealthCheckResult {
	result := HealthCheckResult{
		ID:        check.ID,
		Name:      check.Name,
		Status:    "unknown",
		Message:   "Network check completed",
		Timestamp: time.Now(),
		Threshold: check.Threshold,
		Metadata:  make(map[string]interface{}),
	}

	// Check if ping was successful
	if strings.Contains(output, "1 received") {
		result.Status = "healthy"
		result.Message = "Network connectivity is working"
		result.Value = 1.0
	} else {
		result.Status = "critical"
		result.Message = "Network connectivity failed"
		result.Value = 0.0
	}

	return result
}

func (sm *SystemMonitor) processHealthCheckResult(result HealthCheckResult) {
	// Store health check result
	sm.metrics.setMetric(fmt.Sprintf("health_check_%s", result.ID),
		float64(sm.healthStatusToValue(result.Status)), "status", "gauge",
		map[string]string{"check": result.ID, "status": result.Status})

	// Generate alert if status is critical
	if result.Status == "critical" {
		alert := Alert{
			ID:          generateAlertID(),
			Level:       "critical",
			Message:     result.Message,
			Source:      fmt.Sprintf("health_check:%s", result.ID),
			Timestamp:   time.Now(),
			Acknowledged: false,
			Metadata: map[string]interface{}{
				"check_id": result.ID,
				"value":    result.Value,
				"threshold": result.Threshold,
			},
		}

		sm.alerts.addAlert(alert)
	}
}

func (sm *SystemMonitor) healthStatusToValue(status string) int {
	switch status {
	case "healthy":
		return 1
	case "warning":
		return 2
	case "critical":
		return 3
	default:
		return 0
	}
}

func (sm *SystemMonitor) checkThresholds() {
	sm.metrics.mu.RLock()
	defer sm.metrics.mu.RUnlock()

	for name, metric := range sm.metrics.metrics {
		if threshold, exists := sm.config.Thresholds[name]; exists {
			if metric.Value > threshold {
				alert := Alert{
					ID:          generateAlertID(),
					Level:       "warning",
					Message:     fmt.Sprintf("Metric %s (%.2f) exceeds threshold %.2f", name, metric.Value, threshold),
					Source:      fmt.Sprintf("metric:%s", name),
					Timestamp:   time.Now(),
					Acknowledged: false,
					Metadata: map[string]interface{}{
						"metric_name": name,
						"value":       metric.Value,
						"threshold":   threshold,
					},
				}

				sm.alerts.addAlert(alert)
			}
		}
	}
}

func (sm *SystemMonitor) storeMetrics() error {
	sm.metrics.mu.RLock()
	metrics := make(map[string]Metric)
	for k, v := range sm.metrics.metrics {
		metrics[k] = v
	}
	sm.metrics.mu.RUnlock()

	// Store metrics to file
	data, err := json.MarshalIndent(metrics, "", "  ")
	if err != nil {
		return fmt.Errorf("failed to marshal metrics: %w", err)
	}

	// Create metrics directory if it doesn't exist
	if err := os.MkdirAll(sm.config.MetricsPath, 0755); err != nil {
		return fmt.Errorf("failed to create metrics directory: %w", err)
	}

	// Write metrics to file with timestamp
	timestamp := time.Now().Format("2006-01-02_15-04-05")
	filename := filepath.Join(sm.config.MetricsPath, fmt.Sprintf("metrics_%s.json", timestamp))

	if err := os.WriteFile(filename, data, 0644); err != nil {
		return fmt.Errorf("failed to write metrics file: %w", err)
	}

	// Clean up old metrics files
	return sm.cleanupOldMetrics()
}

func (sm *SystemMonitor) cleanupOldMetrics() error {
	files, err := os.ReadDir(sm.config.MetricsPath)
	if err != nil {
		return fmt.Errorf("failed to read metrics directory: %w", err)
	}

	cutoff := time.Now().Add(-sm.storage.retention)

	for _, file := range files {
		if file.IsDir() || !strings.HasPrefix(file.Name(), "metrics_") {
			continue
		}

		// Extract timestamp from filename
		parts := strings.Split(file.Name(), "_")
		if len(parts) >= 3 {
			timestampStr := strings.Join(parts[1:3], "_")
			timestampStr = strings.TrimSuffix(timestampStr, ".json")

			if fileTime, err := time.Parse("2006-01-02_15-04-05", timestampStr); err == nil {
				if fileTime.Before(cutoff) {
					filePath := filepath.Join(sm.config.MetricsPath, file.Name())
					if err := os.Remove(filePath); err != nil {
						sm.logger.Warnf("Failed to remove old metrics file %s: %v", filePath, err)
					}
				}
			}
		}
	}

	return nil
}

func (sm *SystemMonitor) GetSystemStatus() *SystemStatus {
	status := &SystemStatus{
		Timestamp:    time.Now(),
		HealthStatus: make(map[string]string),
		Alerts:       sm.alerts.getActiveAlerts(),
		Metadata:     make(map[string]interface{}),
	}

	// Get uptime
	if uptime, err := sm.getUptime(); err == nil {
		status.Uptime = uptime
	}

	// Get load average
	if loadAvg, err := sm.getLoadAverage(); err == nil {
		status.LoadAverage = loadAvg
	}

	// Get current metrics
	sm.metrics.mu.RLock()
	if cpuMetric, exists := sm.metrics.metrics["cpu_usage"]; exists {
		status.CPUUsage = cpuMetric.Value
	}
	if memMetric, exists := sm.metrics.metrics["memory_usage"]; exists {
		status.MemoryUsage = memMetric.Value
	}
	if diskMetric, exists := sm.metrics.metrics["disk_usage"]; exists {
		status.DiskUsage = diskMetric.Value
	}
	if procMetric, exists := sm.metrics.metrics["process_count"]; exists {
		status.ProcessCount = int(procMetric.Value)
	}
	sm.metrics.mu.RUnlock()

	// Get health status for each check
	for id, check := range sm.healthChecks {
		if check.Enabled {
			// Get latest health check result
			if metric, exists := sm.metrics.metrics[fmt.Sprintf("health_check_%s", id)]; exists {
				status.HealthStatus[id] = sm.valueToHealthStatus(int(metric.Value))
			} else {
				status.HealthStatus[id] = "unknown"
			}
		}
	}

	return status
}

func (sm *SystemMonitor) getUptime() (time.Duration, error) {
	data, err := os.ReadFile("/proc/uptime")
	if err != nil {
		return 0, fmt.Errorf("failed to read uptime: %w", err)
	}

	fields := strings.Fields(string(data))
	if len(fields) >= 1 {
		if seconds, err := strconv.ParseFloat(fields[0], 64); err == nil {
			return time.Duration(seconds) * time.Second, nil
		}
	}

	return 0, fmt.Errorf("failed to parse uptime")
}

func (sm *SystemMonitor) getLoadAverage() ([]float64, error) {
	data, err := os.ReadFile("/proc/loadavg")
	if err != nil {
		return nil, fmt.Errorf("failed to read loadavg: %w", err)
	}

	fields := strings.Fields(string(data))
	if len(fields) >= 3 {
		loads := make([]float64, 3)
		for i := 0; i < 3; i++ {
			if load, err := strconv.ParseFloat(fields[i], 64); err == nil {
				loads[i] = load
			}
		}
		return loads, nil
	}

	return nil, fmt.Errorf("failed to parse load average")
}

func (sm *SystemMonitor) valueToHealthStatus(value int) string {
	switch value {
	case 1:
		return "healthy"
	case 2:
		return "warning"
	case 3:
		return "critical"
	default:
		return "unknown"
	}
}

// MetricsCollector methods
func (mc *MetricsCollector) setMetric(name string, value float64, unit string, metricType string, labels map[string]string) {
	mc.mu.Lock()
	defer mc.mu.Unlock()

	mc.metrics[name] = Metric{
		Name:      name,
		Value:     value,
		Unit:      unit,
		Type:      metricType,
		Timestamp: time.Now(),
		Labels:    labels,
		Metadata:  make(map[string]interface{}),
	}
}

// AlertManager methods
func (am *AlertManager) addAlert(alert Alert) {
	am.mu.Lock()
	defer am.mu.Unlock()

	am.alerts[alert.ID] = alert

	// Send alert through channels
	for _, channel := range am.channels {
		if channel.Enabled {
			go am.sendAlert(alert, channel)
		}
	}
}

func (am *AlertManager) sendAlert(alert Alert, channel AlertChannel) {
	// This is a placeholder for alert sending
	// In production, implement actual alert delivery logic
}

func (am *AlertManager) getActiveAlerts() []Alert {
	am.mu.RLock()
	defer am.mu.RUnlock()

	var activeAlerts []Alert
	for _, alert := range am.alerts {
		if !alert.Acknowledged {
			activeAlerts = append(activeAlerts, alert)
		}
	}

	return activeAlerts
}

// Helper functions
func generateAlertID() string {
	return fmt.Sprintf("alert-%d", time.Now().UnixNano())
}