Some checks failed
Tests / 🛃 Unit tests (push) Failing after 13s
Tests / 🗄 DB tests (push) Failing after 19s
Tests / 🐍 Lint python scripts (push) Failing after 1s
Tests / ⌨ Golang Lint (push) Failing after 1s
Tests / 📦 Packit config lint (push) Failing after 1s
Tests / 🔍 Check source preparation (push) Failing after 1s
Tests / 🔍 Check for valid snapshot urls (push) Failing after 1s
Tests / 🔍 Check for missing or unused runner repos (push) Failing after 1s
Tests / 🐚 Shellcheck (push) Failing after 1s
Tests / 📦 RPMlint (push) Failing after 1s
Tests / Gitlab CI trigger helper (push) Failing after 1s
Tests / 🎀 kube-linter (push) Failing after 1s
Tests / 🧹 cloud-cleaner-is-enabled (push) Successful in 3s
Tests / 🔍 Check spec file osbuild/images dependencies (push) Failing after 1s
1076 lines
28 KiB
Go
1076 lines
28 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type SystemMonitor struct {
|
|
logger *logrus.Logger
|
|
config *MonitoringConfig
|
|
metrics *MetricsCollector
|
|
healthChecks map[string]HealthCheck
|
|
alerts *AlertManager
|
|
storage *MetricsStorage
|
|
running bool
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
type MonitoringConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
Interval time.Duration `json:"interval"`
|
|
MetricsPath string `json:"metrics_path"`
|
|
AlertPath string `json:"alert_path"`
|
|
RetentionDays int `json:"retention_days"`
|
|
Thresholds map[string]float64 `json:"thresholds"`
|
|
Metadata map[string]string `json:"metadata"`
|
|
}
|
|
|
|
type MetricsCollector struct {
|
|
metrics map[string]Metric
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
type Metric struct {
|
|
Name string `json:"name"`
|
|
Value float64 `json:"value"`
|
|
Unit string `json:"unit"`
|
|
Type string `json:"type"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Labels map[string]string `json:"labels"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
type HealthCheck struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Description string `json:"description"`
|
|
Type string `json:"type"`
|
|
Command string `json:"command"`
|
|
Args []string `json:"args"`
|
|
Interval time.Duration `json:"interval"`
|
|
Timeout time.Duration `json:"timeout"`
|
|
Threshold float64 `json:"threshold"`
|
|
Enabled bool `json:"enabled"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
type HealthCheckResult struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
Message string `json:"message"`
|
|
Value float64 `json:"value"`
|
|
Threshold float64 `json:"threshold"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Duration time.Duration `json:"duration"`
|
|
Error string `json:"error,omitempty"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
type AlertManager struct {
|
|
alerts map[string]Alert
|
|
channels map[string]AlertChannel
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
type Alert struct {
|
|
ID string `json:"id"`
|
|
Level string `json:"level"`
|
|
Message string `json:"message"`
|
|
Source string `json:"source"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Acknowledged bool `json:"acknowledged"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
type AlertChannel struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
Config map[string]interface{} `json:"config"`
|
|
Enabled bool `json:"enabled"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
type MetricsStorage struct {
|
|
path string
|
|
retention time.Duration
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
type SystemStatus struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Uptime time.Duration `json:"uptime"`
|
|
LoadAverage []float64 `json:"load_average"`
|
|
CPUUsage float64 `json:"cpu_usage"`
|
|
MemoryUsage float64 `json:"memory_usage"`
|
|
DiskUsage float64 `json:"disk_usage"`
|
|
NetworkIO NetworkStats `json:"network_io"`
|
|
ProcessCount int `json:"process_count"`
|
|
HealthStatus map[string]string `json:"health_status"`
|
|
Alerts []Alert `json:"alerts"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
type NetworkStats struct {
|
|
BytesReceived uint64 `json:"bytes_received"`
|
|
BytesSent uint64 `json:"bytes_sent"`
|
|
PacketsReceived uint64 `json:"packets_received"`
|
|
PacketsSent uint64 `json:"packets_sent"`
|
|
ErrorsReceived uint64 `json:"errors_received"`
|
|
ErrorsSent uint64 `json:"errors_sent"`
|
|
}
|
|
|
|
func NewSystemMonitor(config *MonitoringConfig, logger *logrus.Logger) *SystemMonitor {
|
|
monitor := &SystemMonitor{
|
|
logger: logger,
|
|
config: config,
|
|
metrics: NewMetricsCollector(),
|
|
healthChecks: make(map[string]HealthCheck),
|
|
alerts: NewAlertManager(),
|
|
storage: NewMetricsStorage(config.MetricsPath, time.Duration(config.RetentionDays)*24*time.Hour),
|
|
running: false,
|
|
}
|
|
|
|
// Initialize health checks
|
|
monitor.initializeHealthChecks()
|
|
|
|
// Initialize alert channels
|
|
monitor.initializeAlertChannels()
|
|
|
|
return monitor
|
|
}
|
|
|
|
func NewMetricsCollector() *MetricsCollector {
|
|
return &MetricsCollector{
|
|
metrics: make(map[string]Metric),
|
|
}
|
|
}
|
|
|
|
func NewAlertManager() *AlertManager {
|
|
return &AlertManager{
|
|
alerts: make(map[string]Alert),
|
|
channels: make(map[string]AlertChannel),
|
|
}
|
|
}
|
|
|
|
func NewMetricsStorage(path string, retention time.Duration) *MetricsStorage {
|
|
return &MetricsStorage{
|
|
path: path,
|
|
retention: retention,
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) initializeHealthChecks() {
|
|
// System resource health checks
|
|
sm.healthChecks["cpu_usage"] = HealthCheck{
|
|
ID: "cpu_usage",
|
|
Name: "CPU Usage",
|
|
Description: "Monitor CPU usage percentage",
|
|
Type: "resource",
|
|
Command: "top",
|
|
Args: []string{"-bn1", "-p", "1"},
|
|
Interval: 30 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 80.0,
|
|
Enabled: true,
|
|
}
|
|
|
|
sm.healthChecks["memory_usage"] = HealthCheck{
|
|
ID: "memory_usage",
|
|
Name: "Memory Usage",
|
|
Description: "Monitor memory usage percentage",
|
|
Type: "resource",
|
|
Command: "free",
|
|
Args: []string{"-m"},
|
|
Interval: 30 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 85.0,
|
|
Enabled: true,
|
|
}
|
|
|
|
sm.healthChecks["disk_usage"] = HealthCheck{
|
|
ID: "disk_usage",
|
|
Name: "Disk Usage",
|
|
Description: "Monitor disk usage percentage",
|
|
Type: "resource",
|
|
Command: "df",
|
|
Args: []string{"-h", "/"},
|
|
Interval: 60 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 90.0,
|
|
Enabled: true,
|
|
}
|
|
|
|
sm.healthChecks["load_average"] = HealthCheck{
|
|
ID: "load_average",
|
|
Name: "Load Average",
|
|
Description: "Monitor system load average",
|
|
Type: "resource",
|
|
Command: "uptime",
|
|
Args: []string{},
|
|
Interval: 30 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 5.0,
|
|
Enabled: true,
|
|
}
|
|
|
|
// Service health checks
|
|
sm.healthChecks["debian_forge_service"] = HealthCheck{
|
|
ID: "debian_forge_service",
|
|
Name: "Debian Forge Service",
|
|
Description: "Check if Debian Forge service is running",
|
|
Type: "service",
|
|
Command: "systemctl",
|
|
Args: []string{"is-active", "debian-forge"},
|
|
Interval: 60 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 0.0,
|
|
Enabled: true,
|
|
}
|
|
|
|
sm.healthChecks["database_connection"] = HealthCheck{
|
|
ID: "database_connection",
|
|
Name: "Database Connection",
|
|
Description: "Check database connectivity",
|
|
Type: "service",
|
|
Command: "pg_isready",
|
|
Args: []string{"-h", "localhost"},
|
|
Interval: 60 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 0.0,
|
|
Enabled: true,
|
|
}
|
|
|
|
// Network health checks
|
|
sm.healthChecks["network_connectivity"] = HealthCheck{
|
|
ID: "network_connectivity",
|
|
Name: "Network Connectivity",
|
|
Description: "Check basic network connectivity",
|
|
Type: "network",
|
|
Command: "ping",
|
|
Args: []string{"-c", "1", "8.8.8.8"},
|
|
Interval: 60 * time.Second,
|
|
Timeout: 10 * time.Second,
|
|
Threshold: 0.0,
|
|
Enabled: true,
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) initializeAlertChannels() {
|
|
// Email alert channel
|
|
sm.alerts.channels["email"] = AlertChannel{
|
|
ID: "email",
|
|
Name: "Email Alerts",
|
|
Type: "email",
|
|
Enabled: true,
|
|
Config: map[string]interface{}{
|
|
"smtp_server": "localhost",
|
|
"smtp_port": 25,
|
|
"from": "alerts@debian-forge.local",
|
|
"to": []string{"admin@debian-forge.local"},
|
|
},
|
|
}
|
|
|
|
// Slack alert channel
|
|
sm.alerts.channels["slack"] = AlertChannel{
|
|
ID: "slack",
|
|
Name: "Slack Alerts",
|
|
Type: "slack",
|
|
Enabled: false,
|
|
Config: map[string]interface{}{
|
|
"webhook_url": "",
|
|
"channel": "#alerts",
|
|
},
|
|
}
|
|
|
|
// Webhook alert channel
|
|
sm.alerts.channels["webhook"] = AlertChannel{
|
|
ID: "webhook",
|
|
Name: "Webhook Alerts",
|
|
Type: "webhook",
|
|
Enabled: false,
|
|
Config: map[string]interface{}{
|
|
"url": "",
|
|
"method": "POST",
|
|
"headers": map[string]string{},
|
|
},
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) Start() error {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
|
|
if sm.running {
|
|
return fmt.Errorf("monitor is already running")
|
|
}
|
|
|
|
sm.running = true
|
|
sm.logger.Info("Starting system monitor")
|
|
|
|
// Start monitoring goroutine
|
|
go sm.monitoringLoop()
|
|
|
|
// Start health check goroutines
|
|
for _, check := range sm.healthChecks {
|
|
if check.Enabled {
|
|
go sm.healthCheckLoop(check)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) Stop() error {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
|
|
if !sm.running {
|
|
return fmt.Errorf("monitor is not running")
|
|
}
|
|
|
|
sm.running = false
|
|
sm.logger.Info("Stopping system monitor")
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) monitoringLoop() {
|
|
ticker := time.NewTicker(sm.config.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
if !sm.isRunning() {
|
|
return
|
|
}
|
|
|
|
// Collect system metrics
|
|
if err := sm.collectSystemMetrics(); err != nil {
|
|
sm.logger.Errorf("Failed to collect system metrics: %v", err)
|
|
}
|
|
|
|
// Check thresholds and generate alerts
|
|
sm.checkThresholds()
|
|
|
|
// Store metrics
|
|
if err := sm.storeMetrics(); err != nil {
|
|
sm.logger.Errorf("Failed to store metrics: %v", err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) healthCheckLoop(check HealthCheck) {
|
|
ticker := time.NewTicker(check.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
if !sm.isRunning() {
|
|
return
|
|
}
|
|
|
|
// Run health check
|
|
result := sm.runHealthCheck(check)
|
|
|
|
// Process result
|
|
sm.processHealthCheckResult(result)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) isRunning() bool {
|
|
sm.mu.RLock()
|
|
defer sm.mu.RUnlock()
|
|
return sm.running
|
|
}
|
|
|
|
func (sm *SystemMonitor) collectSystemMetrics() error {
|
|
// Collect CPU metrics
|
|
if err := sm.collectCPUMetrics(); err != nil {
|
|
sm.logger.Warnf("Failed to collect CPU metrics: %v", err)
|
|
}
|
|
|
|
// Collect memory metrics
|
|
if err := sm.collectMemoryMetrics(); err != nil {
|
|
sm.logger.Warnf("Failed to collect memory metrics: %v", err)
|
|
}
|
|
|
|
// Collect disk metrics
|
|
if err := sm.collectDiskMetrics(); err != nil {
|
|
sm.logger.Warnf("Failed to collect disk metrics: %v", err)
|
|
}
|
|
|
|
// Collect network metrics
|
|
if err := sm.collectNetworkMetrics(); err != nil {
|
|
sm.logger.Warnf("Failed to collect network metrics: %v", err)
|
|
}
|
|
|
|
// Collect process metrics
|
|
if err := sm.collectProcessMetrics(); err != nil {
|
|
sm.logger.Warnf("Failed to collect process metrics: %v", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) collectCPUMetrics() error {
|
|
// Read /proc/loadavg for load average
|
|
data, err := os.ReadFile("/proc/loadavg")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read loadavg: %w", err)
|
|
}
|
|
|
|
fields := strings.Fields(string(data))
|
|
if len(fields) >= 3 {
|
|
if load1, err := strconv.ParseFloat(fields[0], 64); err == nil {
|
|
sm.metrics.setMetric("load_1min", load1, "", "gauge", map[string]string{"type": "load_average"})
|
|
}
|
|
if load5, err := strconv.ParseFloat(fields[1], 64); err == nil {
|
|
sm.metrics.setMetric("load_5min", load5, "", "gauge", map[string]string{"type": "load_average"})
|
|
}
|
|
if load15, err := strconv.ParseFloat(fields[2], 64); err == nil {
|
|
sm.metrics.setMetric("load_15min", load15, "", "gauge", map[string]string{"type": "load_average"})
|
|
}
|
|
}
|
|
|
|
// Read /proc/stat for CPU usage
|
|
data, err = os.ReadFile("/proc/stat")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read stat: %w", err)
|
|
}
|
|
|
|
lines := strings.Split(string(data), "\n")
|
|
for _, line := range lines {
|
|
if strings.HasPrefix(line, "cpu ") {
|
|
fields := strings.Fields(line)
|
|
if len(fields) >= 5 {
|
|
// Calculate CPU usage percentage
|
|
total := 0.0
|
|
idle := 0.0
|
|
|
|
for i := 1; i < len(fields); i++ {
|
|
if val, err := strconv.ParseFloat(fields[i], 64); err == nil {
|
|
total += val
|
|
if i == 4 { // idle time
|
|
idle = val
|
|
}
|
|
}
|
|
}
|
|
|
|
if total > 0 {
|
|
usage := ((total - idle) / total) * 100.0
|
|
sm.metrics.setMetric("cpu_usage", usage, "%", "gauge", map[string]string{"type": "cpu"})
|
|
}
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) collectMemoryMetrics() error {
|
|
// Read /proc/meminfo for memory usage
|
|
data, err := os.ReadFile("/proc/meminfo")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read meminfo: %w", err)
|
|
}
|
|
|
|
var total, available uint64
|
|
|
|
lines := strings.Split(string(data), "\n")
|
|
for _, line := range lines {
|
|
fields := strings.Fields(line)
|
|
if len(fields) >= 2 {
|
|
switch fields[0] {
|
|
case "MemTotal:":
|
|
if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
|
|
total = val
|
|
}
|
|
case "MemAvailable:":
|
|
if val, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
|
|
available = val
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if total > 0 {
|
|
usage := float64(total-available) / float64(total) * 100.0
|
|
sm.metrics.setMetric("memory_usage", usage, "%", "gauge", map[string]string{"type": "memory"})
|
|
sm.metrics.setMetric("memory_total", float64(total), "KB", "gauge", map[string]string{"type": "memory"})
|
|
sm.metrics.setMetric("memory_available", float64(available), "KB", "gauge", map[string]string{"type": "memory"})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) collectDiskMetrics() error {
|
|
// Use df command to get disk usage
|
|
cmd := exec.Command("df", "-h", "/")
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
return fmt.Errorf("df command failed: %w", err)
|
|
}
|
|
|
|
lines := strings.Split(string(output), "\n")
|
|
if len(lines) >= 2 {
|
|
fields := strings.Fields(lines[1])
|
|
if len(fields) >= 5 {
|
|
usageStr := strings.TrimSuffix(fields[4], "%")
|
|
if usage, err := strconv.ParseFloat(usageStr, 64); err == nil {
|
|
sm.metrics.setMetric("disk_usage", usage, "%", "gauge", map[string]string{"type": "disk", "mount": "/"})
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) collectNetworkMetrics() error {
|
|
// Read /proc/net/dev for network statistics
|
|
data, err := os.ReadFile("/proc/net/dev")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read net/dev: %w", err)
|
|
}
|
|
|
|
lines := strings.Split(string(data), "\n")
|
|
for _, line := range lines {
|
|
if strings.Contains(line, ":") && !strings.Contains(line, "lo:") {
|
|
fields := strings.Fields(line)
|
|
if len(fields) >= 17 {
|
|
interfaceName := strings.TrimSuffix(fields[0], ":")
|
|
|
|
if bytesReceived, err := strconv.ParseUint(fields[1], 10, 64); err == nil {
|
|
sm.metrics.setMetric("network_bytes_received", float64(bytesReceived), "bytes", "counter",
|
|
map[string]string{"interface": interfaceName})
|
|
}
|
|
|
|
if bytesSent, err := strconv.ParseUint(fields[9], 10, 64); err == nil {
|
|
sm.metrics.setMetric("network_bytes_sent", float64(bytesSent), "bytes", "counter",
|
|
map[string]string{"interface": interfaceName})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) collectProcessMetrics() error {
|
|
// Count running processes
|
|
cmd := exec.Command("ps", "-e", "--no-headers")
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
return fmt.Errorf("ps command failed: %w", err)
|
|
}
|
|
|
|
lines := strings.Split(string(output), "\n")
|
|
processCount := len(lines)
|
|
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
|
processCount--
|
|
}
|
|
|
|
sm.metrics.setMetric("process_count", float64(processCount), "count", "gauge", map[string]string{"type": "process"})
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) runHealthCheck(check HealthCheck) HealthCheckResult {
|
|
result := HealthCheckResult{
|
|
ID: check.ID,
|
|
Name: check.Name,
|
|
Status: "unknown",
|
|
Message: "Health check completed",
|
|
Timestamp: time.Now(),
|
|
Threshold: check.Threshold,
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
|
|
startTime := time.Now()
|
|
|
|
// Run the health check command
|
|
cmd := exec.Command(check.Command, check.Args...)
|
|
cmd.Timeout = check.Timeout
|
|
|
|
output, err := cmd.Output()
|
|
result.Duration = time.Since(startTime)
|
|
|
|
if err != nil {
|
|
result.Status = "failed"
|
|
result.Message = "Health check command failed"
|
|
result.Error = err.Error()
|
|
return result
|
|
}
|
|
|
|
// Parse output based on check type
|
|
switch check.Type {
|
|
case "resource":
|
|
result = sm.parseResourceCheck(check, string(output))
|
|
case "service":
|
|
result = sm.parseServiceCheck(check, string(output))
|
|
case "network":
|
|
result = sm.parseNetworkCheck(check, string(output))
|
|
default:
|
|
result.Status = "unknown"
|
|
result.Message = "Unknown check type"
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (sm *SystemMonitor) parseResourceCheck(check HealthCheck, output string) HealthCheckResult {
|
|
result := HealthCheckResult{
|
|
ID: check.ID,
|
|
Name: check.Name,
|
|
Status: "unknown",
|
|
Message: "Resource check completed",
|
|
Timestamp: time.Now(),
|
|
Threshold: check.Threshold,
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
|
|
// Parse output based on command
|
|
switch check.Command {
|
|
case "top":
|
|
// Parse top output for CPU usage
|
|
lines := strings.Split(output, "\n")
|
|
for _, line := range lines {
|
|
if strings.Contains(line, "Cpu(s):") {
|
|
fields := strings.Fields(line)
|
|
for i, field := range fields {
|
|
if strings.Contains(field, "%us") {
|
|
if i > 0 {
|
|
if usage, err := strconv.ParseFloat(fields[i-1], 64); err == nil {
|
|
result.Value = usage
|
|
if usage > check.Threshold {
|
|
result.Status = "critical"
|
|
result.Message = fmt.Sprintf("CPU usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
|
|
} else {
|
|
result.Status = "healthy"
|
|
result.Message = fmt.Sprintf("CPU usage %.1f%% is within normal range", usage)
|
|
}
|
|
}
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
case "free":
|
|
// Parse free output for memory usage
|
|
lines := strings.Split(output, "\n")
|
|
if len(lines) >= 2 {
|
|
fields := strings.Fields(lines[1])
|
|
if len(fields) >= 3 {
|
|
if total, err := strconv.ParseFloat(fields[1], 64); err == nil {
|
|
if used, err := strconv.ParseFloat(fields[2], 64); err == nil {
|
|
usage := (used / total) * 100.0
|
|
result.Value = usage
|
|
if usage > check.Threshold {
|
|
result.Status = "critical"
|
|
result.Message = fmt.Sprintf("Memory usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
|
|
} else {
|
|
result.Status = "healthy"
|
|
result.Message = fmt.Sprintf("Memory usage %.1f%% is within normal range", usage)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
case "df":
|
|
// Parse df output for disk usage
|
|
lines := strings.Split(output, "\n")
|
|
if len(lines) >= 2 {
|
|
fields := strings.Fields(lines[1])
|
|
if len(fields) >= 5 {
|
|
usageStr := strings.TrimSuffix(fields[4], "%")
|
|
if usage, err := strconv.ParseFloat(usageStr, 64); err == nil {
|
|
result.Value = usage
|
|
if usage > check.Threshold {
|
|
result.Status = "critical"
|
|
result.Message = fmt.Sprintf("Disk usage %.1f%% exceeds threshold %.1f%%", usage, check.Threshold)
|
|
} else {
|
|
result.Status = "healthy"
|
|
result.Message = fmt.Sprintf("Disk usage %.1f%% is within normal range", usage)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
case "uptime":
|
|
// Parse uptime output for load average
|
|
fields := strings.Fields(output)
|
|
if len(fields) >= 10 {
|
|
loadStr := strings.TrimSuffix(fields[9], ",")
|
|
if load, err := strconv.ParseFloat(loadStr, 64); err == nil {
|
|
result.Value = load
|
|
if load > check.Threshold {
|
|
result.Status = "critical"
|
|
result.Message = fmt.Sprintf("Load average %.2f exceeds threshold %.2f", load, check.Threshold)
|
|
} else {
|
|
result.Status = "healthy"
|
|
result.Message = fmt.Sprintf("Load average %.2f is within normal range", load)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (sm *SystemMonitor) parseServiceCheck(check HealthCheck, output string) HealthCheckResult {
|
|
result := HealthCheckResult{
|
|
ID: check.ID,
|
|
Name: check.Name,
|
|
Status: "unknown",
|
|
Message: "Service check completed",
|
|
Timestamp: time.Now(),
|
|
Threshold: check.Threshold,
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
|
|
output = strings.TrimSpace(output)
|
|
|
|
switch output {
|
|
case "active":
|
|
result.Status = "healthy"
|
|
result.Message = "Service is running"
|
|
result.Value = 1.0
|
|
case "inactive":
|
|
result.Status = "critical"
|
|
result.Message = "Service is not running"
|
|
result.Value = 0.0
|
|
default:
|
|
result.Status = "unknown"
|
|
result.Message = fmt.Sprintf("Service status: %s", output)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (sm *SystemMonitor) parseNetworkCheck(check HealthCheck, output string) HealthCheckResult {
|
|
result := HealthCheckResult{
|
|
ID: check.ID,
|
|
Name: check.Name,
|
|
Status: "unknown",
|
|
Message: "Network check completed",
|
|
Timestamp: time.Now(),
|
|
Threshold: check.Threshold,
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
|
|
// Check if ping was successful
|
|
if strings.Contains(output, "1 received") {
|
|
result.Status = "healthy"
|
|
result.Message = "Network connectivity is working"
|
|
result.Value = 1.0
|
|
} else {
|
|
result.Status = "critical"
|
|
result.Message = "Network connectivity failed"
|
|
result.Value = 0.0
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (sm *SystemMonitor) processHealthCheckResult(result HealthCheckResult) {
|
|
// Store health check result
|
|
sm.metrics.setMetric(fmt.Sprintf("health_check_%s", result.ID),
|
|
float64(sm.healthStatusToValue(result.Status)), "status", "gauge",
|
|
map[string]string{"check": result.ID, "status": result.Status})
|
|
|
|
// Generate alert if status is critical
|
|
if result.Status == "critical" {
|
|
alert := Alert{
|
|
ID: generateAlertID(),
|
|
Level: "critical",
|
|
Message: result.Message,
|
|
Source: fmt.Sprintf("health_check:%s", result.ID),
|
|
Timestamp: time.Now(),
|
|
Acknowledged: false,
|
|
Metadata: map[string]interface{}{
|
|
"check_id": result.ID,
|
|
"value": result.Value,
|
|
"threshold": result.Threshold,
|
|
},
|
|
}
|
|
|
|
sm.alerts.addAlert(alert)
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) healthStatusToValue(status string) int {
|
|
switch status {
|
|
case "healthy":
|
|
return 1
|
|
case "warning":
|
|
return 2
|
|
case "critical":
|
|
return 3
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) checkThresholds() {
|
|
sm.metrics.mu.RLock()
|
|
defer sm.metrics.mu.RUnlock()
|
|
|
|
for name, metric := range sm.metrics.metrics {
|
|
if threshold, exists := sm.config.Thresholds[name]; exists {
|
|
if metric.Value > threshold {
|
|
alert := Alert{
|
|
ID: generateAlertID(),
|
|
Level: "warning",
|
|
Message: fmt.Sprintf("Metric %s (%.2f) exceeds threshold %.2f", name, metric.Value, threshold),
|
|
Source: fmt.Sprintf("metric:%s", name),
|
|
Timestamp: time.Now(),
|
|
Acknowledged: false,
|
|
Metadata: map[string]interface{}{
|
|
"metric_name": name,
|
|
"value": metric.Value,
|
|
"threshold": threshold,
|
|
},
|
|
}
|
|
|
|
sm.alerts.addAlert(alert)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (sm *SystemMonitor) storeMetrics() error {
|
|
sm.metrics.mu.RLock()
|
|
metrics := make(map[string]Metric)
|
|
for k, v := range sm.metrics.metrics {
|
|
metrics[k] = v
|
|
}
|
|
sm.metrics.mu.RUnlock()
|
|
|
|
// Store metrics to file
|
|
data, err := json.MarshalIndent(metrics, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal metrics: %w", err)
|
|
}
|
|
|
|
// Create metrics directory if it doesn't exist
|
|
if err := os.MkdirAll(sm.config.MetricsPath, 0755); err != nil {
|
|
return fmt.Errorf("failed to create metrics directory: %w", err)
|
|
}
|
|
|
|
// Write metrics to file with timestamp
|
|
timestamp := time.Now().Format("2006-01-02_15-04-05")
|
|
filename := filepath.Join(sm.config.MetricsPath, fmt.Sprintf("metrics_%s.json", timestamp))
|
|
|
|
if err := os.WriteFile(filename, data, 0644); err != nil {
|
|
return fmt.Errorf("failed to write metrics file: %w", err)
|
|
}
|
|
|
|
// Clean up old metrics files
|
|
return sm.cleanupOldMetrics()
|
|
}
|
|
|
|
func (sm *SystemMonitor) cleanupOldMetrics() error {
|
|
files, err := os.ReadDir(sm.config.MetricsPath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read metrics directory: %w", err)
|
|
}
|
|
|
|
cutoff := time.Now().Add(-sm.storage.retention)
|
|
|
|
for _, file := range files {
|
|
if file.IsDir() || !strings.HasPrefix(file.Name(), "metrics_") {
|
|
continue
|
|
}
|
|
|
|
// Extract timestamp from filename
|
|
parts := strings.Split(file.Name(), "_")
|
|
if len(parts) >= 3 {
|
|
timestampStr := strings.Join(parts[1:3], "_")
|
|
timestampStr = strings.TrimSuffix(timestampStr, ".json")
|
|
|
|
if fileTime, err := time.Parse("2006-01-02_15-04-05", timestampStr); err == nil {
|
|
if fileTime.Before(cutoff) {
|
|
filePath := filepath.Join(sm.config.MetricsPath, file.Name())
|
|
if err := os.Remove(filePath); err != nil {
|
|
sm.logger.Warnf("Failed to remove old metrics file %s: %v", filePath, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SystemMonitor) GetSystemStatus() *SystemStatus {
|
|
status := &SystemStatus{
|
|
Timestamp: time.Now(),
|
|
HealthStatus: make(map[string]string),
|
|
Alerts: sm.alerts.getActiveAlerts(),
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
|
|
// Get uptime
|
|
if uptime, err := sm.getUptime(); err == nil {
|
|
status.Uptime = uptime
|
|
}
|
|
|
|
// Get load average
|
|
if loadAvg, err := sm.getLoadAverage(); err == nil {
|
|
status.LoadAverage = loadAvg
|
|
}
|
|
|
|
// Get current metrics
|
|
sm.metrics.mu.RLock()
|
|
if cpuMetric, exists := sm.metrics.metrics["cpu_usage"]; exists {
|
|
status.CPUUsage = cpuMetric.Value
|
|
}
|
|
if memMetric, exists := sm.metrics.metrics["memory_usage"]; exists {
|
|
status.MemoryUsage = memMetric.Value
|
|
}
|
|
if diskMetric, exists := sm.metrics.metrics["disk_usage"]; exists {
|
|
status.DiskUsage = diskMetric.Value
|
|
}
|
|
if procMetric, exists := sm.metrics.metrics["process_count"]; exists {
|
|
status.ProcessCount = int(procMetric.Value)
|
|
}
|
|
sm.metrics.mu.RUnlock()
|
|
|
|
// Get health status for each check
|
|
for id, check := range sm.healthChecks {
|
|
if check.Enabled {
|
|
// Get latest health check result
|
|
if metric, exists := sm.metrics.metrics[fmt.Sprintf("health_check_%s", id)]; exists {
|
|
status.HealthStatus[id] = sm.valueToHealthStatus(int(metric.Value))
|
|
} else {
|
|
status.HealthStatus[id] = "unknown"
|
|
}
|
|
}
|
|
}
|
|
|
|
return status
|
|
}
|
|
|
|
func (sm *SystemMonitor) getUptime() (time.Duration, error) {
|
|
data, err := os.ReadFile("/proc/uptime")
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to read uptime: %w", err)
|
|
}
|
|
|
|
fields := strings.Fields(string(data))
|
|
if len(fields) >= 1 {
|
|
if seconds, err := strconv.ParseFloat(fields[0], 64); err == nil {
|
|
return time.Duration(seconds) * time.Second, nil
|
|
}
|
|
}
|
|
|
|
return 0, fmt.Errorf("failed to parse uptime")
|
|
}
|
|
|
|
func (sm *SystemMonitor) getLoadAverage() ([]float64, error) {
|
|
data, err := os.ReadFile("/proc/loadavg")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read loadavg: %w", err)
|
|
}
|
|
|
|
fields := strings.Fields(string(data))
|
|
if len(fields) >= 3 {
|
|
loads := make([]float64, 3)
|
|
for i := 0; i < 3; i++ {
|
|
if load, err := strconv.ParseFloat(fields[i], 64); err == nil {
|
|
loads[i] = load
|
|
}
|
|
}
|
|
return loads, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("failed to parse load average")
|
|
}
|
|
|
|
func (sm *SystemMonitor) valueToHealthStatus(value int) string {
|
|
switch value {
|
|
case 1:
|
|
return "healthy"
|
|
case 2:
|
|
return "warning"
|
|
case 3:
|
|
return "critical"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// MetricsCollector methods
|
|
func (mc *MetricsCollector) setMetric(name string, value float64, unit string, metricType string, labels map[string]string) {
|
|
mc.mu.Lock()
|
|
defer mc.mu.Unlock()
|
|
|
|
mc.metrics[name] = Metric{
|
|
Name: name,
|
|
Value: value,
|
|
Unit: unit,
|
|
Type: metricType,
|
|
Timestamp: time.Now(),
|
|
Labels: labels,
|
|
Metadata: make(map[string]interface{}),
|
|
}
|
|
}
|
|
|
|
// AlertManager methods
|
|
func (am *AlertManager) addAlert(alert Alert) {
|
|
am.mu.Lock()
|
|
defer am.mu.Unlock()
|
|
|
|
am.alerts[alert.ID] = alert
|
|
|
|
// Send alert through channels
|
|
for _, channel := range am.channels {
|
|
if channel.Enabled {
|
|
go am.sendAlert(alert, channel)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (am *AlertManager) sendAlert(alert Alert, channel AlertChannel) {
|
|
// This is a placeholder for alert sending
|
|
// In production, implement actual alert delivery logic
|
|
}
|
|
|
|
func (am *AlertManager) getActiveAlerts() []Alert {
|
|
am.mu.RLock()
|
|
defer am.mu.RUnlock()
|
|
|
|
var activeAlerts []Alert
|
|
for _, alert := range am.alerts {
|
|
if !alert.Acknowledged {
|
|
activeAlerts = append(activeAlerts, alert)
|
|
}
|
|
}
|
|
|
|
return activeAlerts
|
|
}
|
|
|
|
// Helper functions
|
|
func generateAlertID() string {
|
|
return fmt.Sprintf("alert-%d", time.Now().UnixNano())
|
|
}
|