package monitoring import ( "fmt" "sync" "time" "github.com/sirupsen/logrus" ) // HealthStatus represents the overall health status type HealthStatus string const ( HealthStatusHealthy HealthStatus = "healthy" HealthStatusDegraded HealthStatus = "degraded" HealthStatusUnhealthy HealthStatus = "unhealthy" HealthStatusUnknown HealthStatus = "unknown" ) // HealthCheck represents a single health check type HealthCheck struct { Name string `json:"name"` Status HealthStatus `json:"status"` Message string `json:"message"` LastCheck time.Time `json:"last_check"` Duration time.Duration `json:"duration"` Details map[string]interface{} `json:"details,omitempty"` Critical bool `json:"critical"` LastError error `json:"last_error,omitempty"` CheckCount int64 `json:"check_count"` ErrorCount int64 `json:"error_count"` } // HealthChecker defines the interface for health checks type HealthChecker interface { Name() string Check() (*HealthCheck, error) IsCritical() bool } // HealthManager manages all health checks type HealthManager struct { checks map[string]HealthChecker results map[string]*HealthCheck mutex sync.RWMutex logger *logrus.Logger interval time.Duration stopChan chan struct{} running bool } // NewHealthManager creates a new health manager func NewHealthManager(interval time.Duration) *HealthManager { return &HealthManager{ checks: make(map[string]HealthChecker), results: make(map[string]*HealthCheck), logger: logrus.New(), interval: interval, stopChan: make(chan struct{}), } } // AddCheck adds a health check to the manager func (hm *HealthManager) AddCheck(check HealthChecker) { hm.mutex.Lock() defer hm.mutex.Unlock() hm.checks[check.Name()] = check } // RemoveCheck removes a health check from the manager func (hm *HealthManager) RemoveCheck(name string) { hm.mutex.Lock() defer hm.mutex.Unlock() delete(hm.checks, name) } // Start starts the health check monitoring func (hm *HealthManager) Start() { if hm.running { return } hm.running = true hm.logger.Info("Starting health check manager") go hm.runHealthChecks() } // Stop stops the health check monitoring func (hm *HealthManager) Stop() { if !hm.running { return } hm.running = false close(hm.stopChan) hm.logger.Info("Stopped health check manager") } // runHealthChecks runs health checks at regular intervals func (hm *HealthManager) runHealthChecks() { ticker := time.NewTicker(hm.interval) defer ticker.Stop() // Run initial health checks hm.runAllChecks() for { select { case <-ticker.C: hm.runAllChecks() case <-hm.stopChan: return } } } // runAllChecks runs all registered health checks func (hm *HealthManager) runAllChecks() { hm.mutex.RLock() checks := make([]HealthChecker, 0, len(hm.checks)) for _, check := range hm.checks { checks = append(checks, check) } hm.mutex.RUnlock() var wg sync.WaitGroup for _, check := range checks { wg.Add(1) go func(c HealthChecker) { defer wg.Done() hm.runCheck(c) }(check) } wg.Wait() } // runCheck runs a single health check func (hm *HealthManager) runCheck(check HealthChecker) { start := time.Now() result, err := check.Check() if err != nil { result = &HealthCheck{ Name: check.Name(), Status: HealthStatusUnhealthy, Message: fmt.Sprintf("Health check failed: %v", err), LastCheck: time.Now(), Duration: time.Since(start), Critical: check.IsCritical(), LastError: err, } } // Update result with timing and counts result.LastCheck = time.Now() result.Duration = time.Since(start) result.CheckCount++ if err != nil { result.ErrorCount++ } // Store result hm.mutex.Lock() hm.results[check.Name()] = result hm.mutex.Unlock() // Log status if result.Status == HealthStatusUnhealthy { hm.logger.Errorf("Health check %s failed: %s", check.Name(), result.Message) } else if result.Status == HealthStatusDegraded { hm.logger.Warnf("Health check %s degraded: %s", check.Name(), result.Message) } } // GetOverallStatus returns the overall health status func (hm *HealthManager) GetOverallStatus() HealthStatus { hm.mutex.RLock() defer hm.mutex.RUnlock() if len(hm.results) == 0 { return HealthStatusUnknown } criticalUnhealthy := false anyUnhealthy := false anyDegraded := false for _, result := range hm.results { switch result.Status { case HealthStatusUnhealthy: anyUnhealthy = true if result.Critical { criticalUnhealthy = true } case HealthStatusDegraded: anyDegraded = true } } if criticalUnhealthy { return HealthStatusUnhealthy } if anyUnhealthy { return HealthStatusUnhealthy } if anyDegraded { return HealthStatusDegraded } return HealthStatusHealthy } // GetHealthReport returns a comprehensive health report func (hm *HealthManager) GetHealthReport() map[string]interface{} { hm.mutex.RLock() defer hm.mutex.RUnlock() overallStatus := hm.GetOverallStatus() report := map[string]interface{}{ "status": overallStatus, "timestamp": time.Now(), "total_checks": len(hm.results), "checks": hm.results, "summary": hm.getSummary(), } return report } // getSummary returns a summary of health check results func (hm *HealthManager) getSummary() map[string]interface{} { summary := map[string]interface{}{ "healthy": 0, "degraded": 0, "unhealthy": 0, "unknown": 0, } for _, result := range hm.results { switch result.Status { case HealthStatusHealthy: summary["healthy"] = summary["healthy"].(int) + 1 case HealthStatusDegraded: summary["degraded"] = summary["degraded"].(int) + 1 case HealthStatusUnhealthy: summary["unhealthy"] = summary["unhealthy"].(int) + 1 default: summary["unknown"] = summary["unknown"].(int) + 1 } } return summary } // GetCheckResult returns the result of a specific health check func (hm *HealthManager) GetCheckResult(name string) (*HealthCheck, bool) { hm.mutex.RLock() defer hm.mutex.RUnlock() result, exists := hm.results[name] return result, exists }