deb-bootc-compose/internal/performance/scaling.go
2025-08-18 23:32:51 -07:00

497 lines
12 KiB
Go

package performance
import (
"context"
"fmt"
"sync"
"time"
"github.com/sirupsen/logrus"
)
// ScalingManager manages horizontal scaling and load balancing
type ScalingManager struct {
nodes map[string]*Node
loadBalancer *LoadBalancer
autoscaler *AutoScaler
mu sync.RWMutex
logger *logrus.Logger
enabled bool
}
// Node represents a compute node in the cluster
type Node struct {
ID string `json:"id"`
Hostname string `json:"hostname"`
Address string `json:"address"`
Port int `json:"port"`
Status NodeStatus `json:"status"`
Capabilities map[string]interface{} `json:"capabilities"`
Metrics *NodeMetrics `json:"metrics"`
LastSeen time.Time `json:"last_seen"`
Tags map[string]string `json:"tags"`
}
// NodeStatus represents the status of a node
type NodeStatus string
const (
NodeStatusOnline NodeStatus = "online"
NodeStatusOffline NodeStatus = "offline"
NodeStatusBusy NodeStatus = "busy"
NodeStatusError NodeStatus = "error"
)
// NodeMetrics represents performance metrics for a node
type NodeMetrics struct {
CPUUsage float64 `json:"cpu_usage"`
MemoryUsage float64 `json:"memory_usage"`
DiskUsage float64 `json:"disk_usage"`
LoadAverage float64 `json:"load_average"`
ActiveJobs int `json:"active_jobs"`
MaxJobs int `json:"max_jobs"`
LastUpdate time.Time `json:"last_update"`
}
// LoadBalancer manages load distribution across nodes
type LoadBalancer struct {
strategy LoadBalancingStrategy
nodes map[string]*Node
mu sync.RWMutex
logger *logrus.Logger
}
// LoadBalancingStrategy defines the interface for load balancing strategies
type LoadBalancingStrategy interface {
SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error)
GetName() string
}
// LoadRequest represents a load balancing request
type LoadRequest struct {
Type string `json:"type"`
Priority int `json:"priority"`
Requirements map[string]interface{} `json:"requirements"`
Metadata map[string]interface{} `json:"metadata"`
}
// AutoScaler manages automatic scaling of the cluster
type AutoScaler struct {
config *AutoScalerConfig
nodes map[string]*Node
mu sync.RWMutex
logger *logrus.Logger
enabled bool
ctx context.Context
cancel context.CancelFunc
}
// AutoScalerConfig represents auto-scaling configuration
type AutoScalerConfig struct {
Enabled bool `yaml:"enabled"`
MinNodes int `yaml:"min_nodes"`
MaxNodes int `yaml:"max_nodes"`
ScaleUpThreshold float64 `yaml:"scale_up_threshold"`
ScaleDownThreshold float64 `yaml:"scale_down_threshold"`
ScaleUpCooldown time.Duration `yaml:"scale_up_cooldown"`
ScaleDownCooldown time.Duration `yaml:"scale_down_cooldown"`
CheckInterval time.Duration `yaml:"check_interval"`
}
// NewScalingManager creates a new scaling manager
func NewScalingManager(enabled bool) *ScalingManager {
sm := &ScalingManager{
nodes: make(map[string]*Node),
logger: logrus.New(),
enabled: enabled,
}
// Initialize load balancer
sm.loadBalancer = NewLoadBalancer(sm.logger)
// Initialize auto-scaler
sm.autoscaler = NewAutoScaler(&AutoScalerConfig{
Enabled: true,
MinNodes: 2,
MaxNodes: 10,
ScaleUpThreshold: 80.0,
ScaleDownThreshold: 20.0,
ScaleUpCooldown: 5 * time.Minute,
ScaleDownCooldown: 10 * time.Minute,
CheckInterval: 30 * time.Second,
}, sm.logger)
return sm
}
// RegisterNode registers a new node in the cluster
func (sm *ScalingManager) RegisterNode(node *Node) error {
sm.mu.Lock()
defer sm.mu.Unlock()
// Validate node
if node.ID == "" {
return fmt.Errorf("node ID is required")
}
if node.Address == "" {
return fmt.Errorf("node address is required")
}
// Check for duplicate
if _, exists := sm.nodes[node.ID]; exists {
return fmt.Errorf("node %s already exists", node.ID)
}
// Set default values
if node.Status == "" {
node.Status = NodeStatusOnline
}
if node.Capabilities == nil {
node.Capabilities = make(map[string]interface{})
}
if node.Tags == nil {
node.Tags = make(map[string]string)
}
if node.Metrics == nil {
node.Metrics = &NodeMetrics{
LastUpdate: time.Now(),
}
}
node.LastSeen = time.Now()
sm.nodes[node.ID] = node
// Update load balancer
sm.loadBalancer.AddNode(node)
sm.logger.Infof("Registered node: %s (%s)", node.ID, node.Hostname)
return nil
}
// UnregisterNode removes a node from the cluster
func (sm *ScalingManager) UnregisterNode(nodeID string) error {
sm.mu.Lock()
defer sm.mu.Unlock()
node, exists := sm.nodes[nodeID]
if !exists {
return fmt.Errorf("node %s not found", nodeID)
}
delete(sm.nodes, nodeID)
sm.loadBalancer.RemoveNode(nodeID)
sm.logger.Infof("Unregistered node: %s (%s)", node.ID, node.Hostname)
return nil
}
// UpdateNodeMetrics updates metrics for a specific node
func (sm *ScalingManager) UpdateNodeMetrics(nodeID string, metrics *NodeMetrics) error {
sm.mu.Lock()
defer sm.mu.Unlock()
node, exists := sm.nodes[nodeID]
if !exists {
return fmt.Errorf("node %s not found", nodeID)
}
metrics.LastUpdate = time.Now()
node.Metrics = metrics
node.LastSeen = time.Now()
// Update load balancer
sm.loadBalancer.UpdateNode(node)
return nil
}
// GetNode returns a node by ID
func (sm *ScalingManager) GetNode(nodeID string) (*Node, bool) {
sm.mu.RLock()
defer sm.mu.RUnlock()
node, exists := sm.nodes[nodeID]
return node, exists
}
// GetAllNodes returns all registered nodes
func (sm *ScalingManager) GetAllNodes() map[string]*Node {
sm.mu.RLock()
defer sm.mu.RUnlock()
// Create a copy to avoid race conditions
nodes := make(map[string]*Node)
for k, v := range sm.nodes {
nodes[k] = v
}
return nodes
}
// GetAvailableNodes returns all available nodes
func (sm *ScalingManager) GetAvailableNodes() []*Node {
sm.mu.RLock()
defer sm.mu.RUnlock()
var available []*Node
for _, node := range sm.nodes {
if node.Status == NodeStatusOnline && node.Metrics.ActiveJobs < node.Metrics.MaxJobs {
available = append(available, node)
}
}
return available
}
// SelectNode selects a node for a specific request
func (sm *ScalingManager) SelectNode(request *LoadRequest) (*Node, error) {
return sm.loadBalancer.SelectNode(request)
}
// Start starts the scaling manager
func (sm *ScalingManager) Start() error {
if !sm.enabled {
sm.logger.Info("Scaling manager is disabled")
return nil
}
sm.logger.Info("Starting scaling manager")
// Start auto-scaler
if err := sm.autoscaler.Start(); err != nil {
return fmt.Errorf("failed to start auto-scaler: %w", err)
}
// Start node health monitoring
go sm.monitorNodeHealth()
return nil
}
// Stop stops the scaling manager
func (sm *ScalingManager) Stop() error {
sm.logger.Info("Stopping scaling manager")
// Stop auto-scaler
if err := sm.autoscaler.Stop(); err != nil {
return fmt.Errorf("failed to stop auto-scaler: %w", err)
}
return nil
}
// monitorNodeHealth monitors the health of all nodes
func (sm *ScalingManager) monitorNodeHealth() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
sm.checkNodeHealth()
}
}
}
// checkNodeHealth checks the health of all nodes
func (sm *ScalingManager) checkNodeHealth() {
nodes := sm.GetAllNodes()
for _, node := range nodes {
// Check if node is responsive
if time.Since(node.LastSeen) > 2*time.Minute {
sm.logger.Warnf("Node %s appears to be unresponsive", node.ID)
sm.markNodeOffline(node.ID)
}
// Check metrics freshness
if node.Metrics != nil && time.Since(node.Metrics.LastUpdate) > 5*time.Minute {
sm.logger.Warnf("Node %s metrics are stale", node.ID)
}
}
}
// markNodeOffline marks a node as offline
func (sm *ScalingManager) markNodeOffline(nodeID string) {
sm.mu.Lock()
defer sm.mu.Unlock()
if node, exists := sm.nodes[nodeID]; exists {
node.Status = NodeStatusOffline
sm.logger.Infof("Marked node %s as offline", nodeID)
}
}
// GetClusterStatus returns the current status of the cluster
func (sm *ScalingManager) GetClusterStatus() map[string]interface{} {
nodes := sm.GetAllNodes()
status := map[string]interface{}{
"total_nodes": len(nodes),
"online_nodes": 0,
"offline_nodes": 0,
"busy_nodes": 0,
"error_nodes": 0,
"total_capacity": 0,
"used_capacity": 0,
"timestamp": time.Now(),
}
for _, node := range nodes {
switch node.Status {
case NodeStatusOnline:
status["online_nodes"] = status["online_nodes"].(int) + 1
case NodeStatusOffline:
status["offline_nodes"] = status["offline_nodes"].(int) + 1
case NodeStatusBusy:
status["busy_nodes"] = status["busy_nodes"].(int) + 1
case NodeStatusError:
status["error_nodes"] = status["error_nodes"].(int) + 1
}
if node.Metrics != nil {
status["total_capacity"] = status["total_capacity"].(int) + node.Metrics.MaxJobs
status["used_capacity"] = status["used_capacity"].(int) + node.Metrics.ActiveJobs
}
}
// Calculate utilization percentage
if status["total_capacity"].(int) > 0 {
utilization := float64(status["used_capacity"].(int)) / float64(status["total_capacity"].(int)) * 100
status["utilization_percentage"] = utilization
}
return status
}
// NewLoadBalancer creates a new load balancer
func NewLoadBalancer(logger *logrus.Logger) *LoadBalancer {
lb := &LoadBalancer{
strategy: NewRoundRobinStrategy(),
nodes: make(map[string]*Node),
logger: logger,
}
return lb
}
// AddNode adds a node to the load balancer
func (lb *LoadBalancer) AddNode(node *Node) {
lb.mu.Lock()
defer lb.mu.Unlock()
lb.nodes[node.ID] = node
lb.logger.Debugf("Added node %s to load balancer", node.ID)
}
// RemoveNode removes a node from the load balancer
func (lb *LoadBalancer) RemoveNode(nodeID string) {
lb.mu.Lock()
defer lb.mu.Unlock()
delete(lb.nodes, nodeID)
lb.logger.Debugf("Removed node %s from load balancer", nodeID)
}
// UpdateNode updates a node in the load balancer
func (lb *LoadBalancer) UpdateNode(node *Node) {
lb.mu.Lock()
defer lb.mu.Unlock()
lb.nodes[node.ID] = node
}
// SelectNode selects a node using the configured strategy
func (lb *LoadBalancer) SelectNode(request *LoadRequest) (*Node, error) {
lb.mu.RLock()
defer lb.mu.RUnlock()
// Filter available nodes
availableNodes := make(map[string]*Node)
for id, node := range lb.nodes {
if node.Status == NodeStatusOnline && node.Metrics.ActiveJobs < node.Metrics.MaxJobs {
availableNodes[id] = node
}
}
if len(availableNodes) == 0 {
return nil, fmt.Errorf("no available nodes")
}
return lb.strategy.SelectNode(availableNodes, request)
}
// SetStrategy sets the load balancing strategy
func (lb *LoadBalancer) SetStrategy(strategy LoadBalancingStrategy) {
lb.mu.Lock()
defer lb.mu.Unlock()
lb.strategy = strategy
lb.logger.Infof("Load balancing strategy changed to: %s", strategy.GetName())
}
// NewAutoScaler creates a new auto-scaler
func NewAutoScaler(config *AutoScalerConfig, logger *logrus.Logger) *AutoScaler {
ctx, cancel := context.WithCancel(context.Background())
as := &AutoScaler{
config: config,
nodes: make(map[string]*Node),
logger: logger,
enabled: config.Enabled,
ctx: ctx,
cancel: cancel,
}
return as
}
// Start starts the auto-scaler
func (as *AutoScaler) Start() error {
if !as.enabled {
as.logger.Info("Auto-scaler is disabled")
return nil
}
as.logger.Info("Starting auto-scaler")
// Start scaling checks
go as.runScalingChecks()
return nil
}
// Stop stops the auto-scaler
func (as *AutoScaler) Stop() error {
as.logger.Info("Stopping auto-scaler")
as.cancel()
return nil
}
// runScalingChecks runs periodic scaling checks
func (as *AutoScaler) runScalingChecks() {
ticker := time.NewTicker(as.config.CheckInterval)
defer ticker.Stop()
for {
select {
case <-as.ctx.Done():
return
case <-ticker.C:
as.checkScaling()
}
}
}
// checkScaling checks if scaling is needed
func (as *AutoScaler) checkScaling() {
// Get current cluster status
// This would typically come from the scaling manager
// For now, we'll use placeholder logic
as.logger.Debug("Running scaling check")
// Check if we need to scale up
// Check if we need to scale down
// Implement scaling logic based on metrics
}