497 lines
12 KiB
Go
497 lines
12 KiB
Go
package performance
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// ScalingManager manages horizontal scaling and load balancing
|
|
type ScalingManager struct {
|
|
nodes map[string]*Node
|
|
loadBalancer *LoadBalancer
|
|
autoscaler *AutoScaler
|
|
mu sync.RWMutex
|
|
logger *logrus.Logger
|
|
enabled bool
|
|
}
|
|
|
|
// Node represents a compute node in the cluster
|
|
type Node struct {
|
|
ID string `json:"id"`
|
|
Hostname string `json:"hostname"`
|
|
Address string `json:"address"`
|
|
Port int `json:"port"`
|
|
Status NodeStatus `json:"status"`
|
|
Capabilities map[string]interface{} `json:"capabilities"`
|
|
Metrics *NodeMetrics `json:"metrics"`
|
|
LastSeen time.Time `json:"last_seen"`
|
|
Tags map[string]string `json:"tags"`
|
|
}
|
|
|
|
// NodeStatus represents the status of a node
|
|
type NodeStatus string
|
|
|
|
const (
|
|
NodeStatusOnline NodeStatus = "online"
|
|
NodeStatusOffline NodeStatus = "offline"
|
|
NodeStatusBusy NodeStatus = "busy"
|
|
NodeStatusError NodeStatus = "error"
|
|
)
|
|
|
|
// NodeMetrics represents performance metrics for a node
|
|
type NodeMetrics struct {
|
|
CPUUsage float64 `json:"cpu_usage"`
|
|
MemoryUsage float64 `json:"memory_usage"`
|
|
DiskUsage float64 `json:"disk_usage"`
|
|
LoadAverage float64 `json:"load_average"`
|
|
ActiveJobs int `json:"active_jobs"`
|
|
MaxJobs int `json:"max_jobs"`
|
|
LastUpdate time.Time `json:"last_update"`
|
|
}
|
|
|
|
// LoadBalancer manages load distribution across nodes
|
|
type LoadBalancer struct {
|
|
strategy LoadBalancingStrategy
|
|
nodes map[string]*Node
|
|
mu sync.RWMutex
|
|
logger *logrus.Logger
|
|
}
|
|
|
|
// LoadBalancingStrategy defines the interface for load balancing strategies
|
|
type LoadBalancingStrategy interface {
|
|
SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error)
|
|
GetName() string
|
|
}
|
|
|
|
// LoadRequest represents a load balancing request
|
|
type LoadRequest struct {
|
|
Type string `json:"type"`
|
|
Priority int `json:"priority"`
|
|
Requirements map[string]interface{} `json:"requirements"`
|
|
Metadata map[string]interface{} `json:"metadata"`
|
|
}
|
|
|
|
// AutoScaler manages automatic scaling of the cluster
|
|
type AutoScaler struct {
|
|
config *AutoScalerConfig
|
|
nodes map[string]*Node
|
|
mu sync.RWMutex
|
|
logger *logrus.Logger
|
|
enabled bool
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
}
|
|
|
|
// AutoScalerConfig represents auto-scaling configuration
|
|
type AutoScalerConfig struct {
|
|
Enabled bool `yaml:"enabled"`
|
|
MinNodes int `yaml:"min_nodes"`
|
|
MaxNodes int `yaml:"max_nodes"`
|
|
ScaleUpThreshold float64 `yaml:"scale_up_threshold"`
|
|
ScaleDownThreshold float64 `yaml:"scale_down_threshold"`
|
|
ScaleUpCooldown time.Duration `yaml:"scale_up_cooldown"`
|
|
ScaleDownCooldown time.Duration `yaml:"scale_down_cooldown"`
|
|
CheckInterval time.Duration `yaml:"check_interval"`
|
|
}
|
|
|
|
// NewScalingManager creates a new scaling manager
|
|
func NewScalingManager(enabled bool) *ScalingManager {
|
|
sm := &ScalingManager{
|
|
nodes: make(map[string]*Node),
|
|
logger: logrus.New(),
|
|
enabled: enabled,
|
|
}
|
|
|
|
// Initialize load balancer
|
|
sm.loadBalancer = NewLoadBalancer(sm.logger)
|
|
|
|
// Initialize auto-scaler
|
|
sm.autoscaler = NewAutoScaler(&AutoScalerConfig{
|
|
Enabled: true,
|
|
MinNodes: 2,
|
|
MaxNodes: 10,
|
|
ScaleUpThreshold: 80.0,
|
|
ScaleDownThreshold: 20.0,
|
|
ScaleUpCooldown: 5 * time.Minute,
|
|
ScaleDownCooldown: 10 * time.Minute,
|
|
CheckInterval: 30 * time.Second,
|
|
}, sm.logger)
|
|
|
|
return sm
|
|
}
|
|
|
|
// RegisterNode registers a new node in the cluster
|
|
func (sm *ScalingManager) RegisterNode(node *Node) error {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
|
|
// Validate node
|
|
if node.ID == "" {
|
|
return fmt.Errorf("node ID is required")
|
|
}
|
|
if node.Address == "" {
|
|
return fmt.Errorf("node address is required")
|
|
}
|
|
|
|
// Check for duplicate
|
|
if _, exists := sm.nodes[node.ID]; exists {
|
|
return fmt.Errorf("node %s already exists", node.ID)
|
|
}
|
|
|
|
// Set default values
|
|
if node.Status == "" {
|
|
node.Status = NodeStatusOnline
|
|
}
|
|
if node.Capabilities == nil {
|
|
node.Capabilities = make(map[string]interface{})
|
|
}
|
|
if node.Tags == nil {
|
|
node.Tags = make(map[string]string)
|
|
}
|
|
if node.Metrics == nil {
|
|
node.Metrics = &NodeMetrics{
|
|
LastUpdate: time.Now(),
|
|
}
|
|
}
|
|
|
|
node.LastSeen = time.Now()
|
|
sm.nodes[node.ID] = node
|
|
|
|
// Update load balancer
|
|
sm.loadBalancer.AddNode(node)
|
|
|
|
sm.logger.Infof("Registered node: %s (%s)", node.ID, node.Hostname)
|
|
return nil
|
|
}
|
|
|
|
// UnregisterNode removes a node from the cluster
|
|
func (sm *ScalingManager) UnregisterNode(nodeID string) error {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
|
|
node, exists := sm.nodes[nodeID]
|
|
if !exists {
|
|
return fmt.Errorf("node %s not found", nodeID)
|
|
}
|
|
|
|
delete(sm.nodes, nodeID)
|
|
sm.loadBalancer.RemoveNode(nodeID)
|
|
|
|
sm.logger.Infof("Unregistered node: %s (%s)", node.ID, node.Hostname)
|
|
return nil
|
|
}
|
|
|
|
// UpdateNodeMetrics updates metrics for a specific node
|
|
func (sm *ScalingManager) UpdateNodeMetrics(nodeID string, metrics *NodeMetrics) error {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
|
|
node, exists := sm.nodes[nodeID]
|
|
if !exists {
|
|
return fmt.Errorf("node %s not found", nodeID)
|
|
}
|
|
|
|
metrics.LastUpdate = time.Now()
|
|
node.Metrics = metrics
|
|
node.LastSeen = time.Now()
|
|
|
|
// Update load balancer
|
|
sm.loadBalancer.UpdateNode(node)
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetNode returns a node by ID
|
|
func (sm *ScalingManager) GetNode(nodeID string) (*Node, bool) {
|
|
sm.mu.RLock()
|
|
defer sm.mu.RUnlock()
|
|
|
|
node, exists := sm.nodes[nodeID]
|
|
return node, exists
|
|
}
|
|
|
|
// GetAllNodes returns all registered nodes
|
|
func (sm *ScalingManager) GetAllNodes() map[string]*Node {
|
|
sm.mu.RLock()
|
|
defer sm.mu.RUnlock()
|
|
|
|
// Create a copy to avoid race conditions
|
|
nodes := make(map[string]*Node)
|
|
for k, v := range sm.nodes {
|
|
nodes[k] = v
|
|
}
|
|
|
|
return nodes
|
|
}
|
|
|
|
// GetAvailableNodes returns all available nodes
|
|
func (sm *ScalingManager) GetAvailableNodes() []*Node {
|
|
sm.mu.RLock()
|
|
defer sm.mu.RUnlock()
|
|
|
|
var available []*Node
|
|
for _, node := range sm.nodes {
|
|
if node.Status == NodeStatusOnline && node.Metrics.ActiveJobs < node.Metrics.MaxJobs {
|
|
available = append(available, node)
|
|
}
|
|
}
|
|
|
|
return available
|
|
}
|
|
|
|
// SelectNode selects a node for a specific request
|
|
func (sm *ScalingManager) SelectNode(request *LoadRequest) (*Node, error) {
|
|
return sm.loadBalancer.SelectNode(request)
|
|
}
|
|
|
|
// Start starts the scaling manager
|
|
func (sm *ScalingManager) Start() error {
|
|
if !sm.enabled {
|
|
sm.logger.Info("Scaling manager is disabled")
|
|
return nil
|
|
}
|
|
|
|
sm.logger.Info("Starting scaling manager")
|
|
|
|
// Start auto-scaler
|
|
if err := sm.autoscaler.Start(); err != nil {
|
|
return fmt.Errorf("failed to start auto-scaler: %w", err)
|
|
}
|
|
|
|
// Start node health monitoring
|
|
go sm.monitorNodeHealth()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the scaling manager
|
|
func (sm *ScalingManager) Stop() error {
|
|
sm.logger.Info("Stopping scaling manager")
|
|
|
|
// Stop auto-scaler
|
|
if err := sm.autoscaler.Stop(); err != nil {
|
|
return fmt.Errorf("failed to stop auto-scaler: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// monitorNodeHealth monitors the health of all nodes
|
|
func (sm *ScalingManager) monitorNodeHealth() {
|
|
ticker := time.NewTicker(30 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
sm.checkNodeHealth()
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkNodeHealth checks the health of all nodes
|
|
func (sm *ScalingManager) checkNodeHealth() {
|
|
nodes := sm.GetAllNodes()
|
|
|
|
for _, node := range nodes {
|
|
// Check if node is responsive
|
|
if time.Since(node.LastSeen) > 2*time.Minute {
|
|
sm.logger.Warnf("Node %s appears to be unresponsive", node.ID)
|
|
sm.markNodeOffline(node.ID)
|
|
}
|
|
|
|
// Check metrics freshness
|
|
if node.Metrics != nil && time.Since(node.Metrics.LastUpdate) > 5*time.Minute {
|
|
sm.logger.Warnf("Node %s metrics are stale", node.ID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// markNodeOffline marks a node as offline
|
|
func (sm *ScalingManager) markNodeOffline(nodeID string) {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
|
|
if node, exists := sm.nodes[nodeID]; exists {
|
|
node.Status = NodeStatusOffline
|
|
sm.logger.Infof("Marked node %s as offline", nodeID)
|
|
}
|
|
}
|
|
|
|
// GetClusterStatus returns the current status of the cluster
|
|
func (sm *ScalingManager) GetClusterStatus() map[string]interface{} {
|
|
nodes := sm.GetAllNodes()
|
|
|
|
status := map[string]interface{}{
|
|
"total_nodes": len(nodes),
|
|
"online_nodes": 0,
|
|
"offline_nodes": 0,
|
|
"busy_nodes": 0,
|
|
"error_nodes": 0,
|
|
"total_capacity": 0,
|
|
"used_capacity": 0,
|
|
"timestamp": time.Now(),
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
switch node.Status {
|
|
case NodeStatusOnline:
|
|
status["online_nodes"] = status["online_nodes"].(int) + 1
|
|
case NodeStatusOffline:
|
|
status["offline_nodes"] = status["offline_nodes"].(int) + 1
|
|
case NodeStatusBusy:
|
|
status["busy_nodes"] = status["busy_nodes"].(int) + 1
|
|
case NodeStatusError:
|
|
status["error_nodes"] = status["error_nodes"].(int) + 1
|
|
}
|
|
|
|
if node.Metrics != nil {
|
|
status["total_capacity"] = status["total_capacity"].(int) + node.Metrics.MaxJobs
|
|
status["used_capacity"] = status["used_capacity"].(int) + node.Metrics.ActiveJobs
|
|
}
|
|
}
|
|
|
|
// Calculate utilization percentage
|
|
if status["total_capacity"].(int) > 0 {
|
|
utilization := float64(status["used_capacity"].(int)) / float64(status["total_capacity"].(int)) * 100
|
|
status["utilization_percentage"] = utilization
|
|
}
|
|
|
|
return status
|
|
}
|
|
|
|
// NewLoadBalancer creates a new load balancer
|
|
func NewLoadBalancer(logger *logrus.Logger) *LoadBalancer {
|
|
lb := &LoadBalancer{
|
|
strategy: NewRoundRobinStrategy(),
|
|
nodes: make(map[string]*Node),
|
|
logger: logger,
|
|
}
|
|
|
|
return lb
|
|
}
|
|
|
|
// AddNode adds a node to the load balancer
|
|
func (lb *LoadBalancer) AddNode(node *Node) {
|
|
lb.mu.Lock()
|
|
defer lb.mu.Unlock()
|
|
|
|
lb.nodes[node.ID] = node
|
|
lb.logger.Debugf("Added node %s to load balancer", node.ID)
|
|
}
|
|
|
|
// RemoveNode removes a node from the load balancer
|
|
func (lb *LoadBalancer) RemoveNode(nodeID string) {
|
|
lb.mu.Lock()
|
|
defer lb.mu.Unlock()
|
|
|
|
delete(lb.nodes, nodeID)
|
|
lb.logger.Debugf("Removed node %s from load balancer", nodeID)
|
|
}
|
|
|
|
// UpdateNode updates a node in the load balancer
|
|
func (lb *LoadBalancer) UpdateNode(node *Node) {
|
|
lb.mu.Lock()
|
|
defer lb.mu.Unlock()
|
|
|
|
lb.nodes[node.ID] = node
|
|
}
|
|
|
|
// SelectNode selects a node using the configured strategy
|
|
func (lb *LoadBalancer) SelectNode(request *LoadRequest) (*Node, error) {
|
|
lb.mu.RLock()
|
|
defer lb.mu.RUnlock()
|
|
|
|
// Filter available nodes
|
|
availableNodes := make(map[string]*Node)
|
|
for id, node := range lb.nodes {
|
|
if node.Status == NodeStatusOnline && node.Metrics.ActiveJobs < node.Metrics.MaxJobs {
|
|
availableNodes[id] = node
|
|
}
|
|
}
|
|
|
|
if len(availableNodes) == 0 {
|
|
return nil, fmt.Errorf("no available nodes")
|
|
}
|
|
|
|
return lb.strategy.SelectNode(availableNodes, request)
|
|
}
|
|
|
|
// SetStrategy sets the load balancing strategy
|
|
func (lb *LoadBalancer) SetStrategy(strategy LoadBalancingStrategy) {
|
|
lb.mu.Lock()
|
|
defer lb.mu.Unlock()
|
|
|
|
lb.strategy = strategy
|
|
lb.logger.Infof("Load balancing strategy changed to: %s", strategy.GetName())
|
|
}
|
|
|
|
// NewAutoScaler creates a new auto-scaler
|
|
func NewAutoScaler(config *AutoScalerConfig, logger *logrus.Logger) *AutoScaler {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
as := &AutoScaler{
|
|
config: config,
|
|
nodes: make(map[string]*Node),
|
|
logger: logger,
|
|
enabled: config.Enabled,
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
}
|
|
|
|
return as
|
|
}
|
|
|
|
// Start starts the auto-scaler
|
|
func (as *AutoScaler) Start() error {
|
|
if !as.enabled {
|
|
as.logger.Info("Auto-scaler is disabled")
|
|
return nil
|
|
}
|
|
|
|
as.logger.Info("Starting auto-scaler")
|
|
|
|
// Start scaling checks
|
|
go as.runScalingChecks()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the auto-scaler
|
|
func (as *AutoScaler) Stop() error {
|
|
as.logger.Info("Stopping auto-scaler")
|
|
as.cancel()
|
|
return nil
|
|
}
|
|
|
|
// runScalingChecks runs periodic scaling checks
|
|
func (as *AutoScaler) runScalingChecks() {
|
|
ticker := time.NewTicker(as.config.CheckInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-as.ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
as.checkScaling()
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkScaling checks if scaling is needed
|
|
func (as *AutoScaler) checkScaling() {
|
|
// Get current cluster status
|
|
// This would typically come from the scaling manager
|
|
// For now, we'll use placeholder logic
|
|
|
|
as.logger.Debug("Running scaling check")
|
|
|
|
// Check if we need to scale up
|
|
// Check if we need to scale down
|
|
// Implement scaling logic based on metrics
|
|
}
|