first commit

2025-08-18 23:32:51 -07:00 · 2025-08-18 23:32:51 -07:00 · 57bb8aafbe
commit 57bb8aafbe
27 changed files with 8538 additions and 0 deletions
--- a/internal/performance/profiler.go
+++ b/internal/performance/profiler.go
@ -0,0 +1,446 @@
+package performance
+
+import (
+	"context"
+	"fmt"
+	"runtime"
+	"sync"
+	"time"
+
+	"github.com/sirupsen/logrus"
+	"github.com/shirou/gopsutil/v3/cpu"
+	"github.com/shirou/gopsutil/v3/mem"
+	"github.com/shirou/gopsutil/v3/disk"
+)
+
+// Profiler manages performance profiling and metrics collection
+type Profiler struct {
+	metrics     map[string]*Metric
+	collectors  map[string]MetricCollector
+	mu          sync.RWMutex
+	logger      *logrus.Logger
+	enabled     bool
+	interval    time.Duration
+	ctx         context.Context
+	cancel      context.CancelFunc
+}
+
+// Metric represents a performance metric
+type Metric struct {
+	Name        string                 `json:"name"`
+	Value       float64                `json:"value"`
+	Unit        string                 `json:"unit"`
+	Timestamp   time.Time              `json:"timestamp"`
+	Tags        map[string]string      `json:"tags"`
+	Metadata    map[string]interface{} `json:"metadata"`
+	History     []MetricPoint          `json:"history,omitempty"`
+	MaxHistory  int                    `json:"max_history"`
+}
+
+// MetricPoint represents a single metric measurement
+type MetricPoint struct {
+	Value     float64   `json:"value"`
+	Timestamp time.Time `json:"timestamp"`
+}
+
+// MetricCollector defines the interface for collecting metrics
+type MetricCollector interface {
+	Collect() (*Metric, error)
+	GetName() string
+	GetInterval() time.Duration
+}
+
+// ProfilerConfig represents profiler configuration
+type ProfilerConfig struct {
+	Enabled     bool          `yaml:"enabled"`
+	Interval    time.Duration `yaml:"interval"`
+	MaxHistory  int           `yaml:"max_history"`
+	Metrics     []string      `yaml:"metrics"`
+	Exporters   []string      `yaml:"exporters"`
+	Custom      map[string]interface{} `yaml:"custom"`
+}
+
+// NewProfiler creates a new performance profiler
+func NewProfiler(config *ProfilerConfig) *Profiler {
+	if config.Interval == 0 {
+		config.Interval = 30 * time.Second
+	}
+	if config.MaxHistory == 0 {
+		config.MaxHistory = 1000
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	
+	profiler := &Profiler{
+		metrics:    make(map[string]*Metric),
+		collectors: make(map[string]MetricCollector),
+		logger:     logrus.New(),
+		enabled:    config.Enabled,
+		interval:   config.Interval,
+		ctx:        ctx,
+		cancel:     cancel,
+	}
+
+	// Initialize default collectors
+	profiler.initializeDefaultCollectors()
+
+	return profiler
+}
+
+// initializeDefaultCollectors initializes default metric collectors
+func (p *Profiler) initializeDefaultCollectors() {
+	// System metrics collector
+	p.RegisterCollector(NewSystemMetricsCollector(p.interval, p.logger))
+	
+	// Runtime metrics collector
+	p.RegisterCollector(NewRuntimeMetricsCollector(p.interval, p.logger))
+	
+	// Compose metrics collector
+	p.RegisterCollector(NewComposeMetricsCollector(p.interval, p.logger))
+	
+	// Phase metrics collector
+	p.RegisterCollector(NewPhaseMetricsCollector(p.interval, p.logger))
+}
+
+// RegisterCollector registers a new metric collector
+func (p *Profiler) RegisterCollector(collector MetricCollector) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	
+	p.collectors[collector.GetName()] = collector
+	p.logger.Infof("Registered metric collector: %s", collector.GetName())
+}
+
+// Start starts the profiler
+func (p *Profiler) Start() error {
+	if !p.enabled {
+		p.logger.Info("Profiler is disabled")
+		return nil
+	}
+
+	p.logger.Info("Starting performance profiler")
+	
+	// Start metric collection
+	go p.collectMetrics()
+	
+	// Start metric aggregation
+	go p.aggregateMetrics()
+	
+	return nil
+}
+
+// Stop stops the profiler
+func (p *Profiler) Stop() error {
+	p.logger.Info("Stopping performance profiler")
+	p.cancel()
+	return nil
+}
+
+// collectMetrics continuously collects metrics from all collectors
+func (p *Profiler) collectMetrics() {
+	ticker := time.NewTicker(p.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-p.ctx.Done():
+			return
+		case <-ticker.C:
+			p.collectAllMetrics()
+		}
+	}
+}
+
+// collectAllMetrics collects metrics from all registered collectors
+func (p *Profiler) collectAllMetrics() {
+	p.mu.RLock()
+	collectors := make([]MetricCollector, 0, len(p.collectors))
+	for _, collector := range p.collectors {
+		collectors = append(collectors, collector)
+	}
+	p.mu.RUnlock()
+
+	var wg sync.WaitGroup
+	for _, collector := range collectors {
+		wg.Add(1)
+		go func(c MetricCollector) {
+			defer wg.Done()
+			if metric, err := c.Collect(); err == nil {
+				p.storeMetric(metric)
+			} else {
+				p.logger.Errorf("Failed to collect metric from %s: %v", c.GetName(), err)
+			}
+		}(collector)
+	}
+
+	wg.Wait()
+}
+
+// storeMetric stores a collected metric
+func (p *Profiler) storeMetric(metric *Metric) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Add to history
+	if metric.History == nil {
+		metric.History = make([]MetricPoint, 0)
+	}
+
+	metric.History = append(metric.History, MetricPoint{
+		Value:     metric.Value,
+		Timestamp: metric.Timestamp,
+	})
+
+	// Trim history if it exceeds max size
+	if len(metric.History) > metric.MaxHistory {
+		metric.History = metric.History[len(metric.History)-metric.MaxHistory:]
+	}
+
+	p.metrics[metric.Name] = metric
+}
+
+// GetMetric returns a metric by name
+func (p *Profiler) GetMetric(name string) (*Metric, bool) {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	
+	metric, exists := p.metrics[name]
+	return metric, exists
+}
+
+// GetAllMetrics returns all collected metrics
+func (p *Profiler) GetAllMetrics() map[string]*Metric {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	
+	// Create a copy to avoid race conditions
+	metrics := make(map[string]*Metric)
+	for k, v := range p.metrics {
+		metrics[k] = v
+	}
+	
+	return metrics
+}
+
+// GetMetricHistory returns the history of a metric
+func (p *Profiler) GetMetricHistory(name string, duration time.Duration) ([]MetricPoint, error) {
+	metric, exists := p.GetMetric(name)
+	if !exists {
+		return nil, fmt.Errorf("metric %s not found", name)
+	}
+
+	cutoff := time.Now().Add(-duration)
+	var history []MetricPoint
+
+	for _, point := range metric.History {
+		if point.Timestamp.After(cutoff) {
+			history = append(history, point)
+		}
+	}
+
+	return history, nil
+}
+
+// aggregateMetrics aggregates metrics for reporting
+func (p *Profiler) aggregateMetrics() {
+	ticker := time.NewTicker(p.interval * 2)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-p.ctx.Done():
+			return
+		case <-ticker.C:
+			p.aggregateAllMetrics()
+		}
+	}
+}
+
+// aggregateAllMetrics aggregates all metrics
+func (p *Profiler) aggregateAllMetrics() {
+	metrics := p.GetAllMetrics()
+	
+	// Calculate aggregations
+	aggregations := make(map[string]map[string]float64)
+	
+	for name, metric := range metrics {
+		if len(metric.History) == 0 {
+			continue
+		}
+
+		values := make([]float64, 0, len(metric.History))
+		for _, point := range metric.History {
+			values = append(values, point.Value)
+		}
+
+		aggregations[name] = map[string]float64{
+			"min":     p.min(values),
+			"max":     p.max(values),
+			"avg":     p.average(values),
+			"median":  p.median(values),
+			"p95":     p.percentile(values, 95),
+			"p99":     p.percentile(values, 99),
+		}
+	}
+
+	// Log aggregations periodically
+	p.logger.WithField("aggregations", aggregations).Debug("Metric aggregations calculated")
+}
+
+// Utility functions for metric calculations
+func (p *Profiler) min(values []float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	min := values[0]
+	for _, v := range values {
+		if v < min {
+			min = v
+		}
+	}
+	return min
+}
+
+func (p *Profiler) max(values []float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	max := values[0]
+	for _, v := range values {
+		if v > max {
+			max = v
+		}
+	}
+	return max
+}
+
+func (p *Profiler) average(values []float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	sum := 0.0
+	for _, v := range values {
+		sum += v
+	}
+	return sum / float64(len(values))
+}
+
+func (p *Profiler) median(values []float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	// Simple median implementation
+	// In production, you might want to use a more sophisticated algorithm
+	return p.percentile(values, 50)
+}
+
+func (p *Profiler) percentile(values []float64, pct int) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	// Simple percentile implementation
+	// In production, you might want to use a more sophisticated algorithm
+	index := int(float64(pct) / 100.0 * float64(len(values)-1))
+	if index < 0 {
+		index = 0
+	}
+	if index >= len(values) {
+		index = len(values) - 1
+	}
+	return values[index]
+}
+
+// GetPerformanceReport generates a comprehensive performance report
+func (p *Profiler) GetPerformanceReport() map[string]interface{} {
+	metrics := p.GetAllMetrics()
+	
+	report := map[string]interface{}{
+		"timestamp": time.Now(),
+		"metrics":   metrics,
+		"summary":   p.generateSummary(metrics),
+		"system":    p.getSystemInfo(),
+	}
+	
+	return report
+}
+
+// generateSummary generates a summary of all metrics
+func (p *Profiler) generateSummary(metrics map[string]*Metric) map[string]interface{} {
+	summary := map[string]interface{}{
+		"total_metrics": len(metrics),
+		"categories":    make(map[string]int),
+		"alerts":        make([]string, 0),
+	}
+
+	// Categorize metrics
+	for name, metric := range metrics {
+		category := p.getMetricCategory(name)
+		summary["categories"].(map[string]int)[category]++
+		
+		// Check for alerts
+		if alert := p.checkMetricAlert(metric); alert != "" {
+			summary["alerts"].([]string) = append(summary["alerts"].([]string), alert)
+		}
+	}
+
+	return summary
+}
+
+// getMetricCategory determines the category of a metric
+func (p *Profiler) getMetricCategory(name string) string {
+	switch {
+	case contains(name, "cpu") || contains(name, "memory") || contains(name, "disk"):
+		return "system"
+	case contains(name, "compose") || contains(name, "phase"):
+		return "compose"
+	case contains(name, "runtime") || contains(name, "goroutine"):
+		return "runtime"
+	default:
+		return "other"
+	}
+}
+
+// checkMetricAlert checks if a metric should trigger an alert
+func (p *Profiler) checkMetricAlert(metric *Metric) string {
+	// Example alert logic
+	if metric.Name == "cpu_usage" && metric.Value > 90 {
+		return fmt.Sprintf("High CPU usage: %.2f%%", metric.Value)
+	}
+	if metric.Name == "memory_usage" && metric.Value > 85 {
+		return fmt.Sprintf("High memory usage: %.2f%%", metric.Value)
+	}
+	return ""
+}
+
+// getSystemInfo gets current system information
+func (p *Profiler) getSystemInfo() map[string]interface{} {
+	info := map[string]interface{}{
+		"go_version": runtime.Version(),
+		"go_os":      runtime.GOOS,
+		"go_arch":    runtime.GOARCH,
+		"num_cpu":    runtime.NumCPU(),
+		"timestamp":  time.Now(),
+	}
+
+	// Get CPU info
+	if cpuInfo, err := cpu.Info(); err == nil && len(cpuInfo) > 0 {
+		info["cpu_model"] = cpuInfo[0].ModelName
+		info["cpu_cores"] = cpuInfo[0].Cores
+	}
+
+	// Get memory info
+	if memInfo, err := mem.VirtualMemory(); err == nil {
+		info["memory_total"] = memInfo.Total
+		info["memory_available"] = memInfo.Available
+	}
+
+	return info
+}
+
+// contains checks if a string contains a substring
+func contains(s, substr string) bool {
+	return len(s) >= len(substr) && (s == substr || 
+		(len(s) > len(substr) && (s[:len(substr)] == substr || 
+		s[len(s)-len(substr):] == substr || 
+		contains(s[1:], substr))))
+}
--- a/internal/performance/scaling.go
+++ b/internal/performance/scaling.go
@ -0,0 +1,497 @@
+package performance
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/sirupsen/logrus"
+)
+
+// ScalingManager manages horizontal scaling and load balancing
+type ScalingManager struct {
+	nodes       map[string]*Node
+	loadBalancer *LoadBalancer
+	autoscaler  *AutoScaler
+	mu          sync.RWMutex
+	logger      *logrus.Logger
+	enabled     bool
+}
+
+// Node represents a compute node in the cluster
+type Node struct {
+	ID           string                 `json:"id"`
+	Hostname     string                 `json:"hostname"`
+	Address      string                 `json:"address"`
+	Port         int                    `json:"port"`
+	Status       NodeStatus             `json:"status"`
+	Capabilities map[string]interface{} `json:"capabilities"`
+	Metrics      *NodeMetrics           `json:"metrics"`
+	LastSeen    time.Time              `json:"last_seen"`
+	Tags        map[string]string      `json:"tags"`
+}
+
+// NodeStatus represents the status of a node
+type NodeStatus string
+
+const (
+	NodeStatusOnline  NodeStatus = "online"
+	NodeStatusOffline NodeStatus = "offline"
+	NodeStatusBusy    NodeStatus = "busy"
+	NodeStatusError   NodeStatus = "error"
+)
+
+// NodeMetrics represents performance metrics for a node
+type NodeMetrics struct {
+	CPUUsage    float64 `json:"cpu_usage"`
+	MemoryUsage float64 `json:"memory_usage"`
+	DiskUsage   float64 `json:"disk_usage"`
+	LoadAverage float64 `json:"load_average"`
+	ActiveJobs  int     `json:"active_jobs"`
+	MaxJobs     int     `json:"max_jobs"`
+	LastUpdate  time.Time `json:"last_update"`
+}
+
+// LoadBalancer manages load distribution across nodes
+type LoadBalancer struct {
+	strategy    LoadBalancingStrategy
+	nodes       map[string]*Node
+	mu          sync.RWMutex
+	logger      *logrus.Logger
+}
+
+// LoadBalancingStrategy defines the interface for load balancing strategies
+type LoadBalancingStrategy interface {
+	SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error)
+	GetName() string
+}
+
+// LoadRequest represents a load balancing request
+type LoadRequest struct {
+	Type        string                 `json:"type"`
+	Priority    int                    `json:"priority"`
+	Requirements map[string]interface{} `json:"requirements"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+// AutoScaler manages automatic scaling of the cluster
+type AutoScaler struct {
+	config      *AutoScalerConfig
+	nodes       map[string]*Node
+	mu          sync.RWMutex
+	logger      *logrus.Logger
+	enabled     bool
+	ctx         context.Context
+	cancel      context.CancelFunc
+}
+
+// AutoScalerConfig represents auto-scaling configuration
+type AutoScalerConfig struct {
+	Enabled           bool          `yaml:"enabled"`
+	MinNodes          int           `yaml:"min_nodes"`
+	MaxNodes          int           `yaml:"max_nodes"`
+	ScaleUpThreshold  float64       `yaml:"scale_up_threshold"`
+	ScaleDownThreshold float64      `yaml:"scale_down_threshold"`
+	ScaleUpCooldown   time.Duration `yaml:"scale_up_cooldown"`
+	ScaleDownCooldown time.Duration `yaml:"scale_down_cooldown"`
+	CheckInterval     time.Duration `yaml:"check_interval"`
+}
+
+// NewScalingManager creates a new scaling manager
+func NewScalingManager(enabled bool) *ScalingManager {
+	sm := &ScalingManager{
+		nodes:       make(map[string]*Node),
+		logger:      logrus.New(),
+		enabled:     enabled,
+	}
+
+	// Initialize load balancer
+	sm.loadBalancer = NewLoadBalancer(sm.logger)
+
+	// Initialize auto-scaler
+	sm.autoscaler = NewAutoScaler(&AutoScalerConfig{
+		Enabled:           true,
+		MinNodes:          2,
+		MaxNodes:          10,
+		ScaleUpThreshold:  80.0,
+		ScaleDownThreshold: 20.0,
+		ScaleUpCooldown:   5 * time.Minute,
+		ScaleDownCooldown: 10 * time.Minute,
+		CheckInterval:     30 * time.Second,
+	}, sm.logger)
+
+	return sm
+}
+
+// RegisterNode registers a new node in the cluster
+func (sm *ScalingManager) RegisterNode(node *Node) error {
+	sm.mu.Lock()
+	defer sm.mu.Unlock()
+
+	// Validate node
+	if node.ID == "" {
+		return fmt.Errorf("node ID is required")
+	}
+	if node.Address == "" {
+		return fmt.Errorf("node address is required")
+	}
+
+	// Check for duplicate
+	if _, exists := sm.nodes[node.ID]; exists {
+		return fmt.Errorf("node %s already exists", node.ID)
+	}
+
+	// Set default values
+	if node.Status == "" {
+		node.Status = NodeStatusOnline
+	}
+	if node.Capabilities == nil {
+		node.Capabilities = make(map[string]interface{})
+	}
+	if node.Tags == nil {
+		node.Tags = make(map[string]string)
+	}
+	if node.Metrics == nil {
+		node.Metrics = &NodeMetrics{
+			LastUpdate: time.Now(),
+		}
+	}
+
+	node.LastSeen = time.Now()
+	sm.nodes[node.ID] = node
+
+	// Update load balancer
+	sm.loadBalancer.AddNode(node)
+
+	sm.logger.Infof("Registered node: %s (%s)", node.ID, node.Hostname)
+	return nil
+}
+
+// UnregisterNode removes a node from the cluster
+func (sm *ScalingManager) UnregisterNode(nodeID string) error {
+	sm.mu.Lock()
+	defer sm.mu.Unlock()
+
+	node, exists := sm.nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %s not found", nodeID)
+	}
+
+	delete(sm.nodes, nodeID)
+	sm.loadBalancer.RemoveNode(nodeID)
+
+	sm.logger.Infof("Unregistered node: %s (%s)", node.ID, node.Hostname)
+	return nil
+}
+
+// UpdateNodeMetrics updates metrics for a specific node
+func (sm *ScalingManager) UpdateNodeMetrics(nodeID string, metrics *NodeMetrics) error {
+	sm.mu.Lock()
+	defer sm.mu.Unlock()
+
+	node, exists := sm.nodes[nodeID]
+	if !exists {
+		return fmt.Errorf("node %s not found", nodeID)
+	}
+
+	metrics.LastUpdate = time.Now()
+	node.Metrics = metrics
+	node.LastSeen = time.Now()
+
+	// Update load balancer
+	sm.loadBalancer.UpdateNode(node)
+
+	return nil
+}
+
+// GetNode returns a node by ID
+func (sm *ScalingManager) GetNode(nodeID string) (*Node, bool) {
+	sm.mu.RLock()
+	defer sm.mu.RUnlock()
+	
+	node, exists := sm.nodes[nodeID]
+	return node, exists
+}
+
+// GetAllNodes returns all registered nodes
+func (sm *ScalingManager) GetAllNodes() map[string]*Node {
+	sm.mu.RLock()
+	defer sm.mu.RUnlock()
+	
+	// Create a copy to avoid race conditions
+	nodes := make(map[string]*Node)
+	for k, v := range sm.nodes {
+		nodes[k] = v
+	}
+	
+	return nodes
+}
+
+// GetAvailableNodes returns all available nodes
+func (sm *ScalingManager) GetAvailableNodes() []*Node {
+	sm.mu.RLock()
+	defer sm.mu.RUnlock()
+	
+	var available []*Node
+	for _, node := range sm.nodes {
+		if node.Status == NodeStatusOnline && node.Metrics.ActiveJobs < node.Metrics.MaxJobs {
+			available = append(available, node)
+		}
+	}
+	
+	return available
+}
+
+// SelectNode selects a node for a specific request
+func (sm *ScalingManager) SelectNode(request *LoadRequest) (*Node, error) {
+	return sm.loadBalancer.SelectNode(request)
+}
+
+// Start starts the scaling manager
+func (sm *ScalingManager) Start() error {
+	if !sm.enabled {
+		sm.logger.Info("Scaling manager is disabled")
+		return nil
+	}
+
+	sm.logger.Info("Starting scaling manager")
+	
+	// Start auto-scaler
+	if err := sm.autoscaler.Start(); err != nil {
+		return fmt.Errorf("failed to start auto-scaler: %w", err)
+	}
+
+	// Start node health monitoring
+	go sm.monitorNodeHealth()
+
+	return nil
+}
+
+// Stop stops the scaling manager
+func (sm *ScalingManager) Stop() error {
+	sm.logger.Info("Stopping scaling manager")
+	
+	// Stop auto-scaler
+	if err := sm.autoscaler.Stop(); err != nil {
+		return fmt.Errorf("failed to stop auto-scaler: %w", err)
+	}
+
+	return nil
+}
+
+// monitorNodeHealth monitors the health of all nodes
+func (sm *ScalingManager) monitorNodeHealth() {
+	ticker := time.NewTicker(30 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ticker.C:
+			sm.checkNodeHealth()
+		}
+	}
+}
+
+// checkNodeHealth checks the health of all nodes
+func (sm *ScalingManager) checkNodeHealth() {
+	nodes := sm.GetAllNodes()
+	
+	for _, node := range nodes {
+		// Check if node is responsive
+		if time.Since(node.LastSeen) > 2*time.Minute {
+			sm.logger.Warnf("Node %s appears to be unresponsive", node.ID)
+			sm.markNodeOffline(node.ID)
+		}
+
+		// Check metrics freshness
+		if node.Metrics != nil && time.Since(node.Metrics.LastUpdate) > 5*time.Minute {
+			sm.logger.Warnf("Node %s metrics are stale", node.ID)
+		}
+	}
+}
+
+// markNodeOffline marks a node as offline
+func (sm *ScalingManager) markNodeOffline(nodeID string) {
+	sm.mu.Lock()
+	defer sm.mu.Unlock()
+
+	if node, exists := sm.nodes[nodeID]; exists {
+		node.Status = NodeStatusOffline
+		sm.logger.Infof("Marked node %s as offline", nodeID)
+	}
+}
+
+// GetClusterStatus returns the current status of the cluster
+func (sm *ScalingManager) GetClusterStatus() map[string]interface{} {
+	nodes := sm.GetAllNodes()
+	
+	status := map[string]interface{}{
+		"total_nodes":    len(nodes),
+		"online_nodes":   0,
+		"offline_nodes":  0,
+		"busy_nodes":     0,
+		"error_nodes":    0,
+		"total_capacity": 0,
+		"used_capacity":  0,
+		"timestamp":      time.Now(),
+	}
+
+	for _, node := range nodes {
+		switch node.Status {
+		case NodeStatusOnline:
+			status["online_nodes"] = status["online_nodes"].(int) + 1
+		case NodeStatusOffline:
+			status["offline_nodes"] = status["offline_nodes"].(int) + 1
+		case NodeStatusBusy:
+			status["busy_nodes"] = status["busy_nodes"].(int) + 1
+		case NodeStatusError:
+			status["error_nodes"] = status["error_nodes"].(int) + 1
+		}
+
+		if node.Metrics != nil {
+			status["total_capacity"] = status["total_capacity"].(int) + node.Metrics.MaxJobs
+			status["used_capacity"] = status["used_capacity"].(int) + node.Metrics.ActiveJobs
+		}
+	}
+
+	// Calculate utilization percentage
+	if status["total_capacity"].(int) > 0 {
+		utilization := float64(status["used_capacity"].(int)) / float64(status["total_capacity"].(int)) * 100
+		status["utilization_percentage"] = utilization
+	}
+
+	return status
+}
+
+// NewLoadBalancer creates a new load balancer
+func NewLoadBalancer(logger *logrus.Logger) *LoadBalancer {
+	lb := &LoadBalancer{
+		strategy: NewRoundRobinStrategy(),
+		nodes:    make(map[string]*Node),
+		logger:   logger,
+	}
+
+	return lb
+}
+
+// AddNode adds a node to the load balancer
+func (lb *LoadBalancer) AddNode(node *Node) {
+	lb.mu.Lock()
+	defer lb.mu.Unlock()
+	
+	lb.nodes[node.ID] = node
+	lb.logger.Debugf("Added node %s to load balancer", node.ID)
+}
+
+// RemoveNode removes a node from the load balancer
+func (lb *LoadBalancer) RemoveNode(nodeID string) {
+	lb.mu.Lock()
+	defer lb.mu.Unlock()
+	
+	delete(lb.nodes, nodeID)
+	lb.logger.Debugf("Removed node %s from load balancer", nodeID)
+}
+
+// UpdateNode updates a node in the load balancer
+func (lb *LoadBalancer) UpdateNode(node *Node) {
+	lb.mu.Lock()
+	defer lb.mu.Unlock()
+	
+	lb.nodes[node.ID] = node
+}
+
+// SelectNode selects a node using the configured strategy
+func (lb *LoadBalancer) SelectNode(request *LoadRequest) (*Node, error) {
+	lb.mu.RLock()
+	defer lb.mu.RUnlock()
+	
+	// Filter available nodes
+	availableNodes := make(map[string]*Node)
+	for id, node := range lb.nodes {
+		if node.Status == NodeStatusOnline && node.Metrics.ActiveJobs < node.Metrics.MaxJobs {
+			availableNodes[id] = node
+		}
+	}
+
+	if len(availableNodes) == 0 {
+		return nil, fmt.Errorf("no available nodes")
+	}
+
+	return lb.strategy.SelectNode(availableNodes, request)
+}
+
+// SetStrategy sets the load balancing strategy
+func (lb *LoadBalancer) SetStrategy(strategy LoadBalancingStrategy) {
+	lb.mu.Lock()
+	defer lb.mu.Unlock()
+	
+	lb.strategy = strategy
+	lb.logger.Infof("Load balancing strategy changed to: %s", strategy.GetName())
+}
+
+// NewAutoScaler creates a new auto-scaler
+func NewAutoScaler(config *AutoScalerConfig, logger *logrus.Logger) *AutoScaler {
+	ctx, cancel := context.WithCancel(context.Background())
+	
+	as := &AutoScaler{
+		config:  config,
+		nodes:   make(map[string]*Node),
+		logger:  logger,
+		enabled: config.Enabled,
+		ctx:     ctx,
+		cancel:  cancel,
+	}
+
+	return as
+}
+
+// Start starts the auto-scaler
+func (as *AutoScaler) Start() error {
+	if !as.enabled {
+		as.logger.Info("Auto-scaler is disabled")
+		return nil
+	}
+
+	as.logger.Info("Starting auto-scaler")
+	
+	// Start scaling checks
+	go as.runScalingChecks()
+	
+	return nil
+}
+
+// Stop stops the auto-scaler
+func (as *AutoScaler) Stop() error {
+	as.logger.Info("Stopping auto-scaler")
+	as.cancel()
+	return nil
+}
+
+// runScalingChecks runs periodic scaling checks
+func (as *AutoScaler) runScalingChecks() {
+	ticker := time.NewTicker(as.config.CheckInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-as.ctx.Done():
+			return
+		case <-ticker.C:
+			as.checkScaling()
+		}
+	}
+}
+
+// checkScaling checks if scaling is needed
+func (as *AutoScaler) checkScaling() {
+	// Get current cluster status
+	// This would typically come from the scaling manager
+	// For now, we'll use placeholder logic
+	
+	as.logger.Debug("Running scaling check")
+	
+	// Check if we need to scale up
+	// Check if we need to scale down
+	// Implement scaling logic based on metrics
+}
--- a/internal/performance/strategies.go
+++ b/internal/performance/strategies.go
@ -0,0 +1,458 @@
+package performance
+
+import (
+	"fmt"
+	"math/rand"
+	"sort"
+	"sync"
+	"time"
+)
+
+// RoundRobinStrategy implements round-robin load balancing
+type RoundRobinStrategy struct {
+	currentIndex int
+	mu           sync.Mutex
+}
+
+// NewRoundRobinStrategy creates a new round-robin strategy
+func NewRoundRobinStrategy() *RoundRobinStrategy {
+	return &RoundRobinStrategy{
+		currentIndex: 0,
+	}
+}
+
+// SelectNode selects a node using round-robin algorithm
+func (rr *RoundRobinStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	rr.mu.Lock()
+	defer rr.mu.Unlock()
+
+	// Convert map to slice for indexing
+	nodeSlice := make([]*Node, 0, len(nodes))
+	for _, node := range nodes {
+		nodeSlice = append(nodeSlice, node)
+	}
+
+	// Sort by ID for consistent ordering
+	sort.Slice(nodeSlice, func(i, j int) bool {
+		return nodeSlice[i].ID < nodeSlice[j].ID
+	})
+
+	// Select next node in round-robin fashion
+	selectedNode := nodeSlice[rr.currentIndex%len(nodeSlice)]
+	rr.currentIndex++
+
+	return selectedNode, nil
+}
+
+// GetName returns the strategy name
+func (rr *RoundRobinStrategy) GetName() string {
+	return "round_robin"
+}
+
+// LeastConnectionsStrategy implements least connections load balancing
+type LeastConnectionsStrategy struct{}
+
+// NewLeastConnectionsStrategy creates a new least connections strategy
+func NewLeastConnectionsStrategy() *LeastConnectionsStrategy {
+	return &LeastConnectionsStrategy{}
+}
+
+// SelectNode selects a node with the least active connections
+func (lc *LeastConnectionsStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	var selectedNode *Node
+	minConnections := int(^uint(0) >> 1) // Max int
+
+	for _, node := range nodes {
+		if node.Metrics == nil {
+			continue
+		}
+
+		activeJobs := node.Metrics.ActiveJobs
+		if activeJobs < minConnections {
+			minConnections = activeJobs
+			selectedNode = node
+		}
+	}
+
+	if selectedNode == nil {
+		return nil, fmt.Errorf("no suitable node found")
+	}
+
+	return selectedNode, nil
+}
+
+// GetName returns the strategy name
+func (lc *LeastConnectionsStrategy) GetName() string {
+	return "least_connections"
+}
+
+// WeightedRoundRobinStrategy implements weighted round-robin load balancing
+type WeightedRoundRobinStrategy struct {
+	currentIndex int
+	mu           sync.Mutex
+}
+
+// NewWeightedRoundRobinStrategy creates a new weighted round-robin strategy
+func NewWeightedRoundRobinStrategy() *WeightedRoundRobinStrategy {
+	return &WeightedRoundRobinStrategy{
+		currentIndex: 0,
+	}
+}
+
+// SelectNode selects a node using weighted round-robin algorithm
+func (wr *WeightedRoundRobinStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	wr.mu.Lock()
+	defer wr.mu.Unlock()
+
+	// Convert map to slice and calculate weights
+	type weightedNode struct {
+		node   *Node
+		weight int
+	}
+
+	var weightedNodes []weightedNode
+	for _, node := range nodes {
+		weight := 1 // Default weight
+		
+		// Calculate weight based on node capabilities and current load
+		if node.Metrics != nil {
+			// Higher weight for nodes with more capacity
+			availableCapacity := node.Metrics.MaxJobs - node.Metrics.ActiveJobs
+			if availableCapacity > 0 {
+				weight = availableCapacity
+			}
+		}
+
+		// Apply tags-based weight adjustments
+		if node.Tags != nil {
+			if priority, ok := node.Tags["priority"]; ok {
+				switch priority {
+				case "high":
+					weight *= 2
+				case "low":
+					weight /= 2
+				}
+			}
+		}
+
+		weightedNodes = append(weightedNodes, weightedNode{node: node, weight: weight})
+	}
+
+	// Sort by weight (descending)
+	sort.Slice(weightedNodes, func(i, j int) bool {
+		return weightedNodes[i].weight > weightedNodes[j].weight
+	})
+
+	// Select next node in weighted round-robin fashion
+	selectedNode := weightedNodes[wr.currentIndex%len(weightedNodes)].node
+	wr.currentIndex++
+
+	return selectedNode, nil
+}
+
+// GetName returns the strategy name
+func (wr *WeightedRoundRobinStrategy) GetName() string {
+	return "weighted_round_robin"
+}
+
+// RandomStrategy implements random load balancing
+type RandomStrategy struct {
+	rand *rand.Rand
+	mu   sync.Mutex
+}
+
+// NewRandomStrategy creates a new random strategy
+func NewRandomStrategy() *RandomStrategy {
+	return &RandomStrategy{
+		rand: rand.New(rand.NewSource(time.Now().UnixNano())),
+	}
+}
+
+// SelectNode selects a random node
+func (r *RandomStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	// Convert map to slice
+	nodeSlice := make([]*Node, 0, len(nodes))
+	for _, node := range nodes {
+		nodeSlice = append(nodeSlice, node)
+	}
+
+	// Select random node
+	randomIndex := r.rand.Intn(len(nodeSlice))
+	return nodeSlice[randomIndex], nil
+}
+
+// GetName returns the strategy name
+func (r *RandomStrategy) GetName() string {
+	return "random"
+}
+
+// LeastResponseTimeStrategy implements least response time load balancing
+type LeastResponseTimeStrategy struct{}
+
+// NewLeastResponseTimeStrategy creates a new least response time strategy
+func NewLeastResponseTimeStrategy() *LeastResponseTimeStrategy {
+	return &LeastResponseTimeStrategy{}
+}
+
+// SelectNode selects a node with the least response time
+func (lrt *LeastResponseTimeStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	var selectedNode *Node
+	minResponseTime := float64(^uint(0) >> 1) // Max float64
+
+	for _, node := range nodes {
+		if node.Metrics == nil {
+			continue
+		}
+
+		// Use load average as a proxy for response time
+		// In a real implementation, you'd have actual response time metrics
+		responseTime := node.Metrics.LoadAverage
+		if responseTime < minResponseTime {
+			minResponseTime = responseTime
+			selectedNode = node
+		}
+	}
+
+	if selectedNode == nil {
+		return nil, fmt.Errorf("no suitable node found")
+	}
+
+	return selectedNode, nil
+}
+
+// GetName returns the strategy name
+func (lrt *LeastResponseTimeStrategy) GetName() string {
+	return "least_response_time"
+}
+
+// IPHashStrategy implements IP hash load balancing
+type IPHashStrategy struct{}
+
+// NewIPHashStrategy creates a new IP hash strategy
+func NewIPHashStrategy() *IPHashStrategy {
+	return &IPHashStrategy{}
+}
+
+// SelectNode selects a node using IP hash algorithm
+func (ih *IPHashStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	// Extract client IP from request metadata
+	clientIP := "127.0.0.1" // Default IP
+	if request.Metadata != nil {
+		if ip, ok := request.Metadata["client_ip"].(string); ok {
+			clientIP = ip
+		}
+	}
+
+	// Calculate hash of client IP
+	hash := hashString(clientIP)
+
+	// Convert map to slice for indexing
+	nodeSlice := make([]*Node, 0, len(nodes))
+	for _, node := range nodes {
+		nodeSlice = append(nodeSlice, node)
+	}
+
+	// Sort by ID for consistent ordering
+	sort.Slice(nodeSlice, func(i, j int) bool {
+		return nodeSlice[i].ID < nodeSlice[j].ID
+	})
+
+	// Select node based on hash
+	selectedIndex := hash % uint32(len(nodeSlice))
+	return nodeSlice[selectedIndex], nil
+}
+
+// GetName returns the strategy name
+func (ih *IPHashStrategy) GetName() string {
+	return "ip_hash"
+}
+
+// hashString calculates a simple hash of a string
+func hashString(s string) uint32 {
+	var hash uint32
+	for _, char := range s {
+		hash = ((hash << 5) + hash) + uint32(char)
+	}
+	return hash
+}
+
+// AdaptiveStrategy implements adaptive load balancing based on multiple factors
+type AdaptiveStrategy struct {
+	mu sync.Mutex
+}
+
+// NewAdaptiveStrategy creates a new adaptive strategy
+func NewAdaptiveStrategy() *AdaptiveStrategy {
+	return &AdaptiveStrategy{}
+}
+
+// SelectNode selects a node using adaptive algorithm
+func (a *AdaptiveStrategy) SelectNode(nodes map[string]*Node, request *LoadRequest) (*Node, error) {
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no nodes available")
+	}
+
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	// Score each node based on multiple factors
+	type scoredNode struct {
+		node  *Node
+		score float64
+	}
+
+	var scoredNodes []scoredNode
+
+	for _, node := range nodes {
+		if node.Metrics == nil {
+			continue
+		}
+
+		score := a.calculateNodeScore(node, request)
+		scoredNodes = append(scoredNodes, scoredNode{node: node, score: score})
+	}
+
+	if len(scoredNodes) == 0 {
+		return nil, fmt.Errorf("no suitable node found")
+	}
+
+	// Sort by score (descending)
+	sort.Slice(scoredNodes, func(i, j int) bool {
+		return scoredNodes[i].score > scoredNodes[j].score
+	})
+
+	// Return the highest scoring node
+	return scoredNodes[0].node, nil
+}
+
+// calculateNodeScore calculates a score for a node based on multiple factors
+func (a *AdaptiveStrategy) calculateNodeScore(node *Node, request *LoadRequest) float64 {
+	score := 100.0
+
+	if node.Metrics == nil {
+		return score
+	}
+
+	// Factor 1: Available capacity (higher is better)
+	availableCapacity := float64(node.Metrics.MaxJobs - node.Metrics.ActiveJobs)
+	if node.Metrics.MaxJobs > 0 {
+		capacityRatio := availableCapacity / float64(node.Metrics.MaxJobs)
+		score += capacityRatio * 50 // Up to 50 points for capacity
+	}
+
+	// Factor 2: System load (lower is better)
+	if node.Metrics.LoadAverage > 0 {
+		loadScore := 100.0 - (node.Metrics.LoadAverage * 10)
+		if loadScore < 0 {
+			loadScore = 0
+		}
+		score += loadScore * 0.3 // Up to 30 points for load
+	}
+
+	// Factor 3: Resource usage (lower is better)
+	cpuScore := 100.0 - node.Metrics.CPUUsage
+	memoryScore := 100.0 - node.Metrics.MemoryUsage
+	
+	score += cpuScore * 0.1      // Up to 10 points for CPU
+	score += memoryScore * 0.1   // Up to 10 points for memory
+
+	// Factor 4: Priority-based adjustments
+	if node.Tags != nil {
+		if priority, ok := node.Tags["priority"]; ok {
+			switch priority {
+			case "high":
+				score += 20
+			case "low":
+				score -= 20
+			}
+		}
+	}
+
+	// Factor 5: Request-specific requirements
+	if request.Requirements != nil {
+		if arch, ok := request.Requirements["architecture"].(string); ok {
+			if nodeArch, ok := node.Capabilities["architecture"].(string); ok {
+				if arch == nodeArch {
+					score += 25 // Bonus for architecture match
+				}
+			}
+		}
+	}
+
+	return score
+}
+
+// GetName returns the strategy name
+func (a *AdaptiveStrategy) GetName() string {
+	return "adaptive"
+}
+
+// StrategyFactory creates load balancing strategies
+type StrategyFactory struct{}
+
+// NewStrategyFactory creates a new strategy factory
+func NewStrategyFactory() *StrategyFactory {
+	return &StrategyFactory{}
+}
+
+// CreateStrategy creates a strategy by name
+func (sf *StrategyFactory) CreateStrategy(name string) (LoadBalancingStrategy, error) {
+	switch name {
+	case "round_robin":
+		return NewRoundRobinStrategy(), nil
+	case "least_connections":
+		return NewLeastConnectionsStrategy(), nil
+	case "weighted_round_robin":
+		return NewWeightedRoundRobinStrategy(), nil
+	case "random":
+		return NewRandomStrategy(), nil
+	case "least_response_time":
+		return NewLeastResponseTimeStrategy(), nil
+	case "ip_hash":
+		return NewIPHashStrategy(), nil
+	case "adaptive":
+		return NewAdaptiveStrategy(), nil
+	default:
+		return nil, fmt.Errorf("unknown strategy: %s", name)
+	}
+}
+
+// GetAvailableStrategies returns all available strategy names
+func (sf *StrategyFactory) GetAvailableStrategies() []string {
+	return []string{
+		"round_robin",
+		"least_connections",
+		"weighted_round_robin",
+		"random",
+		"least_response_time",
+		"ip_hash",
+		"adaptive",
+	}
+}