did stuff

2025-08-26 10:34:42 -07:00 · 2025-08-26 10:34:42 -07:00 · 4eeaa43c39
commit 4eeaa43c39
parent d228f6d30f
47 changed files with 21390 additions and 31 deletions
--- a/internal/monitoring/build_analytics.go
+++ b/internal/monitoring/build_analytics.go
@ -0,0 +1,703 @@
+package monitoring
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/sirupsen/logrus"
+)
+
+type BuildAnalytics struct {
+	logger       *logrus.Logger
+	config       *AnalyticsConfig
+	buildTracker *BuildTracker
+	performance  *PerformanceAnalyzer
+	capacity     *CapacityPlanner
+	dashboard    *AnalyticsDashboard
+	storage      *AnalyticsStorage
+	mu           sync.RWMutex
+}
+
+type AnalyticsConfig struct {
+	Enabled       bool              `json:"enabled"`
+	DataPath      string            `json:"data_path"`
+	RetentionDays int               `json:"retention_days"`
+	MetricsPath   string            `json:"metrics_path"`
+	DashboardPath string            `json:"dashboard_path"`
+	Metadata      map[string]string `json:"metadata"`
+}
+
+type BuildTracker struct {
+	builds    map[string]BuildRecord
+	workers   map[string]WorkerStats
+	queues    map[string]QueueStats
+	mu        sync.RWMutex
+}
+
+type BuildRecord struct {
+	ID              string                 `json:"id"`
+	Blueprint       string                 `json:"blueprint"`
+	Variant         string                 `json:"variant"`
+	Status          string                 `json:"status"`
+	StartTime       time.Time              `json:"start_time"`
+	EndTime         time.Time              `json:"end_time"`
+	Duration        time.Duration          `json:"duration"`
+	WorkerID        string                 `json:"worker_id"`
+	Priority        int                    `json:"priority"`
+	QueueTime       time.Duration          `json:"queue_time"`
+	ResourceUsage   ResourceUsage          `json:"resource_usage"`
+	Error           string                 `json:"error,omitempty"`
+	Metadata        map[string]interface{} `json:"metadata"`
+}
+
+type WorkerStats struct {
+	ID              string                 `json:"id"`
+	Status          string                 `json:"status"`
+	CurrentBuild    string                 `json:"current_build"`
+	TotalBuilds     int                    `json:"total_builds"`
+	SuccessfulBuilds int                   `json:"successful_builds"`
+	FailedBuilds    int                    `json:"failed_builds"`
+	Uptime          time.Duration          `json:"uptime"`
+	LastSeen        time.Time              `json:"last_seen"`
+	ResourceUsage   ResourceUsage          `json:"resource_usage"`
+	Metadata        map[string]interface{} `json:"metadata"`
+}
+
+type QueueStats struct {
+	Name            string                 `json:"name"`
+	Length          int                    `json:"length"`
+	Priority        int                    `json:"priority"`
+	AverageWaitTime time.Duration          `json:"average_wait_time"`
+	TotalProcessed  int                    `json:"total_processed"`
+	Metadata        map[string]interface{} `json:"metadata"`
+}
+
+type ResourceUsage struct {
+	CPUUsage    float64 `json:"cpu_usage"`
+	MemoryUsage float64 `json:"memory_usage"`
+	DiskUsage   float64 `json:"disk_usage"`
+	NetworkIO   float64 `json:"network_io"`
+}
+
+type PerformanceAnalyzer struct {
+	trends     map[string]PerformanceTrend
+	benchmarks map[string]Benchmark
+	mu         sync.RWMutex
+}
+
+type PerformanceTrend struct {
+	Metric       string                 `json:"metric"`
+	TimeRange    string                 `json:"time_range"`
+	DataPoints   []DataPoint            `json:"data_points"`
+	Trend        string                 `json:"trend"`
+	Slope        float64                `json:"slope"`
+	Confidence   float64                `json:"confidence"`
+	Metadata     map[string]interface{} `json:"metadata"`
+}
+
+type DataPoint struct {
+	Timestamp time.Time `json:"timestamp"`
+	Value     float64   `json:"value"`
+}
+
+type Benchmark struct {
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Category    string                 `json:"category"`
+	Baseline    float64                `json:"baseline"`
+	Current     float64                `json:"current"`
+	Improvement float64                `json:"improvement"`
+	Unit        string                 `json:"unit"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type CapacityPlanner struct {
+	recommendations []CapacityRecommendation
+	forecasts       map[string]CapacityForecast
+	mu              sync.RWMutex
+}
+
+type CapacityRecommendation struct {
+	ID          string                 `json:"id"`
+	Type        string                 `json:"type"`
+	Priority    string                 `json:"priority"`
+	Description string                 `json:"description"`
+	Impact      string                 `json:"impact"`
+	Effort      string                 `json:"effort"`
+	Timeline    string                 `json:"timeline"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type CapacityForecast struct {
+	Resource     string                 `json:"resource"`
+	TimeRange    string                 `json:"time_range"`
+	CurrentUsage float64                `json:"current_usage"`
+	ProjectedUsage float64              `json:"projected_usage"`
+	PeakUsage   float64                `json:"peak_usage"`
+	RiskLevel   string                 `json:"risk_level"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type AnalyticsDashboard struct {
+	config     *DashboardConfig
+	templates  map[string]DashboardTemplate
+	mu         sync.RWMutex
+}
+
+type DashboardConfig struct {
+	RefreshInterval time.Duration     `json:"refresh_interval"`
+	Theme           string            `json:"theme"`
+	Layout          string            `json:"layout"`
+	Widgets         []DashboardWidget `json:"widgets"`
+	Metadata        map[string]string `json:"metadata"`
+}
+
+type DashboardWidget struct {
+	ID          string                 `json:"id"`
+	Type        string                 `json:"type"`
+	Title       string                 `json:"title"`
+	Position    WidgetPosition         `json:"position"`
+	Size        WidgetSize             `json:"size"`
+	Config      map[string]interface{} `json:"config"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type WidgetPosition struct {
+	X int `json:"x"`
+	Y int `json:"y"`
+}
+
+type WidgetSize struct {
+	Width  int `json:"width"`
+	Height int `json:"height"`
+}
+
+type AnalyticsStorage struct {
+	path      string
+	retention time.Duration
+	mu        sync.RWMutex
+}
+
+func NewBuildAnalytics(config *AnalyticsConfig, logger *logrus.Logger) *BuildAnalytics {
+	analytics := &BuildAnalytics{
+		logger:       logger,
+		config:       config,
+		buildTracker: NewBuildTracker(),
+		performance:  NewPerformanceAnalyzer(),
+		capacity:     NewCapacityPlanner(),
+		dashboard:    NewAnalyticsDashboard(),
+		storage:      NewAnalyticsStorage(config.DataPath, time.Duration(config.RetentionDays)*24*time.Hour),
+	}
+	
+	return analytics
+}
+
+func NewBuildTracker() *BuildTracker {
+	return &BuildTracker{
+		builds:  make(map[string]BuildRecord),
+		workers: make(map[string]WorkerStats),
+		queues:  make(map[string]QueueStats),
+	}
+}
+
+func NewPerformanceAnalyzer() *PerformanceAnalyzer {
+	return &PerformanceAnalyzer{
+		trends:     make(map[string]PerformanceTrend),
+		benchmarks: make(map[string]Benchmark),
+	}
+}
+
+func NewCapacityPlanner() *CapacityPlanner {
+	return &CapacityPlanner{
+		recommendations: []CapacityRecommendation{},
+		forecasts:       make(map[string]CapacityForecast),
+	}
+}
+
+func NewAnalyticsDashboard() *AnalyticsDashboard {
+	return &AnalyticsDashboard{
+		config:    &DashboardConfig{},
+		templates: make(map[string]DashboardTemplate),
+	}
+}
+
+func NewAnalyticsStorage(path string, retention time.Duration) *AnalyticsStorage {
+	return &AnalyticsStorage{
+		path:      path,
+		retention: retention,
+	}
+}
+
+func (ba *BuildAnalytics) TrackBuild(build BuildRecord) error {
+	ba.logger.Infof("Tracking build: %s (blueprint: %s, variant: %s)", build.ID, build.Blueprint, build.Variant)
+	
+	ba.buildTracker.mu.Lock()
+	defer ba.buildTracker.mu.Unlock()
+	
+	// Store build record
+	ba.buildTracker.builds[build.ID] = build
+	
+	// Update worker stats
+	if worker, exists := ba.buildTracker.workers[build.WorkerID]; exists {
+		worker.TotalBuilds++
+		if build.Status == "success" {
+			worker.SuccessfulBuilds++
+		} else if build.Status == "failed" {
+			worker.FailedBuilds++
+		}
+		worker.LastSeen = time.Now()
+		ba.buildTracker.workers[build.WorkerID] = worker
+	}
+	
+	// Store to persistent storage
+	return ba.storage.storeBuildRecord(build)
+}
+
+func (ba *BuildAnalytics) UpdateBuildStatus(buildID string, status string, endTime time.Time, error string) error {
+	ba.buildTracker.mu.Lock()
+	defer ba.buildTracker.mu.Unlock()
+	
+	if build, exists := ba.buildTracker.builds[buildID]; exists {
+		build.Status = status
+		build.EndTime = endTime
+		build.Duration = endTime.Sub(build.StartTime)
+		if error != "" {
+			build.Error = error
+		}
+		
+		ba.buildTracker.builds[buildID] = build
+		
+		// Update performance trends
+		go ba.performance.updateTrends(build)
+		
+		// Update capacity forecasts
+		go ba.capacity.updateForecasts(build)
+		
+		return ba.storage.updateBuildRecord(build)
+	}
+	
+	return fmt.Errorf("build not found: %s", buildID)
+}
+
+func (ba *BuildAnalytics) GetBuildStats(timeRange string) *BuildStats {
+	ba.buildTracker.mu.RLock()
+	defer ba.buildTracker.mu.RUnlock()
+	
+	stats := &BuildStats{
+		TimeRange: timeRange,
+		Timestamp: time.Now(),
+		Metadata:  make(map[string]interface{}),
+	}
+	
+	// Calculate time range
+	var startTime time.Time
+	switch timeRange {
+	case "1h":
+		startTime = time.Now().Add(-1 * time.Hour)
+	case "24h":
+		startTime = time.Now().Add(-24 * time.Hour)
+	case "7d":
+		startTime = time.Now().AddDate(0, 0, -7)
+	case "30d":
+		startTime = time.Now().AddDate(0, 0, -30)
+	default:
+		startTime = time.Now().Add(-24 * time.Hour)
+	}
+	
+	// Count builds by status
+	for _, build := range ba.buildTracker.builds {
+		if build.StartTime.After(startTime) {
+			switch build.Status {
+			case "success":
+				stats.SuccessfulBuilds++
+			case "failed":
+				stats.FailedBuilds++
+			case "running":
+				stats.RunningBuilds++
+			case "queued":
+				stats.QueuedBuilds++
+			}
+			
+			stats.TotalBuilds++
+			stats.TotalDuration += build.Duration
+			
+			// Track average build time
+			if build.Status == "success" || build.Status == "failed" {
+				stats.AverageBuildTime += build.Duration
+				stats.CompletedBuilds++
+			}
+		}
+	}
+	
+	// Calculate averages
+	if stats.CompletedBuilds > 0 {
+		stats.AverageBuildTime = stats.AverageBuildTime / time.Duration(stats.CompletedBuilds)
+	}
+	
+	// Calculate success rate
+	if stats.TotalBuilds > 0 {
+		stats.SuccessRate = float64(stats.SuccessfulBuilds) / float64(stats.TotalBuilds) * 100.0
+	}
+	
+	return stats
+}
+
+func (ba *BuildAnalytics) GetPerformanceTrends(metric string, timeRange string) *PerformanceTrend {
+	ba.performance.mu.RLock()
+	defer ba.performance.mu.RUnlock()
+	
+	trendKey := fmt.Sprintf("%s_%s", metric, timeRange)
+	if trend, exists := ba.performance.trends[trendKey]; exists {
+		return &trend
+	}
+	
+	// Generate trend if it doesn't exist
+	return ba.performance.generateTrend(metric, timeRange)
+}
+
+func (ba *BuildAnalytics) GetCapacityRecommendations() []CapacityRecommendation {
+	ba.capacity.mu.RLock()
+	defer ba.capacity.mu.RUnlock()
+	
+	// Sort recommendations by priority
+	recommendations := make([]CapacityRecommendation, len(ba.capacity.recommendations))
+	copy(recommendations, ba.capacity.recommendations)
+	
+	sort.Slice(recommendations, func(i, j int) bool {
+		priorityOrder := map[string]int{"critical": 0, "high": 1, "medium": 2, "low": 3}
+		return priorityOrder[recommendations[i].Priority] < priorityOrder[recommendations[j].Priority]
+	})
+	
+	return recommendations
+}
+
+func (ba *BuildAnalytics) GetCapacityForecasts() map[string]CapacityForecast {
+	ba.capacity.mu.RLock()
+	defer ba.capacity.mu.RUnlock()
+	
+	forecasts := make(map[string]CapacityForecast)
+	for k, v := range ba.capacity.forecasts {
+		forecasts[k] = v
+	}
+	
+	return forecasts
+}
+
+func (ba *BuildAnalytics) GenerateDashboard() (*DashboardData, error) {
+	ba.logger.Info("Generating analytics dashboard")
+	
+	dashboard := &DashboardData{
+		Timestamp: time.Now(),
+		Widgets:   make(map[string]WidgetData),
+		Metadata:  make(map[string]interface{}),
+	}
+	
+	// Generate build statistics widget
+	if buildStats := ba.GetBuildStats("24h"); buildStats != nil {
+		dashboard.Widgets["build_stats"] = WidgetData{
+			Type: "build_statistics",
+			Data: buildStats,
+		}
+	}
+	
+	// Generate performance trends widget
+	if trends := ba.GetPerformanceTrends("build_duration", "7d"); trends != nil {
+		dashboard.Widgets["performance_trends"] = WidgetData{
+			Type: "performance_trends",
+			Data: trends,
+		}
+	}
+	
+	// Generate capacity recommendations widget
+	if recommendations := ba.GetCapacityRecommendations(); len(recommendations) > 0 {
+		dashboard.Widgets["capacity_recommendations"] = WidgetData{
+			Type: "capacity_recommendations",
+			Data: recommendations,
+		}
+	}
+	
+	// Generate worker status widget
+	if workerStats := ba.GetWorkerStats(); len(workerStats) > 0 {
+		dashboard.Widgets["worker_status"] = WidgetData{
+			Type: "worker_status",
+			Data: workerStats,
+		}
+	}
+	
+	// Store dashboard data
+	if err := ba.storage.storeDashboardData(dashboard); err != nil {
+		ba.logger.Warnf("Failed to store dashboard data: %v", err)
+	}
+	
+	return dashboard, nil
+}
+
+func (ba *BuildAnalytics) GetWorkerStats() map[string]WorkerStats {
+	ba.buildTracker.mu.RLock()
+	defer ba.buildTracker.mu.RUnlock()
+	
+	workerStats := make(map[string]WorkerStats)
+	for k, v := range ba.buildTracker.workers {
+		workerStats[k] = v
+	}
+	
+	return workerStats
+}
+
+// PerformanceAnalyzer methods
+func (pa *PerformanceAnalyzer) updateTrends(build BuildRecord) {
+	pa.mu.Lock()
+	defer pa.mu.Unlock()
+	
+	// Update build duration trend
+	trendKey := "build_duration_7d"
+	if trend, exists := pa.trends[trendKey]; exists {
+		dataPoint := DataPoint{
+			Timestamp: build.EndTime,
+			Value:     float64(build.Duration.Milliseconds()),
+		}
+		trend.DataPoints = append(trend.DataPoints, dataPoint)
+		
+		// Keep only last 7 days of data
+		cutoff := time.Now().AddDate(0, 0, -7)
+		var filteredPoints []DataPoint
+		for _, point := range trend.DataPoints {
+			if point.Timestamp.After(cutoff) {
+				filteredPoints = append(filteredPoints, point)
+			}
+		}
+		trend.DataPoints = filteredPoints
+		
+		// Calculate trend
+		trend = pa.calculateTrend(trend)
+		pa.trends[trendKey] = trend
+	}
+}
+
+func (pa *PerformanceAnalyzer) generateTrend(metric string, timeRange string) *PerformanceTrend {
+	// This is a placeholder for trend generation
+	// In production, implement actual trend calculation logic
+	return &PerformanceTrend{
+		Metric:     metric,
+		TimeRange:  timeRange,
+		DataPoints: []DataPoint{},
+		Trend:      "stable",
+		Slope:      0.0,
+		Confidence: 0.0,
+		Metadata:   make(map[string]interface{}),
+	}
+}
+
+func (pa *PerformanceAnalyzer) calculateTrend(trend PerformanceTrend) PerformanceTrend {
+	if len(trend.DataPoints) < 2 {
+		trend.Trend = "insufficient_data"
+		return trend
+	}
+	
+	// Simple linear regression for trend calculation
+	var sumX, sumY, sumXY, sumX2 float64
+	n := float64(len(trend.DataPoints))
+	
+	for i, point := range trend.DataPoints {
+		x := float64(i)
+		y := point.Value
+		
+		sumX += x
+		sumY += y
+		sumXY += x * y
+		sumX2 += x * x
+	}
+	
+	// Calculate slope
+	slope := (n*sumXY - sumX*sumY) / (n*sumX2 - sumX*sumX)
+	trend.Slope = slope
+	
+	// Determine trend direction
+	if slope > 0.1 {
+		trend.Trend = "increasing"
+	} else if slope < -0.1 {
+		trend.Trend = "decreasing"
+	} else {
+		trend.Trend = "stable"
+	}
+	
+	// Calculate confidence (simplified)
+	trend.Confidence = 0.8 // Placeholder
+	
+	return trend
+}
+
+// CapacityPlanner methods
+func (cp *CapacityPlanner) updateForecasts(build BuildRecord) {
+	cp.mu.Lock()
+	defer cp.mu.Unlock()
+	
+	// Update resource usage forecasts
+	forecastKey := "cpu_usage_7d"
+	if forecast, exists := cp.forecasts[forecastKey]; exists {
+		// Update current usage based on build
+		forecast.CurrentUsage = build.ResourceUsage.CPUUsage
+		
+		// Simple projection (in production, use more sophisticated forecasting)
+		forecast.ProjectedUsage = forecast.CurrentUsage * 1.1
+		
+		// Determine risk level
+		if forecast.ProjectedUsage > 80.0 {
+			forecast.RiskLevel = "high"
+		} else if forecast.ProjectedUsage > 60.0 {
+			forecast.RiskLevel = "medium"
+		} else {
+			forecast.RiskLevel = "low"
+		}
+		
+		cp.forecasts[forecastKey] = forecast
+	}
+	
+	// Generate recommendations if needed
+	cp.generateRecommendations()
+}
+
+func (cp *CapacityPlanner) generateRecommendations() {
+	// Check CPU usage
+	if forecast, exists := cp.forecasts["cpu_usage_7d"]; exists {
+		if forecast.RiskLevel == "high" {
+			recommendation := CapacityRecommendation{
+				ID:          generateRecommendationID(),
+				Type:        "scale_up",
+				Priority:    "high",
+				Description: "CPU usage is projected to exceed 80% within 7 days",
+				Impact:      "high",
+				Effort:      "medium",
+				Timeline:    "1-2 weeks",
+				Metadata:    make(map[string]interface{}),
+			}
+			
+			cp.recommendations = append(cp.recommendations, recommendation)
+		}
+	}
+}
+
+// AnalyticsStorage methods
+func (as *AnalyticsStorage) storeBuildRecord(build BuildRecord) error {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+	
+	// Create data directory if it doesn't exist
+	if err := os.MkdirAll(as.path, 0755); err != nil {
+		return fmt.Errorf("failed to create data directory: %w", err)
+	}
+	
+	// Store build record with timestamp
+	timestamp := build.StartTime.Format("2006-01-02_15-04-05")
+	filename := filepath.Join(as.path, fmt.Sprintf("build_%s_%s.json", build.ID, timestamp))
+	
+	data, err := json.MarshalIndent(build, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal build record: %w", err)
+	}
+	
+	if err := os.WriteFile(filename, data, 0644); err != nil {
+		return fmt.Errorf("failed to write build record: %w", err)
+	}
+	
+	return nil
+}
+
+func (as *AnalyticsStorage) updateBuildRecord(build BuildRecord) error {
+	// Find and update existing build record file
+	files, err := os.ReadDir(as.path)
+	if err != nil {
+		return fmt.Errorf("failed to read data directory: %w", err)
+	}
+	
+	for _, file := range files {
+		if strings.Contains(file.Name(), fmt.Sprintf("build_%s_", build.ID)) {
+			filePath := filepath.Join(as.path, file.Name())
+			
+			data, err := json.MarshalIndent(build, "", "  ")
+			if err != nil {
+				return fmt.Errorf("failed to marshal updated build record: %w", err)
+			}
+			
+			if err := os.WriteFile(filePath, data, 0644); err != nil {
+				return fmt.Errorf("failed to update build record: %w", err)
+			}
+			
+			return nil
+		}
+	}
+	
+	return fmt.Errorf("build record file not found for ID: %s", build.ID)
+}
+
+func (as *AnalyticsStorage) storeDashboardData(dashboard *DashboardData) error {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+	
+	// Create dashboard directory if it doesn't exist
+	dashboardPath := filepath.Join(as.path, "dashboard")
+	if err := os.MkdirAll(dashboardPath, 0755); err != nil {
+		return fmt.Errorf("failed to create dashboard directory: %w", err)
+	}
+	
+	// Store dashboard data with timestamp
+	timestamp := dashboard.Timestamp.Format("2006-01-02_15-04-05")
+	filename := filepath.Join(dashboardPath, fmt.Sprintf("dashboard_%s.json", timestamp))
+	
+	data, err := json.MarshalIndent(dashboard, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal dashboard data: %w", err)
+	}
+	
+	if err := os.WriteFile(filename, data, 0644); err != nil {
+		return fmt.Errorf("failed to write dashboard data: %w", err)
+	}
+	
+	return nil
+}
+
+// Dashboard types
+type DashboardData struct {
+	Timestamp time.Time              `json:"timestamp"`
+	Widgets   map[string]WidgetData `json:"widgets"`
+	Metadata  map[string]interface{} `json:"metadata"`
+}
+
+type WidgetData struct {
+	Type string      `json:"type"`
+	Data interface{} `json:"data"`
+}
+
+type DashboardTemplate struct {
+	ID       string                 `json:"id"`
+	Name     string                 `json:"name"`
+	Template string                 `json:"template"`
+	Metadata map[string]interface{} `json:"metadata"`
+}
+
+type BuildStats struct {
+	TimeRange         string        `json:"time_range"`
+	Timestamp         time.Time     `json:"timestamp"`
+	TotalBuilds       int           `json:"total_builds"`
+	SuccessfulBuilds  int           `json:"successful_builds"`
+	FailedBuilds      int           `json:"failed_builds"`
+	RunningBuilds     int           `json:"running_builds"`
+	QueuedBuilds      int           `json:"queued_builds"`
+	CompletedBuilds   int           `json:"completed_builds"`
+	TotalDuration     time.Duration `json:"total_duration"`
+	AverageBuildTime  time.Duration `json:"average_build_time"`
+	SuccessRate       float64       `json:"success_rate"`
+	Metadata          map[string]interface{} `json:"metadata"`
+}
+
+// Helper functions
+func generateRecommendationID() string {
+	return fmt.Sprintf("rec-%d", time.Now().UnixNano())
+}
--- a/internal/monitoring/operations_cli.go
+++ b/internal/monitoring/operations_cli.go
@ -0,0 +1,559 @@
+package monitoring
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/sirupsen/logrus"
+	"github.com/spf13/cobra"
+)
+
+// OperationsCLI provides command-line interface for operations management
+type OperationsCLI struct {
+	manager    *OperationsManager
+	configPath string
+	logger     *logrus.Logger
+}
+
+// NewOperationsCLI creates a new operations CLI
+func NewOperationsCLI(configPath string, logger *logrus.Logger) *OperationsCLI {
+	return &OperationsCLI{
+		configPath: configPath,
+		logger:     logger,
+	}
+}
+
+// CreateRootCommand creates the root operations command
+func (cli *OperationsCLI) CreateRootCommand() *cobra.Command {
+	rootCmd := &cobra.Command{
+		Use:   "operations",
+		Short: "Debian Forge Operations Management",
+		Long:  "Manage backup, recovery, and testing operations for Debian Forge",
+		PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
+			return cli.initializeManager()
+		},
+	}
+
+	// Add subcommands
+	rootCmd.AddCommand(cli.createBackupCommand())
+	rootCmd.AddCommand(cli.createRecoveryCommand())
+	rootCmd.AddCommand(cli.createTestingCommand())
+	rootCmd.AddCommand(cli.createConfigCommand())
+	rootCmd.AddCommand(cli.createStatusCommand())
+
+	return rootCmd
+}
+
+// initializeManager initializes the operations manager
+func (cli *OperationsCLI) initializeManager() error {
+	// Load configuration
+	config, err := LoadOperationsConfig(cli.configPath)
+	if err != nil {
+		return fmt.Errorf("failed to load configuration: %w", err)
+	}
+
+	// Validate configuration
+	configManager := &OperationsConfigManager{configPath: cli.configPath, config: config}
+	if err := configManager.ValidateConfig(); err != nil {
+		return fmt.Errorf("configuration validation failed: %w", err)
+	}
+
+	// Create operations manager
+	cli.manager = NewOperationsManager(config, cli.logger)
+	return nil
+}
+
+// createBackupCommand creates the backup command
+func (cli *OperationsCLI) createBackupCommand() *cobra.Command {
+	backupCmd := &cobra.Command{
+		Use:   "backup",
+		Short: "Manage backup operations",
+		Long:  "Create, list, and manage backup operations",
+	}
+
+	// Create backup subcommand
+	createCmd := &cobra.Command{
+		Use:   "create [strategy]",
+		Short: "Create a new backup",
+		Long:  "Create a new backup using the specified strategy",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.createBackup(args[0])
+		},
+	}
+
+	// List backups subcommand
+	listCmd := &cobra.Command{
+		Use:   "list",
+		Short: "List available backups",
+		Long:  "List all available backup strategies and recent backups",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.listBackups()
+		},
+	}
+
+	// Schedule backup subcommand
+	scheduleCmd := &cobra.Command{
+		Use:   "schedule [schedule]",
+		Short: "Schedule a backup",
+		Long:  "Schedule a backup using the specified schedule",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.scheduleBackup(args[0])
+		},
+	}
+
+	backupCmd.AddCommand(createCmd, listCmd, scheduleCmd)
+	return backupCmd
+}
+
+// createRecoveryCommand creates the recovery command
+func (cli *OperationsCLI) createRecoveryCommand() *cobra.Command {
+	recoveryCmd := &cobra.Command{
+		Use:   "recovery",
+		Short: "Manage recovery operations",
+		Long:  "Execute recovery plans and manage recovery procedures",
+	}
+
+	// Execute recovery subcommand
+	executeCmd := &cobra.Command{
+		Use:   "execute [plan] [backup]",
+		Short: "Execute a recovery plan",
+		Long:  "Execute a recovery plan using the specified backup",
+		Args:  cobra.ExactArgs(2),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.executeRecovery(args[0], args[1])
+		},
+	}
+
+	// List recovery plans subcommand
+	listCmd := &cobra.Command{
+		Use:   "list",
+		Short: "List recovery plans",
+		Long:  "List all available recovery plans",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.listRecoveryPlans()
+		},
+	}
+
+	// Show recovery procedure subcommand
+	showCmd := &cobra.Command{
+		Use:   "show [procedure]",
+		Short: "Show recovery procedure details",
+		Long:  "Show detailed information about a recovery procedure",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.showRecoveryProcedure(args[0])
+		},
+	}
+
+	recoveryCmd.AddCommand(executeCmd, listCmd, showCmd)
+	return recoveryCmd
+}
+
+// createTestingCommand creates the testing command
+func (cli *OperationsCLI) createTestingCommand() *cobra.Command {
+	testingCmd := &cobra.Command{
+		Use:   "testing",
+		Short: "Manage recovery testing",
+		Long:  "Run and manage recovery testing scenarios",
+	}
+
+	// Run test subcommand
+	runCmd := &cobra.Command{
+		Use:   "run [scenario]",
+		Short: "Run a test scenario",
+		Long:  "Run a recovery test scenario",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.runTest(args[0])
+		},
+	}
+
+	// List test scenarios subcommand
+	listCmd := &cobra.Command{
+		Use:   "list",
+		Short: "List test scenarios",
+		Long:  "List all available test scenarios",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.listTestScenarios()
+		},
+	}
+
+	// Show test results subcommand
+	resultsCmd := &cobra.Command{
+		Use:   "results [test-id]",
+		Short: "Show test results",
+		Long:  "Show results for a specific test",
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.showTestResults(args[0])
+		},
+	}
+
+	testingCmd.AddCommand(runCmd, listCmd, resultsCmd)
+	return testingCmd
+}
+
+// createConfigCommand creates the configuration command
+func (cli *OperationsCLI) createConfigCommand() *cobra.Command {
+	configCmd := &cobra.Command{
+		Use:   "config",
+		Short: "Manage operations configuration",
+		Long:  "View and modify operations configuration",
+	}
+
+	// Show configuration subcommand
+	showCmd := &cobra.Command{
+		Use:   "show",
+		Short: "Show current configuration",
+		Long:  "Show current operations configuration",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.showConfig()
+		},
+	}
+
+	// Update configuration subcommand
+	updateCmd := &cobra.Command{
+		Use:   "update [key] [value]",
+		Short: "Update configuration",
+		Long:  "Update a configuration value",
+		Args:  cobra.ExactArgs(2),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.updateConfig(args[0], args[1])
+		},
+	}
+
+	// Validate configuration subcommand
+	validateCmd := &cobra.Command{
+		Use:   "validate",
+		Short: "Validate configuration",
+		Long:  "Validate current configuration",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.validateConfig()
+		},
+	}
+
+	configCmd.AddCommand(showCmd, updateCmd, validateCmd)
+	return configCmd
+}
+
+// createStatusCommand creates the status command
+func (cli *OperationsCLI) createStatusCommand() *cobra.Command {
+	statusCmd := &cobra.Command{
+		Use:   "status",
+		Short: "Show operations status",
+		Long:  "Show current status of operations systems",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return cli.showStatus()
+		},
+	}
+
+	return statusCmd
+}
+
+// Backup operations
+func (cli *OperationsCLI) createBackup(strategyID string) error {
+	cli.logger.Infof("Creating backup using strategy: %s", strategyID)
+	
+	job, err := cli.manager.backup.CreateBackup(strategyID)
+	if err != nil {
+		return fmt.Errorf("backup creation failed: %w", err)
+	}
+
+	fmt.Printf("Backup created successfully:\n")
+	fmt.Printf("  ID: %s\n", job.ID)
+	fmt.Printf("  Strategy: %s\n", job.StrategyID)
+	fmt.Printf("  Status: %s\n", job.Status)
+	fmt.Printf("  Size: %d bytes\n", job.Size)
+	fmt.Printf("  Duration: %v\n", job.Duration)
+	fmt.Printf("  Path: %s\n", job.Path)
+	if job.Checksum != "" {
+		fmt.Printf("  Checksum: %s\n", job.Checksum)
+	}
+
+	return nil
+}
+
+func (cli *OperationsCLI) listBackups() error {
+	fmt.Printf("Available Backup Strategies:\n")
+	fmt.Printf("============================\n")
+	
+	for id, strategy := range cli.manager.backup.strategies {
+		fmt.Printf("  %s:\n", id)
+		fmt.Printf("    Name: %s\n", strategy.Name)
+		fmt.Printf("    Description: %s\n", strategy.Description)
+		fmt.Printf("    Type: %s\n", strategy.Type)
+		fmt.Printf("    Enabled: %t\n", strategy.Enabled)
+		fmt.Printf("    Compression: %t\n", strategy.Compression)
+		fmt.Printf("    Encryption: %t\n", strategy.Encryption)
+		fmt.Printf("    Paths: %v\n", strategy.Paths)
+		fmt.Printf("    Exclude: %v\n", strategy.Exclude)
+		fmt.Printf("\n")
+	}
+
+	fmt.Printf("Backup Schedules:\n")
+	fmt.Printf("=================\n")
+	
+	for id, schedule := range cli.manager.backup.schedules {
+		fmt.Printf("  %s:\n", id)
+		fmt.Printf("    Name: %s\n", schedule.Name)
+		fmt.Printf("    Description: %s\n", schedule.Description)
+		fmt.Printf("    Type: %s\n", schedule.Type)
+		fmt.Printf("    Interval: %v\n", schedule.Interval)
+		fmt.Printf("    Enabled: %t\n", schedule.Enabled)
+		fmt.Printf("    Next Run: %v\n", schedule.NextRun)
+		fmt.Printf("\n")
+	}
+
+	return nil
+}
+
+func (cli *OperationsCLI) scheduleBackup(scheduleID string) error {
+	schedule, exists := cli.manager.backup.schedules[scheduleID]
+	if !exists {
+		return fmt.Errorf("backup schedule not found: %s", scheduleID)
+	}
+
+	if !schedule.Enabled {
+		return fmt.Errorf("backup schedule is disabled: %s", scheduleID)
+	}
+
+	fmt.Printf("Scheduling backup for: %s\n", schedule.Name)
+	fmt.Printf("  Type: %s\n", schedule.Type)
+	fmt.Printf("  Interval: %v\n", schedule.Interval)
+	fmt.Printf("  Next Run: %v\n", schedule.NextRun)
+
+	// In production, this would actually schedule the backup
+	cli.logger.Infof("Backup scheduled for: %s", scheduleID)
+	return nil
+}
+
+// Recovery operations
+func (cli *OperationsCLI) executeRecovery(planID string, backupID string) error {
+	cli.logger.Infof("Executing recovery plan: %s with backup: %s", planID, backupID)
+	
+	if err := cli.manager.recovery.ExecuteRecovery(planID, backupID); err != nil {
+		return fmt.Errorf("recovery execution failed: %w", err)
+	}
+
+	fmt.Printf("Recovery plan executed successfully: %s\n", planID)
+	return nil
+}
+
+func (cli *OperationsCLI) listRecoveryPlans() error {
+	fmt.Printf("Available Recovery Plans:\n")
+	fmt.Printf("=========================\n")
+	
+	for id, plan := range cli.manager.recovery.plans {
+		fmt.Printf("  %s:\n", id)
+		fmt.Printf("    Name: %s\n", plan.Name)
+		fmt.Printf("    Description: %s\n", plan.Description)
+		fmt.Printf("    Priority: %s\n", plan.Priority)
+		fmt.Printf("    RTO: %v\n", plan.RTO)
+		fmt.Printf("    RPO: %v\n", plan.RPO)
+		fmt.Printf("    Enabled: %t\n", plan.Enabled)
+		fmt.Printf("    Procedures: %v\n", plan.Procedures)
+		fmt.Printf("\n")
+	}
+
+	return nil
+}
+
+func (cli *OperationsCLI) showRecoveryProcedure(procedureID string) error {
+	procedure, exists := cli.manager.recovery.procedures[procedureID]
+	if !exists {
+		return fmt.Errorf("recovery procedure not found: %s", procedureID)
+	}
+
+	fmt.Printf("Recovery Procedure: %s\n", procedure.Name)
+	fmt.Printf("=====================\n")
+	fmt.Printf("  ID: %s\n", procedure.ID)
+	fmt.Printf("  Description: %s\n", procedure.Description)
+	fmt.Printf("  Type: %s\n", procedure.Type)
+	fmt.Printf("  Risk Level: %s\n", procedure.RiskLevel)
+	fmt.Printf("  Estimated Time: %v\n", procedure.EstimatedTime)
+	fmt.Printf("  Enabled: %t\n", procedure.Enabled)
+	fmt.Printf("  Prerequisites: %v\n", procedure.Prerequisites)
+	
+	fmt.Printf("\n  Steps:\n")
+	for i, step := range procedure.Steps {
+		fmt.Printf("    %d. %s\n", i+1, step.Name)
+		fmt.Printf("       Description: %s\n", step.Description)
+		fmt.Printf("       Command: %s %v\n", step.Command, step.Args)
+		fmt.Printf("       Timeout: %v\n", step.Timeout)
+		if step.Rollback != "" {
+			fmt.Printf("       Rollback: %s\n", step.Rollback)
+		}
+		fmt.Printf("\n")
+	}
+
+	return nil
+}
+
+// Testing operations
+func (cli *OperationsCLI) runTest(scenarioID string) error {
+	cli.logger.Infof("Running test scenario: %s", scenarioID)
+	
+	result, err := cli.manager.testing.RunTest(scenarioID)
+	if err != nil {
+		return fmt.Errorf("test execution failed: %w", err)
+	}
+
+	fmt.Printf("Test scenario completed successfully:\n")
+	fmt.Printf("  ID: %s\n", result.ID)
+	fmt.Printf("  Scenario: %s\n", result.ScenarioID)
+	fmt.Printf("  Status: %s\n", result.Status)
+	fmt.Printf("  Duration: %v\n", result.Duration)
+	fmt.Printf("  Results: %v\n", result.Results)
+
+	return nil
+}
+
+func (cli *OperationsCLI) listTestScenarios() error {
+	fmt.Printf("Available Test Scenarios:\n")
+	fmt.Printf("=========================\n")
+	
+	for id, scenario := range cli.manager.testing.scenarios {
+		fmt.Printf("  %s:\n", id)
+		fmt.Printf("    Name: %s\n", scenario.Name)
+		fmt.Printf("    Description: %s\n", scenario.Description)
+		fmt.Printf("    Type: %s\n", scenario.Type)
+		fmt.Printf("    Enabled: %t\n", scenario.Enabled)
+		fmt.Printf("    Steps: %d\n", len(scenario.Steps))
+		fmt.Printf("    Expected: %v\n", scenario.Expected)
+		fmt.Printf("\n")
+	}
+
+	return nil
+}
+
+func (cli *OperationsCLI) showTestResults(testID string) error {
+	result, exists := cli.manager.testing.results[testID]
+	if !exists {
+		return fmt.Errorf("test result not found: %s", testID)
+	}
+
+	fmt.Printf("Test Result: %s\n", testID)
+	fmt.Printf("============\n")
+	fmt.Printf("  Scenario: %s\n", result.ScenarioID)
+	fmt.Printf("  Status: %s\n", result.Status)
+	fmt.Printf("  Start Time: %v\n", result.StartTime)
+	fmt.Printf("  End Time: %v\n", result.EndTime)
+	fmt.Printf("  Duration: %v\n", result.Duration)
+	
+	if result.Error != "" {
+		fmt.Printf("  Error: %s\n", result.Error)
+	}
+	
+	fmt.Printf("  Results: %v\n", result.Results)
+	fmt.Printf("  Metadata: %v\n", result.Metadata)
+
+	return nil
+}
+
+// Configuration operations
+func (cli *OperationsCLI) showConfig() error {
+	if cli.manager.config == nil {
+		return fmt.Errorf("no configuration loaded")
+	}
+
+	fmt.Printf("Operations Configuration:\n")
+	fmt.Printf("========================\n")
+	fmt.Printf("  Enabled: %t\n", cli.manager.config.Enabled)
+	fmt.Printf("  Backup Path: %s\n", cli.manager.config.BackupPath)
+	fmt.Printf("  Recovery Path: %s\n", cli.manager.config.RecoveryPath)
+	fmt.Printf("  Retention Days: %d\n", cli.manager.config.RetentionDays)
+	fmt.Printf("  Compression: %t\n", cli.manager.config.Compression)
+	fmt.Printf("  Encryption: %t\n", cli.manager.config.Encryption)
+	
+	if len(cli.manager.config.Metadata) > 0 {
+		fmt.Printf("  Metadata:\n")
+		for key, value := range cli.manager.config.Metadata {
+			fmt.Printf("    %s: %s\n", key, value)
+		}
+	}
+
+	return nil
+}
+
+func (cli *OperationsCLI) updateConfig(key string, value string) error {
+	configManager := &OperationsConfigManager{configPath: cli.configPath, config: cli.manager.config}
+	
+	updates := make(map[string]interface{})
+	
+	// Parse value based on key type
+	switch key {
+	case "enabled", "compression", "encryption":
+		if boolVal, err := strconv.ParseBool(value); err == nil {
+			updates[key] = boolVal
+		} else {
+			return fmt.Errorf("invalid boolean value for %s: %s", key, value)
+		}
+	case "retention_days":
+		if intVal, err := strconv.Atoi(value); err == nil {
+			updates[key] = intVal
+		} else {
+			return fmt.Errorf("invalid integer value for %s: %s", key, value)
+		}
+	case "backup_path", "recovery_path":
+		updates[key] = value
+	default:
+		return fmt.Errorf("unknown configuration key: %s", key)
+	}
+
+	if err := configManager.UpdateConfig(updates); err != nil {
+		return fmt.Errorf("failed to update configuration: %w", err)
+	}
+
+	fmt.Printf("Configuration updated: %s = %s\n", key, value)
+	return nil
+}
+
+func (cli *OperationsCLI) validateConfig() error {
+	configManager := &OperationsConfigManager{configPath: cli.configPath, config: cli.manager.config}
+	
+	if err := configManager.ValidateConfig(); err != nil {
+		return fmt.Errorf("configuration validation failed: %w", err)
+	}
+
+	fmt.Printf("Configuration validation passed\n")
+	return nil
+}
+
+// Status operations
+func (cli *OperationsCLI) showStatus() error {
+	fmt.Printf("Operations System Status:\n")
+	fmt.Printf("=========================\n")
+	
+	// Backup system status
+	fmt.Printf("Backup System:\n")
+	fmt.Printf("  Status: Active\n")
+	fmt.Printf("  Strategies: %d\n", len(cli.manager.backup.strategies))
+	fmt.Printf("  Schedules: %d\n", len(cli.manager.backup.schedules))
+	fmt.Printf("  Storage Path: %s\n", cli.manager.backup.storage.path)
+	
+	// Recovery system status
+	fmt.Printf("\nRecovery System:\n")
+	fmt.Printf("  Status: Active\n")
+	fmt.Printf("  Procedures: %d\n", len(cli.manager.recovery.procedures))
+	fmt.Printf("  Plans: %d\n", len(cli.manager.recovery.plans))
+	
+	// Testing system status
+	fmt.Printf("\nTesting System:\n")
+	fmt.Printf("  Status: Active\n")
+	fmt.Printf("  Scenarios: %d\n", len(cli.manager.testing.scenarios))
+	fmt.Printf("  Results: %d\n", len(cli.manager.testing.results))
+	
+	// Data persistence status
+	fmt.Printf("\nData Persistence:\n")
+	fmt.Printf("  Status: Active\n")
+	fmt.Printf("  Replication: %t\n", cli.manager.persistence.config.Replication)
+	fmt.Printf("  Replica Count: %d\n", cli.manager.persistence.config.ReplicaCount)
+
+	return nil
+}
--- a/internal/monitoring/operations_config.go
+++ b/internal/monitoring/operations_config.go
@ -0,0 +1,235 @@
+package monitoring
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"time"
+)
+
+// OperationsConfigManager handles loading and saving operations configuration
+type OperationsConfigManager struct {
+	configPath string
+	config     *OperationsConfig
+}
+
+// LoadOperationsConfig loads operations configuration from file
+func LoadOperationsConfig(configPath string) (*OperationsConfig, error) {
+	manager := &OperationsConfigManager{
+		configPath: configPath,
+	}
+	
+	return manager.Load()
+}
+
+// Load loads configuration from file
+func (ocm *OperationsConfigManager) Load() (*OperationsConfig, error) {
+	// Check if config file exists
+	if _, err := os.Stat(ocm.configPath); os.IsNotExist(err) {
+		// Create default configuration
+		ocm.config = ocm.createDefaultConfig()
+		return ocm.config, ocm.Save()
+	}
+	
+	// Read existing configuration
+	data, err := os.ReadFile(ocm.configPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read config file: %w", err)
+	}
+	
+	// Parse configuration
+	ocm.config = &OperationsConfig{}
+	if err := json.Unmarshal(data, ocm.config); err != nil {
+		return nil, fmt.Errorf("failed to parse config file: %w", err)
+	}
+	
+	return ocm.config, nil
+}
+
+// Save saves configuration to file
+func (ocm *OperationsConfigManager) Save() error {
+	if ocm.config == nil {
+		return fmt.Errorf("no configuration to save")
+	}
+	
+	// Create directory if it doesn't exist
+	configDir := os.DirEntry(ocm.configPath)
+	if configDir != nil {
+		if err := os.MkdirAll(ocm.configPath, 0755); err != nil {
+			return fmt.Errorf("failed to create config directory: %w", err)
+		}
+	}
+	
+	// Marshal configuration
+	data, err := json.MarshalIndent(ocm.config, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal config: %w", err)
+	}
+	
+	// Write to file
+	if err := os.WriteFile(ocm.configPath, data, 0644); err != nil {
+		return fmt.Errorf("failed to write config file: %w", err)
+	}
+	
+	return nil
+}
+
+// UpdateConfig updates configuration and saves to file
+func (ocm *OperationsConfigManager) UpdateConfig(updates map[string]interface{}) error {
+	if ocm.config == nil {
+		return fmt.Errorf("no configuration loaded")
+	}
+	
+	// Apply updates
+	for key, value := range updates {
+		switch key {
+		case "enabled":
+			if boolVal, ok := value.(bool); ok {
+				ocm.config.Enabled = boolVal
+			}
+		case "backup_path":
+			if strVal, ok := value.(string); ok {
+				ocm.config.BackupPath = strVal
+			}
+		case "recovery_path":
+			if strVal, ok := value.(string); ok {
+				ocm.config.RecoveryPath = strVal
+			}
+		case "retention_days":
+			if intVal, ok := value.(int); ok {
+				ocm.config.RetentionDays = intVal
+			}
+		case "compression":
+			if boolVal, ok := value.(bool); ok {
+				ocm.config.Compression = boolVal
+			}
+		case "encryption":
+			if boolVal, ok := value.(bool); ok {
+				ocm.config.Encryption = boolVal
+			}
+		case "metadata":
+			if mapVal, ok := value.(map[string]string); ok {
+				ocm.config.Metadata = mapVal
+			}
+		}
+	}
+	
+	// Save updated configuration
+	return ocm.Save()
+}
+
+// createDefaultConfig creates a default operations configuration
+func (ocm *OperationsConfigManager) createDefaultConfig() *OperationsConfig {
+	return &OperationsConfig{
+		Enabled:       true,
+		BackupPath:    "/var/lib/debian-forge/backups",
+		RecoveryPath:  "/var/lib/debian-forge/recovery",
+		RetentionDays: 30,
+		Compression:   true,
+		Encryption:    false,
+		Metadata: map[string]string{
+			"version":     "1.0.0",
+			"created":     time.Now().Format(time.RFC3339),
+			"description": "Default operations configuration for Debian Forge",
+		},
+	}
+}
+
+// ValidateConfig validates the configuration
+func (ocm *OperationsConfigManager) ValidateConfig() error {
+	if ocm.config == nil {
+		return fmt.Errorf("no configuration loaded")
+	}
+	
+	// Validate backup path
+	if ocm.config.BackupPath == "" {
+		return fmt.Errorf("backup path is required")
+	}
+	
+	// Validate recovery path
+	if ocm.config.RecoveryPath == "" {
+		return fmt.Errorf("recovery path is required")
+	}
+	
+	// Validate retention days
+	if ocm.config.RetentionDays <= 0 {
+		return fmt.Errorf("retention days must be positive")
+	}
+	
+	// Validate paths are absolute
+	if !isAbsolutePath(ocm.config.BackupPath) {
+		return fmt.Errorf("backup path must be absolute")
+	}
+	
+	if !isAbsolutePath(ocm.config.RecoveryPath) {
+		return fmt.Errorf("recovery path must be absolute")
+	}
+	
+	return nil
+}
+
+// isAbsolutePath checks if a path is absolute
+func isAbsolutePath(path string) bool {
+	return len(path) > 0 && path[0] == '/'
+}
+
+// GetBackupConfig returns backup-specific configuration
+func (ocm *OperationsConfigManager) GetBackupConfig() *BackupConfig {
+	if ocm.config == nil {
+		return nil
+	}
+	
+	return &BackupConfig{
+		Enabled:       ocm.config.Enabled,
+		AutoBackup:    true,
+		BackupPath:    ocm.config.BackupPath,
+		RetentionDays: ocm.config.RetentionDays,
+		Compression:   ocm.config.Compression,
+		Encryption:    ocm.config.Encryption,
+		Metadata:      ocm.config.Metadata,
+	}
+}
+
+// GetRecoveryConfig returns recovery-specific configuration
+func (ocm *OperationsConfigManager) GetRecoveryConfig() *RecoveryConfig {
+	if ocm.config == nil {
+		return nil
+	}
+	
+	return &RecoveryConfig{
+		Enabled:      ocm.config.Enabled,
+		AutoRecovery: false,
+		RecoveryPath: ocm.config.RecoveryPath,
+		Testing:      true,
+		Metadata:     ocm.config.Metadata,
+	}
+}
+
+// GetPersistenceConfig returns persistence-specific configuration
+func (ocm *OperationsConfigManager) GetPersistenceConfig() *PersistenceConfig {
+	if ocm.config == nil {
+		return nil
+	}
+	
+	return &PersistenceConfig{
+		Enabled:      ocm.config.Enabled,
+		Replication:  true,
+		ReplicaCount: 3,
+		SyncMode:     "async",
+		Metadata:     ocm.config.Metadata,
+	}
+}
+
+// GetTestingConfig returns testing-specific configuration
+func (ocm *OperationsConfigManager) GetTestingConfig() *TestingConfig {
+	if ocm.config == nil {
+		return nil
+	}
+	
+	return &TestingConfig{
+		Enabled:      ocm.config.Enabled,
+		AutoTesting:  false,
+		TestInterval: 7 * 24 * time.Hour, // Weekly
+		Metadata:     ocm.config.Metadata,
+	}
+}
--- a/internal/monitoring/operations_manager.go
+++ b/internal/monitoring/operations_manager.go
@ -0,0 +1,890 @@
+package monitoring
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"crypto/sha256"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/sirupsen/logrus"
+)
+
+type OperationsManager struct {
+	logger       *logrus.Logger
+	config       *OperationsConfig
+	backup       *BackupManager
+	recovery     *RecoveryManager
+	persistence  *DataPersistence
+	testing      *RecoveryTesting
+	mu           sync.RWMutex
+}
+
+type OperationsConfig struct {
+	Enabled       bool              `json:"enabled"`
+	BackupPath    string            `json:"backup_path"`
+	RecoveryPath  string            `json:"recovery_path"`
+	RetentionDays int               `json:"retention_days"`
+	Compression   bool              `json:"compression"`
+	Encryption    bool              `json:"encryption"`
+	Metadata      map[string]string `json:"metadata"`
+}
+
+type BackupManager struct {
+	config     *BackupConfig
+	schedules  map[string]BackupSchedule
+	strategies map[string]BackupStrategy
+	storage    *BackupStorage
+	logger     *logrus.Logger
+}
+
+type BackupConfig struct {
+	Enabled       bool              `json:"enabled"`
+	AutoBackup    bool              `json:"auto_backup"`
+	BackupPath    string            `json:"backup_path"`
+	RetentionDays int               `json:"retention_days"`
+	Compression   bool              `json:"compression"`
+	Encryption    bool              `json:"encryption"`
+	Metadata      map[string]string `json:"metadata"`
+}
+
+type BackupSchedule struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Type        string                 `json:"type"`
+	Interval    time.Duration          `json:"interval"`
+	LastRun     time.Time              `json:"last_run"`
+	NextRun     time.Time              `json:"next_run"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type BackupStrategy struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Type        string                 `json:"type"`
+	Paths       []string               `json:"paths"`
+	Exclude     []string               `json:"exclude"`
+	Compression bool                   `json:"compression"`
+	Encryption  bool                   `json:"encryption"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type BackupJob struct {
+	ID          string                 `json:"id"`
+	ScheduleID  string                 `json:"schedule_id"`
+	StrategyID  string                 `json:"strategy_id"`
+	Status      string                 `json:"status"`
+	StartTime   time.Time              `json:"start_time"`
+	EndTime     time.Time              `json:"end_time"`
+	Duration    time.Duration          `json:"duration"`
+	Size        int64                  `json:"size"`
+	Checksum    string                 `json:"checksum"`
+	Path        string                 `json:"path"`
+	Error       string                 `json:"error,omitempty"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type BackupStorage struct {
+	path      string
+	retention time.Duration
+	mu        sync.RWMutex
+}
+
+type RecoveryManager struct {
+	config     *RecoveryConfig
+	procedures map[string]RecoveryProcedure
+	plans      map[string]RecoveryPlan
+	logger     *logrus.Logger
+}
+
+type RecoveryConfig struct {
+	Enabled       bool              `json:"enabled"`
+	AutoRecovery  bool              `json:"auto_recovery"`
+	RecoveryPath  string            `json:"recovery_path"`
+	Testing       bool              `json:"testing"`
+	Metadata      map[string]string `json:"metadata"`
+}
+
+type RecoveryProcedure struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Type        string                 `json:"type"`
+	Steps       []RecoveryStep         `json:"steps"`
+	Prerequisites []string             `json:"prerequisites"`
+	EstimatedTime time.Duration        `json:"estimated_time"`
+	RiskLevel   string                 `json:"risk_level"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type RecoveryStep struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Command     string                 `json:"command"`
+	Args        []string               `json:"args"`
+	Timeout     time.Duration          `json:"timeout"`
+	Rollback    string                 `json:"rollback"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type RecoveryPlan struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Procedures  []string               `json:"procedures"`
+	Priority    string                 `json:"priority"`
+	RTO         time.Duration          `json:"rto"`
+	RPO         time.Duration          `json:"rpo"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type DataPersistence struct {
+	config     *PersistenceConfig
+	replication *ReplicationManager
+	mu         sync.RWMutex
+}
+
+type PersistenceConfig struct {
+	Enabled       bool              `json:"enabled"`
+	Replication   bool              `json:"replication"`
+	ReplicaCount  int               `json:"replica_count"`
+	SyncMode      string            `json:"sync_mode"`
+	Metadata      map[string]string `json:"metadata"`
+}
+
+type ReplicationManager struct {
+	replicas   map[string]Replica
+	strategies map[string]ReplicationStrategy
+	mu         sync.RWMutex
+}
+
+type Replica struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Location    string                 `json:"location"`
+	Status      string                 `json:"status"`
+	LastSync    time.Time              `json:"last_sync"`
+	SyncStatus  string                 `json:"sync_status"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type ReplicationStrategy struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Type        string                 `json:"type"`
+	Interval    time.Duration          `json:"interval"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type RecoveryTesting struct {
+	config     *TestingConfig
+	scenarios  map[string]TestScenario
+	results    map[string]TestResult
+	logger     *logrus.Logger
+}
+
+type TestingConfig struct {
+	Enabled       bool              `json:"enabled"`
+	AutoTesting   bool              `json:"auto_testing"`
+	TestInterval  time.Duration     `json:"test_interval"`
+	Metadata      map[string]string `json:"metadata"`
+}
+
+type TestScenario struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Type        string                 `json:"type"`
+	Steps       []TestStep             `json:"steps"`
+	Expected    map[string]interface{} `json:"expected"`
+	Enabled     bool                   `json:"enabled"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type TestStep struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Action      string                 `json:"action"`
+	Parameters  map[string]interface{} `json:"parameters"`
+	Validation  string                 `json:"validation"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+type TestResult struct {
+	ID          string                 `json:"id"`
+	ScenarioID  string                 `json:"scenario_id"`
+	Status      string                 `json:"status"`
+	StartTime   time.Time              `json:"start_time"`
+	EndTime     time.Time              `json:"end_time"`
+	Duration    time.Duration          `json:"duration"`
+	Results     map[string]interface{} `json:"results"`
+	Error       string                 `json:"error,omitempty"`
+	Metadata    map[string]interface{} `json:"metadata"`
+}
+
+func NewOperationsManager(config *OperationsConfig, logger *logrus.Logger) *OperationsManager {
+	manager := &OperationsManager{
+		logger:      logger,
+		config:      config,
+		backup:      NewBackupManager(config.BackupPath, logger),
+		recovery:    NewRecoveryManager(config.RecoveryPath, logger),
+		persistence: NewDataPersistence(),
+		testing:     NewRecoveryTesting(logger),
+	}
+	
+	return manager
+}
+
+func NewBackupManager(backupPath string, logger *logrus.Logger) *BackupManager {
+	manager := &BackupManager{
+		config:     &BackupConfig{},
+		schedules:  make(map[string]BackupSchedule),
+		strategies: make(map[string]BackupStrategy),
+		storage:    NewBackupStorage(backupPath, 30*24*time.Hour),
+		logger:     logger,
+	}
+	
+	// Initialize backup schedules
+	manager.initializeSchedules()
+	
+	// Initialize backup strategies
+	manager.initializeStrategies()
+	
+	return manager
+}
+
+func NewRecoveryManager(recoveryPath string, logger *logrus.Logger) *RecoveryManager {
+	manager := &RecoveryManager{
+		config:     &RecoveryConfig{},
+		procedures: make(map[string]RecoveryProcedure),
+		plans:      make(map[string]RecoveryPlan),
+		logger:     logger,
+	}
+	
+	// Initialize recovery procedures
+	manager.initializeProcedures()
+	
+	// Initialize recovery plans
+	manager.initializePlans()
+	
+	return manager
+}
+
+func NewDataPersistence() *DataPersistence {
+	return &DataPersistence{
+		config:     &PersistenceConfig{},
+		replication: NewReplicationManager(),
+	}
+}
+
+func NewRecoveryTesting(logger *logrus.Logger) *RecoveryTesting {
+	testing := &RecoveryTesting{
+		config:    &TestingConfig{},
+		scenarios: make(map[string]TestScenario),
+		results:   make(map[string]TestResult),
+		logger:    logger,
+	}
+	
+	// Initialize test scenarios
+	testing.initializeScenarios()
+	
+	return testing
+}
+
+func NewBackupStorage(path string, retention time.Duration) *BackupStorage {
+	return &BackupStorage{
+		path:      path,
+		retention: retention,
+	}
+}
+
+func NewReplicationManager() *ReplicationManager {
+	return &ReplicationManager{
+		replicas:   make(map[string]Replica),
+		strategies: make(map[string]ReplicationStrategy),
+	}
+}
+
+func (bm *BackupManager) initializeSchedules() {
+	// Daily backup schedule
+	bm.schedules["daily"] = BackupSchedule{
+		ID:          "daily",
+		Name:        "Daily Backup",
+		Description: "Daily backup of critical data",
+		Type:        "full",
+		Interval:    24 * time.Hour,
+		LastRun:     time.Time{},
+		NextRun:     time.Now().Add(24 * time.Hour),
+		Enabled:     true,
+	}
+	
+	// Weekly backup schedule
+	bm.schedules["weekly"] = BackupSchedule{
+		ID:          "weekly",
+		Name:        "Weekly Backup",
+		Description: "Weekly full backup with retention",
+		Type:        "full",
+		Interval:    7 * 24 * time.Hour,
+		LastRun:     time.Time{},
+		NextRun:     time.Now().Add(7 * 24 * time.Hour),
+		Enabled:     true,
+	}
+	
+	// Monthly backup schedule
+	bm.schedules["monthly"] = BackupSchedule{
+		ID:          "monthly",
+		Name:        "Monthly Backup",
+		Description: "Monthly archival backup",
+		Type:        "archival",
+		Interval:    30 * 24 * time.Hour,
+		LastRun:     time.Time{},
+		NextRun:     time.Now().Add(30 * 24 * time.Hour),
+		Enabled:     true,
+	}
+}
+
+func (bm *BackupManager) initializeStrategies() {
+	// Full backup strategy
+	bm.strategies["full"] = BackupStrategy{
+		ID:          "full",
+		Name:        "Full Backup",
+		Description: "Complete backup of all data",
+		Type:        "full",
+		Paths:       []string{"/var/lib/debian-forge", "/etc/debian-forge", "/opt/debian-forge"},
+		Exclude:     []string{"*.tmp", "*.log", "*.cache"},
+		Compression: true,
+		Encryption:  false,
+		Enabled:     true,
+	}
+	
+	// Incremental backup strategy
+	bm.strategies["incremental"] = BackupStrategy{
+		ID:          "incremental",
+		Name:        "Incremental Backup",
+		Description: "Backup of changed files only",
+		Type:        "incremental",
+		Paths:       []string{"/var/lib/debian-forge"},
+		Exclude:     []string{"*.tmp", "*.log"},
+		Compression: true,
+		Encryption:  false,
+		Enabled:     true,
+	}
+	
+	// Configuration backup strategy
+	bm.strategies["config"] = BackupStrategy{
+		ID:          "config",
+		Name:        "Configuration Backup",
+		Description: "Backup of configuration files only",
+		Type:        "config",
+		Paths:       []string{"/etc/debian-forge"},
+		Exclude:     []string{},
+		Compression: true,
+		Encryption:  true,
+		Enabled:     true,
+	}
+}
+
+func (rm *RecoveryManager) initializeProcedures() {
+	// Database recovery procedure
+	rm.procedures["database_recovery"] = RecoveryProcedure{
+		ID:          "database_recovery",
+		Name:        "Database Recovery",
+		Description: "Recover database from backup",
+		Type:        "database",
+		Steps: []RecoveryStep{
+			{
+				ID:          "stop_services",
+				Name:        "Stop Services",
+				Description: "Stop all services that use the database",
+				Command:     "systemctl",
+				Args:        []string{"stop", "debian-forge"},
+				Timeout:     30 * time.Second,
+				Rollback:    "systemctl start debian-forge",
+			},
+			{
+				ID:          "restore_database",
+				Name:        "Restore Database",
+				Description: "Restore database from backup file",
+				Command:     "pg_restore",
+				Args:        []string{"--clean", "--if-exists", "--dbname=debian_forge"},
+				Timeout:     300 * time.Second,
+				Rollback:    "restore_previous_database",
+			},
+			{
+				ID:          "start_services",
+				Name:        "Start Services",
+				Description: "Start all services",
+				Command:     "systemctl",
+				Args:        []string{"start", "debian-forge"},
+				Timeout:     60 * time.Second,
+				Rollback:    "systemctl stop debian-forge",
+			},
+		},
+		Prerequisites: []string{"backup_file_exists", "database_stopped"},
+		EstimatedTime: 10 * time.Minute,
+		RiskLevel:   "medium",
+		Enabled:     true,
+	}
+	
+	// File system recovery procedure
+	rm.procedures["filesystem_recovery"] = RecoveryProcedure{
+		ID:          "filesystem_recovery",
+		Name:        "File System Recovery",
+		Description: "Recover file system from backup",
+		Type:        "filesystem",
+		Steps: []RecoveryStep{
+			{
+				ID:          "mount_backup",
+				Name:        "Mount Backup",
+				Description: "Mount backup volume",
+				Command:     "mount",
+				Args:        []string{"/dev/backup", "/mnt/backup"},
+				Timeout:     30 * time.Second,
+				Rollback:    "umount /mnt/backup",
+			},
+			{
+				ID:          "restore_files",
+				Name:        "Restore Files",
+				Description: "Restore files from backup",
+				Command:     "rsync",
+				Args:        []string{"-av", "--delete", "/mnt/backup/", "/var/lib/debian-forge/"},
+				Timeout:     600 * time.Second,
+				Rollback:    "restore_from_previous_backup",
+			},
+		},
+		Prerequisites: []string{"backup_volume_available", "sufficient_space"},
+		EstimatedTime: 15 * time.Minute,
+		RiskLevel:   "low",
+		Enabled:     true,
+	}
+}
+
+func (rm *RecoveryManager) initializePlans() {
+	// Critical recovery plan
+	rm.plans["critical"] = RecoveryPlan{
+		ID:          "critical",
+		Name:        "Critical Recovery Plan",
+		Description: "Recovery plan for critical system failures",
+		Procedures:  []string{"database_recovery", "filesystem_recovery"},
+		Priority:    "critical",
+		RTO:         1 * time.Hour,
+		RPO:         15 * time.Minute,
+		Enabled:     true,
+	}
+	
+	// Standard recovery plan
+	rm.plans["standard"] = RecoveryPlan{
+		ID:          "standard",
+		Name:        "Standard Recovery Plan",
+		Description: "Standard recovery plan for normal operations",
+		Procedures:  []string{"filesystem_recovery"},
+		Priority:    "normal",
+		RTO:         4 * time.Hour,
+		RPO:         1 * time.Hour,
+		Enabled:     true,
+	}
+}
+
+func (rt *RecoveryTesting) initializeScenarios() {
+	// Database recovery test
+	rt.scenarios["database_recovery_test"] = TestScenario{
+		ID:          "database_recovery_test",
+		Name:        "Database Recovery Test",
+		Description: "Test database recovery procedure",
+		Type:        "recovery",
+		Steps: []TestStep{
+			{
+				ID:          "create_test_data",
+				Name:        "Create Test Data",
+				Description: "Create test data in database",
+				Action:      "create_test_records",
+				Parameters:  map[string]interface{}{"count": 100},
+				Validation:  "verify_test_data_exists",
+			},
+			{
+				ID:          "simulate_failure",
+				Name:        "Simulate Failure",
+				Description: "Simulate database failure",
+				Action:      "corrupt_database",
+				Parameters:  map[string]interface{}{"severity": "medium"},
+				Validation:  "verify_database_corrupted",
+			},
+			{
+				ID:          "execute_recovery",
+				Name:        "Execute Recovery",
+				Description: "Execute recovery procedure",
+				Action:      "run_recovery_procedure",
+				Parameters:  map[string]interface{}{"procedure": "database_recovery"},
+				Validation:  "verify_database_recovered",
+			},
+		},
+		Expected: map[string]interface{}{
+			"recovery_time": "10m",
+			"data_integrity": "100%",
+			"service_availability": "100%",
+		},
+		Enabled: true,
+	}
+}
+
+func (bm *BackupManager) CreateBackup(strategyID string) (*BackupJob, error) {
+	bm.logger.Infof("Creating backup using strategy: %s", strategyID)
+	
+	strategy, exists := bm.strategies[strategyID]
+	if !exists {
+		return nil, fmt.Errorf("backup strategy not found: %s", strategyID)
+	}
+	
+	if !strategy.Enabled {
+		return nil, fmt.Errorf("backup strategy is disabled: %s", strategyID)
+	}
+	
+	// Create backup job
+	job := &BackupJob{
+		ID:         generateBackupID(),
+		StrategyID: strategyID,
+		Status:     "running",
+		StartTime:  time.Now(),
+		Metadata:   make(map[string]interface{}),
+	}
+	
+	// Execute backup
+	if err := bm.executeBackup(job, strategy); err != nil {
+		job.Status = "failed"
+		job.Error = err.Error()
+		job.EndTime = time.Now()
+		job.Duration = job.EndTime.Sub(job.StartTime)
+		return job, fmt.Errorf("backup execution failed: %w", err)
+	}
+	
+	job.Status = "completed"
+	job.EndTime = time.Now()
+	job.Duration = job.EndTime.Sub(job.StartTime)
+	
+	bm.logger.Infof("Backup completed successfully: %s", job.ID)
+	return job, nil
+}
+
+func (bm *BackupManager) executeBackup(job *BackupJob, strategy BackupStrategy) error {
+	// Create backup directory
+	backupDir := filepath.Join(bm.storage.path, job.ID)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		return fmt.Errorf("failed to create backup directory: %w", err)
+	}
+	
+	// Create tar archive
+	archivePath := filepath.Join(backupDir, "backup.tar")
+	if strategy.Compression {
+		archivePath += ".gz"
+	}
+	
+	// Create archive
+	if err := bm.createArchive(archivePath, strategy.Paths, strategy.Exclude, strategy.Compression); err != nil {
+		return fmt.Errorf("failed to create archive: %w", err)
+	}
+	
+	// Get file size
+	if fileInfo, err := os.Stat(archivePath); err == nil {
+		job.Size = fileInfo.Size()
+	}
+	
+	// Calculate checksum
+	if checksum, err := bm.calculateChecksum(archivePath); err == nil {
+		job.Checksum = checksum
+	}
+	
+	job.Path = archivePath
+	
+	// Store backup job
+	return bm.storage.storeBackupJob(job)
+}
+
+func (bm *BackupManager) createArchive(archivePath string, paths []string, exclude []string, compression bool) error {
+	// Create archive file
+	file, err := os.Create(archivePath)
+	if err != nil {
+		return fmt.Errorf("failed to create archive file: %w", err)
+	}
+	defer file.Close()
+	
+	var writer io.Writer = file
+	
+	// Add compression if enabled
+	if compression {
+		gzipWriter := gzip.NewWriter(file)
+		defer gzipWriter.Close()
+		writer = gzipWriter
+	}
+	
+	// Create tar writer
+	tarWriter := tar.NewWriter(writer)
+	defer tarWriter.Close()
+	
+	// Add files to archive
+	for _, path := range paths {
+		if err := bm.addPathToArchive(tarWriter, path, exclude); err != nil {
+			return fmt.Errorf("failed to add path to archive: %w", err)
+		}
+	}
+	
+	return nil
+}
+
+func (bm *BackupManager) addPathToArchive(tarWriter *tar.Writer, path string, exclude []string) error {
+	return filepath.Walk(path, func(filePath string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		
+		// Check if file should be excluded
+		if bm.shouldExclude(filePath, exclude) {
+			return nil
+		}
+		
+		// Create tar header
+		header, err := tar.FileInfoHeader(info, filePath)
+		if err != nil {
+			return err
+		}
+		
+		// Use relative path
+		header.Name = strings.TrimPrefix(filePath, "/")
+		
+		// Write header
+		if err := tarWriter.WriteHeader(header); err != nil {
+			return err
+		}
+		
+		// Write file content if it's a regular file
+		if !info.IsDir() {
+			file, err := os.Open(filePath)
+			if err != nil {
+				return err
+			}
+			defer file.Close()
+			
+			if _, err := io.Copy(tarWriter, file); err != nil {
+				return err
+			}
+		}
+		
+		return nil
+	})
+}
+
+func (bm *BackupManager) shouldExclude(filePath string, exclude []string) bool {
+	for _, pattern := range exclude {
+		if strings.Contains(filePath, pattern) {
+			return true
+		}
+	}
+	return false
+}
+
+func (bm *BackupManager) calculateChecksum(filePath string) (string, error) {
+	file, err := os.Open(filePath)
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+	
+	hash := sha256.New()
+	if _, err := io.Copy(hash, file); err != nil {
+		return "", err
+	}
+	
+	return fmt.Sprintf("%x", hash.Sum(nil)), nil
+}
+
+func (rm *RecoveryManager) ExecuteRecovery(planID string, backupID string) error {
+	rm.logger.Infof("Executing recovery plan: %s with backup: %s", planID, backupID)
+	
+	plan, exists := rm.plans[planID]
+	if !exists {
+		return fmt.Errorf("recovery plan not found: %s", planID)
+	}
+	
+	if !plan.Enabled {
+		return fmt.Errorf("recovery plan is disabled: %s", planID)
+	}
+	
+	// Execute each procedure in the plan
+	for _, procedureID := range plan.Procedures {
+		procedure, exists := rm.procedures[procedureID]
+		if !exists {
+			rm.logger.Warnf("Recovery procedure not found: %s", procedureID)
+			continue
+		}
+		
+		if err := rm.executeProcedure(procedure, backupID); err != nil {
+			return fmt.Errorf("recovery procedure failed: %w", err)
+		}
+	}
+	
+	rm.logger.Infof("Recovery plan completed successfully: %s", planID)
+	return nil
+}
+
+func (rm *RecoveryManager) executeProcedure(procedure RecoveryProcedure, backupID string) error {
+	rm.logger.Infof("Executing recovery procedure: %s", procedure.ID)
+	
+	// Check prerequisites
+	if err := rm.checkPrerequisites(procedure.Prerequisites); err != nil {
+		return fmt.Errorf("prerequisites not met: %w", err)
+	}
+	
+	// Execute each step
+	for _, step := range procedure.Steps {
+		if err := rm.executeStep(step); err != nil {
+			return fmt.Errorf("step failed: %s - %w", step.ID, err)
+		}
+	}
+	
+	return nil
+}
+
+func (rm *RecoveryManager) checkPrerequisites(prerequisites []string) error {
+	// This is a placeholder for prerequisite checking
+	// In production, implement actual prerequisite validation
+	return nil
+}
+
+func (rm *RecoveryManager) executeStep(step RecoveryStep) error {
+	rm.logger.Infof("Executing recovery step: %s", step.ID)
+	
+	// This is a placeholder for step execution
+	// In production, implement actual step execution logic
+	rm.logger.Infof("Step %s completed: %s", step.ID, step.Description)
+	
+	return nil
+}
+
+func (rt *RecoveryTesting) RunTest(scenarioID string) (*TestResult, error) {
+	rt.logger.Infof("Running recovery test scenario: %s", scenarioID)
+	
+	scenario, exists := rt.scenarios[scenarioID]
+	if !exists {
+		return nil, fmt.Errorf("test scenario not found: %s", scenarioID)
+	}
+	
+	if !scenario.Enabled {
+		return nil, fmt.Errorf("test scenario is disabled: %s", scenarioID)
+	}
+	
+	// Create test result
+	result := &TestResult{
+		ID:         generateTestID(),
+		ScenarioID: scenarioID,
+		Status:     "running",
+		StartTime:  time.Now(),
+		Results:    make(map[string]interface{}),
+		Metadata:   make(map[string]interface{}),
+	}
+	
+	// Execute test scenario
+	if err := rt.executeScenario(scenario, result); err != nil {
+		result.Status = "failed"
+		result.Error = err.Error()
+		result.EndTime = time.Now()
+		result.Duration = result.EndTime.Sub(result.StartTime)
+		return result, fmt.Errorf("test scenario failed: %w", err)
+	}
+	
+	result.Status = "completed"
+	result.EndTime = time.Now()
+	result.Duration = result.EndTime.Sub(result.StartTime)
+	
+	// Store test result
+	rt.results[result.ID] = *result
+	
+	rt.logger.Infof("Test scenario completed successfully: %s", scenarioID)
+	return result, nil
+}
+
+func (rt *RecoveryTesting) executeScenario(scenario TestScenario, result *TestResult) error {
+	rt.logger.Infof("Executing test scenario: %s", scenario.ID)
+	
+	// Execute each test step
+	for _, step := range scenario.Steps {
+		if err := rt.executeTestStep(step, result); err != nil {
+			return fmt.Errorf("test step failed: %s - %w", step.ID, err)
+		}
+	}
+	
+	// Validate results against expected outcomes
+	if err := rt.validateResults(scenario.Expected, result.Results); err != nil {
+		return fmt.Errorf("test validation failed: %w", err)
+	}
+	
+	return nil
+}
+
+func (rt *RecoveryTesting) executeTestStep(step TestStep, result *TestResult) error {
+	rt.logger.Infof("Executing test step: %s", step.ID)
+	
+	// This is a placeholder for test step execution
+	// In production, implement actual test step execution logic
+	result.Results[step.ID] = map[string]interface{}{
+		"status": "completed",
+		"message": step.Description,
+	}
+	
+	return nil
+}
+
+func (rt *RecoveryTesting) validateResults(expected map[string]interface{}, actual map[string]interface{}) error {
+	// This is a placeholder for result validation
+	// In production, implement actual validation logic
+	return nil
+}
+
+// BackupStorage methods
+func (bs *BackupStorage) storeBackupJob(job *BackupJob) error {
+	bs.mu.Lock()
+	defer bs.mu.Unlock()
+	
+	// Create data directory if it doesn't exist
+	if err := os.MkdirAll(bs.path, 0755); err != nil {
+		return fmt.Errorf("failed to create data directory: %w", err)
+	}
+	
+	// Store backup job with timestamp
+	timestamp := job.StartTime.Format("2006-01-02_15-04-05")
+	filename := filepath.Join(bs.path, fmt.Sprintf("backup_job_%s_%s.json", job.ID, timestamp))
+	
+	data, err := json.MarshalIndent(job, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal backup job: %w", err)
+	}
+	
+	if err := os.WriteFile(filename, data, 0644); err != nil {
+		return fmt.Errorf("failed to write backup job: %w", err)
+	}
+	
+	return nil
+}
+
+// Helper functions
+func generateBackupID() string {
+	return fmt.Sprintf("backup-%d", time.Now().UnixNano())
+}
+
+func generateTestID() string {
+	return fmt.Sprintf("test-%d", time.Now().UnixNano())
+}
--- a/internal/monitoring/system_monitor.go
+++ b/internal/monitoring/system_monitor.go