apt-ostree/src/monitoring.rs
robojerk 9f02fe2d69 Build apt-ostree Debian package with libostree 2025.2 compatibility
- Fix compilation errors in src/main.rs and resolve import conflicts
- Add debian/compat file and ensure debian/rules is executable
- Downgrade Cargo.lock to version 3 for compatibility with system cargo
- Create working apt-ostree binary with basic CLI functionality
- Build apt-ostree_0.1.0-1_amd64.deb package (1.1MB)
- Package installs successfully and binary works correctly
- Ensure libostree-1-1 (>= 2025.2) dependency for bootc compatibility
- Test package installation and basic commands (status, version)
2025-07-22 05:45:32 +00:00

773 lines
No EOL
24 KiB
Rust

//! Comprehensive Monitoring and Logging for APT-OSTree
//!
//! This module provides structured logging, metrics collection, health checks,
//! and monitoring capabilities for the APT-OSTree system.
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use tokio::sync::Mutex;
use serde::{Serialize, Deserialize};
use tracing::{info, error, debug, instrument, Level};
use tracing_subscriber::{
fmt::{self},
EnvFilter,
};
use tracing_subscriber::prelude::*;
use chrono::{DateTime, Utc};
use crate::error::{AptOstreeError, AptOstreeResult};
/// Monitoring configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MonitoringConfig {
/// Log level (trace, debug, info, warn, error)
pub log_level: String,
/// Log file path (optional)
pub log_file: Option<String>,
/// Enable structured logging (JSON format)
pub structured_logging: bool,
/// Enable metrics collection
pub enable_metrics: bool,
/// Metrics collection interval in seconds
pub metrics_interval: u64,
/// Enable health checks
pub enable_health_checks: bool,
/// Health check interval in seconds
pub health_check_interval: u64,
/// Enable performance monitoring
pub enable_performance_monitoring: bool,
/// Enable transaction monitoring
pub enable_transaction_monitoring: bool,
/// Enable system resource monitoring
pub enable_system_monitoring: bool,
}
impl Default for MonitoringConfig {
fn default() -> Self {
Self {
log_level: "info".to_string(),
log_file: None,
structured_logging: false,
enable_metrics: true,
metrics_interval: 60,
enable_health_checks: true,
health_check_interval: 300,
enable_performance_monitoring: true,
enable_transaction_monitoring: true,
enable_system_monitoring: true,
}
}
}
/// System metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemMetrics {
/// Timestamp of metrics collection
pub timestamp: DateTime<Utc>,
/// CPU usage percentage
pub cpu_usage: f64,
/// Memory usage in bytes
pub memory_usage: u64,
/// Total memory in bytes
pub total_memory: u64,
/// Disk usage in bytes
pub disk_usage: u64,
/// Total disk space in bytes
pub total_disk: u64,
/// Number of active transactions
pub active_transactions: u32,
/// Number of pending deployments
pub pending_deployments: u32,
/// OSTree repository size in bytes
pub ostree_repo_size: u64,
/// APT cache size in bytes
pub apt_cache_size: u64,
/// System uptime in seconds
pub uptime: u64,
/// Load average (1, 5, 15 minutes)
pub load_average: [f64; 3],
}
/// Performance metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceMetrics {
/// Timestamp of metrics collection
pub timestamp: DateTime<Utc>,
/// Operation type
pub operation_type: String,
/// Operation duration in milliseconds
pub duration_ms: u64,
/// Success status
pub success: bool,
/// Error message if failed
pub error_message: Option<String>,
/// Additional context
pub context: HashMap<String, String>,
}
/// Transaction metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TransactionMetrics {
/// Transaction ID
pub transaction_id: String,
/// Transaction type
pub transaction_type: String,
/// Start time
pub start_time: DateTime<Utc>,
/// End time
pub end_time: Option<DateTime<Utc>>,
/// Duration in milliseconds
pub duration_ms: Option<u64>,
/// Success status
pub success: bool,
/// Error message if failed
pub error_message: Option<String>,
/// Number of packages involved
pub packages_count: u32,
/// Total size of packages in bytes
pub packages_size: u64,
/// Progress percentage
pub progress: f64,
}
/// Health check result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheckResult {
/// Check name
pub check_name: String,
/// Check status
pub status: HealthStatus,
/// Check message
pub message: String,
/// Check timestamp
pub timestamp: DateTime<Utc>,
/// Check duration in milliseconds
pub duration_ms: u64,
/// Additional details
pub details: HashMap<String, String>,
}
/// Health status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum HealthStatus {
Healthy,
Warning,
Critical,
Unknown,
}
/// Monitoring manager
pub struct MonitoringManager {
config: MonitoringConfig,
metrics: Arc<Mutex<Vec<SystemMetrics>>>,
performance_metrics: Arc<Mutex<Vec<PerformanceMetrics>>>,
transaction_metrics: Arc<Mutex<HashMap<String, TransactionMetrics>>>,
health_checks: Arc<Mutex<Vec<HealthCheckResult>>>,
start_time: Instant,
}
impl MonitoringManager {
/// Create a new monitoring manager
pub fn new(config: MonitoringConfig) -> AptOstreeResult<Self> {
info!("Initializing monitoring manager with config: {:?}", config);
Ok(Self {
config,
metrics: Arc::new(Mutex::new(Vec::new())),
performance_metrics: Arc::new(Mutex::new(Vec::new())),
transaction_metrics: Arc::new(Mutex::new(HashMap::new())),
health_checks: Arc::new(Mutex::new(Vec::new())),
start_time: Instant::now(),
})
}
/// Initialize logging system
pub fn init_logging(&self) -> AptOstreeResult<()> {
info!("Initializing logging system");
// Create environment filter
let env_filter = EnvFilter::try_from_default_env()
.unwrap_or_else(|_| {
let level = match self.config.log_level.as_str() {
"trace" => Level::TRACE,
"debug" => Level::DEBUG,
"info" => Level::INFO,
"warn" => Level::WARN,
"error" => Level::ERROR,
_ => Level::INFO,
};
EnvFilter::new(format!("apt_ostree={}", level))
});
// Create formatter layer
let fmt_layer = fmt::layer()
.with_target(true)
.with_thread_ids(true)
.with_thread_names(true);
// Create subscriber
let subscriber = tracing_subscriber::registry()
.with(env_filter)
.with(fmt_layer);
// Set global default
tracing::subscriber::set_global_default(subscriber)
.map_err(|e| AptOstreeError::Initialization(format!("Failed to set global subscriber: {}", e)))?;
info!("Logging system initialized successfully");
Ok(())
}
/// Record system metrics
#[instrument(skip(self))]
pub async fn record_system_metrics(&self) -> AptOstreeResult<()> {
if !self.config.enable_metrics {
return Ok(());
}
debug!("Recording system metrics");
let metrics = self.collect_system_metrics().await?;
{
let mut metrics_store = self.metrics.lock().await;
metrics_store.push(metrics.clone());
// Keep only last 1000 metrics
let len = metrics_store.len();
if len > 1000 {
let to_remove = len - 1000;
metrics_store.drain(0..to_remove);
}
}
debug!("System metrics recorded: {:?}", metrics);
Ok(())
}
/// Collect system metrics
async fn collect_system_metrics(&self) -> AptOstreeResult<SystemMetrics> {
// In a real implementation, this would collect actual system metrics
// For now, we'll use placeholder values
let timestamp = Utc::now();
let uptime = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
Ok(SystemMetrics {
timestamp,
cpu_usage: 0.0, // Would get from /proc/stat
memory_usage: 0, // Would get from /proc/meminfo
total_memory: 0, // Would get from /proc/meminfo
disk_usage: 0, // Would get from df
total_disk: 0, // Would get from df
active_transactions: 0, // Would get from transaction manager
pending_deployments: 0, // Would get from OSTree manager
ostree_repo_size: 0, // Would get from OSTree repo
apt_cache_size: 0, // Would get from APT cache
uptime,
load_average: [0.0, 0.0, 0.0], // Would get from /proc/loadavg
})
}
/// Record performance metrics
#[instrument(skip(self, context))]
pub async fn record_performance_metrics(
&self,
operation_type: &str,
duration: Duration,
success: bool,
error_message: Option<String>,
context: HashMap<String, String>,
) -> AptOstreeResult<()> {
if !self.config.enable_performance_monitoring {
return Ok(());
}
debug!("Recording performance metrics for operation: {}", operation_type);
let metrics = PerformanceMetrics {
timestamp: Utc::now(),
operation_type: operation_type.to_string(),
duration_ms: duration.as_millis() as u64,
success,
error_message,
context,
};
{
let mut perf_metrics = self.performance_metrics.lock().await;
perf_metrics.push(metrics.clone());
// Keep only last 1000 performance metrics
let len = perf_metrics.len();
if len > 1000 {
let to_remove = len - 1000;
perf_metrics.drain(0..to_remove);
}
}
debug!("Performance metrics recorded: {:?}", metrics);
Ok(())
}
/// Start transaction monitoring
#[instrument(skip(self))]
pub async fn start_transaction_monitoring(
&self,
transaction_id: &str,
transaction_type: &str,
packages_count: u32,
packages_size: u64,
) -> AptOstreeResult<()> {
if !self.config.enable_transaction_monitoring {
return Ok(());
}
debug!("Starting transaction monitoring for: {}", transaction_id);
let metrics = TransactionMetrics {
transaction_id: transaction_id.to_string(),
transaction_type: transaction_type.to_string(),
start_time: Utc::now(),
end_time: None,
duration_ms: None,
success: false,
error_message: None,
packages_count,
packages_size,
progress: 0.0,
};
{
let mut tx_metrics = self.transaction_metrics.lock().await;
tx_metrics.insert(transaction_id.to_string(), metrics);
}
info!("Transaction monitoring started: {} ({})", transaction_id, transaction_type);
Ok(())
}
/// Update transaction progress
#[instrument(skip(self))]
pub async fn update_transaction_progress(
&self,
transaction_id: &str,
progress: f64,
) -> AptOstreeResult<()> {
if !self.config.enable_transaction_monitoring {
return Ok(());
}
debug!("Updating transaction progress: {} -> {:.1}%", transaction_id, progress * 100.0);
{
let mut tx_metrics = self.transaction_metrics.lock().await;
if let Some(metrics) = tx_metrics.get_mut(transaction_id) {
metrics.progress = progress;
}
}
Ok(())
}
/// Complete transaction monitoring
#[instrument(skip(self))]
pub async fn complete_transaction_monitoring(
&self,
transaction_id: &str,
success: bool,
error_message: Option<String>,
) -> AptOstreeResult<()> {
if !self.config.enable_transaction_monitoring {
return Ok(());
}
debug!("Completing transaction monitoring for: {}", transaction_id);
{
let mut tx_metrics = self.transaction_metrics.lock().await;
if let Some(metrics) = tx_metrics.get_mut(transaction_id) {
metrics.end_time = Some(Utc::now());
metrics.duration_ms = Some(metrics.end_time
.unwrap()
.signed_duration_since(metrics.start_time)
.num_milliseconds() as u64);
metrics.success = success;
metrics.error_message = error_message;
}
}
info!("Transaction monitoring completed: {} (success: {})", transaction_id, success);
Ok(())
}
/// Run health checks
#[instrument(skip(self))]
pub async fn run_health_checks(&self) -> AptOstreeResult<Vec<HealthCheckResult>> {
if !self.config.enable_health_checks {
return Ok(Vec::new());
}
debug!("Running health checks");
let mut results = Vec::new();
// Run individual health checks
results.push(self.check_ostree_health().await);
results.push(self.check_apt_health().await);
results.push(self.check_system_resources().await);
results.push(self.check_daemon_health().await);
// Store health check results
{
let mut health_store = self.health_checks.lock().await;
health_store.extend(results.clone());
// Keep only last 100 health checks
let len = health_store.len();
if len > 100 {
let to_remove = len - 100;
health_store.drain(0..to_remove);
}
}
debug!("Health checks completed: {} results", results.len());
Ok(results)
}
/// Check OSTree repository health
async fn check_ostree_health(&self) -> HealthCheckResult {
let start_time = Instant::now();
let check_name = "ostree_repository";
// In a real implementation, this would check OSTree repository integrity
let status = HealthStatus::Healthy;
let message = "OSTree repository is healthy".to_string();
let duration_ms = start_time.elapsed().as_millis() as u64;
HealthCheckResult {
check_name: check_name.to_string(),
status,
message,
timestamp: Utc::now(),
duration_ms,
details: HashMap::new(),
}
}
/// Check APT database health
async fn check_apt_health(&self) -> HealthCheckResult {
let start_time = Instant::now();
let check_name = "apt_database";
// In a real implementation, this would check APT database integrity
let status = HealthStatus::Healthy;
let message = "APT database is healthy".to_string();
let duration_ms = start_time.elapsed().as_millis() as u64;
HealthCheckResult {
check_name: check_name.to_string(),
status,
message,
timestamp: Utc::now(),
duration_ms,
details: HashMap::new(),
}
}
/// Check system resources
async fn check_system_resources(&self) -> HealthCheckResult {
let start_time = Instant::now();
let check_name = "system_resources";
// In a real implementation, this would check system resource availability
let status = HealthStatus::Healthy;
let message = "System resources are adequate".to_string();
let duration_ms = start_time.elapsed().as_millis() as u64;
HealthCheckResult {
check_name: check_name.to_string(),
status,
message,
timestamp: Utc::now(),
duration_ms,
details: HashMap::new(),
}
}
/// Check daemon health
async fn check_daemon_health(&self) -> HealthCheckResult {
let start_time = Instant::now();
let check_name = "daemon_health";
// In a real implementation, this would check daemon status
let status = HealthStatus::Healthy;
let message = "Daemon is running and healthy".to_string();
let duration_ms = start_time.elapsed().as_millis() as u64;
HealthCheckResult {
check_name: check_name.to_string(),
status,
message,
timestamp: Utc::now(),
duration_ms,
details: HashMap::new(),
}
}
/// Get monitoring statistics
pub async fn get_statistics(&self) -> AptOstreeResult<MonitoringStatistics> {
let uptime = self.start_time.elapsed();
let metrics_count = {
let metrics = self.metrics.lock().await;
metrics.len()
};
let performance_count = {
let perf_metrics = self.performance_metrics.lock().await;
perf_metrics.len()
};
let transaction_count = {
let tx_metrics = self.transaction_metrics.lock().await;
tx_metrics.len()
};
let health_check_count = {
let health_checks = self.health_checks.lock().await;
health_checks.len()
};
Ok(MonitoringStatistics {
uptime_seconds: uptime.as_secs(),
metrics_collected: metrics_count,
performance_metrics_collected: performance_count,
active_transactions: transaction_count,
health_checks_performed: health_check_count,
config: self.config.clone(),
})
}
/// Export metrics as JSON
#[instrument(skip(self))]
pub async fn export_metrics(&self) -> AptOstreeResult<String> {
debug!("Exporting metrics");
let metrics_export = MetricsExport {
timestamp: Utc::now(),
system_metrics: self.metrics.lock().await.clone(),
performance_metrics: self.performance_metrics.lock().await.clone(),
transaction_metrics: self.transaction_metrics.lock().await.values().cloned().collect(),
health_checks: self.health_checks.lock().await.clone(),
};
serde_json::to_string_pretty(&metrics_export)
.map_err(|e| AptOstreeError::Initialization(format!("Failed to export metrics: {}", e)))
}
}
/// Monitoring statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MonitoringStatistics {
/// Uptime in seconds
pub uptime_seconds: u64,
/// Number of metrics collected
pub metrics_collected: usize,
/// Number of performance metrics collected
pub performance_metrics_collected: usize,
/// Number of active transactions
pub active_transactions: usize,
/// Number of health checks performed
pub health_checks_performed: usize,
/// Monitoring configuration
pub config: MonitoringConfig,
}
/// Metrics export structure
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricsExport {
/// Export timestamp
pub timestamp: DateTime<Utc>,
/// System metrics
pub system_metrics: Vec<SystemMetrics>,
/// Performance metrics
pub performance_metrics: Vec<PerformanceMetrics>,
/// Transaction metrics
pub transaction_metrics: Vec<TransactionMetrics>,
/// Health checks
pub health_checks: Vec<HealthCheckResult>,
}
/// Performance monitoring wrapper
pub struct PerformanceMonitor {
monitoring_manager: Arc<MonitoringManager>,
operation_type: String,
start_time: Instant,
context: HashMap<String, String>,
}
impl PerformanceMonitor {
/// Create a new performance monitor
pub fn new(
monitoring_manager: Arc<MonitoringManager>,
operation_type: &str,
context: HashMap<String, String>,
) -> Self {
Self {
monitoring_manager,
operation_type: operation_type.to_string(),
start_time: Instant::now(),
context,
}
}
/// Record success
pub async fn success(self) -> AptOstreeResult<()> {
let duration = self.start_time.elapsed();
self.monitoring_manager
.record_performance_metrics(
&self.operation_type,
duration,
true,
None,
self.context,
)
.await
}
/// Record failure
pub async fn failure(self, error_message: String) -> AptOstreeResult<()> {
let duration = self.start_time.elapsed();
self.monitoring_manager
.record_performance_metrics(
&self.operation_type,
duration,
false,
Some(error_message),
self.context,
)
.await
}
}
/// Transaction monitor
pub struct TransactionMonitor {
monitoring_manager: Arc<MonitoringManager>,
transaction_id: String,
}
impl TransactionMonitor {
/// Create a new transaction monitor
pub fn new(
monitoring_manager: Arc<MonitoringManager>,
transaction_id: &str,
transaction_type: &str,
packages_count: u32,
packages_size: u64,
) -> Self {
let transaction_id = transaction_id.to_string();
let transaction_type = transaction_type.to_string();
// Start transaction monitoring in background
let manager_clone = monitoring_manager.clone();
let tx_id = transaction_id.clone();
let tx_type = transaction_type.clone();
tokio::spawn(async move {
if let Err(e) = manager_clone
.start_transaction_monitoring(&tx_id, &tx_type, packages_count, packages_size)
.await
{
error!("Failed to start transaction monitoring: {}", e);
}
});
Self {
monitoring_manager,
transaction_id,
}
}
/// Update progress
pub async fn update_progress(&self, progress: f64) -> AptOstreeResult<()> {
self.monitoring_manager
.update_transaction_progress(&self.transaction_id, progress)
.await
}
/// Complete with success
pub async fn success(self) -> AptOstreeResult<()> {
self.monitoring_manager
.complete_transaction_monitoring(&self.transaction_id, true, None)
.await
}
/// Complete with failure
pub async fn failure(self, error_message: String) -> AptOstreeResult<()> {
self.monitoring_manager
.complete_transaction_monitoring(&self.transaction_id, false, Some(error_message))
.await
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_monitoring_manager_creation() {
let config = MonitoringConfig::default();
let manager = MonitoringManager::new(config).unwrap();
assert!(manager.init_logging().is_ok());
}
#[tokio::test]
async fn test_performance_monitoring() {
let config = MonitoringConfig::default();
let manager = Arc::new(MonitoringManager::new(config).unwrap());
let monitor = PerformanceMonitor::new(
manager.clone(),
"test_operation",
HashMap::new(),
);
assert!(monitor.success().await.is_ok());
}
#[tokio::test]
async fn test_transaction_monitoring() {
let config = MonitoringConfig::default();
let manager = Arc::new(MonitoringManager::new(config).unwrap());
let monitor = TransactionMonitor::new(
manager.clone(),
"test_transaction",
"test_type",
5,
1024,
);
assert!(monitor.update_progress(0.5).await.is_ok());
assert!(monitor.success().await.is_ok());
}
#[tokio::test]
async fn test_health_checks() {
let config = MonitoringConfig::default();
let manager = MonitoringManager::new(config).unwrap();
let results = manager.run_health_checks().await.unwrap();
assert!(!results.is_empty());
for result in results {
assert!(!result.check_name.is_empty());
assert!(!result.message.is_empty());
}
}
}