2025-08-12 07:21:41 +08:00

21 KiB
Raw Blame History

内存管理优化方案

当前问题分析

1. 内存拷贝问题

当前架构中存在的主要内存问题:

// 当前实现 - 每次事件传递都会克隆整个帧数据
pub struct FrameCapturedEvent {
    pub frame_data: Vec<u8>, // 640x480 RGB = ~900KB per frame
}

// 问题分析:
// - 30 FPS = 27MB/秒的内存拷贝
// - 事件总线广播时,每个订阅者都会克隆数据
// - 3个订阅者 = 81MB/秒的内存操作

2. 内存分配压力

  • 每帧都需要新的内存分配
  • GC压力导致延迟峰值
  • 内存碎片化问题

3. 缓冲区管理

  • Detection模块维护独立的帧缓冲
  • Storage模块也有自己的缓冲
  • 重复存储相同数据

优化方案详细设计

方案1: 零拷贝架构 (Zero-Copy Architecture)

A. 使用Arc实现共享不可变数据

use std::sync::Arc;
use bytes::Bytes;

// 新的事件结构 - 使用Arc共享数据
#[derive(Clone, Debug)]
pub struct FrameCapturedEvent {
    pub frame_id: u64,
    pub timestamp: chrono::DateTime<chrono::Utc>,
    pub metadata: FrameMetadata,
    pub frame_data: Arc<FrameData>, // 共享引用,克隆只增加引用计数
}

// 帧数据包装,包含原始数据和元信息
#[derive(Debug)]
pub struct FrameData {
    pub data: Bytes,        // 使用bytes crate支持零拷贝切片
    pub width: u32,
    pub height: u32,
    pub format: FrameFormat,
}

#[derive(Clone, Debug)]
pub struct FrameMetadata {
    pub camera_id: u32,
    pub exposure_time: f32,
    pub gain: f32,
    pub temperature: Option<f32>,
}

#[derive(Clone, Debug)]
pub enum FrameFormat {
    RGB888,
    YUV420,
    JPEG,
    H264Frame,
}

// 实现示例
impl FrameCapturedEvent {
    pub fn new_zero_copy(
        frame_id: u64,
        data: Vec<u8>,
        width: u32,
        height: u32,
    ) -> Self {
        let frame_data = Arc::new(FrameData {
            data: Bytes::from(data), // 转换为Bytes之后可零拷贝切片
            width,
            height,
            format: FrameFormat::RGB888,
        });
        
        Self {
            frame_id,
            timestamp: chrono::Utc::now(),
            metadata: FrameMetadata::default(),
            frame_data,
        }
    }
    
    // 获取帧数据的只读引用
    pub fn data(&self) -> &[u8] {
        &self.frame_data.data
    }
    
    // 创建数据的零拷贝切片
    pub fn slice(&self, start: usize, end: usize) -> Bytes {
        self.frame_data.data.slice(start..end)
    }
}

B. 优化事件总线

use tokio::sync::broadcast;
use std::sync::Arc;

pub struct OptimizedEventBus {
    // 使用Arc包装的发送器避免克隆整个通道
    sender: Arc<broadcast::Sender<Arc<SystemEvent>>>,
    capacity: usize,
}

impl OptimizedEventBus {
    pub fn new(capacity: usize) -> Self {
        let (sender, _) = broadcast::channel(capacity);
        Self {
            sender: Arc::new(sender),
            capacity,
        }
    }
    
    // 发布事件时使用Arc包装
    pub fn publish(&self, event: SystemEvent) -> Result<()> {
        let arc_event = Arc::new(event);
        self.sender.send(arc_event)
            .map_err(|_| anyhow::anyhow!("No subscribers"))?;
        Ok(())
    }
    
    // 订阅者接收Arc包装的事件
    pub fn subscribe(&self) -> broadcast::Receiver<Arc<SystemEvent>> {
        self.sender.subscribe()
    }
}

方案2: 帧池化 (Frame Pooling)

A. 对象池实现

use std::sync::{Arc, Mutex};
use std::collections::VecDeque;

/// 帧缓冲池,复用内存分配
pub struct FramePool {
    pool: Arc<Mutex<VecDeque<Vec<u8>>>>,
    frame_size: usize,
    max_pool_size: usize,
    allocated_count: Arc<AtomicUsize>,
}

impl FramePool {
    pub fn new(width: u32, height: u32, format: FrameFormat, max_pool_size: usize) -> Self {
        let frame_size = Self::calculate_frame_size(width, height, format);
        
        Self {
            pool: Arc::new(Mutex::new(VecDeque::with_capacity(max_pool_size))),
            frame_size,
            max_pool_size,
            allocated_count: Arc::new(AtomicUsize::new(0)),
        }
    }
    
    /// 从池中获取或分配新的帧缓冲
    pub fn acquire(&self) -> PooledFrame {
        let mut pool = self.pool.lock().unwrap();
        
        let buffer = if let Some(mut buf) = pool.pop_front() {
            // 复用现有缓冲
            buf.clear();
            buf.resize(self.frame_size, 0);
            buf
        } else {
            // 分配新缓冲
            self.allocated_count.fetch_add(1, Ordering::Relaxed);
            vec![0u8; self.frame_size]
        };
        
        PooledFrame {
            buffer,
            pool: Arc::clone(&self.pool),
            frame_size: self.frame_size,
        }
    }
    
    /// 计算帧大小
    fn calculate_frame_size(width: u32, height: u32, format: FrameFormat) -> usize {
        match format {
            FrameFormat::RGB888 => (width * height * 3) as usize,
            FrameFormat::YUV420 => (width * height * 3 / 2) as usize,
            FrameFormat::JPEG => (width * height) as usize, // 估算
            FrameFormat::H264Frame => (width * height / 2) as usize, // 估算
        }
    }
    
    /// 获取池统计信息
    pub fn stats(&self) -> PoolStats {
        let pool = self.pool.lock().unwrap();
        PoolStats {
            pooled: pool.len(),
            allocated: self.allocated_count.load(Ordering::Relaxed),
            frame_size: self.frame_size,
        }
    }
}

/// RAII包装的池化帧自动归还到池
pub struct PooledFrame {
    buffer: Vec<u8>,
    pool: Arc<Mutex<VecDeque<Vec<u8>>>>,
    frame_size: usize,
}

impl PooledFrame {
    pub fn as_slice(&self) -> &[u8] {
        &self.buffer
    }
    
    pub fn as_mut_slice(&mut self) -> &mut [u8] {
        &mut self.buffer
    }
}

impl Drop for PooledFrame {
    fn drop(&mut self) {
        // 归还缓冲到池
        let mut pool = self.pool.lock().unwrap();
        if pool.len() < pool.capacity() {
            let buffer = std::mem::replace(&mut self.buffer, Vec::new());
            pool.push_back(buffer);
        }
    }
}

#[derive(Debug)]
pub struct PoolStats {
    pub pooled: usize,
    pub allocated: usize,
    pub frame_size: usize,
}

B. Camera模块集成

// camera.rs 优化版本
pub struct OptimizedCameraController {
    config: CameraConfig,
    event_bus: EventBus,
    frame_pool: FramePool,
    frame_counter: AtomicU64,
}

impl OptimizedCameraController {
    pub async fn capture_loop(&mut self) -> Result<()> {
        loop {
            // 从池中获取帧缓冲
            let mut pooled_frame = self.frame_pool.acquire();
            
            // 捕获到池化缓冲中
            self.capture_to_buffer(pooled_frame.as_mut_slice()).await?;
            
            // 转换为Arc共享数据
            let frame_data = Arc::new(FrameData {
                data: Bytes::from(pooled_frame.as_slice().to_vec()),
                width: self.config.width.unwrap_or(640),
                height: self.config.height.unwrap_or(480),
                format: FrameFormat::RGB888,
            });
            
            // 创建事件
            let event = FrameCapturedEvent {
                frame_id: self.frame_counter.fetch_add(1, Ordering::Relaxed),
                timestamp: chrono::Utc::now(),
                metadata: self.create_metadata(),
                frame_data,
            };
            
            // 发布事件
            self.event_bus.publish(SystemEvent::FrameCaptured(event))?;
            
            // pooled_frame 在这里自动Drop缓冲归还到池
            
            // 控制帧率
            tokio::time::sleep(Duration::from_millis(33)).await; // ~30 FPS
        }
    }
}

方案3: 环形缓冲区 (Ring Buffer)

A. 内存映射环形缓冲

use memmap2::{MmapMut, MmapOptions};
use std::sync::atomic::{AtomicUsize, Ordering};

/// 内存映射的环形缓冲区,用于高效的帧存储
pub struct MmapRingBuffer {
    mmap: Arc<MmapMut>,
    frame_size: usize,
    capacity: usize,
    write_pos: Arc<AtomicUsize>,
    read_pos: Arc<AtomicUsize>,
    frame_offsets: Vec<usize>,
}

impl MmapRingBuffer {
    pub fn new(capacity: usize, frame_size: usize) -> Result<Self> {
        let total_size = capacity * frame_size;
        
        // 创建临时文件用于内存映射
        let temp_file = tempfile::tempfile()?;
        temp_file.set_len(total_size as u64)?;
        
        // 创建内存映射
        let mmap = unsafe {
            MmapOptions::new()
                .len(total_size)
                .map_mut(&temp_file)?
        };
        
        // 预计算帧偏移
        let frame_offsets: Vec<usize> = (0..capacity)
            .map(|i| i * frame_size)
            .collect();
        
        Ok(Self {
            mmap: Arc::new(mmap),
            frame_size,
            capacity,
            write_pos: Arc::new(AtomicUsize::new(0)),
            read_pos: Arc::new(AtomicUsize::new(0)),
            frame_offsets,
        })
    }
    
    /// 写入帧到环形缓冲区
    pub fn write_frame(&self, frame_data: &[u8]) -> Result<usize> {
        if frame_data.len() != self.frame_size {
            return Err(anyhow::anyhow!("Frame size mismatch"));
        }
        
        let pos = self.write_pos.fetch_add(1, Ordering::AcqRel) % self.capacity;
        let offset = self.frame_offsets[pos];
        
        // 直接写入内存映射区域
        unsafe {
            let dst = &mut self.mmap[offset..offset + self.frame_size];
            dst.copy_from_slice(frame_data);
        }
        
        Ok(pos)
    }
    
    /// 读取帧从环形缓冲区(零拷贝)
    pub fn read_frame(&self, position: usize) -> &[u8] {
        let offset = self.frame_offsets[position % self.capacity];
        &self.mmap[offset..offset + self.frame_size]
    }
    
    /// 获取当前写入位置
    pub fn current_write_pos(&self) -> usize {
        self.write_pos.load(Ordering::Acquire) % self.capacity
    }
    
    /// 获取可用帧数量
    pub fn available_frames(&self) -> usize {
        let write = self.write_pos.load(Ordering::Acquire);
        let read = self.read_pos.load(Ordering::Acquire);
        write.saturating_sub(read).min(self.capacity)
    }
}

/// 环形缓冲区的只读视图
pub struct RingBufferView {
    buffer: Arc<MmapRingBuffer>,
    start_pos: usize,
    end_pos: usize,
}

impl RingBufferView {
    pub fn new(buffer: Arc<MmapRingBuffer>, start_pos: usize, end_pos: usize) -> Self {
        Self {
            buffer,
            start_pos,
            end_pos,
        }
    }
    
    /// 迭代视图中的帧
    pub fn iter_frames(&self) -> impl Iterator<Item = &[u8]> {
        (self.start_pos..self.end_pos)
            .map(move |pos| self.buffer.read_frame(pos))
    }
}

B. Detection模块集成

// detection.rs 优化版本
pub struct OptimizedDetectionController {
    config: DetectionConfig,
    event_bus: EventBus,
    ring_buffer: Arc<MmapRingBuffer>,
    frame_metadata: Arc<RwLock<HashMap<usize, FrameMetadata>>>,
}

impl OptimizedDetectionController {
    pub async fn detection_loop(&mut self) -> Result<()> {
        let mut last_processed_pos = 0;
        
        loop {
            let current_pos = self.ring_buffer.current_write_pos();
            
            if current_pos > last_processed_pos {
                // 创建视图,零拷贝访问帧
                let view = RingBufferView::new(
                    Arc::clone(&self.ring_buffer),
                    last_processed_pos,
                    current_pos,
                );
                
                // 分析帧序列
                if let Some(detection) = self.analyze_frames(view).await? {
                    // 发布检测事件
                    self.event_bus.publish(SystemEvent::MeteorDetected(detection))?;
                }
                
                last_processed_pos = current_pos;
            }
            
            // 避免忙等待
            tokio::time::sleep(Duration::from_millis(100)).await;
        }
    }
    
    async fn analyze_frames(&self, view: RingBufferView) -> Result<Option<MeteorDetectedEvent>> {
        // 使用SIMD优化的亮度计算
        let brightness_values: Vec<f32> = view.iter_frames()
            .map(|frame| self.calculate_brightness_simd(frame))
            .collect();
        
        // 检测算法...
        Ok(None)
    }
    
    #[cfg(target_arch = "aarch64")]
    fn calculate_brightness_simd(&self, frame: &[u8]) -> f32 {
        use std::arch::aarch64::*;
        
        unsafe {
            let mut sum = vdupq_n_u32(0);
            let chunks = frame.chunks_exact(16);
            
            for chunk in chunks {
                let data = vld1q_u8(chunk.as_ptr());
                let data_u16 = vmovl_u8(vget_low_u8(data));
                let data_u32 = vmovl_u16(vget_low_u16(data_u16));
                sum = vaddq_u32(sum, data_u32);
            }
            
            // 累加SIMD寄存器中的值
            let sum_array: [u32; 4] = std::mem::transmute(sum);
            let total: u32 = sum_array.iter().sum();
            
            total as f32 / frame.len() as f32
        }
    }
}

方案4: 分层内存管理

A. 内存层次结构

/// 分层内存管理器
pub struct HierarchicalMemoryManager {
    // L1: 热数据 - 最近的帧在内存中
    hot_cache: Arc<RwLock<LruCache<u64, Arc<FrameData>>>>,
    
    // L2: 温数据 - 使用内存映射文件
    warm_storage: Arc<MmapRingBuffer>,
    
    // L3: 冷数据 - 压缩存储在磁盘
    cold_storage: Arc<ColdStorage>,
    
    // 统计信息
    stats: Arc<MemoryStats>,
}

impl HierarchicalMemoryManager {
    pub fn new(config: MemoryConfig) -> Result<Self> {
        Ok(Self {
            hot_cache: Arc::new(RwLock::new(
                LruCache::new(config.hot_cache_frames)
            )),
            warm_storage: Arc::new(MmapRingBuffer::new(
                config.warm_storage_frames,
                config.frame_size,
            )?),
            cold_storage: Arc::new(ColdStorage::new(config.cold_storage_path)?),
            stats: Arc::new(MemoryStats::default()),
        })
    }
    
    /// 智能存储帧
    pub async fn store_frame(&self, frame_id: u64, data: Arc<FrameData>) -> Result<()> {
        // 更新热缓存
        {
            let mut cache = self.hot_cache.write().await;
            cache.put(frame_id, Arc::clone(&data));
        }
        
        // 异步写入温存储
        let warm_storage = Arc::clone(&self.warm_storage);
        let data_clone = Arc::clone(&data);
        tokio::spawn(async move {
            warm_storage.write_frame(&data_clone.data).ok();
        });
        
        // 更新统计
        self.stats.record_store(data.data.len());
        
        Ok(())
    }
    
    /// 智能获取帧
    pub async fn get_frame(&self, frame_id: u64) -> Result<Arc<FrameData>> {
        // 检查L1热缓存
        {
            let cache = self.hot_cache.read().await;
            if let Some(data) = cache.peek(&frame_id) {
                self.stats.record_hit(CacheLevel::L1);
                return Ok(Arc::clone(data));
            }
        }
        
        // 检查L2温存储
        if let Some(data) = self.warm_storage.get_frame_by_id(frame_id) {
            self.stats.record_hit(CacheLevel::L2);
            let frame_data = Arc::new(FrameData::from_bytes(data));
            
            // 提升到L1
            self.promote_to_hot(frame_id, Arc::clone(&frame_data)).await;
            
            return Ok(frame_data);
        }
        
        // 从L3冷存储加载
        let data = self.cold_storage.load_frame(frame_id).await?;
        self.stats.record_hit(CacheLevel::L3);
        
        // 提升到L1和L2
        self.promote_to_hot(frame_id, Arc::clone(&data)).await;
        self.promote_to_warm(frame_id, &data).await;
        
        Ok(data)
    }
    
    /// 内存压力管理
    pub async fn handle_memory_pressure(&self) -> Result<()> {
        let memory_info = sys_info::mem_info()?;
        let used_percent = (memory_info.total - memory_info.avail) * 100 / memory_info.total;
        
        if used_percent > 80 {
            // 高内存压力,移动数据到下一层
            self.evict_to_cold().await?;
        } else if used_percent > 60 {
            // 中等压力,清理热缓存
            self.trim_hot_cache().await?;
        }
        
        Ok(())
    }
}

#[derive(Debug, Default)]
struct MemoryStats {
    l1_hits: AtomicU64,
    l2_hits: AtomicU64,
    l3_hits: AtomicU64,
    total_requests: AtomicU64,
    bytes_stored: AtomicU64,
}

enum CacheLevel {
    L1,
    L2,
    L3,
}

方案5: 内存监控与调优

A. 实时内存监控

use prometheus::{Gauge, Histogram, Counter};

pub struct MemoryMonitor {
    // Prometheus metrics
    memory_usage: Gauge,
    allocation_rate: Counter,
    gc_pause_time: Histogram,
    frame_pool_usage: Gauge,
    
    // 监控任务句柄
    monitor_handle: Option<JoinHandle<()>>,
}

impl MemoryMonitor {
    pub fn start(&mut self) -> Result<()> {
        let memory_usage = self.memory_usage.clone();
        let allocation_rate = self.allocation_rate.clone();
        
        let handle = tokio::spawn(async move {
            let mut interval = tokio::time::interval(Duration::from_secs(1));
            
            loop {
                interval.tick().await;
                
                // 更新内存使用率
                if let Ok(info) = sys_info::mem_info() {
                    let used_mb = (info.total - info.avail) / 1024;
                    memory_usage.set(used_mb as f64);
                }
                
                // 监控分配率
                let allocator_stats = ALLOCATOR.stats();
                allocation_rate.inc_by(allocator_stats.bytes_allocated);
            }
        });
        
        self.monitor_handle = Some(handle);
        Ok(())
    }
    
    /// 生成内存报告
    pub fn generate_report(&self) -> MemoryReport {
        MemoryReport {
            current_usage_mb: self.memory_usage.get() as usize,
            allocation_rate_mb_s: self.allocation_rate.get() / 1_000_000.0,
            frame_pool_efficiency: self.calculate_pool_efficiency(),
            recommendations: self.generate_recommendations(),
        }
    }
}

实施步骤

第一阶段基础优化1周

  1. 实现Arc共享帧数据
  2. 优化事件总线避免数据拷贝
  3. 添加基础内存监控

第二阶段池化管理1周

  1. 实现帧对象池
  2. 集成到Camera模块
  3. 添加池统计和调优

第三阶段高级优化2周

  1. 实现内存映射环形缓冲
  2. 添加分层内存管理
  3. SIMD优化关键路径

第四阶段监控与调优1周

  1. 完整的内存监控系统
  2. 自动内存压力管理
  3. 性能基准测试

预期效果

内存使用降低

  • 帧数据拷贝:降低 90%
  • 整体内存使用:降低 60%
  • GC压力降低 80%

性能提升

  • 帧处理延迟:降低 50%
  • CPU使用率降低 30%
  • 吞吐量:提升 2-3倍

系统稳定性

  • 内存泄漏:完全避免
  • OOM风险显著降低
  • 长期运行:稳定可靠

测试验证

#[cfg(test)]
mod memory_tests {
    use super::*;
    
    #[test]
    fn test_zero_copy_performance() {
        let frame_size = 640 * 480 * 3;
        let iterations = 1000;
        
        // 测试传统方式
        let start = Instant::now();
        for _ in 0..iterations {
            let data = vec![0u8; frame_size];
            let _clone1 = data.clone();
            let _clone2 = data.clone();
        }
        let traditional_time = start.elapsed();
        
        // 测试零拷贝方式
        let start = Instant::now();
        for _ in 0..iterations {
            let data = Arc::new(vec![0u8; frame_size]);
            let _ref1 = Arc::clone(&data);
            let _ref2 = Arc::clone(&data);
        }
        let zero_copy_time = start.elapsed();
        
        println!("Traditional: {:?}, Zero-copy: {:?}", 
                 traditional_time, zero_copy_time);
        assert!(zero_copy_time < traditional_time / 10);
    }
    
    #[test]
    fn test_frame_pool_efficiency() {
        let pool = FramePool::new(640, 480, FrameFormat::RGB888, 10);
        
        // 测试复用
        let frame1 = pool.acquire();
        let addr1 = frame1.as_ptr();
        drop(frame1);
        
        let frame2 = pool.acquire();
        let addr2 = frame2.as_ptr();
        
        // 验证地址相同(复用成功)
        assert_eq!(addr1, addr2);
    }
}

这个内存优化方案将显著提升边缘设备的性能和稳定性,特别适合资源受限的树莓派环境。