warpgate/src/supervisor.rs

//! `warpgate run` — single-process supervisor for all services.
//!
//! Manages rclone mount processes (one per share) + protocol services in one
//! process tree with coordinated startup and shutdown. Spawns a built-in web
//! server for status monitoring and config hot-reload.

use std::collections::HashMap;
use std::os::unix::process::CommandExt;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::mpsc::{self, RecvTimeoutError};
use std::sync::{Arc, RwLock};
use std::thread;
use std::time::{Duration, Instant, SystemTime};

use anyhow::{Context, Result};
use tracing::{error, info, warn};

use crate::config::Config;
use crate::config_diff::{self, ChangeTier};
use crate::daemon::{DaemonStatus, ShareHealth, SupervisorCmd, WarmupRuleState, WarmupRuleStatus};
use crate::scheduler::ScheduledTask;
use crate::rclone::mount::{build_mount_args, is_mounted};
use crate::rclone::rc;
use crate::services::{nfs, samba, webdav};

/// Mount ready timeout.
const MOUNT_TIMEOUT: Duration = Duration::from_secs(30);
/// Supervision loop poll interval.
const POLL_INTERVAL: Duration = Duration::from_secs(2);
/// Grace period for SIGTERM before escalating to SIGKILL.
const SIGTERM_GRACE: Duration = Duration::from_secs(3);
/// Max restart attempts before giving up on a protocol service.
const MAX_RESTARTS: u32 = 3;
/// Reset restart counter after this period of stable running.
const RESTART_STABLE_PERIOD: Duration = Duration::from_secs(300);
/// Max time to wait for write-back queue to drain on shutdown.
const WRITEBACK_DRAIN_TIMEOUT: Duration = Duration::from_secs(300);
/// Poll interval when waiting for write-back drain.
const WRITEBACK_POLL_INTERVAL: Duration = Duration::from_secs(2);
/// Transfer speed below this value is considered idle (bytes/sec).
const SPEED_ACTIVE_THRESHOLD: f64 = 10_240.0; // 10 KiB/s
/// Interval for periodic cache stats snapshots.
const STATS_SNAPSHOT_INTERVAL: Duration = Duration::from_secs(60);
/// Cache usage WARN threshold (fraction of max_size).
const CACHE_WARN_THRESHOLD: f64 = 0.80;
/// Cache usage CRIT threshold.
const CACHE_CRITICAL_THRESHOLD: f64 = 0.95;

/// Per-share state from the previous poll cycle, used for change detection.
struct SharePrevState {
    dirty_count: u64,
    errored_files: u64,
    cache_bytes: u64,
    is_active: bool,
    /// 0 = normal, 1 = ≥80%, 2 = ≥95%
    cache_warn_level: u8,
}

/// Tracks restart attempts for a supervised child process.
struct RestartTracker {
    count: u32,
    last_restart: Option<Instant>,
}

impl RestartTracker {
    fn new() -> Self {
        Self {
            count: 0,
            last_restart: None,
        }
    }

    /// Returns true if another restart is allowed. Resets counter if the
    /// service has been stable for `RESTART_STABLE_PERIOD`.
    fn can_restart(&mut self) -> bool {
        if let Some(last) = self.last_restart
            && last.elapsed() >= RESTART_STABLE_PERIOD
        {
            self.count = 0;
        }
        self.count < MAX_RESTARTS
    }

    fn record_restart(&mut self) {
        self.count += 1;
        self.last_restart = Some(Instant::now());
    }
}

/// A named rclone mount child process for a single share.
struct MountChild {
    name: String,
    child: Child,
    rc_port: u16,
}

/// Child processes for protocol servers managed by the supervisor.
///
/// Implements `Drop` to kill any spawned children — prevents orphaned
/// processes if startup fails partway through `start_protocols()`.
struct ProtocolChildren {
    smbd: Option<Child>,
    webdav: Option<Child>,
}

impl Drop for ProtocolChildren {
    fn drop(&mut self) {
        for child in [&mut self.smbd, &mut self.webdav].into_iter().flatten() {
            graceful_kill(child);
        }
    }
}

/// Entry point — called from main.rs for `warpgate run`.
pub fn run(config: &Config, config_path: PathBuf) -> Result<()> {
    let shutdown = Arc::new(AtomicBool::new(false));

    // Install signal handler (SIGTERM + SIGINT)
    let shutdown_flag = Arc::clone(&shutdown);
    ctrlc::set_handler(move || {
        info!("Signal received, shutting down...");
        shutdown_flag.store(true, Ordering::SeqCst);
    })
    .context("Failed to set signal handler")?;

    // Set up shared state for web server integration
    let shared_config = Arc::new(RwLock::new(config.clone()));
    let share_names: Vec<String> = config.shares.iter().map(|s| s.name.clone()).collect();
    let shared_status = Arc::new(RwLock::new(DaemonStatus::new(&share_names)));
    let (cmd_tx, cmd_rx) = mpsc::channel::<SupervisorCmd>();

    // Create SSE broadcast channel (supervisor → web clients)
    let (sse_tx, _) = tokio::sync::broadcast::channel::<()>(16);

    // Spawn the web UI server in a background thread
    let _web_handle = crate::web::spawn_web_server(
        Arc::clone(&shared_config),
        Arc::clone(&shared_status),
        cmd_tx.clone(),
        config_path,
        sse_tx.clone(),
    );

    // Also wire shutdown signal to the command channel
    let shutdown_tx = cmd_tx;
    let shutdown_for_cmd = Arc::clone(&shutdown);
    thread::spawn(move || {
        // Poll the AtomicBool and forward to cmd channel when set
        loop {
            if shutdown_for_cmd.load(Ordering::SeqCst) {
                let _ = shutdown_tx.send(SupervisorCmd::Shutdown);
                return;
            }
            thread::sleep(Duration::from_millis(200));
        }
    });

    // Phase 1: Preflight — create dirs, write rclone.conf
    info!("Preflight checks...");
    preflight(config)?;

    // Phase 1.5: Probe remote paths in parallel
    info!("Probing remote paths...");
    let healthy_names = probe_all_shares(config, &shared_status, &shutdown)?;

    if healthy_names.is_empty() {
        anyhow::bail!("All shares failed probe — no healthy mounts to start");
    }

    // Build a config containing only healthy shares for protocol configs
    let mut healthy_config = config.clone();
    healthy_config
        .shares
        .retain(|s| healthy_names.contains(&s.name));

    // Phase 1.75: Generate protocol configs with only healthy shares
    write_protocol_configs(&healthy_config)?;

    // Phase 2: Start rclone mounts only for healthy shares
    info!("Starting rclone mounts...");
    let mut mount_children = start_and_wait_mounts(&healthy_config, &shutdown)?;
    for share in &healthy_config.shares {
        info!("  Mount ready at {}", share.mount_point.display());
    }

    // Update status: mounts are ready (match by name, not index)
    {
        let mut status = shared_status.write().unwrap();
        for mc in &mount_children {
            if let Some(ss) = status.shares.iter_mut().find(|s| s.name == mc.name) {
                ss.mounted = true;
                ss.rc_port = mc.rc_port;
            }
        }
    }

    // Phase 3: Start protocol services
    if shutdown.load(Ordering::SeqCst) {
        info!("Shutdown signal received during mount.");
        for mc in &mut mount_children {
            let _ = mc.child.kill();
            let _ = mc.child.wait();
        }
        return Ok(());
    }
    info!("Starting protocol services...");
    let mut protocols = start_protocols(&healthy_config)?;

    // Update status: protocols running
    {
        let mut status = shared_status.write().unwrap();
        status.smbd_running = protocols.smbd.is_some();
        status.webdav_running = protocols.webdav.is_some();
        status.nfs_exported = healthy_config.protocols.enable_nfs;
    }

    // Phase 3.5: Auto-warmup in background thread (non-blocking)
    spawn_warmup(config, &shared_status, &shutdown);
    // Phase 3.6: Dir-refresh background threads (non-blocking)
    spawn_dir_refresh(config, &shared_status, &shutdown);

    // Phase 4: Supervision loop with command channel
    info!("Supervision active. Web UI at http://localhost:8090. Press Ctrl+C to stop.");
    let result = supervise(
        &shared_config,
        &shared_status,
        &cmd_rx,
        &mut mount_children,
        &mut protocols,
        Arc::clone(&shutdown),
        &sse_tx,
    );

    // Phase 5: Teardown (always runs)
    info!("Shutting down...");
    let config = shared_config.read().unwrap().clone();
    shutdown_services(&config, &mut mount_children, &mut protocols);

    result
}

/// Spawn a background warmup thread for all configured warmup rules.
///
/// Increments the warmup generation counter so any previous warmup thread
/// will detect the change and exit. Each rule is processed sequentially
/// with progress reported into `shared_status.warmup`.
fn spawn_warmup(
    config: &Config,
    shared_status: &Arc<RwLock<DaemonStatus>>,
    shutdown: &Arc<AtomicBool>,
) {
    if config.warmup.rules.is_empty() || !config.warmup.auto {
        // Clear stale warmup status when rules are removed or auto is disabled
        let mut status = shared_status.write().unwrap();
        status.warmup.clear();
        return;
    }

    // Pre-populate warmup status entries and bump generation
    let generation = {
        let mut status = shared_status.write().unwrap();
        status.warmup_generation += 1;
        status.warmup = config
            .warmup
            .rules
            .iter()
            .map(|rule| WarmupRuleStatus {
                share: rule.share.clone(),
                path: rule.path.clone(),
                newer_than: rule.newer_than.clone(),
                state: WarmupRuleState::Pending,
                total_files: 0,
                skipped: 0,
                cached: 0,
                errors: 0,
            })
            .collect();
        status.warmup_generation
    };

    let warmup_config = config.clone();
    let warmup_status = Arc::clone(shared_status);
    let warmup_shutdown = Arc::clone(shutdown);

    thread::spawn(move || {
        info!("Auto-warmup started (background, generation {generation})...");
        for (i, rule) in warmup_config.warmup.rules.iter().enumerate() {
            if warmup_shutdown.load(Ordering::SeqCst) {
                info!("Auto-warmup interrupted by shutdown.");
                break;
            }
            // Check if our generation is still current
            {
                let status = warmup_status.read().unwrap();
                if status.warmup_generation != generation {
                    info!("Auto-warmup superseded by newer generation.");
                    return;
                }
            }
            if let Err(e) = crate::cli::warmup::run_tracked(
                &warmup_config,
                &rule.share,
                &rule.path,
                rule.newer_than.as_deref(),
                &warmup_status,
                i,
                generation,
                &warmup_shutdown,
            ) {
                warn!("Warmup warning: {e}");
            }
        }
        info!("Auto-warmup complete.");
    });
}

/// Spawn per-share background threads that periodically call `vfs/refresh` to
/// keep the rclone directory listing cache warm.
///
/// Bumps the dir-refresh generation counter so any previous threads detect
/// that they've been superseded and exit cleanly. Each share whose effective
/// interval is non-zero gets its own `ScheduledTask` thread.
fn spawn_dir_refresh(
    config: &Config,
    shared_status: &Arc<RwLock<DaemonStatus>>,
    shutdown: &Arc<AtomicBool>,
) {
    // Quick check: skip entirely if no share will actually refresh.
    let any_active = config
        .shares
        .iter()
        .any(|s| config.effective_dir_refresh_interval(s).is_some());
    if !any_active {
        return;
    }

    // Bump generation and clone the shared Arc<AtomicU64> for threads.
    let gen_arc: Arc<AtomicU64> = {
        let mut s = shared_status.write().unwrap();
        s.dir_refresh_generation += 1;
        s.dir_refresh_running = true;
        let g = s.dir_refresh_generation;
        s.dir_refresh_gen_arc.store(g, Ordering::SeqCst);
        Arc::clone(&s.dir_refresh_gen_arc)
    };
    let generation = gen_arc.load(Ordering::SeqCst);

    for (i, share) in config.shares.iter().enumerate() {
        let interval = match config.effective_dir_refresh_interval(share) {
            Some(d) => d,
            None => continue,
        };

        let share_name = share.name.clone();
        let mount_point = share.mount_point.clone();
        let recursive = config.dir_refresh.recursive;
        let rc_port = config.rc_port(i);
        let status = Arc::clone(shared_status);
        let gen_arc2 = Arc::clone(&gen_arc);
        let sd = Arc::clone(shutdown);

        info!(
            "  dir-refresh: scheduling '{}' every {}s",
            share_name,
            interval.as_secs()
        );

        ScheduledTask {
            name: "dir-refresh",
            interval,
        }
        .spawn(generation, gen_arc2, sd, move || {
            // Enumerate top-level subdirectories by reading the FUSE mount point.
            // The VFS root itself is not a valid vfs/refresh target in rclone.
            let dirs: Vec<String> = std::fs::read_dir(&mount_point)
                .with_context(|| format!("dir-refresh: failed to read mount point for '{share_name}'"))?
                .filter_map(|entry| {
                    let entry = entry.ok()?;
                    if entry.file_type().ok()?.is_dir() {
                        entry.file_name().into_string().ok()
                    } else {
                        None
                    }
                })
                .collect();

            if dirs.is_empty() {
                tracing::warn!(share = %share_name, "dir-refresh: no subdirs found, skipping");
                return Ok(());
            }

            let mut ok = 0usize;
            let mut failed = 0usize;
            for dir in &dirs {
                match rc::vfs_refresh(rc_port, dir, recursive) {
                    Ok(()) => {
                        tracing::info!(share = %share_name, dir = %dir, "dir-refresh OK");
                        ok += 1;
                    }
                    Err(e) => {
                        tracing::warn!(share = %share_name, dir = %dir, error = %e, "dir-refresh failed");
                        failed += 1;
                    }
                }
            }

            tracing::info!(
                share = %share_name,
                dirs_ok = ok,
                dirs_failed = failed,
                "dir-refresh cycle complete"
            );

            let mut s = status.write().unwrap();
            s.last_dir_refresh.insert(share_name.clone(), SystemTime::now());
            s.dir_refresh_dirs_ok.insert(share_name.clone(), ok);
            s.dir_refresh_dirs_failed.insert(share_name.clone(), failed);
            Ok(())
        });
    }
}

/// Write rclone config and create directories (protocol configs generated after probe).
fn preflight(config: &Config) -> Result<()> {
    // Ensure mount points exist for each share
    for share in &config.shares {
        std::fs::create_dir_all(&share.mount_point).with_context(|| {
            format!(
                "Failed to create mount point: {}",
                share.mount_point.display()
            )
        })?;
    }

    // Ensure cache directory exists
    std::fs::create_dir_all(&config.cache.dir).with_context(|| {
        format!(
            "Failed to create cache dir: {}",
            config.cache.dir.display()
        )
    })?;

    // Generate rclone config
    crate::rclone::config::write_config(config)?;

    Ok(())
}

/// Generate protocol configs (SMB/NFS) for the given config.
///
/// Called after probing so only healthy shares are included.
fn write_protocol_configs(config: &Config) -> Result<()> {
    if config.protocols.enable_smb {
        samba::write_config(config)?;
        if config.smb_auth.enabled {
            samba::setup_user(config)?;
        }
    }
    if config.protocols.enable_nfs {
        nfs::write_config(config)?;
    }
    Ok(())
}

/// Probe all shares in parallel and return the set of healthy share names.
///
/// Updates `shared_status` with probe results as they complete.
fn probe_all_shares(
    config: &Config,
    shared_status: &Arc<RwLock<DaemonStatus>>,
    shutdown: &AtomicBool,
) -> Result<Vec<String>> {
    use std::collections::HashSet;

    let shares: Vec<_> = config.shares.clone();
    let config_clone = config.clone();

    // Mark all shares as Probing
    {
        let mut status = shared_status.write().unwrap();
        for ss in &mut status.shares {
            ss.health = ShareHealth::Probing;
        }
    }

    // Spawn one thread per share
    let handles: Vec<_> = shares
        .into_iter()
        .map(|share| {
            let cfg = config_clone.clone();
            let name = share.name.clone();
            thread::spawn(move || {
                let result = crate::rclone::probe::probe_remote_path(&cfg, &share);
                (name, result)
            })
        })
        .collect();

    // Collect results
    let mut healthy = HashSet::new();
    for handle in handles {
        if shutdown.load(Ordering::SeqCst) {
            anyhow::bail!("Interrupted during probe");
        }
        match handle.join() {
            Ok((name, Ok(()))) => {
                info!("  Probe OK: {name}");
                let mut status = shared_status.write().unwrap();
                if let Some(ss) = status.shares.iter_mut().find(|s| s.name == name) {
                    ss.health = ShareHealth::Healthy;
                }
                healthy.insert(name);
            }
            Ok((name, Err(e))) => {
                let msg = format!("{e}");
                error!("  Probe FAILED: {name} — {msg}");
                let mut status = shared_status.write().unwrap();
                if let Some(ss) = status.shares.iter_mut().find(|s| s.name == name) {
                    ss.health = ShareHealth::Failed(msg);
                }
            }
            Err(_) => {
                error!("  Probe thread panicked");
            }
        }
    }

    Ok(healthy.into_iter().collect())
}

/// Spawn rclone mount processes for all shares and poll until each FUSE mount appears.
fn start_and_wait_mounts(config: &Config, shutdown: &AtomicBool) -> Result<Vec<MountChild>> {
    let mut children = Vec::new();

    for (i, share) in config.shares.iter().enumerate() {
        let rc_port = config.rc_port(i);
        let args = build_mount_args(config, share, rc_port);

        let child = Command::new("rclone")
            .args(&args)
            .process_group(0)
            .spawn()
            .with_context(|| format!("Failed to spawn rclone mount for share '{}'", share.name))?;

        children.push(MountChild {
            name: share.name.clone(),
            child,
            rc_port,
        });
    }

    // Poll for all mounts to become ready
    let deadline = Instant::now() + MOUNT_TIMEOUT;
    let mut ready = vec![false; config.shares.len()];

    loop {
        if shutdown.load(Ordering::SeqCst) {
            for mc in &mut children {
                let _ = mc.child.kill();
                let _ = mc.child.wait();
            }
            anyhow::bail!("Interrupted while waiting for mounts");
        }

        if Instant::now() > deadline {
            for mc in &mut children {
                let _ = mc.child.kill();
                let _ = mc.child.wait();
            }
            let pending: Vec<&str> = config.shares.iter()
                .zip(ready.iter())
                .filter(|(_, r)| !**r)
                .map(|(s, _)| s.name.as_str())
                .collect();
            anyhow::bail!(
                "Timed out waiting for mounts ({}s). Still pending: {}",
                MOUNT_TIMEOUT.as_secs(),
                pending.join(", ")
            );
        }

        // Check for early exits
        for (i, mc) in children.iter_mut().enumerate() {
            if ready[i] {
                continue;
            }
            match mc.child.try_wait() {
                Ok(Some(status)) => {
                    anyhow::bail!(
                        "rclone mount for '{}' exited immediately ({status}). Check remote/auth config.",
                        mc.name
                    );
                }
                Ok(None) => {}
                Err(e) => {
                    anyhow::bail!("Failed to check rclone mount status for '{}': {e}", mc.name);
                }
            }
        }

        // Check mount readiness
        let mut all_ready = true;
        for (i, share) in config.shares.iter().enumerate() {
            if ready[i] {
                continue;
            }
            match is_mounted(&share.mount_point) {
                Ok(true) => ready[i] = true,
                Ok(false) => all_ready = false,
                Err(e) => {
                    warn!("Warning: mount check failed for '{}': {e}", share.name);
                    all_ready = false;
                }
            }
        }

        if all_ready {
            break;
        }

        thread::sleep(Duration::from_millis(500));
    }

    Ok(children)
}

/// Spawn smbd as a foreground child process.
fn spawn_smbd() -> Result<Child> {
    Command::new("smbd")
        .args(["--foreground", "--debug-stdout", "--no-process-group",
               "--configfile", samba::SMB_CONF_PATH])
        .process_group(0)
        .spawn()
        .context("Failed to spawn smbd")
}

/// Start protocol services after the mount is ready.
fn start_protocols(config: &Config) -> Result<ProtocolChildren> {
    let smbd = if config.protocols.enable_smb {
        let child = spawn_smbd()?;
        info!("  SMB: started");
        Some(child)
    } else {
        None
    };

    if config.protocols.enable_nfs {
        let status = Command::new("exportfs")
            .arg("-ra")
            .status()
            .context("Failed to run exportfs -ra")?;
        if !status.success() {
            anyhow::bail!("exportfs -ra failed: {status}");
        }
        info!("  NFS: exported");
    }

    let webdav = if config.protocols.enable_webdav {
        let child = spawn_webdav(config)?;
        info!("  WebDAV: started");
        Some(child)
    } else {
        None
    };

    Ok(ProtocolChildren { smbd, webdav })
}

/// Spawn a `rclone serve webdav` child process.
fn spawn_webdav(config: &Config) -> Result<Child> {
    let args = webdav::build_serve_args(config);
    Command::new("rclone")
        .args(&args)
        .process_group(0)
        .spawn()
        .context("Failed to spawn rclone serve webdav")
}

/// Main supervision loop with command channel.
///
/// Uses `recv_timeout` on the command channel so it can both respond to
/// commands from the web UI and poll child processes every POLL_INTERVAL.
///
/// - If any rclone mount dies → full shutdown (data safety).
/// - If smbd/WebDAV dies → restart up to 3 times.
fn supervise(
    shared_config: &Arc<RwLock<Config>>,
    shared_status: &Arc<RwLock<DaemonStatus>>,
    cmd_rx: &mpsc::Receiver<SupervisorCmd>,
    mounts: &mut Vec<MountChild>,
    protocols: &mut ProtocolChildren,
    shutdown: Arc<AtomicBool>,
    sse_tx: &tokio::sync::broadcast::Sender<()>,
) -> Result<()> {
    let mut smbd_tracker = RestartTracker::new();
    let mut webdav_tracker = RestartTracker::new();
    let mut prev_states: HashMap<String, SharePrevState> = HashMap::new();
    let mut last_stats_snapshot = Instant::now();

    loop {
        // Check for commands (non-blocking with timeout = POLL_INTERVAL)
        match cmd_rx.recv_timeout(POLL_INTERVAL) {
            Ok(SupervisorCmd::Shutdown) => {
                info!("Shutdown command received.");
                return Ok(());
            }
            Ok(SupervisorCmd::BwLimit { up, down }) => {
                info!(bw_limit_up = %up, bw_limit_down = %down, "bandwidth limit applied");
                apply_bwlimit(mounts, &up, &down);
            }
            Ok(SupervisorCmd::Reload(new_config)) => {
                info!("Config reload requested...");
                handle_reload(
                    shared_config,
                    shared_status,
                    mounts,
                    protocols,
                    &mut smbd_tracker,
                    &mut webdav_tracker,
                    new_config,
                    &shutdown,
                )?;
                info!("Config reload complete.");
            }
            Err(RecvTimeoutError::Timeout) => {} // normal poll cycle
            Err(RecvTimeoutError::Disconnected) => {
                info!("Command channel disconnected, shutting down.");
                return Ok(());
            }
        }

        // Check for shutdown signal
        if shutdown.load(Ordering::SeqCst) {
            info!("Shutdown signal received.");
            return Ok(());
        }

        // Check all rclone mount processes
        for mc in mounts.iter_mut() {
            match mc.child.try_wait() {
                Ok(Some(status)) => {
                    anyhow::bail!(
                        "rclone mount for '{}' exited unexpectedly ({}). Initiating full shutdown for data safety.",
                        mc.name,
                        status
                    );
                }
                Ok(None) => {}
                Err(e) => {
                    anyhow::bail!("Failed to check rclone mount status for '{}': {e}", mc.name);
                }
            }
        }

        // Check smbd process (if enabled)
        if let Some(child) = &mut protocols.smbd {
            match child.try_wait() {
                Ok(Some(status)) => {
                    warn!("smbd exited ({status}).");
                    if smbd_tracker.can_restart() {
                        smbd_tracker.record_restart();
                        let delay = smbd_tracker.count * 2;
                        warn!(
                            "Restarting smbd in {delay}s ({}/{MAX_RESTARTS})...",
                            smbd_tracker.count,
                        );
                        thread::sleep(Duration::from_secs(delay.into()));
                        match spawn_smbd() {
                            Ok(new_child) => *child = new_child,
                            Err(e) => {
                                error!("Failed to restart smbd: {e}");
                                protocols.smbd = None;
                            }
                        }
                    } else {
                        error!(
                            "smbd exceeded max restarts ({MAX_RESTARTS}), giving up."
                        );
                        protocols.smbd = None;
                    }
                }
                Ok(None) => {}
                Err(e) => warn!("Warning: failed to check smbd status: {e}"),
            }
        }

        // Check WebDAV process (if enabled)
        let config = shared_config.read().unwrap().clone();
        if let Some(child) = &mut protocols.webdav {
            match child.try_wait() {
                Ok(Some(status)) => {
                    warn!("WebDAV exited ({status}).");
                    if webdav_tracker.can_restart() {
                        webdav_tracker.record_restart();
                        let delay = webdav_tracker.count * 2;
                        warn!(
                            "Restarting WebDAV in {delay}s ({}/{MAX_RESTARTS})...",
                            webdav_tracker.count,
                        );
                        thread::sleep(Duration::from_secs(delay.into()));
                        match spawn_webdav(&config) {
                            Ok(new_child) => *child = new_child,
                            Err(e) => {
                                error!("Failed to restart WebDAV: {e}");
                                protocols.webdav = None;
                            }
                        }
                    } else {
                        error!(
                            "WebDAV exceeded max restarts ({MAX_RESTARTS}), giving up."
                        );
                        protocols.webdav = None;
                    }
                }
                Ok(None) => {}
                Err(e) => warn!("Warning: failed to check WebDAV status: {e}"),
            }
        }

        // Update shared status with fresh RC stats
        update_status(shared_status, mounts, protocols, &config);

        // Log cache state changes and periodic snapshots
        log_cache_events(shared_status, &config, &mut prev_states, &mut last_stats_snapshot);

        // Notify SSE subscribers that status was refreshed
        let _ = sse_tx.send(());
    }
}

/// Poll RC API for each share and update the shared DaemonStatus.
///
/// Matches mounts to status entries by name (not index) so the mapping
/// stays correct after dynamic PerShare add/remove/modify reloads.
///
/// Uses a two-phase approach to avoid holding the write lock during HTTP IO:
/// Phase 1 collects all data without any lock; Phase 2 applies it under a
/// short-lived write lock (pure memory writes, no IO).
fn update_status(
    shared_status: &Arc<RwLock<DaemonStatus>>,
    mounts: &[MountChild],
    protocols: &ProtocolChildren,
    config: &Config,
) {
    struct ShareSnapshot {
        name: String,
        rc_port: u16,
        mounted: bool,
        cache_bytes: u64,
        dirty_count: u64,
        errored_files: u64,
        speed: f64,
        transfers: u64,
        errors: u64,
    }

    // Phase 1: collect all data WITHOUT holding any lock.
    let snapshots: Vec<ShareSnapshot> = mounts
        .iter()
        .map(|mc| {
            let mount_point = config
                .shares
                .iter()
                .find(|s| s.name == mc.name)
                .map(|s| s.mount_point.clone())
                .unwrap_or_default();

            let mounted = is_mounted(&mount_point).unwrap_or(false);

            let (cache_bytes, dirty_count, errored_files) = rc::vfs_stats(mc.rc_port)
                .ok()
                .and_then(|v| v.disk_cache)
                .map(|dc| (dc.bytes_used, dc.uploads_in_progress + dc.uploads_queued, dc.errored_files))
                .unwrap_or((0, 0, 0));

            let (speed, transfers, errors) = rc::core_stats(mc.rc_port)
                .map(|core| {
                    let active = core.transferring.len() as u64;
                    (if active > 0 { core.speed } else { 0.0 }, active, core.errors)
                })
                .unwrap_or((0.0, 0, 0));

            ShareSnapshot {
                name: mc.name.clone(),
                rc_port: mc.rc_port,
                mounted,
                cache_bytes,
                dirty_count,
                errored_files,
                speed,
                transfers,
                errors,
            }
        })
        .collect();

    // Phase 2: apply collected data under write lock — no IO here.
    let mut status = shared_status.write().unwrap();
    for snap in snapshots {
        if let Some(ss) = status.shares.iter_mut().find(|s| s.name == snap.name) {
            ss.mounted = snap.mounted;
            ss.rc_port = snap.rc_port;
            ss.cache_bytes = snap.cache_bytes;
            ss.dirty_count = snap.dirty_count;
            ss.errored_files = snap.errored_files;
            ss.speed = snap.speed;
            ss.transfers = snap.transfers;
            ss.errors = snap.errors;
        }
    }
    status.smbd_running = protocols.smbd.is_some();
    status.webdav_running = protocols.webdav.is_some();
    status.nfs_exported = config.protocols.enable_nfs;
}

/// Apply bandwidth limits to all rclone mounts via RC API (Tier A — no restart).
fn apply_bwlimit(mounts: &[MountChild], up: &str, down: &str) {
    for mc in mounts {
        match rc::bwlimit(mc.rc_port, Some(up), Some(down)) {
            Ok(_) => info!("  bwlimit applied to '{}'", mc.name),
            Err(e) => warn!("  bwlimit failed for '{}': {e}", mc.name),
        }
    }
}

/// Handle a config reload using the tiered change strategy.
fn handle_reload(
    shared_config: &Arc<RwLock<Config>>,
    shared_status: &Arc<RwLock<DaemonStatus>>,
    mounts: &mut Vec<MountChild>,
    protocols: &mut ProtocolChildren,
    smbd_tracker: &mut RestartTracker,
    webdav_tracker: &mut RestartTracker,
    new_config: Config,
    shutdown: &Arc<AtomicBool>,
) -> Result<()> {
    let old_config = shared_config.read().unwrap().clone();
    let diff = config_diff::diff(&old_config, &new_config);

    if diff.is_empty() {
        info!("  No changes detected.");
        return Ok(());
    }

    info!("  Changes: {}", diff.summary());

    match diff.highest_tier() {
        ChangeTier::None => {}

        ChangeTier::Live => {
            // Tier A: bandwidth only — RC API call, no restart
            info!("  Tier A: applying bandwidth limits via RC API...");
            apply_bwlimit(mounts, &new_config.bandwidth.limit_up, &new_config.bandwidth.limit_down);
        }

        ChangeTier::Protocol => {
            // Tier B: protocol-only changes — regen configs, restart protocol services
            // Also apply bandwidth if changed
            if diff.bandwidth_changed {
                apply_bwlimit(mounts, &new_config.bandwidth.limit_up, &new_config.bandwidth.limit_down);
            }
            info!("  Tier B: restarting protocol services...");
            restart_protocols(protocols, smbd_tracker, webdav_tracker, &new_config)?;
        }

        ChangeTier::PerShare => {
            // Tier C: per-share changes — drain affected, unmount, remount
            if diff.bandwidth_changed {
                apply_bwlimit(mounts, &new_config.bandwidth.limit_up, &new_config.bandwidth.limit_down);
            }

            // Regenerate rclone.conf if connections changed
            if !diff.connections_added.is_empty()
                || !diff.connections_removed.is_empty()
                || !diff.connections_modified.is_empty()
            {
                info!("  Regenerating rclone.conf (connections changed)...");
                crate::rclone::config::write_config(&new_config)?;
            }

            // Handle removed shares: drain → unmount → kill
            for name in &diff.shares_removed {
                info!("  Removing share '{name}'...");
                if let Some(idx) = mounts.iter().position(|mc| mc.name == *name) {
                    let mc = &mounts[idx];
                    wait_writeback_drain(mc.rc_port);
                    unmount_share(&old_config, &mc.name);
                    let mut mc = mounts.remove(idx);
                    graceful_kill(&mut mc.child);
                }
            }

            // Handle modified shares: treat as remove + add
            for name in &diff.shares_modified {
                info!("  Restarting modified share '{name}'...");
                // Remove old
                if let Some(idx) = mounts.iter().position(|mc| mc.name == *name) {
                    let mc = &mounts[idx];
                    wait_writeback_drain(mc.rc_port);
                    unmount_share(&old_config, &mc.name);
                    let mut mc = mounts.remove(idx);
                    graceful_kill(&mut mc.child);
                }
                // Add new
                if let Some((i, share)) = new_config.shares.iter().enumerate().find(|(_, s)| s.name == *name) {
                    let rc_port = new_config.rc_port(i);
                    if let Ok(mc) = spawn_mount(&new_config, share, rc_port) {
                        mounts.push(mc);
                    }
                }
            }

            // Handle added shares: spawn new mount
            for name in &diff.shares_added {
                info!("  Adding share '{name}'...");
                if let Some((i, share)) = new_config.shares.iter().enumerate().find(|(_, s)| s.name == *name) {
                    let rc_port = new_config.rc_port(i);
                    std::fs::create_dir_all(&share.mount_point).ok();
                    if let Ok(mc) = spawn_mount(&new_config, share, rc_port) {
                        mounts.push(mc);
                    }
                }
            }

            // Update protocol configs to reflect share changes
            if diff.protocols_changed {
                // Protocol settings changed too — full restart needed
                restart_protocols(protocols, smbd_tracker, webdav_tracker, &new_config)?;
            } else if !diff.shares_removed.is_empty() || !diff.shares_added.is_empty() || !diff.shares_modified.is_empty() {
                // Only shares changed — live reload is sufficient
                reload_protocol_configs(protocols, &new_config)?;
            }
        }

        ChangeTier::Global => {
            // Tier D: global restart — drain all → stop everything → restart
            info!("  Tier D: full restart (global settings changed)...");

            // Drain all write-back queues
            for mc in mounts.iter() {
                wait_writeback_drain(mc.rc_port);
            }

            // Stop all protocol services
            stop_protocols(protocols, &old_config);

            // Unmount and kill all rclone instances
            for mc in mounts.iter_mut() {
                unmount_share(&old_config, &mc.name);
                graceful_kill(&mut mc.child);
            }
            mounts.clear();

            // Re-preflight with new config
            preflight(&new_config)?;

            // Re-probe all shares
            let shutdown_flag = AtomicBool::new(false);
            let healthy_names =
                probe_all_shares(&new_config, shared_status, &shutdown_flag)?;

            // Build healthy-only config for mounts and protocols
            let mut healthy_config = new_config.clone();
            healthy_config
                .shares
                .retain(|s| healthy_names.contains(&s.name));
            write_protocol_configs(&healthy_config)?;

            // Re-start mounts (healthy only)
            let mut new_mounts = start_and_wait_mounts(&healthy_config, &shutdown_flag)?;
            mounts.append(&mut new_mounts);

            // Re-start protocols
            let new_protocols = start_protocols(&healthy_config)?;
            // Replace old protocol children (Drop will handle any leftover)
            *protocols = new_protocols;
            *smbd_tracker = RestartTracker::new();
            *webdav_tracker = RestartTracker::new();
        }
    }

    // Update shared config
    {
        let mut cfg = shared_config.write().unwrap();
        *cfg = new_config.clone();
    }

    // Update shared status with new share list
    {
        let mut status = shared_status.write().unwrap();
        let new_shares: Vec<crate::daemon::ShareStatus> = new_config
            .shares
            .iter()
            .enumerate()
            .map(|(i, s)| {
                // Preserve existing stats if share still exists
                let existing = status.shares.iter().find(|ss| ss.name == s.name);
                crate::daemon::ShareStatus {
                    name: s.name.clone(),
                    mounted: existing.map(|e| e.mounted).unwrap_or(false),
                    rc_port: new_config.rc_port(i),
                    cache_bytes: existing.map(|e| e.cache_bytes).unwrap_or(0),
                    dirty_count: existing.map(|e| e.dirty_count).unwrap_or(0),
                    errored_files: existing.map(|e| e.errored_files).unwrap_or(0),
                    speed: existing.map(|e| e.speed).unwrap_or(0.0),
                    transfers: existing.map(|e| e.transfers).unwrap_or(0),
                    errors: existing.map(|e| e.errors).unwrap_or(0),
                    health: existing
                        .map(|e| e.health.clone())
                        .unwrap_or_else(|| {
                            // New share: if mount succeeded, it's healthy
                            if mounts.iter().any(|mc| mc.name == s.name) {
                                ShareHealth::Healthy
                            } else {
                                ShareHealth::Pending
                            }
                        }),
                }
            })
            .collect();
        status.shares = new_shares;
        status.smbd_running = protocols.smbd.is_some();
        status.webdav_running = protocols.webdav.is_some();
        status.nfs_exported = new_config.protocols.enable_nfs;
    }

    // Re-trigger warmup if settings changed
    if diff.warmup_changed {
        info!("  Warmup settings changed, re-triggering...");
        spawn_warmup(&new_config, shared_status, shutdown);
    }

    // Re-trigger dir-refresh if settings changed
    if diff.dir_refresh_changed {
        info!("  Dir-refresh settings changed, re-triggering...");
        spawn_dir_refresh(&new_config, shared_status, shutdown);
    }

    Ok(())
}

/// Spawn a single rclone mount for a share.
fn spawn_mount(config: &Config, share: &crate::config::ShareConfig, rc_port: u16) -> Result<MountChild> {
    let args = build_mount_args(config, share, rc_port);
    let child = Command::new("rclone")
        .args(&args)
        .process_group(0)
        .spawn()
        .with_context(|| format!("Failed to spawn rclone mount for share '{}'", share.name))?;

    // Wait for mount to appear
    let deadline = Instant::now() + MOUNT_TIMEOUT;
    loop {
        if Instant::now() > deadline {
            anyhow::bail!("Timed out waiting for mount '{}'", share.name);
        }
        match is_mounted(&share.mount_point) {
            Ok(true) => break,
            _ => thread::sleep(Duration::from_millis(500)),
        }
    }

    info!("  Mount ready: {} at {}", share.name, share.mount_point.display());
    Ok(MountChild {
        name: share.name.clone(),
        child,
        rc_port,
    })
}

/// Unmount a single share's FUSE mount.
fn unmount_share(config: &Config, share_name: &str) {
    if let Some(share) = config.find_share(share_name) {
        if is_mounted(&share.mount_point).unwrap_or(false) {
            let mp = share.mount_point.display().to_string();
            let unmounted = Command::new("fusermount3")
                .args(["-uz", &mp])
                .status()
                .map(|s| s.success())
                .unwrap_or(false);
            if !unmounted {
                let _ = Command::new("fusermount")
                    .args(["-uz", &mp])
                    .status();
            }
        }
    }
}

/// Stop protocol services only (without touching mounts).
fn stop_protocols(protocols: &mut ProtocolChildren, config: &Config) {
    if let Some(child) = &mut protocols.smbd {
        graceful_kill(child);
        info!("  SMB: stopped");
    }
    protocols.smbd = None;

    if config.protocols.enable_nfs {
        let _ = Command::new("exportfs").arg("-ua").status();
        info!("  NFS: unexported");
    }

    if let Some(child) = &mut protocols.webdav {
        graceful_kill(child);
        info!("  WebDAV: stopped");
    }
    protocols.webdav = None;
}

/// Reload protocol configs without full restart (share add/remove/modify).
///
/// Writes updated smb.conf / NFS exports, then signals the running services
/// to re-read them:
/// - smbd: SIGHUP causes it to reload smb.conf (new shares appear, removed
///   shares disappear for new connections).
/// - NFS: `exportfs -ra` re-reads the exports file.
/// - WebDAV: no action needed (serves from FUSE mount directly).
fn reload_protocol_configs(protocols: &ProtocolChildren, config: &Config) -> Result<()> {
    if config.protocols.enable_smb {
        samba::write_config(config)?;
        if let Some(child) = &protocols.smbd {
            let pid = child.id() as i32;
            // SAFETY: sending SIGHUP to a known child PID is safe.
            unsafe { libc::kill(pid, libc::SIGHUP) };
            info!("  SMB: config reloaded (SIGHUP)");
        }
    }
    if config.protocols.enable_nfs {
        nfs::write_config(config)?;
        let _ = Command::new("exportfs").arg("-ra").status();
        info!("  NFS: re-exported");
    }
    Ok(())
}

/// Restart protocol services (Tier B). Regen configs and restart smbd/NFS/WebDAV.
fn restart_protocols(
    protocols: &mut ProtocolChildren,
    smbd_tracker: &mut RestartTracker,
    webdav_tracker: &mut RestartTracker,
    config: &Config,
) -> Result<()> {
    // Stop existing
    stop_protocols(protocols, config);

    // Regenerate configs
    if config.protocols.enable_smb {
        samba::write_config(config)?;
        if config.smb_auth.enabled {
            samba::setup_user(config)?;
        }
    }
    if config.protocols.enable_nfs {
        nfs::write_config(config)?;
    }

    // Start fresh
    let new_protocols = start_protocols(config)?;
    *protocols = new_protocols;
    *smbd_tracker = RestartTracker::new();
    *webdav_tracker = RestartTracker::new();

    Ok(())
}

/// Send SIGTERM to the entire process group, wait up to `SIGTERM_GRACE`,
/// then SIGKILL if still alive.
///
/// All children are spawned with `.process_group(0)` so the child PID equals
/// the process group ID.  Using `-pid` ensures forked workers (e.g. smbd
/// per-client forks) are also terminated — otherwise orphaned workers hold
/// the listening socket and prevent the new process from binding.
fn graceful_kill(child: &mut Child) {
    let pid = child.id() as i32;
    // SAFETY: sending a signal to a known child process group is safe.
    unsafe { libc::kill(-pid, libc::SIGTERM) };

    let deadline = Instant::now() + SIGTERM_GRACE;
    loop {
        match child.try_wait() {
            Ok(Some(_)) => return,
            Ok(None) => {}
            Err(_) => break,
        }
        if Instant::now() > deadline {
            break;
        }
        thread::sleep(Duration::from_millis(100));
    }

    // Escalate: SIGKILL the entire process group
    unsafe { libc::kill(-pid, libc::SIGKILL) };
    let _ = child.wait();
}

/// Wait for rclone VFS write-back queue to drain on a specific RC port.
fn wait_writeback_drain(port: u16) {
    let deadline = Instant::now() + WRITEBACK_DRAIN_TIMEOUT;
    let mut first = true;

    loop {
        match rc::vfs_stats(port) {
            Ok(vfs) => {
                if let Some(dc) = &vfs.disk_cache {
                    let pending = dc.uploads_in_progress + dc.uploads_queued;
                    if pending == 0 {
                        if !first {
                            info!("  Write-back queue drained.");
                        }
                        return;
                    }
                    if first {
                        info!(
                            "  Waiting for write-back queue ({pending} files pending)..."
                        );
                        first = false;
                    } else {
                        info!("  Write-back: {pending} files remaining...");
                    }
                } else {
                    return;
                }
            }
            Err(_) => return,
        }

        if Instant::now() > deadline {
            warn!(
                "  Warning: write-back drain timed out after {}s, proceeding.",
                WRITEBACK_DRAIN_TIMEOUT.as_secs()
            );
            return;
        }

        thread::sleep(WRITEBACK_POLL_INTERVAL);
    }
}

/// Reverse-order teardown of all services.
fn shutdown_services(config: &Config, mounts: &mut Vec<MountChild>, protocols: &mut ProtocolChildren) {
    // Stop protocol services
    stop_protocols(protocols, config);

    // Wait for write-back queues to drain on each mount
    for mc in mounts.iter() {
        wait_writeback_drain(mc.rc_port);
    }

    // Lazy unmount each share's FUSE mount
    for share in &config.shares {
        if is_mounted(&share.mount_point).unwrap_or(false) {
            let mp = share.mount_point.display().to_string();
            let unmounted = Command::new("fusermount3")
                .args(["-uz", &mp])
                .status()
                .map(|s| s.success())
                .unwrap_or(false);
            if !unmounted {
                let _ = Command::new("fusermount")
                    .args(["-uz", &mp])
                    .status();
            }
        }
    }
    info!("  FUSE: unmounted");

    // Gracefully stop all rclone mount processes
    for mc in mounts.iter_mut() {
        graceful_kill(&mut mc.child);
    }
    info!("  rclone: stopped");
}

/// Detect cache state changes and emit structured log events; also emit
/// periodic stats snapshots. Called after every `update_status()` cycle.
fn log_cache_events(
    shared_status: &Arc<RwLock<DaemonStatus>>,
    config: &Config,
    prev_states: &mut HashMap<String, SharePrevState>,
    last_snapshot: &mut Instant,
) {
    let status = shared_status.read().unwrap();
    let max_bytes = parse_size_bytes(&config.cache.max_size);
    let emit_snapshot = last_snapshot.elapsed() >= STATS_SNAPSHOT_INTERVAL;

    for ss in &status.shares {
        if !ss.mounted {
            continue;
        }

        let now_active = ss.speed > SPEED_ACTIVE_THRESHOLD;
        let now_warn_level = if let Some(max) = max_bytes {
            let frac = ss.cache_bytes as f64 / max as f64;
            if frac >= CACHE_CRITICAL_THRESHOLD {
                2
            } else if frac >= CACHE_WARN_THRESHOLD {
                1
            } else {
                0
            }
        } else {
            0
        };

        let prev = prev_states.entry(ss.name.clone()).or_insert(SharePrevState {
            dirty_count: ss.dirty_count,
            errored_files: ss.errored_files,
            cache_bytes: ss.cache_bytes,
            is_active: now_active,
            cache_warn_level: now_warn_level,
        });

        // dirty_count change
        if ss.dirty_count != prev.dirty_count {
            info!(share = %ss.name, dirty_count = ss.dirty_count,
                  prev = prev.dirty_count, "cache dirty_count changed");
        }

        // errored_files change
        if ss.errored_files != prev.errored_files {
            if ss.errored_files > prev.errored_files {
                warn!(share = %ss.name, errored_files = ss.errored_files,
                      prev = prev.errored_files, "cache errored_files increased");
            } else {
                info!(share = %ss.name, errored_files = ss.errored_files,
                      prev = prev.errored_files, "cache errored_files cleared");
            }
        }

        // cache_bytes change >10%
        if prev.cache_bytes == 0 && ss.cache_bytes > 0 {
            info!(share = %ss.name, cache_bytes = ss.cache_bytes, "cache population started");
        } else if prev.cache_bytes > 0 {
            let delta = (ss.cache_bytes as f64 - prev.cache_bytes as f64).abs()
                / prev.cache_bytes as f64;
            if delta > 0.10 {
                info!(share = %ss.name, cache_bytes = ss.cache_bytes,
                      prev = prev.cache_bytes, "cache size changed >10%");
            }
        }

        // transfer idle/active transition
        if now_active != prev.is_active {
            if now_active {
                info!(share = %ss.name, speed_bps = ss.speed as u64, "transfer became active");
            } else {
                info!(share = %ss.name, "transfer became idle");
            }
        }

        // cache warn level change or periodic snapshot
        if now_warn_level != prev.cache_warn_level || emit_snapshot {
            if now_warn_level == 2 {
                if let Some(max) = max_bytes {
                    warn!(share = %ss.name, cache_bytes = ss.cache_bytes, cache_max = max,
                          "cache critically full (>=95%)");
                }
            } else if now_warn_level == 1 {
                if let Some(max) = max_bytes {
                    warn!(share = %ss.name, cache_bytes = ss.cache_bytes, cache_max = max,
                          "cache nearly full (>=80%)");
                }
            }
        }

        // periodic stats snapshot
        if emit_snapshot {
            info!(share = %ss.name,
                  cache_bytes = ss.cache_bytes, dirty_count = ss.dirty_count,
                  errored_files = ss.errored_files, speed_bps = ss.speed as u64,
                  transfers = ss.transfers, errors = ss.errors,
                  "stats snapshot");
        }

        prev.dirty_count = ss.dirty_count;
        prev.errored_files = ss.errored_files;
        prev.cache_bytes = ss.cache_bytes;
        prev.is_active = now_active;
        prev.cache_warn_level = now_warn_level;
    }

    if emit_snapshot {
        *last_snapshot = Instant::now();
    }
}

/// Parse a human-readable size string (e.g. "200G", "1.5T", "512M") into bytes.
fn parse_size_bytes(s: &str) -> Option<u64> {
    let s = s.trim();
    let (num_part, suffix) = s
        .find(|c: char| c.is_alphabetic())
        .map(|i| s.split_at(i))
        .unwrap_or((s, ""));
    let n: f64 = num_part.trim().parse().ok()?;
    let mult: f64 = match suffix.to_uppercase().trim_end_matches('B') {
        "" => 1.0,
        "K" => 1024.0,
        "M" => 1024.0_f64.powi(2),
        "G" => 1024.0_f64.powi(3),
        "T" => 1024.0_f64.powi(4),
        _ => return None,
    };
    Some((n * mult) as u64)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_restart_tracker_new() {
        let tracker = RestartTracker::new();
        assert_eq!(tracker.count, 0);
        assert!(tracker.last_restart.is_none());
    }

    #[test]
    fn test_restart_tracker_record_restart() {
        let mut tracker = RestartTracker::new();
        tracker.record_restart();
        assert_eq!(tracker.count, 1);
        assert!(tracker.last_restart.is_some());
    }

    #[test]
    fn test_restart_tracker_can_restart_under_max() {
        let mut tracker = RestartTracker::new();
        assert!(tracker.can_restart());

        tracker.record_restart();
        assert!(tracker.can_restart()); // count = 1

        tracker.record_restart();
        assert!(tracker.can_restart()); // count = 2
    }

    #[test]
    fn test_restart_tracker_cannot_restart_at_max() {
        let mut tracker = RestartTracker::new();
        for _ in 0..MAX_RESTARTS {
            tracker.record_restart();
        }
        assert!(!tracker.can_restart()); // count = 3 = MAX_RESTARTS
    }

    #[test]
    fn test_restart_tracker_backoff_delay() {
        let mut tracker = RestartTracker::new();

        tracker.record_restart();
        assert_eq!(tracker.count * 2, 2); // 2s delay

        tracker.record_restart();
        assert_eq!(tracker.count * 2, 4); // 4s delay

        tracker.record_restart();
        assert_eq!(tracker.count * 2, 6); // 6s delay
    }

    #[test]
    fn test_restart_tracker_multiple_record() {
        let mut tracker = RestartTracker::new();
        tracker.record_restart();
        tracker.record_restart();
        tracker.record_restart();
        assert_eq!(tracker.count, 3);
        assert!(!tracker.can_restart());
    }

    #[test]
    fn test_constants() {
        assert_eq!(MOUNT_TIMEOUT, Duration::from_secs(30));
        assert_eq!(POLL_INTERVAL, Duration::from_secs(2));
        assert_eq!(SIGTERM_GRACE, Duration::from_secs(3));
        assert_eq!(MAX_RESTARTS, 3);
        assert_eq!(RESTART_STABLE_PERIOD, Duration::from_secs(300));
        assert_eq!(WRITEBACK_DRAIN_TIMEOUT, Duration::from_secs(300));
        assert_eq!(WRITEBACK_POLL_INTERVAL, Duration::from_secs(2));
    }

    #[test]
    fn test_parse_size_bytes() {
        assert_eq!(parse_size_bytes("200G"), Some(200 * 1024 * 1024 * 1024));
        assert_eq!(parse_size_bytes("1T"), Some(1024 * 1024 * 1024 * 1024));
        assert_eq!(parse_size_bytes("512M"), Some(512 * 1024 * 1024));
        assert_eq!(parse_size_bytes("1024K"), Some(1024 * 1024));
        assert_eq!(parse_size_bytes("1024"), Some(1024));
        assert_eq!(parse_size_bytes("200GB"), Some(200 * 1024 * 1024 * 1024));
        assert_eq!(parse_size_bytes("bogus"), None);
    }
}