//! `warpgate run` — single-process supervisor for all services. //! //! Manages rclone mount processes (one per share) + protocol services in one //! process tree with coordinated startup and shutdown. Spawns a built-in web //! server for status monitoring and config hot-reload. use std::collections::HashMap; use std::os::unix::process::CommandExt; use std::path::PathBuf; use std::process::{Child, Command}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::mpsc::{self, RecvTimeoutError}; use std::sync::{Arc, RwLock}; use std::thread; use std::time::{Duration, Instant, SystemTime}; use anyhow::{Context, Result}; use tracing::{error, info, warn}; use crate::config::Config; use crate::config_diff::{self, ChangeTier}; use crate::daemon::{DaemonStatus, ShareHealth, SupervisorCmd, WarmupRuleState, WarmupRuleStatus}; use crate::scheduler::ScheduledTask; use crate::rclone::mount::{build_mount_args, is_mounted}; use crate::rclone::rc; use crate::services::{nfs, samba, webdav}; /// Mount ready timeout. const MOUNT_TIMEOUT: Duration = Duration::from_secs(30); /// Supervision loop poll interval. const POLL_INTERVAL: Duration = Duration::from_secs(2); /// Grace period for SIGTERM before escalating to SIGKILL. const SIGTERM_GRACE: Duration = Duration::from_secs(3); /// Max restart attempts before giving up on a protocol service. const MAX_RESTARTS: u32 = 3; /// Reset restart counter after this period of stable running. const RESTART_STABLE_PERIOD: Duration = Duration::from_secs(300); /// Max time to wait for write-back queue to drain on shutdown. const WRITEBACK_DRAIN_TIMEOUT: Duration = Duration::from_secs(300); /// Poll interval when waiting for write-back drain. const WRITEBACK_POLL_INTERVAL: Duration = Duration::from_secs(2); /// Transfer speed below this value is considered idle (bytes/sec). const SPEED_ACTIVE_THRESHOLD: f64 = 10_240.0; // 10 KiB/s /// Interval for periodic cache stats snapshots. const STATS_SNAPSHOT_INTERVAL: Duration = Duration::from_secs(60); /// Cache usage WARN threshold (fraction of max_size). const CACHE_WARN_THRESHOLD: f64 = 0.80; /// Cache usage CRIT threshold. const CACHE_CRITICAL_THRESHOLD: f64 = 0.95; /// Per-share state from the previous poll cycle, used for change detection. struct SharePrevState { dirty_count: u64, errored_files: u64, cache_bytes: u64, is_active: bool, /// 0 = normal, 1 = ≥80%, 2 = ≥95% cache_warn_level: u8, } /// Tracks restart attempts for a supervised child process. struct RestartTracker { count: u32, last_restart: Option, } impl RestartTracker { fn new() -> Self { Self { count: 0, last_restart: None, } } /// Returns true if another restart is allowed. Resets counter if the /// service has been stable for `RESTART_STABLE_PERIOD`. fn can_restart(&mut self) -> bool { if let Some(last) = self.last_restart && last.elapsed() >= RESTART_STABLE_PERIOD { self.count = 0; } self.count < MAX_RESTARTS } fn record_restart(&mut self) { self.count += 1; self.last_restart = Some(Instant::now()); } } /// A named rclone mount child process for a single share. struct MountChild { name: String, child: Child, rc_port: u16, } /// Child processes for protocol servers managed by the supervisor. /// /// Implements `Drop` to kill any spawned children — prevents orphaned /// processes if startup fails partway through `start_protocols()`. struct ProtocolChildren { smbd: Option, webdav: Option, } impl Drop for ProtocolChildren { fn drop(&mut self) { for child in [&mut self.smbd, &mut self.webdav].into_iter().flatten() { graceful_kill(child); } } } /// Entry point — called from main.rs for `warpgate run`. pub fn run(config: &Config, config_path: PathBuf) -> Result<()> { let shutdown = Arc::new(AtomicBool::new(false)); // Install signal handler (SIGTERM + SIGINT) let shutdown_flag = Arc::clone(&shutdown); ctrlc::set_handler(move || { info!("Signal received, shutting down..."); shutdown_flag.store(true, Ordering::SeqCst); }) .context("Failed to set signal handler")?; // Set up shared state for web server integration let shared_config = Arc::new(RwLock::new(config.clone())); let share_names: Vec = config.shares.iter().map(|s| s.name.clone()).collect(); let shared_status = Arc::new(RwLock::new(DaemonStatus::new(&share_names))); let (cmd_tx, cmd_rx) = mpsc::channel::(); // Create SSE broadcast channel (supervisor → web clients) let (sse_tx, _) = tokio::sync::broadcast::channel::<()>(16); // Spawn the web UI server in a background thread let _web_handle = crate::web::spawn_web_server( Arc::clone(&shared_config), Arc::clone(&shared_status), cmd_tx.clone(), config_path, sse_tx.clone(), ); // Also wire shutdown signal to the command channel let shutdown_tx = cmd_tx; let shutdown_for_cmd = Arc::clone(&shutdown); thread::spawn(move || { // Poll the AtomicBool and forward to cmd channel when set loop { if shutdown_for_cmd.load(Ordering::SeqCst) { let _ = shutdown_tx.send(SupervisorCmd::Shutdown); return; } thread::sleep(Duration::from_millis(200)); } }); // Phase 1: Preflight — create dirs, write rclone.conf info!("Preflight checks..."); preflight(config)?; // Phase 1.5: Probe remote paths in parallel info!("Probing remote paths..."); let healthy_names = probe_all_shares(config, &shared_status, &shutdown)?; if healthy_names.is_empty() { anyhow::bail!("All shares failed probe — no healthy mounts to start"); } // Build a config containing only healthy shares for protocol configs let mut healthy_config = config.clone(); healthy_config .shares .retain(|s| healthy_names.contains(&s.name)); // Phase 1.75: Generate protocol configs with only healthy shares write_protocol_configs(&healthy_config)?; // Phase 2: Start rclone mounts only for healthy shares info!("Starting rclone mounts..."); let mut mount_children = start_and_wait_mounts(&healthy_config, &shutdown)?; for share in &healthy_config.shares { info!(" Mount ready at {}", share.mount_point.display()); } // Update status: mounts are ready (match by name, not index) { let mut status = shared_status.write().unwrap(); for mc in &mount_children { if let Some(ss) = status.shares.iter_mut().find(|s| s.name == mc.name) { ss.mounted = true; ss.rc_port = mc.rc_port; } } } // Phase 3: Start protocol services if shutdown.load(Ordering::SeqCst) { info!("Shutdown signal received during mount."); for mc in &mut mount_children { let _ = mc.child.kill(); let _ = mc.child.wait(); } return Ok(()); } info!("Starting protocol services..."); let mut protocols = start_protocols(&healthy_config)?; // Update status: protocols running { let mut status = shared_status.write().unwrap(); status.smbd_running = protocols.smbd.is_some(); status.webdav_running = protocols.webdav.is_some(); status.nfs_exported = healthy_config.protocols.enable_nfs; } // Phase 3.5: Auto-warmup in background thread (non-blocking) spawn_warmup(config, &shared_status, &shutdown); // Phase 3.6: Dir-refresh background threads (non-blocking) spawn_dir_refresh(config, &shared_status, &shutdown); // Phase 4: Supervision loop with command channel info!("Supervision active. Web UI at http://localhost:8090. Press Ctrl+C to stop."); let result = supervise( &shared_config, &shared_status, &cmd_rx, &mut mount_children, &mut protocols, Arc::clone(&shutdown), &sse_tx, ); // Phase 5: Teardown (always runs) info!("Shutting down..."); let config = shared_config.read().unwrap().clone(); shutdown_services(&config, &mut mount_children, &mut protocols); result } /// Spawn a background warmup thread for all configured warmup rules. /// /// Increments the warmup generation counter so any previous warmup thread /// will detect the change and exit. Each rule is processed sequentially /// with progress reported into `shared_status.warmup`. fn spawn_warmup( config: &Config, shared_status: &Arc>, shutdown: &Arc, ) { if config.warmup.rules.is_empty() || !config.warmup.auto { // Clear stale warmup status when rules are removed or auto is disabled let mut status = shared_status.write().unwrap(); status.warmup.clear(); return; } // Pre-populate warmup status entries and bump generation let generation = { let mut status = shared_status.write().unwrap(); status.warmup_generation += 1; status.warmup = config .warmup .rules .iter() .map(|rule| WarmupRuleStatus { share: rule.share.clone(), path: rule.path.clone(), newer_than: rule.newer_than.clone(), state: WarmupRuleState::Pending, total_files: 0, skipped: 0, cached: 0, errors: 0, }) .collect(); status.warmup_generation }; let warmup_config = config.clone(); let warmup_status = Arc::clone(shared_status); let warmup_shutdown = Arc::clone(shutdown); thread::spawn(move || { info!("Auto-warmup started (background, generation {generation})..."); for (i, rule) in warmup_config.warmup.rules.iter().enumerate() { if warmup_shutdown.load(Ordering::SeqCst) { info!("Auto-warmup interrupted by shutdown."); break; } // Check if our generation is still current { let status = warmup_status.read().unwrap(); if status.warmup_generation != generation { info!("Auto-warmup superseded by newer generation."); return; } } if let Err(e) = crate::cli::warmup::run_tracked( &warmup_config, &rule.share, &rule.path, rule.newer_than.as_deref(), &warmup_status, i, generation, &warmup_shutdown, ) { warn!("Warmup warning: {e}"); } } info!("Auto-warmup complete."); }); } /// Spawn per-share background threads that periodically call `vfs/refresh` to /// keep the rclone directory listing cache warm. /// /// Bumps the dir-refresh generation counter so any previous threads detect /// that they've been superseded and exit cleanly. Each share whose effective /// interval is non-zero gets its own `ScheduledTask` thread. fn spawn_dir_refresh( config: &Config, shared_status: &Arc>, shutdown: &Arc, ) { // Quick check: skip entirely if no share will actually refresh. let any_active = config .shares .iter() .any(|s| config.effective_dir_refresh_interval(s).is_some()); if !any_active { return; } // Bump generation and clone the shared Arc for threads. let gen_arc: Arc = { let mut s = shared_status.write().unwrap(); s.dir_refresh_generation += 1; s.dir_refresh_running = true; let g = s.dir_refresh_generation; s.dir_refresh_gen_arc.store(g, Ordering::SeqCst); Arc::clone(&s.dir_refresh_gen_arc) }; let generation = gen_arc.load(Ordering::SeqCst); for (i, share) in config.shares.iter().enumerate() { let interval = match config.effective_dir_refresh_interval(share) { Some(d) => d, None => continue, }; let share_name = share.name.clone(); let mount_point = share.mount_point.clone(); let recursive = config.dir_refresh.recursive; let rc_port = config.rc_port(i); let status = Arc::clone(shared_status); let gen_arc2 = Arc::clone(&gen_arc); let sd = Arc::clone(shutdown); info!( " dir-refresh: scheduling '{}' every {}s", share_name, interval.as_secs() ); ScheduledTask { name: "dir-refresh", interval, } .spawn(generation, gen_arc2, sd, move || { // Enumerate top-level subdirectories by reading the FUSE mount point. // The VFS root itself is not a valid vfs/refresh target in rclone. let dirs: Vec = std::fs::read_dir(&mount_point) .with_context(|| format!("dir-refresh: failed to read mount point for '{share_name}'"))? .filter_map(|entry| { let entry = entry.ok()?; if entry.file_type().ok()?.is_dir() { entry.file_name().into_string().ok() } else { None } }) .collect(); if dirs.is_empty() { tracing::warn!(share = %share_name, "dir-refresh: no subdirs found, skipping"); return Ok(()); } let mut ok = 0usize; let mut failed = 0usize; for dir in &dirs { match rc::vfs_refresh(rc_port, dir, recursive) { Ok(()) => { tracing::info!(share = %share_name, dir = %dir, "dir-refresh OK"); ok += 1; } Err(e) => { tracing::warn!(share = %share_name, dir = %dir, error = %e, "dir-refresh failed"); failed += 1; } } } tracing::info!( share = %share_name, dirs_ok = ok, dirs_failed = failed, "dir-refresh cycle complete" ); let mut s = status.write().unwrap(); s.last_dir_refresh.insert(share_name.clone(), SystemTime::now()); s.dir_refresh_dirs_ok.insert(share_name.clone(), ok); s.dir_refresh_dirs_failed.insert(share_name.clone(), failed); Ok(()) }); } } /// Write rclone config and create directories (protocol configs generated after probe). fn preflight(config: &Config) -> Result<()> { // Ensure mount points exist for each share for share in &config.shares { std::fs::create_dir_all(&share.mount_point).with_context(|| { format!( "Failed to create mount point: {}", share.mount_point.display() ) })?; } // Ensure cache directory exists std::fs::create_dir_all(&config.cache.dir).with_context(|| { format!( "Failed to create cache dir: {}", config.cache.dir.display() ) })?; // Generate rclone config crate::rclone::config::write_config(config)?; Ok(()) } /// Generate protocol configs (SMB/NFS) for the given config. /// /// Called after probing so only healthy shares are included. fn write_protocol_configs(config: &Config) -> Result<()> { if config.protocols.enable_smb { samba::write_config(config)?; if config.smb_auth.enabled { samba::setup_user(config)?; } } if config.protocols.enable_nfs { nfs::write_config(config)?; } Ok(()) } /// Probe all shares in parallel and return the set of healthy share names. /// /// Updates `shared_status` with probe results as they complete. fn probe_all_shares( config: &Config, shared_status: &Arc>, shutdown: &AtomicBool, ) -> Result> { use std::collections::HashSet; let shares: Vec<_> = config.shares.clone(); let config_clone = config.clone(); // Mark all shares as Probing { let mut status = shared_status.write().unwrap(); for ss in &mut status.shares { ss.health = ShareHealth::Probing; } } // Spawn one thread per share let handles: Vec<_> = shares .into_iter() .map(|share| { let cfg = config_clone.clone(); let name = share.name.clone(); thread::spawn(move || { let result = crate::rclone::probe::probe_remote_path(&cfg, &share); (name, result) }) }) .collect(); // Collect results let mut healthy = HashSet::new(); for handle in handles { if shutdown.load(Ordering::SeqCst) { anyhow::bail!("Interrupted during probe"); } match handle.join() { Ok((name, Ok(()))) => { info!(" Probe OK: {name}"); let mut status = shared_status.write().unwrap(); if let Some(ss) = status.shares.iter_mut().find(|s| s.name == name) { ss.health = ShareHealth::Healthy; } healthy.insert(name); } Ok((name, Err(e))) => { let msg = format!("{e}"); error!(" Probe FAILED: {name} — {msg}"); let mut status = shared_status.write().unwrap(); if let Some(ss) = status.shares.iter_mut().find(|s| s.name == name) { ss.health = ShareHealth::Failed(msg); } } Err(_) => { error!(" Probe thread panicked"); } } } Ok(healthy.into_iter().collect()) } /// Spawn rclone mount processes for all shares and poll until each FUSE mount appears. fn start_and_wait_mounts(config: &Config, shutdown: &AtomicBool) -> Result> { let mut children = Vec::new(); for (i, share) in config.shares.iter().enumerate() { let rc_port = config.rc_port(i); let args = build_mount_args(config, share, rc_port); let child = Command::new("rclone") .args(&args) .process_group(0) .spawn() .with_context(|| format!("Failed to spawn rclone mount for share '{}'", share.name))?; children.push(MountChild { name: share.name.clone(), child, rc_port, }); } // Poll for all mounts to become ready let deadline = Instant::now() + MOUNT_TIMEOUT; let mut ready = vec![false; config.shares.len()]; loop { if shutdown.load(Ordering::SeqCst) { for mc in &mut children { let _ = mc.child.kill(); let _ = mc.child.wait(); } anyhow::bail!("Interrupted while waiting for mounts"); } if Instant::now() > deadline { for mc in &mut children { let _ = mc.child.kill(); let _ = mc.child.wait(); } let pending: Vec<&str> = config.shares.iter() .zip(ready.iter()) .filter(|(_, r)| !**r) .map(|(s, _)| s.name.as_str()) .collect(); anyhow::bail!( "Timed out waiting for mounts ({}s). Still pending: {}", MOUNT_TIMEOUT.as_secs(), pending.join(", ") ); } // Check for early exits for (i, mc) in children.iter_mut().enumerate() { if ready[i] { continue; } match mc.child.try_wait() { Ok(Some(status)) => { anyhow::bail!( "rclone mount for '{}' exited immediately ({status}). Check remote/auth config.", mc.name ); } Ok(None) => {} Err(e) => { anyhow::bail!("Failed to check rclone mount status for '{}': {e}", mc.name); } } } // Check mount readiness let mut all_ready = true; for (i, share) in config.shares.iter().enumerate() { if ready[i] { continue; } match is_mounted(&share.mount_point) { Ok(true) => ready[i] = true, Ok(false) => all_ready = false, Err(e) => { warn!("Warning: mount check failed for '{}': {e}", share.name); all_ready = false; } } } if all_ready { break; } thread::sleep(Duration::from_millis(500)); } Ok(children) } /// Spawn smbd as a foreground child process. fn spawn_smbd() -> Result { Command::new("smbd") .args(["--foreground", "--debug-stdout", "--no-process-group", "--configfile", samba::SMB_CONF_PATH]) .process_group(0) .spawn() .context("Failed to spawn smbd") } /// Start protocol services after the mount is ready. fn start_protocols(config: &Config) -> Result { let smbd = if config.protocols.enable_smb { let child = spawn_smbd()?; info!(" SMB: started"); Some(child) } else { None }; if config.protocols.enable_nfs { let status = Command::new("exportfs") .arg("-ra") .status() .context("Failed to run exportfs -ra")?; if !status.success() { anyhow::bail!("exportfs -ra failed: {status}"); } info!(" NFS: exported"); } let webdav = if config.protocols.enable_webdav { let child = spawn_webdav(config)?; info!(" WebDAV: started"); Some(child) } else { None }; Ok(ProtocolChildren { smbd, webdav }) } /// Spawn a `rclone serve webdav` child process. fn spawn_webdav(config: &Config) -> Result { let args = webdav::build_serve_args(config); Command::new("rclone") .args(&args) .process_group(0) .spawn() .context("Failed to spawn rclone serve webdav") } /// Main supervision loop with command channel. /// /// Uses `recv_timeout` on the command channel so it can both respond to /// commands from the web UI and poll child processes every POLL_INTERVAL. /// /// - If any rclone mount dies → full shutdown (data safety). /// - If smbd/WebDAV dies → restart up to 3 times. fn supervise( shared_config: &Arc>, shared_status: &Arc>, cmd_rx: &mpsc::Receiver, mounts: &mut Vec, protocols: &mut ProtocolChildren, shutdown: Arc, sse_tx: &tokio::sync::broadcast::Sender<()>, ) -> Result<()> { let mut smbd_tracker = RestartTracker::new(); let mut webdav_tracker = RestartTracker::new(); let mut prev_states: HashMap = HashMap::new(); let mut last_stats_snapshot = Instant::now(); loop { // Check for commands (non-blocking with timeout = POLL_INTERVAL) match cmd_rx.recv_timeout(POLL_INTERVAL) { Ok(SupervisorCmd::Shutdown) => { info!("Shutdown command received."); return Ok(()); } Ok(SupervisorCmd::BwLimit { up, down }) => { info!(bw_limit_up = %up, bw_limit_down = %down, "bandwidth limit applied"); apply_bwlimit(mounts, &up, &down); } Ok(SupervisorCmd::Reload(new_config)) => { info!("Config reload requested..."); handle_reload( shared_config, shared_status, mounts, protocols, &mut smbd_tracker, &mut webdav_tracker, new_config, &shutdown, )?; info!("Config reload complete."); } Err(RecvTimeoutError::Timeout) => {} // normal poll cycle Err(RecvTimeoutError::Disconnected) => { info!("Command channel disconnected, shutting down."); return Ok(()); } } // Check for shutdown signal if shutdown.load(Ordering::SeqCst) { info!("Shutdown signal received."); return Ok(()); } // Check all rclone mount processes for mc in mounts.iter_mut() { match mc.child.try_wait() { Ok(Some(status)) => { anyhow::bail!( "rclone mount for '{}' exited unexpectedly ({}). Initiating full shutdown for data safety.", mc.name, status ); } Ok(None) => {} Err(e) => { anyhow::bail!("Failed to check rclone mount status for '{}': {e}", mc.name); } } } // Check smbd process (if enabled) if let Some(child) = &mut protocols.smbd { match child.try_wait() { Ok(Some(status)) => { warn!("smbd exited ({status})."); if smbd_tracker.can_restart() { smbd_tracker.record_restart(); let delay = smbd_tracker.count * 2; warn!( "Restarting smbd in {delay}s ({}/{MAX_RESTARTS})...", smbd_tracker.count, ); thread::sleep(Duration::from_secs(delay.into())); match spawn_smbd() { Ok(new_child) => *child = new_child, Err(e) => { error!("Failed to restart smbd: {e}"); protocols.smbd = None; } } } else { error!( "smbd exceeded max restarts ({MAX_RESTARTS}), giving up." ); protocols.smbd = None; } } Ok(None) => {} Err(e) => warn!("Warning: failed to check smbd status: {e}"), } } // Check WebDAV process (if enabled) let config = shared_config.read().unwrap().clone(); if let Some(child) = &mut protocols.webdav { match child.try_wait() { Ok(Some(status)) => { warn!("WebDAV exited ({status})."); if webdav_tracker.can_restart() { webdav_tracker.record_restart(); let delay = webdav_tracker.count * 2; warn!( "Restarting WebDAV in {delay}s ({}/{MAX_RESTARTS})...", webdav_tracker.count, ); thread::sleep(Duration::from_secs(delay.into())); match spawn_webdav(&config) { Ok(new_child) => *child = new_child, Err(e) => { error!("Failed to restart WebDAV: {e}"); protocols.webdav = None; } } } else { error!( "WebDAV exceeded max restarts ({MAX_RESTARTS}), giving up." ); protocols.webdav = None; } } Ok(None) => {} Err(e) => warn!("Warning: failed to check WebDAV status: {e}"), } } // Update shared status with fresh RC stats update_status(shared_status, mounts, protocols, &config); // Log cache state changes and periodic snapshots log_cache_events(shared_status, &config, &mut prev_states, &mut last_stats_snapshot); // Notify SSE subscribers that status was refreshed let _ = sse_tx.send(()); } } /// Poll RC API for each share and update the shared DaemonStatus. /// /// Matches mounts to status entries by name (not index) so the mapping /// stays correct after dynamic PerShare add/remove/modify reloads. /// /// Uses a two-phase approach to avoid holding the write lock during HTTP IO: /// Phase 1 collects all data without any lock; Phase 2 applies it under a /// short-lived write lock (pure memory writes, no IO). fn update_status( shared_status: &Arc>, mounts: &[MountChild], protocols: &ProtocolChildren, config: &Config, ) { struct ShareSnapshot { name: String, rc_port: u16, mounted: bool, cache_bytes: u64, dirty_count: u64, errored_files: u64, speed: f64, transfers: u64, errors: u64, } // Phase 1: collect all data WITHOUT holding any lock. let snapshots: Vec = mounts .iter() .map(|mc| { let mount_point = config .shares .iter() .find(|s| s.name == mc.name) .map(|s| s.mount_point.clone()) .unwrap_or_default(); let mounted = is_mounted(&mount_point).unwrap_or(false); let (cache_bytes, dirty_count, errored_files) = rc::vfs_stats(mc.rc_port) .ok() .and_then(|v| v.disk_cache) .map(|dc| (dc.bytes_used, dc.uploads_in_progress + dc.uploads_queued, dc.errored_files)) .unwrap_or((0, 0, 0)); let (speed, transfers, errors) = rc::core_stats(mc.rc_port) .map(|core| { let active = core.transferring.len() as u64; (if active > 0 { core.speed } else { 0.0 }, active, core.errors) }) .unwrap_or((0.0, 0, 0)); ShareSnapshot { name: mc.name.clone(), rc_port: mc.rc_port, mounted, cache_bytes, dirty_count, errored_files, speed, transfers, errors, } }) .collect(); // Phase 2: apply collected data under write lock — no IO here. let mut status = shared_status.write().unwrap(); for snap in snapshots { if let Some(ss) = status.shares.iter_mut().find(|s| s.name == snap.name) { ss.mounted = snap.mounted; ss.rc_port = snap.rc_port; ss.cache_bytes = snap.cache_bytes; ss.dirty_count = snap.dirty_count; ss.errored_files = snap.errored_files; ss.speed = snap.speed; ss.transfers = snap.transfers; ss.errors = snap.errors; } } status.smbd_running = protocols.smbd.is_some(); status.webdav_running = protocols.webdav.is_some(); status.nfs_exported = config.protocols.enable_nfs; } /// Apply bandwidth limits to all rclone mounts via RC API (Tier A — no restart). fn apply_bwlimit(mounts: &[MountChild], up: &str, down: &str) { for mc in mounts { match rc::bwlimit(mc.rc_port, Some(up), Some(down)) { Ok(_) => info!(" bwlimit applied to '{}'", mc.name), Err(e) => warn!(" bwlimit failed for '{}': {e}", mc.name), } } } /// Handle a config reload using the tiered change strategy. fn handle_reload( shared_config: &Arc>, shared_status: &Arc>, mounts: &mut Vec, protocols: &mut ProtocolChildren, smbd_tracker: &mut RestartTracker, webdav_tracker: &mut RestartTracker, new_config: Config, shutdown: &Arc, ) -> Result<()> { let old_config = shared_config.read().unwrap().clone(); let diff = config_diff::diff(&old_config, &new_config); if diff.is_empty() { info!(" No changes detected."); return Ok(()); } info!(" Changes: {}", diff.summary()); match diff.highest_tier() { ChangeTier::None => {} ChangeTier::Live => { // Tier A: bandwidth only — RC API call, no restart info!(" Tier A: applying bandwidth limits via RC API..."); apply_bwlimit(mounts, &new_config.bandwidth.limit_up, &new_config.bandwidth.limit_down); } ChangeTier::Protocol => { // Tier B: protocol-only changes — regen configs, restart protocol services // Also apply bandwidth if changed if diff.bandwidth_changed { apply_bwlimit(mounts, &new_config.bandwidth.limit_up, &new_config.bandwidth.limit_down); } info!(" Tier B: restarting protocol services..."); restart_protocols(protocols, smbd_tracker, webdav_tracker, &new_config)?; } ChangeTier::PerShare => { // Tier C: per-share changes — drain affected, unmount, remount if diff.bandwidth_changed { apply_bwlimit(mounts, &new_config.bandwidth.limit_up, &new_config.bandwidth.limit_down); } // Regenerate rclone.conf if connections changed if !diff.connections_added.is_empty() || !diff.connections_removed.is_empty() || !diff.connections_modified.is_empty() { info!(" Regenerating rclone.conf (connections changed)..."); crate::rclone::config::write_config(&new_config)?; } // Handle removed shares: drain → unmount → kill for name in &diff.shares_removed { info!(" Removing share '{name}'..."); if let Some(idx) = mounts.iter().position(|mc| mc.name == *name) { let mc = &mounts[idx]; wait_writeback_drain(mc.rc_port); unmount_share(&old_config, &mc.name); let mut mc = mounts.remove(idx); graceful_kill(&mut mc.child); } } // Handle modified shares: treat as remove + add for name in &diff.shares_modified { info!(" Restarting modified share '{name}'..."); // Remove old if let Some(idx) = mounts.iter().position(|mc| mc.name == *name) { let mc = &mounts[idx]; wait_writeback_drain(mc.rc_port); unmount_share(&old_config, &mc.name); let mut mc = mounts.remove(idx); graceful_kill(&mut mc.child); } // Add new if let Some((i, share)) = new_config.shares.iter().enumerate().find(|(_, s)| s.name == *name) { let rc_port = new_config.rc_port(i); if let Ok(mc) = spawn_mount(&new_config, share, rc_port) { mounts.push(mc); } } } // Handle added shares: spawn new mount for name in &diff.shares_added { info!(" Adding share '{name}'..."); if let Some((i, share)) = new_config.shares.iter().enumerate().find(|(_, s)| s.name == *name) { let rc_port = new_config.rc_port(i); std::fs::create_dir_all(&share.mount_point).ok(); if let Ok(mc) = spawn_mount(&new_config, share, rc_port) { mounts.push(mc); } } } // Update protocol configs to reflect share changes if diff.protocols_changed { // Protocol settings changed too — full restart needed restart_protocols(protocols, smbd_tracker, webdav_tracker, &new_config)?; } else if !diff.shares_removed.is_empty() || !diff.shares_added.is_empty() || !diff.shares_modified.is_empty() { // Only shares changed — live reload is sufficient reload_protocol_configs(protocols, &new_config)?; } } ChangeTier::Global => { // Tier D: global restart — drain all → stop everything → restart info!(" Tier D: full restart (global settings changed)..."); // Drain all write-back queues for mc in mounts.iter() { wait_writeback_drain(mc.rc_port); } // Stop all protocol services stop_protocols(protocols, &old_config); // Unmount and kill all rclone instances for mc in mounts.iter_mut() { unmount_share(&old_config, &mc.name); graceful_kill(&mut mc.child); } mounts.clear(); // Re-preflight with new config preflight(&new_config)?; // Re-probe all shares let shutdown_flag = AtomicBool::new(false); let healthy_names = probe_all_shares(&new_config, shared_status, &shutdown_flag)?; // Build healthy-only config for mounts and protocols let mut healthy_config = new_config.clone(); healthy_config .shares .retain(|s| healthy_names.contains(&s.name)); write_protocol_configs(&healthy_config)?; // Re-start mounts (healthy only) let mut new_mounts = start_and_wait_mounts(&healthy_config, &shutdown_flag)?; mounts.append(&mut new_mounts); // Re-start protocols let new_protocols = start_protocols(&healthy_config)?; // Replace old protocol children (Drop will handle any leftover) *protocols = new_protocols; *smbd_tracker = RestartTracker::new(); *webdav_tracker = RestartTracker::new(); } } // Update shared config { let mut cfg = shared_config.write().unwrap(); *cfg = new_config.clone(); } // Update shared status with new share list { let mut status = shared_status.write().unwrap(); let new_shares: Vec = new_config .shares .iter() .enumerate() .map(|(i, s)| { // Preserve existing stats if share still exists let existing = status.shares.iter().find(|ss| ss.name == s.name); crate::daemon::ShareStatus { name: s.name.clone(), mounted: existing.map(|e| e.mounted).unwrap_or(false), rc_port: new_config.rc_port(i), cache_bytes: existing.map(|e| e.cache_bytes).unwrap_or(0), dirty_count: existing.map(|e| e.dirty_count).unwrap_or(0), errored_files: existing.map(|e| e.errored_files).unwrap_or(0), speed: existing.map(|e| e.speed).unwrap_or(0.0), transfers: existing.map(|e| e.transfers).unwrap_or(0), errors: existing.map(|e| e.errors).unwrap_or(0), health: existing .map(|e| e.health.clone()) .unwrap_or_else(|| { // New share: if mount succeeded, it's healthy if mounts.iter().any(|mc| mc.name == s.name) { ShareHealth::Healthy } else { ShareHealth::Pending } }), } }) .collect(); status.shares = new_shares; status.smbd_running = protocols.smbd.is_some(); status.webdav_running = protocols.webdav.is_some(); status.nfs_exported = new_config.protocols.enable_nfs; } // Re-trigger warmup if settings changed if diff.warmup_changed { info!(" Warmup settings changed, re-triggering..."); spawn_warmup(&new_config, shared_status, shutdown); } // Re-trigger dir-refresh if settings changed if diff.dir_refresh_changed { info!(" Dir-refresh settings changed, re-triggering..."); spawn_dir_refresh(&new_config, shared_status, shutdown); } Ok(()) } /// Spawn a single rclone mount for a share. fn spawn_mount(config: &Config, share: &crate::config::ShareConfig, rc_port: u16) -> Result { let args = build_mount_args(config, share, rc_port); let child = Command::new("rclone") .args(&args) .process_group(0) .spawn() .with_context(|| format!("Failed to spawn rclone mount for share '{}'", share.name))?; // Wait for mount to appear let deadline = Instant::now() + MOUNT_TIMEOUT; loop { if Instant::now() > deadline { anyhow::bail!("Timed out waiting for mount '{}'", share.name); } match is_mounted(&share.mount_point) { Ok(true) => break, _ => thread::sleep(Duration::from_millis(500)), } } info!(" Mount ready: {} at {}", share.name, share.mount_point.display()); Ok(MountChild { name: share.name.clone(), child, rc_port, }) } /// Unmount a single share's FUSE mount. fn unmount_share(config: &Config, share_name: &str) { if let Some(share) = config.find_share(share_name) { if is_mounted(&share.mount_point).unwrap_or(false) { let mp = share.mount_point.display().to_string(); let unmounted = Command::new("fusermount3") .args(["-uz", &mp]) .status() .map(|s| s.success()) .unwrap_or(false); if !unmounted { let _ = Command::new("fusermount") .args(["-uz", &mp]) .status(); } } } } /// Stop protocol services only (without touching mounts). fn stop_protocols(protocols: &mut ProtocolChildren, config: &Config) { if let Some(child) = &mut protocols.smbd { graceful_kill(child); info!(" SMB: stopped"); } protocols.smbd = None; if config.protocols.enable_nfs { let _ = Command::new("exportfs").arg("-ua").status(); info!(" NFS: unexported"); } if let Some(child) = &mut protocols.webdav { graceful_kill(child); info!(" WebDAV: stopped"); } protocols.webdav = None; } /// Reload protocol configs without full restart (share add/remove/modify). /// /// Writes updated smb.conf / NFS exports, then signals the running services /// to re-read them: /// - smbd: SIGHUP causes it to reload smb.conf (new shares appear, removed /// shares disappear for new connections). /// - NFS: `exportfs -ra` re-reads the exports file. /// - WebDAV: no action needed (serves from FUSE mount directly). fn reload_protocol_configs(protocols: &ProtocolChildren, config: &Config) -> Result<()> { if config.protocols.enable_smb { samba::write_config(config)?; if let Some(child) = &protocols.smbd { let pid = child.id() as i32; // SAFETY: sending SIGHUP to a known child PID is safe. unsafe { libc::kill(pid, libc::SIGHUP) }; info!(" SMB: config reloaded (SIGHUP)"); } } if config.protocols.enable_nfs { nfs::write_config(config)?; let _ = Command::new("exportfs").arg("-ra").status(); info!(" NFS: re-exported"); } Ok(()) } /// Restart protocol services (Tier B). Regen configs and restart smbd/NFS/WebDAV. fn restart_protocols( protocols: &mut ProtocolChildren, smbd_tracker: &mut RestartTracker, webdav_tracker: &mut RestartTracker, config: &Config, ) -> Result<()> { // Stop existing stop_protocols(protocols, config); // Regenerate configs if config.protocols.enable_smb { samba::write_config(config)?; if config.smb_auth.enabled { samba::setup_user(config)?; } } if config.protocols.enable_nfs { nfs::write_config(config)?; } // Start fresh let new_protocols = start_protocols(config)?; *protocols = new_protocols; *smbd_tracker = RestartTracker::new(); *webdav_tracker = RestartTracker::new(); Ok(()) } /// Send SIGTERM to the entire process group, wait up to `SIGTERM_GRACE`, /// then SIGKILL if still alive. /// /// All children are spawned with `.process_group(0)` so the child PID equals /// the process group ID. Using `-pid` ensures forked workers (e.g. smbd /// per-client forks) are also terminated — otherwise orphaned workers hold /// the listening socket and prevent the new process from binding. fn graceful_kill(child: &mut Child) { let pid = child.id() as i32; // SAFETY: sending a signal to a known child process group is safe. unsafe { libc::kill(-pid, libc::SIGTERM) }; let deadline = Instant::now() + SIGTERM_GRACE; loop { match child.try_wait() { Ok(Some(_)) => return, Ok(None) => {} Err(_) => break, } if Instant::now() > deadline { break; } thread::sleep(Duration::from_millis(100)); } // Escalate: SIGKILL the entire process group unsafe { libc::kill(-pid, libc::SIGKILL) }; let _ = child.wait(); } /// Wait for rclone VFS write-back queue to drain on a specific RC port. fn wait_writeback_drain(port: u16) { let deadline = Instant::now() + WRITEBACK_DRAIN_TIMEOUT; let mut first = true; loop { match rc::vfs_stats(port) { Ok(vfs) => { if let Some(dc) = &vfs.disk_cache { let pending = dc.uploads_in_progress + dc.uploads_queued; if pending == 0 { if !first { info!(" Write-back queue drained."); } return; } if first { info!( " Waiting for write-back queue ({pending} files pending)..." ); first = false; } else { info!(" Write-back: {pending} files remaining..."); } } else { return; } } Err(_) => return, } if Instant::now() > deadline { warn!( " Warning: write-back drain timed out after {}s, proceeding.", WRITEBACK_DRAIN_TIMEOUT.as_secs() ); return; } thread::sleep(WRITEBACK_POLL_INTERVAL); } } /// Reverse-order teardown of all services. fn shutdown_services(config: &Config, mounts: &mut Vec, protocols: &mut ProtocolChildren) { // Stop protocol services stop_protocols(protocols, config); // Wait for write-back queues to drain on each mount for mc in mounts.iter() { wait_writeback_drain(mc.rc_port); } // Lazy unmount each share's FUSE mount for share in &config.shares { if is_mounted(&share.mount_point).unwrap_or(false) { let mp = share.mount_point.display().to_string(); let unmounted = Command::new("fusermount3") .args(["-uz", &mp]) .status() .map(|s| s.success()) .unwrap_or(false); if !unmounted { let _ = Command::new("fusermount") .args(["-uz", &mp]) .status(); } } } info!(" FUSE: unmounted"); // Gracefully stop all rclone mount processes for mc in mounts.iter_mut() { graceful_kill(&mut mc.child); } info!(" rclone: stopped"); } /// Detect cache state changes and emit structured log events; also emit /// periodic stats snapshots. Called after every `update_status()` cycle. fn log_cache_events( shared_status: &Arc>, config: &Config, prev_states: &mut HashMap, last_snapshot: &mut Instant, ) { let status = shared_status.read().unwrap(); let max_bytes = parse_size_bytes(&config.cache.max_size); let emit_snapshot = last_snapshot.elapsed() >= STATS_SNAPSHOT_INTERVAL; for ss in &status.shares { if !ss.mounted { continue; } let now_active = ss.speed > SPEED_ACTIVE_THRESHOLD; let now_warn_level = if let Some(max) = max_bytes { let frac = ss.cache_bytes as f64 / max as f64; if frac >= CACHE_CRITICAL_THRESHOLD { 2 } else if frac >= CACHE_WARN_THRESHOLD { 1 } else { 0 } } else { 0 }; let prev = prev_states.entry(ss.name.clone()).or_insert(SharePrevState { dirty_count: ss.dirty_count, errored_files: ss.errored_files, cache_bytes: ss.cache_bytes, is_active: now_active, cache_warn_level: now_warn_level, }); // dirty_count change if ss.dirty_count != prev.dirty_count { info!(share = %ss.name, dirty_count = ss.dirty_count, prev = prev.dirty_count, "cache dirty_count changed"); } // errored_files change if ss.errored_files != prev.errored_files { if ss.errored_files > prev.errored_files { warn!(share = %ss.name, errored_files = ss.errored_files, prev = prev.errored_files, "cache errored_files increased"); } else { info!(share = %ss.name, errored_files = ss.errored_files, prev = prev.errored_files, "cache errored_files cleared"); } } // cache_bytes change >10% if prev.cache_bytes == 0 && ss.cache_bytes > 0 { info!(share = %ss.name, cache_bytes = ss.cache_bytes, "cache population started"); } else if prev.cache_bytes > 0 { let delta = (ss.cache_bytes as f64 - prev.cache_bytes as f64).abs() / prev.cache_bytes as f64; if delta > 0.10 { info!(share = %ss.name, cache_bytes = ss.cache_bytes, prev = prev.cache_bytes, "cache size changed >10%"); } } // transfer idle/active transition if now_active != prev.is_active { if now_active { info!(share = %ss.name, speed_bps = ss.speed as u64, "transfer became active"); } else { info!(share = %ss.name, "transfer became idle"); } } // cache warn level change or periodic snapshot if now_warn_level != prev.cache_warn_level || emit_snapshot { if now_warn_level == 2 { if let Some(max) = max_bytes { warn!(share = %ss.name, cache_bytes = ss.cache_bytes, cache_max = max, "cache critically full (>=95%)"); } } else if now_warn_level == 1 { if let Some(max) = max_bytes { warn!(share = %ss.name, cache_bytes = ss.cache_bytes, cache_max = max, "cache nearly full (>=80%)"); } } } // periodic stats snapshot if emit_snapshot { info!(share = %ss.name, cache_bytes = ss.cache_bytes, dirty_count = ss.dirty_count, errored_files = ss.errored_files, speed_bps = ss.speed as u64, transfers = ss.transfers, errors = ss.errors, "stats snapshot"); } prev.dirty_count = ss.dirty_count; prev.errored_files = ss.errored_files; prev.cache_bytes = ss.cache_bytes; prev.is_active = now_active; prev.cache_warn_level = now_warn_level; } if emit_snapshot { *last_snapshot = Instant::now(); } } /// Parse a human-readable size string (e.g. "200G", "1.5T", "512M") into bytes. fn parse_size_bytes(s: &str) -> Option { let s = s.trim(); let (num_part, suffix) = s .find(|c: char| c.is_alphabetic()) .map(|i| s.split_at(i)) .unwrap_or((s, "")); let n: f64 = num_part.trim().parse().ok()?; let mult: f64 = match suffix.to_uppercase().trim_end_matches('B') { "" => 1.0, "K" => 1024.0, "M" => 1024.0_f64.powi(2), "G" => 1024.0_f64.powi(3), "T" => 1024.0_f64.powi(4), _ => return None, }; Some((n * mult) as u64) } #[cfg(test)] mod tests { use super::*; #[test] fn test_restart_tracker_new() { let tracker = RestartTracker::new(); assert_eq!(tracker.count, 0); assert!(tracker.last_restart.is_none()); } #[test] fn test_restart_tracker_record_restart() { let mut tracker = RestartTracker::new(); tracker.record_restart(); assert_eq!(tracker.count, 1); assert!(tracker.last_restart.is_some()); } #[test] fn test_restart_tracker_can_restart_under_max() { let mut tracker = RestartTracker::new(); assert!(tracker.can_restart()); tracker.record_restart(); assert!(tracker.can_restart()); // count = 1 tracker.record_restart(); assert!(tracker.can_restart()); // count = 2 } #[test] fn test_restart_tracker_cannot_restart_at_max() { let mut tracker = RestartTracker::new(); for _ in 0..MAX_RESTARTS { tracker.record_restart(); } assert!(!tracker.can_restart()); // count = 3 = MAX_RESTARTS } #[test] fn test_restart_tracker_backoff_delay() { let mut tracker = RestartTracker::new(); tracker.record_restart(); assert_eq!(tracker.count * 2, 2); // 2s delay tracker.record_restart(); assert_eq!(tracker.count * 2, 4); // 4s delay tracker.record_restart(); assert_eq!(tracker.count * 2, 6); // 6s delay } #[test] fn test_restart_tracker_multiple_record() { let mut tracker = RestartTracker::new(); tracker.record_restart(); tracker.record_restart(); tracker.record_restart(); assert_eq!(tracker.count, 3); assert!(!tracker.can_restart()); } #[test] fn test_constants() { assert_eq!(MOUNT_TIMEOUT, Duration::from_secs(30)); assert_eq!(POLL_INTERVAL, Duration::from_secs(2)); assert_eq!(SIGTERM_GRACE, Duration::from_secs(3)); assert_eq!(MAX_RESTARTS, 3); assert_eq!(RESTART_STABLE_PERIOD, Duration::from_secs(300)); assert_eq!(WRITEBACK_DRAIN_TIMEOUT, Duration::from_secs(300)); assert_eq!(WRITEBACK_POLL_INTERVAL, Duration::from_secs(2)); } #[test] fn test_parse_size_bytes() { assert_eq!(parse_size_bytes("200G"), Some(200 * 1024 * 1024 * 1024)); assert_eq!(parse_size_bytes("1T"), Some(1024 * 1024 * 1024 * 1024)); assert_eq!(parse_size_bytes("512M"), Some(512 * 1024 * 1024)); assert_eq!(parse_size_bytes("1024K"), Some(1024 * 1024)); assert_eq!(parse_size_bytes("1024"), Some(1024)); assert_eq!(parse_size_bytes("200GB"), Some(200 * 1024 * 1024 * 1024)); assert_eq!(parse_size_bytes("bogus"), None); } }