From e6c48c9bd94f0dc0dfd06ee0e965546d45dc892f Mon Sep 17 00:00:00 2001 From: grabbit Date: Wed, 18 Feb 2026 09:56:09 +0800 Subject: [PATCH] Harden supervisor shutdown: process group isolation, write-back drain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Spawn all children (rclone, smbd, webdav) in isolated process groups so Ctrl+C doesn't reach them directly — supervisor controls shutdown order - Wait for rclone VFS write-back queue to drain before unmounting (5min cap) - Prefer fusermount3 over fusermount, skip if already unmounted Co-Authored-By: Claude Opus 4.6 --- warpgate/src/supervisor.rs | 79 +++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 5 deletions(-) diff --git a/warpgate/src/supervisor.rs b/warpgate/src/supervisor.rs index 137fa2b..3d85b37 100644 --- a/warpgate/src/supervisor.rs +++ b/warpgate/src/supervisor.rs @@ -4,6 +4,7 @@ //! coordinated startup and shutdown. Designed to run as a systemd unit //! or standalone (Docker-friendly). +use std::os::unix::process::CommandExt; use std::process::{Child, Command}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; @@ -26,6 +27,10 @@ const SIGTERM_GRACE: Duration = Duration::from_secs(3); const MAX_RESTARTS: u32 = 3; /// Reset restart counter after this period of stable running. const RESTART_STABLE_PERIOD: Duration = Duration::from_secs(300); +/// Max time to wait for write-back queue to drain on shutdown. +const WRITEBACK_DRAIN_TIMEOUT: Duration = Duration::from_secs(300); +/// Poll interval when waiting for write-back drain. +const WRITEBACK_POLL_INTERVAL: Duration = Duration::from_secs(2); /// Tracks restart attempts for a supervised child process. struct RestartTracker { @@ -170,6 +175,7 @@ fn start_and_wait_mount(config: &Config, shutdown: &AtomicBool) -> Result let mut child = Command::new("rclone") .args(&args) + .process_group(0) // isolate from terminal SIGINT .spawn() .context("Failed to spawn rclone mount")?; @@ -221,6 +227,7 @@ fn spawn_smbd() -> Result { Command::new("smbd") .args(["--foreground", "--debug-stdout", "--no-process-group", "--configfile", samba::SMB_CONF_PATH]) + .process_group(0) .spawn() .context("Failed to spawn smbd") } @@ -266,6 +273,7 @@ fn spawn_webdav(config: &Config) -> Result { let args = webdav::build_serve_args(config); Command::new("rclone") .args(&args) + .process_group(0) .spawn() .context("Failed to spawn rclone serve webdav") } @@ -401,6 +409,55 @@ fn graceful_kill(child: &mut Child) { let _ = child.wait(); } +/// Wait for rclone VFS write-back queue to drain. +/// +/// Polls `vfs/stats` every 2s. Exits when uploads_in_progress + uploads_queued +/// reaches 0, or after 5 minutes (safety cap to avoid hanging forever). +fn wait_writeback_drain() { + use crate::rclone::rc; + + let deadline = Instant::now() + WRITEBACK_DRAIN_TIMEOUT; + let mut first = true; + + loop { + match rc::vfs_stats() { + Ok(vfs) => { + if let Some(dc) = &vfs.disk_cache { + let pending = dc.uploads_in_progress + dc.uploads_queued; + if pending == 0 { + if !first { + println!(" Write-back queue drained."); + } + return; + } + if first { + println!( + " Waiting for write-back queue ({pending} files pending)..." + ); + first = false; + } else { + eprint!("\r Write-back: {pending} files remaining... "); + } + } else { + return; // no cache info → nothing to wait for + } + } + Err(_) => return, // RC API unavailable → rclone already gone + } + + if Instant::now() > deadline { + eprintln!(); + eprintln!( + " Warning: write-back drain timed out after {}s, proceeding with shutdown.", + WRITEBACK_DRAIN_TIMEOUT.as_secs() + ); + return; + } + + thread::sleep(WRITEBACK_POLL_INTERVAL); + } +} + /// Reverse-order teardown of all services. /// /// Order: stop smbd → unexport NFS → kill WebDAV → unmount FUSE → kill rclone. @@ -423,11 +480,23 @@ fn shutdown_services(config: &Config, mount: &mut Child, protocols: &mut Protoco println!(" WebDAV: stopped"); } - // Lazy unmount FUSE - let mount_point = config.mount.point.display().to_string(); - let _ = Command::new("fusermount") - .args(["-uz", &mount_point]) - .status(); + // Wait for write-back queue to drain before unmounting + wait_writeback_drain(); + + // Lazy unmount FUSE (skip if rclone already unmounted on signal) + if is_mounted(config).unwrap_or(false) { + let mount_point = config.mount.point.display().to_string(); + let unmounted = Command::new("fusermount3") + .args(["-uz", &mount_point]) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if !unmounted { + let _ = Command::new("fusermount") + .args(["-uz", &mount_point]) + .status(); + } + } println!(" FUSE: unmounted"); // Gracefully stop rclone