Harden supervisor shutdown: process group isolation, write-back drain

- Spawn all children (rclone, smbd, webdav) in isolated process groups
  so Ctrl+C doesn't reach them directly — supervisor controls shutdown order
- Wait for rclone VFS write-back queue to drain before unmounting (5min cap)
- Prefer fusermount3 over fusermount, skip if already unmounted

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
grabbit 2026-02-18 09:56:09 +08:00
parent 960ddd20ce
commit e6c48c9bd9

View File

@ -4,6 +4,7 @@
//! coordinated startup and shutdown. Designed to run as a systemd unit
//! or standalone (Docker-friendly).
use std::os::unix::process::CommandExt;
use std::process::{Child, Command};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
@ -26,6 +27,10 @@ const SIGTERM_GRACE: Duration = Duration::from_secs(3);
const MAX_RESTARTS: u32 = 3;
/// Reset restart counter after this period of stable running.
const RESTART_STABLE_PERIOD: Duration = Duration::from_secs(300);
/// Max time to wait for write-back queue to drain on shutdown.
const WRITEBACK_DRAIN_TIMEOUT: Duration = Duration::from_secs(300);
/// Poll interval when waiting for write-back drain.
const WRITEBACK_POLL_INTERVAL: Duration = Duration::from_secs(2);
/// Tracks restart attempts for a supervised child process.
struct RestartTracker {
@ -170,6 +175,7 @@ fn start_and_wait_mount(config: &Config, shutdown: &AtomicBool) -> Result<Child>
let mut child = Command::new("rclone")
.args(&args)
.process_group(0) // isolate from terminal SIGINT
.spawn()
.context("Failed to spawn rclone mount")?;
@ -221,6 +227,7 @@ fn spawn_smbd() -> Result<Child> {
Command::new("smbd")
.args(["--foreground", "--debug-stdout", "--no-process-group",
"--configfile", samba::SMB_CONF_PATH])
.process_group(0)
.spawn()
.context("Failed to spawn smbd")
}
@ -266,6 +273,7 @@ fn spawn_webdav(config: &Config) -> Result<Child> {
let args = webdav::build_serve_args(config);
Command::new("rclone")
.args(&args)
.process_group(0)
.spawn()
.context("Failed to spawn rclone serve webdav")
}
@ -401,6 +409,55 @@ fn graceful_kill(child: &mut Child) {
let _ = child.wait();
}
/// Wait for rclone VFS write-back queue to drain.
///
/// Polls `vfs/stats` every 2s. Exits when uploads_in_progress + uploads_queued
/// reaches 0, or after 5 minutes (safety cap to avoid hanging forever).
fn wait_writeback_drain() {
use crate::rclone::rc;
let deadline = Instant::now() + WRITEBACK_DRAIN_TIMEOUT;
let mut first = true;
loop {
match rc::vfs_stats() {
Ok(vfs) => {
if let Some(dc) = &vfs.disk_cache {
let pending = dc.uploads_in_progress + dc.uploads_queued;
if pending == 0 {
if !first {
println!(" Write-back queue drained.");
}
return;
}
if first {
println!(
" Waiting for write-back queue ({pending} files pending)..."
);
first = false;
} else {
eprint!("\r Write-back: {pending} files remaining... ");
}
} else {
return; // no cache info → nothing to wait for
}
}
Err(_) => return, // RC API unavailable → rclone already gone
}
if Instant::now() > deadline {
eprintln!();
eprintln!(
" Warning: write-back drain timed out after {}s, proceeding with shutdown.",
WRITEBACK_DRAIN_TIMEOUT.as_secs()
);
return;
}
thread::sleep(WRITEBACK_POLL_INTERVAL);
}
}
/// Reverse-order teardown of all services.
///
/// Order: stop smbd → unexport NFS → kill WebDAV → unmount FUSE → kill rclone.
@ -423,11 +480,23 @@ fn shutdown_services(config: &Config, mount: &mut Child, protocols: &mut Protoco
println!(" WebDAV: stopped");
}
// Lazy unmount FUSE
let mount_point = config.mount.point.display().to_string();
let _ = Command::new("fusermount")
.args(["-uz", &mount_point])
.status();
// Wait for write-back queue to drain before unmounting
wait_writeback_drain();
// Lazy unmount FUSE (skip if rclone already unmounted on signal)
if is_mounted(config).unwrap_or(false) {
let mount_point = config.mount.point.display().to_string();
let unmounted = Command::new("fusermount3")
.args(["-uz", &mount_point])
.status()
.map(|s| s.success())
.unwrap_or(false);
if !unmounted {
let _ = Command::new("fusermount")
.args(["-uz", &mount_point])
.status();
}
}
println!(" FUSE: unmounted");
// Gracefully stop rclone