Harden supervisor shutdown: process group isolation, write-back drain
- Spawn all children (rclone, smbd, webdav) in isolated process groups so Ctrl+C doesn't reach them directly — supervisor controls shutdown order - Wait for rclone VFS write-back queue to drain before unmounting (5min cap) - Prefer fusermount3 over fusermount, skip if already unmounted Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
960ddd20ce
commit
e6c48c9bd9
@ -4,6 +4,7 @@
|
||||
//! coordinated startup and shutdown. Designed to run as a systemd unit
|
||||
//! or standalone (Docker-friendly).
|
||||
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::process::{Child, Command};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
@ -26,6 +27,10 @@ const SIGTERM_GRACE: Duration = Duration::from_secs(3);
|
||||
const MAX_RESTARTS: u32 = 3;
|
||||
/// Reset restart counter after this period of stable running.
|
||||
const RESTART_STABLE_PERIOD: Duration = Duration::from_secs(300);
|
||||
/// Max time to wait for write-back queue to drain on shutdown.
|
||||
const WRITEBACK_DRAIN_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
/// Poll interval when waiting for write-back drain.
|
||||
const WRITEBACK_POLL_INTERVAL: Duration = Duration::from_secs(2);
|
||||
|
||||
/// Tracks restart attempts for a supervised child process.
|
||||
struct RestartTracker {
|
||||
@ -170,6 +175,7 @@ fn start_and_wait_mount(config: &Config, shutdown: &AtomicBool) -> Result<Child>
|
||||
|
||||
let mut child = Command::new("rclone")
|
||||
.args(&args)
|
||||
.process_group(0) // isolate from terminal SIGINT
|
||||
.spawn()
|
||||
.context("Failed to spawn rclone mount")?;
|
||||
|
||||
@ -221,6 +227,7 @@ fn spawn_smbd() -> Result<Child> {
|
||||
Command::new("smbd")
|
||||
.args(["--foreground", "--debug-stdout", "--no-process-group",
|
||||
"--configfile", samba::SMB_CONF_PATH])
|
||||
.process_group(0)
|
||||
.spawn()
|
||||
.context("Failed to spawn smbd")
|
||||
}
|
||||
@ -266,6 +273,7 @@ fn spawn_webdav(config: &Config) -> Result<Child> {
|
||||
let args = webdav::build_serve_args(config);
|
||||
Command::new("rclone")
|
||||
.args(&args)
|
||||
.process_group(0)
|
||||
.spawn()
|
||||
.context("Failed to spawn rclone serve webdav")
|
||||
}
|
||||
@ -401,6 +409,55 @@ fn graceful_kill(child: &mut Child) {
|
||||
let _ = child.wait();
|
||||
}
|
||||
|
||||
/// Wait for rclone VFS write-back queue to drain.
|
||||
///
|
||||
/// Polls `vfs/stats` every 2s. Exits when uploads_in_progress + uploads_queued
|
||||
/// reaches 0, or after 5 minutes (safety cap to avoid hanging forever).
|
||||
fn wait_writeback_drain() {
|
||||
use crate::rclone::rc;
|
||||
|
||||
let deadline = Instant::now() + WRITEBACK_DRAIN_TIMEOUT;
|
||||
let mut first = true;
|
||||
|
||||
loop {
|
||||
match rc::vfs_stats() {
|
||||
Ok(vfs) => {
|
||||
if let Some(dc) = &vfs.disk_cache {
|
||||
let pending = dc.uploads_in_progress + dc.uploads_queued;
|
||||
if pending == 0 {
|
||||
if !first {
|
||||
println!(" Write-back queue drained.");
|
||||
}
|
||||
return;
|
||||
}
|
||||
if first {
|
||||
println!(
|
||||
" Waiting for write-back queue ({pending} files pending)..."
|
||||
);
|
||||
first = false;
|
||||
} else {
|
||||
eprint!("\r Write-back: {pending} files remaining... ");
|
||||
}
|
||||
} else {
|
||||
return; // no cache info → nothing to wait for
|
||||
}
|
||||
}
|
||||
Err(_) => return, // RC API unavailable → rclone already gone
|
||||
}
|
||||
|
||||
if Instant::now() > deadline {
|
||||
eprintln!();
|
||||
eprintln!(
|
||||
" Warning: write-back drain timed out after {}s, proceeding with shutdown.",
|
||||
WRITEBACK_DRAIN_TIMEOUT.as_secs()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
thread::sleep(WRITEBACK_POLL_INTERVAL);
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse-order teardown of all services.
|
||||
///
|
||||
/// Order: stop smbd → unexport NFS → kill WebDAV → unmount FUSE → kill rclone.
|
||||
@ -423,11 +480,23 @@ fn shutdown_services(config: &Config, mount: &mut Child, protocols: &mut Protoco
|
||||
println!(" WebDAV: stopped");
|
||||
}
|
||||
|
||||
// Lazy unmount FUSE
|
||||
// Wait for write-back queue to drain before unmounting
|
||||
wait_writeback_drain();
|
||||
|
||||
// Lazy unmount FUSE (skip if rclone already unmounted on signal)
|
||||
if is_mounted(config).unwrap_or(false) {
|
||||
let mount_point = config.mount.point.display().to_string();
|
||||
let unmounted = Command::new("fusermount3")
|
||||
.args(["-uz", &mount_point])
|
||||
.status()
|
||||
.map(|s| s.success())
|
||||
.unwrap_or(false);
|
||||
if !unmounted {
|
||||
let _ = Command::new("fusermount")
|
||||
.args(["-uz", &mount_point])
|
||||
.status();
|
||||
}
|
||||
}
|
||||
println!(" FUSE: unmounted");
|
||||
|
||||
// Gracefully stop rclone
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user