warpgate/tests/08-crash-recovery/test-write-interrupt-recovery.sh
grabbit a2d49137f9 Add comprehensive test suite: 63 integration tests + 110 Rust unit tests
Integration tests (tests/):
- 9 categories covering config, lifecycle, signals, supervision,
  cache, writeback, network faults, crash recovery, and CLI
- Shell-based harness with mock NAS (network namespace + SFTP),
  fault injection (tc netem), and power loss simulation
- TAP format runner (run-all.sh) with proper SKIP detection

Rust unit tests (warpgate/src/):
- 110 tests across 14 modules, all passing in 0.01s
- Config parsing, defaults validation, RestartTracker logic,
  RC API response parsing, rclone arg generation, service
  config generation, CLI output formatting, warmup path logic

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 11:21:35 +08:00

187 lines
6.3 KiB
Bash
Executable File

#!/usr/bin/env bash
# Test: large write interrupted by power loss — recovery behavior
#
# Verifies what happens when a large file write (5 MB) is interrupted
# mid-way by a simulated power loss (SIGKILL). After restarting warpgate,
# documents whether the file is partially present, fully recovered, or
# missing on the NAS.
#
# With rclone VFS write-back=2s, the VFS may have begun uploading or may
# have the file cached locally waiting for write-back. After power loss,
# the partial/complete file should persist in the VFS cache and be
# re-uploaded on restart.
#
# Sequence:
# 1. Start warpgate with write_back=2s.
# 2. Begin writing a 5 MB file in the background.
# 3. Sleep 1s (let the write start but possibly not complete).
# 4. simulate_power_loss.
# 5. Start a fresh warpgate instance.
# 6. Wait for dirty count to reach zero.
# 7. Document what happened to the file on the NAS.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
source "$SCRIPT_DIR/../harness/helpers.sh"
source "$SCRIPT_DIR/../harness/mock-nas.sh"
require_root
setup_test_env
trap teardown_test_env EXIT
# Start the mock NAS
start_mock_nas
# Generate config with a short write-back delay
gen_config write_back=2s
# Start warpgate and wait for readiness
start_warpgate
wait_for_mount
wait_for_rc_api
# Start writing a 5 MB file in the background
dd if=/dev/urandom of="$TEST_MOUNT/bigwrite.dat" bs=1M count=5 2>/dev/null &
dd_pid=$!
_BG_PIDS+=("$dd_pid")
echo "INFO: started 5 MB write in background (PID $dd_pid)"
# Let the write proceed for a moment
sleep 1
# Check if there's anything in the cache yet
cache_file="$CACHE_DIR/vfs/nas/bigwrite.dat"
if [[ -f "$cache_file" ]]; then
cache_size=$(stat -c%s "$cache_file" 2>/dev/null || stat -f%z "$cache_file" 2>/dev/null || echo 0)
echo "INFO: cache file exists before power loss, size: $cache_size bytes"
else
echo "INFO: cache file not yet created at time of power loss"
fi
# Simulate power loss — kills everything including the dd
simulate_power_loss
# The dd process is now dead too
# Check what survived in the cache
if [[ -f "$cache_file" ]]; then
cache_size_after=$(stat -c%s "$cache_file" 2>/dev/null || stat -f%z "$cache_file" 2>/dev/null || echo 0)
echo "INFO: cache file persists after power loss, size: $cache_size_after bytes"
else
echo "INFO: no cache file found after power loss"
fi
# Clean up any stale FUSE mount
if mountpoint -q "$TEST_MOUNT" 2>/dev/null; then
fusermount3 -uz "$TEST_MOUNT" 2>/dev/null || fusermount -uz "$TEST_MOUNT" 2>/dev/null || true
fi
# Start a fresh warpgate instance
start_warpgate
wait_for_mount 60
wait_for_rc_api 30
# Wait for any dirty files to be flushed
wait_for_dirty_zero 120
# Document what happened to the file
echo "INFO: --- Recovery results ---"
if nas_file_exists "bigwrite.dat"; then
nas_size=$(stat -c%s "$NAS_ROOT/bigwrite.dat" 2>/dev/null || stat -f%z "$NAS_ROOT/bigwrite.dat" 2>/dev/null || echo 0)
expected_size=$((5 * 1024 * 1024))
echo "INFO: bigwrite.dat exists on NAS, size: $nas_size bytes"
if [[ "$nas_size" -eq "$expected_size" ]]; then
echo "INFO: file is complete (5 MB) — write finished before power loss"
elif [[ "$nas_size" -gt 0 ]]; then
echo "INFO: file is partial ($nas_size / $expected_size bytes)"
echo "INFO: this is expected — write was interrupted mid-stream"
fi
else
echo "INFO: bigwrite.dat NOT found on NAS"
echo "INFO: the write may not have committed to cache before power loss"
fi
# Also check if the file is visible through the mount
if [[ -f "$TEST_MOUNT/bigwrite.dat" ]]; then
mount_size=$(stat -c%s "$TEST_MOUNT/bigwrite.dat" 2>/dev/null || stat -f%z "$TEST_MOUNT/bigwrite.dat" 2>/dev/null || echo 0)
echo "INFO: bigwrite.dat visible through mount, size: $mount_size bytes"
else
echo "INFO: bigwrite.dat not visible through mount"
fi
# Stop the current warpgate instance before the optional btrfs test
stop_warpgate
# --- Optional btrfs test path ---
# If WARPGATE_TEST_BTRFS is set to a block device, run the same test on a
# btrfs-formatted cache filesystem and compare results vs ext4 above.
if [[ -n "${WARPGATE_TEST_BTRFS:-}" ]]; then
require_command mkfs.btrfs
echo "INFO: --- btrfs test path (device: $WARPGATE_TEST_BTRFS) ---"
# Format the device as btrfs
mkfs.btrfs -f "$WARPGATE_TEST_BTRFS" > /dev/null 2>&1
# Create a btrfs mount point and mount
btrfs_cache="$TEST_DIR/btrfs-cache"
mkdir -p "$btrfs_cache"
mount "$WARPGATE_TEST_BTRFS" "$btrfs_cache"
# Re-generate config with the btrfs cache dir
gen_config write_back=2s cache_dir="$btrfs_cache"
# Start warpgate on btrfs cache
start_warpgate
wait_for_mount 60
wait_for_rc_api 30
# Write a 5 MB file in the background
dd if=/dev/urandom of="$TEST_MOUNT/bigwrite-btrfs.dat" bs=1M count=5 2>/dev/null &
btrfs_dd_pid=$!
_BG_PIDS+=("$btrfs_dd_pid")
sleep 1
# Simulate power loss
simulate_power_loss
# Check what survived in the btrfs cache
btrfs_cache_file="$btrfs_cache/vfs/nas/bigwrite-btrfs.dat"
if [[ -f "$btrfs_cache_file" ]]; then
btrfs_size=$(stat -c%s "$btrfs_cache_file" 2>/dev/null || stat -f%z "$btrfs_cache_file" 2>/dev/null || echo 0)
echo "INFO: btrfs cache file persists after power loss, size: $btrfs_size bytes"
else
echo "INFO: no btrfs cache file found after power loss"
fi
# Clean up stale FUSE mount
if mountpoint -q "$TEST_MOUNT" 2>/dev/null; then
fusermount3 -uz "$TEST_MOUNT" 2>/dev/null || fusermount -uz "$TEST_MOUNT" 2>/dev/null || true
fi
# Restart warpgate on btrfs cache and wait for recovery
start_warpgate
wait_for_mount 60
wait_for_rc_api 30
wait_for_dirty_zero 120
# Document btrfs recovery result
if nas_file_exists "bigwrite-btrfs.dat"; then
btrfs_nas_size=$(stat -c%s "$NAS_ROOT/bigwrite-btrfs.dat" 2>/dev/null || stat -f%z "$NAS_ROOT/bigwrite-btrfs.dat" 2>/dev/null || echo 0)
echo "INFO: btrfs recovery: bigwrite-btrfs.dat on NAS, size: $btrfs_nas_size bytes"
else
echo "INFO: btrfs recovery: bigwrite-btrfs.dat NOT found on NAS"
fi
stop_warpgate
# Unmount btrfs
umount "$btrfs_cache" 2>/dev/null || true
else
echo "INFO: skipping btrfs test (set WARPGATE_TEST_BTRFS=/dev/sdX to enable)"
fi
echo "PASS: $(basename "$0" .sh)"