Skip to content

Commit

Permalink
Too many vdev probe errors should suspend pool
Browse files Browse the repository at this point in the history
Similar to what we saw in #16569, we need to consider that a
replacing vdev should not be considered as fully contributing
to the redundancy of a raidz vdev even though current IO has
enough redundancy.

When a failed vdev_probe() is faulting a disk, it now checks
if that disk is required, and if so it suspends the pool until
the admin can return the missing disks.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Don Brady <[email protected]>
  • Loading branch information
don-brady committed Dec 14, 2024
1 parent 6c9b4f1 commit 5645a2f
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 8 deletions.
27 changes: 21 additions & 6 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}

static void
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
{
if (vd->vdev_fault_wanted) {
vdev_state_t newstate = VDEV_STATE_FAULTED;
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}

/*
* If this device has the only valid copy of the data, then
* back off and simply mark the vdev as degraded instead.
*/
if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
vdev_dtl_required(vd)) {
newstate = VDEV_STATE_DEGRADED;
/* A required disk is missing so suspend the pool */
*suspend = B_TRUE;
}
vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
spa_async_fault_vdev(spa, vd->vdev_child[c]);
spa_async_fault_vdev(vd->vdev_child[c], suspend);
}

static void
Expand Down Expand Up @@ -9049,8 +9059,13 @@ spa_async_thread(void *arg)
*/
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
boolean_t suspend = B_FALSE;
spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
(void) spa_vdev_state_exit(spa, NULL, 0);
if (suspend) {
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
zio_resume_wait(spa);
}
}

/*
Expand Down
4 changes: 2 additions & 2 deletions tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
'fault_limits', 'scrub_after_resilver', 'suspend_resume_single',
'zpool_status_-s']
'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
'suspend_resume_single', 'zpool_status_-s']
tags = ['functional', 'fault']

[tests/functional/features/large_dnode:Linux]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1531,6 +1531,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/fault/decrypt_fault.ksh \
functional/fault/fault_limits.ksh \
functional/fault/scrub_after_resilver.ksh \
functional/fault/suspend_on_probe_errors.ksh \
functional/fault/suspend_resume_single.ksh \
functional/fault/setup.ksh \
functional/fault/zpool_status_-s.ksh \
Expand Down
149 changes: 149 additions & 0 deletions tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2024, Klara Inc.
#

. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/include/blkdev.shlib

#
# DESCRIPTION: Verify that 4 disks removed from a raidz3 will suspend the pool
#
# STRATEGY:
# 1. Disable ZED -- this test is focused on vdev_probe errors
# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug)
# 3. Add some data to it for a resilver workload
# 4. Replace one of the child vdevs to start a replacing vdev
# 5. During the resilver, remove 4 disks including one from the replacing vdev
# 6. Verify that the pool is suspended (it used to remain online)
#

DEV_SIZE_MB=1024

FILE_VDEV_CNT=8
FILE_VDEV_SIZ=256M

function cleanup
{
destroy_pool $TESTPOOL
if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then
log_must eval "echo running > /sys/block/$sd/device/state"
fi
unload_scsi_debug
rm -f $DATA_FILE
for i in {0..$((FILE_VDEV_CNT - 1))}; do
log_must rm -f "$TEST_BASE_DIR/dev-$i"
done
zed_start
}

log_onexit cleanup

log_assert "VDEV probe errors for more disks than parity should suspend a pool"

log_note "Stoping ZED process"
zed_stop
zpool events -c

# Make a debug device that we can "unplug" and lose 4 drives at once
unload_scsi_debug
load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b'
sd=$(get_debug_device)

# Create 4 partitions that match the FILE_VDEV_SIZ
parted "/dev/${sd}" --script mklabel gpt
parted "/dev/${sd}" --script mkpart primary 0% 25%
parted "/dev/${sd}" --script mkpart primary 25% 50%
parted "/dev/${sd}" --script mkpart primary 50% 75%
parted "/dev/${sd}" --script mkpart primary 75% 100%
block_device_wait "/dev/${sd}"
blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4"

# Create 8 file vdevs
typeset -a filedevs
for i in {0..$((FILE_VDEV_CNT - 1))}; do
device=$TEST_BASE_DIR/dev-$i
log_must truncate -s $FILE_VDEV_SIZ $device
# Use all but the last one for pool create
if [[ $i -lt "7" ]]; then
filedevs[${#filedevs[*]}+1]=$device
fi
done

# Create a raidz-3 pool that we can pull 4 disks from
log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs
sync_pool $TESTPOOL

# Add some data to the pool
log_must zfs create $TESTPOOL/fs
MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)"
SECONDS=0
log_must fill_fs $MNTPOINT 3 200 32768 300 Z
log_note "fill_fs took $SECONDS seconds"
sync_pool $TESTPOOL

# Start a replacing vdev
log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7

# Remove 4 disks all at once
log_must eval "echo offline > /sys/block/${sd}/device/state"

# Add some writes to drive the vdev probe errors
log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1

# Wait until sync starts, and the pool suspends
log_note "waiting for pool to suspend"
typeset -i tries=30
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
if ((tries-- == 0)); then
zpool status -s
log_fail "UNEXPECTED -- pool did not suspend"
fi
sleep 1
done

log_must zpool status $TESTPOOL

# Put the missing disks back into service
log_must eval "echo running > /sys/block/$sd/device/state"

# Clear the vdev error states, which will reopen the vdevs and resume the pool
log_must zpool clear $TESTPOOL

# Wait until the pool resumes
log_note "waiting for pool to resume"
tries=30
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do
if ((tries-- == 0)); then
log_fail "pool did not resume"
fi
sleep 1
done

# Make sure a pool scrub comes back clean
log_must zpool scrub -w $TESTPOOL
log_must zpool status -v $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" "No known data errors"

log_pass "VDEV probe errors for more disks than parity should suspend a pool"

0 comments on commit 5645a2f

Please sign in to comment.