Skip to content

Commit

Permalink
vdev probe to slow disk can stall mmp write checker
Browse files Browse the repository at this point in the history
Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for a long duration.

Also allow zpool clear if no evidence of another host
is using the pool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Olaf Faaland <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Closes openzfs#15839
  • Loading branch information
Don Brady committed Apr 29, 2024
1 parent b28461b commit c3f2f1a
Show file tree
Hide file tree
Showing 16 changed files with 242 additions and 52 deletions.
2 changes: 1 addition & 1 deletion cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -9050,7 9050,7 @@ status_callback(zpool_handle_t *zhp, void *data)
printf_color(ANSI_BOLD, gettext("action: "));
printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
" are connected, then reboot your system and\n\timport the "
"pool.\n"));
"pool or run 'zpool clear' to resume the pool.\n"));
break;

case ZPOOL_STATUS_IO_FAILURE_WAIT:
Expand Down
4 changes: 3 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_FAULT_VDEV 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
Expand Down Expand Up @@ -1123,6 1123,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);

extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);

extern spa_mode_t spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);

Expand Down
16 changes: 8 additions & 8 deletions include/sys/uberblock_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 50,20 @@ extern "C" {
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04

#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
ubp->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
(ubp)->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_INTERVAL_VALID_BIT))
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_SEQ_VALID_BIT))
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))

#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
>> 8)
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
>> 32)
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
>> 48)

#define MMP_INTERVAL_SET(write) \
Expand Down
2 changes: 1 addition & 1 deletion include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 273,7 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
boolean_t vdev_fault_wanted; /* async faulted wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
Expand Down
7 changes: 4 additions & 3 deletions man/man8/zpool-clear.8
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 50,10 @@ If the pool was suspended it will be brought back online provided the
devices can be accessed.
Pools with
.Sy multihost
enabled which have been suspended cannot be resumed.
While the pool was suspended, it may have been imported on
another host, and resuming I/O could result in pool damage.
enabled which have been suspended cannot be resumed when there is evidence
that the pool was imported by another host.
The same checks performed during an import will be applied before the clear
proceeds.
.Bl -tag -width Ds
.It Fl -power
Power on the devices's slot in the storage enclosure and wait for the device
Expand Down
5 changes: 3 additions & 2 deletions module/zfs/mmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -664,12 664,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
"mmp_fail_intervals %llu mmp_fail_ns %llu",
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
(u_longlong_t)mmp_fail_ns);
(u_longlong_t)mmp_fail_ns,
(u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",
Expand Down
102 changes: 84 additions & 18 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -3863,11 3863,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}

/*
* Perform the import activity check. If the user canceled the import or
* we detected activity then fail.
* Remote host activity check.
*
* error results:
* 0 - no activity detected
* EREMOTEIO - remote activity detected
* EINTR - user canceled the operation
*/
static int
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
Expand Down Expand Up @@ -3643,19 3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)

import_expire = gethrtime() import_delay;

spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
"%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
if (importing) {
spa_import_progress_set_notes(spa, "Checking MMP activity, "
"waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
}

int interations = 0;
int iterations = 0;
while ((now = gethrtime()) < import_expire) {
if (interations % 30 == 0) {
if (importing && iterations % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}

(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
if (importing) {
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
}

vdev_uberblock_load(rvd, ub, &mmp_label);

Expand Down Expand Up @@ -3737,6 3746,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
return (error);
}

/*
* Called from zfs_ioc_clear for a pool that was suspended
* after failing mmp write checks.
*/
boolean_t
spa_mmp_remote_host_activity(spa_t *spa)
{
ASSERT(spa_multihost(spa) && spa_suspended(spa));

nvlist_t *best_label;
uberblock_t best_ub;

/*
* Locate the best uberblock on disk
*/
vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
if (best_label) {
/*
* confirm that the best hostid matches our hostid
*/
if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
spa_get_hostid(spa) !=
fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
nvlist_free(best_label);
return (B_TRUE);
}
nvlist_free(best_label);
} else {
return (B_TRUE);
}

if (!MMP_VALID(&best_ub) ||
!MMP_FAIL_INT_VALID(&best_ub) ||
MMP_FAIL_INT(&best_ub) == 0) {
return (B_TRUE);
}

if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
zfs_dbgmsg("txg mismatch detected during pool clear "
"txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
(u_longlong_t)spa->spa_uberblock.ub_txg,
(u_longlong_t)best_ub.ub_txg,
(u_longlong_t)spa->spa_uberblock.ub_timestamp,
(u_longlong_t)best_ub.ub_timestamp);
return (B_TRUE);
}

/*
* Perform an activity check looking for any remote writer
*/
return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
B_FALSE) != 0);
}

static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
Expand Down Expand Up @@ -4063,7 4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}

int error = spa_activity_check(spa, ub, spa->spa_config);
int error =
spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
Expand Down Expand Up @@ -8771,15 8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}

static void
spa_async_probe(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
if (vd->vdev_fault_wanted) {
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}

for (int c = 0; c < vd->vdev_children; c )
spa_async_probe(spa, vd->vdev_child[c]);
spa_async_fault_vdev(spa, vd->vdev_child[c]);
}

static void
Expand Down Expand Up @@ -8867,11 8933,11 @@ spa_async_thread(void *arg)
}

/*
* See if any devices need to be probed.
* See if any devices need to be marked faulted.
*/
if (tasks & SPA_ASYNC_PROBE) {
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}

Expand Down
9 changes: 9 additions & 0 deletions module/zfs/txg.c
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 550,15 @@ txg_sync_thread(void *arg)
timer = (delta > timeout ? 0 : timeout - delta);
}

/*
* When we're suspended, nothing should be changing and for
* MMP we don't want to bump anything that would make it
* harder to detect if another host is changing it when
* resuming after a MMP suspend.
*/
if (spa_suspended(spa))
continue;

/*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
Expand Down
22 changes: 13 additions & 9 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;

Expand Down Expand Up @@ -1709,6 1710,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);

/*
* If this probe was initiated from zio pipeline, then
* change the state in a spa_async_request. Probes that
* were initiated from a vdev_open can change the state
* as part of the open call.
*/
if (vps->vps_zio_done_probe) {
vd->vdev_fault_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
}
}

mutex_enter(&vd->vdev_probe_lock);
Expand Down Expand Up @@ -1759,6 1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)

vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
vps->vps_zio_done_probe = (zio != NULL);

if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
Expand All @@ -1785,15 1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);

/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}

if (zio != NULL)
Expand Down
4 changes: 3 additions & 1 deletion module/zfs/vdev_label.c
Original file line number Diff line number Diff line change
Expand Up @@ -2027,14 2027,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
* and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
if (ub->ub_txg < txg) {
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);

if (!changed && list_is_empty(&spa->spa_config_dirty_list))
if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
!spa_multihost(spa))
return (0);
}

Expand Down
9 changes: 6 additions & 3 deletions module/zfs/zfs_ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -5823,10 5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)

/*
* If multihost is enabled, resuming I/O is unsafe as another
* host may have imported the pool.
* host may have imported the pool. Check for remote activity.
*/
if (spa_multihost(spa) && spa_suspended(spa))
return (SET_ERROR(EINVAL));
if (spa_multihost(spa) && spa_suspended(spa) &&
spa_mmp_remote_host_activity(spa)) {
spa_close(spa, FTAG);
return (SET_ERROR(EREMOTEIO));
}

spa_vdev_state_enter(spa, SCL_NONE);

Expand Down
6 changes: 4 additions & 2 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -2532,8 2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));

cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
if (reason != ZIO_SUSPEND_MMP) {
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
"I/O failure and has been suspended.\n", spa_name(spa));
}

(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
Expand Down
6 changes: 4 additions & 2 deletions module/zfs/zio_inject.c
Original file line number Diff line number Diff line change
Expand Up @@ -607,9 607,11 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;

/* also match on I/O type (e.g., -T read) */
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
handler->zi_record.zi_iotype != zio->io_type)
continue;
handler->zi_record.zi_iotype != zio->io_type) {
continue;
}

/*
* Defensive; should never happen as the array allocation
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 146,7 @@ tags = ['functional', 'mmap']
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
tags = ['functional', 'mmp']

[tests/functional/mount:Linux]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1593,6 1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \
functional/mmp/mmp_on_zdb.ksh \
functional/mmp/mmp_reset_interval.ksh \
functional/mmp/mmp_write_distribution.ksh \
functional/mmp/mmp_write_slow_disk.ksh \
functional/mmp/mmp_write_uberblocks.ksh \
functional/mmp/multihost_history.ksh \
functional/mmp/setup.ksh \
Expand Down
Loading

0 comments on commit c3f2f1a

Please sign in to comment.