From 13f5a08a91ffccda7b26afef451ff85001bbbea2 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 20 Feb 2025 12:01:59 -0800 Subject: [PATCH 1/2] zpool: Allow lockless zpool status Add a new `ZPOOL_LOCK_BEHAVIOR` envvar to control `zpool status` lock behavior. `ZPOOL_LOCK_BEHAVIOR` can have one of these values: "lockless": Try for a short amount of time to get the spa_namespace lock. If that doesn't work, then do the zpool status locklessly. This is dangerous and can crash your system if the pools configs are being modified while zpool status is running. This setting requires `zpool status` to be run as root. "trylock": Try for a short amount of time to get the spa_namespace lock. If that doesn't work then simply abort 'zpool status'. "wait": Wait forever for the lock. This is the default. These options allow users to view the zpool status when the pool gets stuck while holding the spa_namespace lock. Signed-off-by: Tony Hutter --- cmd/zdb/zdb.c | 3 +- cmd/zpool/zpool_iter.c | 7 +- cmd/zpool/zpool_main.c | 28 ++++ include/libzfs.h | 2 + include/os/freebsd/spl/sys/mutex.h | 15 +++ include/os/linux/spl/sys/mutex.h | 16 +++ include/sys/fs/zfs.h | 27 ++++ include/sys/spa.h | 12 +- include/sys/spa_impl.h | 1 + include/sys/zfs_context.h | 1 + lib/libzfs/libzfs_config.c | 36 +++++- lib/libzfs/libzfs_impl.h | 2 + lib/libzfs/libzfs_pool.c | 11 +- lib/libzfs/libzfs_util.c | 13 ++ lib/libzpool/kernel.c | 16 +++ lib/libzpool/util.c | 3 +- man/man4/zfs.4 | 29 ++++- man/man8/zpool.8 | 43 ++++++ module/zfs/spa.c | 117 ++++++++++++++++- module/zfs/spa_config.c | 78 +++++++++-- module/zfs/spa_misc.c | 53 +++++--- module/zfs/zfs_ioctl.c | 122 +++++++++++++++++- tests/runfiles/common.run | 2 +- tests/zfs-tests/include/libtest.shlib | 10 ++ tests/zfs-tests/include/tunables.cfg | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_status/zpool_status_lockless.ksh | 111 ++++++++++++++++ 27 files changed, 697 insertions(+), 64 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d655fa715e15..f11ea85f31ce 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7815,7 +7815,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa, if (cfg == NULL) { zdb_set_skip_mmp(poolname); - error = spa_get_stats(poolname, &cfg, NULL, 0); + error = spa_get_stats(poolname, &cfg, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 2eec9a95e24c..ef0c1ccb7a9e 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -118,6 +118,7 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, boolean_t literal, int *err) { zpool_list_t *zlp; + int rc; zlp = safe_malloc(sizeof (zpool_list_t)); @@ -137,7 +138,11 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, zlp->zl_literal = literal; if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); + rc = zpool_iter(g_zfs, add_pool, zlp); + if (rc != 0) { + free(zlp); + return (NULL); + } zlp->zl_findall = B_TRUE; } else { int i; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 2c46ad0df895..775cefd5bae4 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -11048,6 +11048,32 @@ status_callback(zpool_handle_t *zhp, void *data) return (0); } +/* + * Set the zpool status lock behavior based off of the ZPOOL_LOCK_BEHAVIOR + * envvar. If the var is not set, or an unknown value, then set the lock + * behavior to ZPOOL_LOCK_BEHAVIOR_DEFAULT. + */ +static void +zpool_set_lock_behavior(void) +{ + char *str; + zpool_lock_behavior_t zpool_lock_behavior; + + str = getenv("ZPOOL_LOCK_BEHAVIOR"); + if (str == NULL) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + else if (strcmp(str, "wait") == 0) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_WAIT; + else if (strcmp(str, "trylock") == 0) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_TRYLOCK; + else if (strcmp(str, "lockless") == 0) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_LOCKLESS; + else + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + + libzfs_set_lock_behavior(g_zfs, zpool_lock_behavior); +} + /* * zpool status [-dDegiLpPstvx] [-c [script1,script2,...]] ... * [-j|--json [--json-flat-vdevs] [--json-int] ... @@ -11223,6 +11249,8 @@ zpool_do_status(int argc, char **argv) usage(B_FALSE); } + zpool_set_lock_behavior(); + for (;;) { if (cb.cb_json) { cb.cb_jsobj = zpool_json_schema(0, 1); diff --git a/include/libzfs.h b/include/libzfs.h index 3fcdc176a621..312e30e8f90a 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -265,6 +265,8 @@ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); +_LIBZFS_H void libzfs_set_lock_behavior(libzfs_handle_t *, + zpool_lock_behavior_t); typedef struct splitflags { /* do not split, but return the config that would be split off */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 546a4681a291..f91831fcefde 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -72,4 +72,19 @@ typedef enum { #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index 4eca2414fc5b..101bd1281be6 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -26,6 +26,7 @@ #define _SPL_MUTEX_H #include +#include #include #include #include @@ -187,4 +188,19 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ /* NOTE: do not dereference mp after this point */ \ } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _SPL_MUTEX_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 49ab9d3db795..3dc03290be59 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -874,6 +874,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" +/* ZFS_IOC_POOL_STATS argument to for spa_namespace locking behavior */ +#define ZPOOL_CONFIG_LOCK_BEHAVIOR "lock_behavior" /* not stored on disk */ + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -1999,6 +2002,30 @@ enum zio_encrypt { ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) +/* + * Set locking behavior for zpool commands. + */ +typedef enum { + /* Wait to acquire the lock on the zpool config */ + ZPOOL_LOCK_BEHAVIOR_WAIT = 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT = ZPOOL_LOCK_BEHAVIOR_WAIT, + /* + * Return an error if it's taking an unnecessarily long time to + * acquire the lock on the pool config (default 100ms) + */ + ZPOOL_LOCK_BEHAVIOR_TRYLOCK = 1, + + /* + * DANGER: THIS CAN CRASH YOUR SYSTEM + * + * If you can't acquire the pool config lock after 100ms then do a + * a lockless lookup. This should only be done in emergencies, as it + * can crash the kernel module! + */ + ZPOOL_LOCK_BEHAVIOR_LOCKLESS = 2, + ZPOOL_LOCK_BEHAVIOR_END = 3 /* last entry marker */ +} zpool_lock_behavior_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 66db16b33c51..ea5cd3a5a505 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -772,10 +772,13 @@ typedef enum trim_type { /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); +extern int spa_open_common_lock_behavior(const char *pool, spa_t **spapp, + const void *tag, nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - size_t buflen); + size_t buflen, zpool_lock_behavior_t zpool_lock_behavior); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, @@ -880,10 +883,13 @@ extern kcondvar_t spa_namespace_cv; #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); -extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools, + zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); +extern nvlist_t *spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, + uint64_t txg, int getstats, zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); @@ -895,9 +901,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_lookup_lockless(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); +extern spa_t *spa_next_lockless(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 07a959db3447..00d22b482b24 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -500,6 +500,7 @@ extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; +extern unsigned int spa_namespace_trylock_ms; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 7112d3ef5c99..7bcc2ced7303 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -270,6 +270,7 @@ extern void mutex_enter(kmutex_t *mp); extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); +extern int mutex_enter_timeout(kmutex_t *mp, uint64_t ns); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c index 0d2102191389..009986635c1a 100644 --- a/lib/libzfs/libzfs_config.c +++ b/lib/libzfs/libzfs_config.c @@ -109,6 +109,7 @@ namespace_reload(libzfs_handle_t *hdl) nvpair_t *elem; zfs_cmd_t zc = {"\0"}; void *cookie; + nvlist_t *nv_args = NULL; if (hdl->libzfs_ns_gen == 0) { /* @@ -129,21 +130,35 @@ namespace_reload(libzfs_handle_t *hdl) zcmd_alloc_dst_nvlist(hdl, &zc, 0); + + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { zc.zc_cookie = hdl->libzfs_ns_gen; if (zfs_ioctl(hdl, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { switch (errno) { + case EAGAIN: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "Unable to get the " + "pool config lock in a reasonable amount of" + " time"))); case EEXIST: /* * The namespace hasn't changed. */ zcmd_free_nvlists(&zc); return (0); - case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); break; - + case ENOTSUP: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, + "ZPOOL_LOCK_BEHAVIOR=lockless requires" + " root"))); default: zcmd_free_nvlists(&zc); return (zfs_standard_error(hdl, errno, @@ -253,6 +268,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) int error; nvlist_t *config; libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nv_args = NULL; *missing = B_FALSE; (void) strcpy(zc.zc_name, zhp->zpool_name); @@ -262,9 +278,15 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size); + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, - &zc) == 0) { + int rc; + rc = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, &zc); + if (rc == 0) { /* * The real error is returned in the zc_cookie field. */ @@ -276,15 +298,18 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); - if (errno == ENOENT || errno == EINVAL) + + if (errno == ENOENT || errno == EINVAL || rc) *missing = B_TRUE; zhp->zpool_state = POOL_STATE_UNAVAIL; + nvlist_free(nv_args); return (0); } } if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { zcmd_free_nvlists(&zc); + nvlist_free(nv_args); return (-1); } @@ -304,6 +329,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) else zhp->zpool_state = POOL_STATE_ACTIVE; + nvlist_free(nv_args); return (0); } diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index 9053740ec2f0..cbc36eb871ec 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -73,6 +73,8 @@ struct libzfs_handle { uint64_t libzfs_max_nvlist; void *libfetch; char *libfetch_load_error; + + zpool_lock_behavior_t zpool_lock_behavior; }; struct zfs_handle { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ce154ae1a4cd..664aaf4283c9 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -81,16 +81,15 @@ zpool_get_all_props(zpool_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + nvlist_t *innvl = fnvlist_alloc(); if (zhp->zpool_n_propnames > 0) { - nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, zhp->zpool_propnames, zhp->zpool_n_propnames); - zcmd_write_src_nvlist(hdl, &zc, innvl); - fnvlist_free(innvl); } - - zcmd_alloc_dst_nvlist(hdl, &zc, 0); - + VERIFY0(nvlist_add_uint64(innvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, innvl); + fnvlist_free(innvl); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 26f5135dff62..bc5393d02577 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1037,6 +1037,18 @@ libzfs_free_str_array(char **strs, int count) free(strs); } +/* + * Set spa_namespace locking behavior for zpools. For example you may want to + * error out if you can't acquire the lock, or skip the lock entirely (which + * can be dangerous). + */ +void +libzfs_set_lock_behavior(libzfs_handle_t *hdl, + zpool_lock_behavior_t zpool_lock_behavior) +{ + hdl->zpool_lock_behavior = zpool_lock_behavior; +} + /* * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or * a non-zero number. @@ -1122,6 +1134,7 @@ libzfs_init(void) ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE; } + libzfs_set_lock_behavior(hdl, ZPOOL_LOCK_BEHAVIOR_DEFAULT); return (hdl); } diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 8ed374627264..b78d8aab26f7 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -239,6 +239,22 @@ mutex_exit(kmutex_t *mp) VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ENOLCK + * if timeout occurred. + */ +int +mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ENOLCK); +} + /* * ========================================================================= * rwlocks diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index 66d6f43967d5..b3a978935723 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -137,7 +137,8 @@ show_pool_stats(spa_t *spa) nvlist_t *config, *nvroot; const char *name; - VERIFY0(spa_get_stats(spa_name(spa), &config, NULL, 0)); + VERIFY0(spa_get_stats(spa_name(spa), &config, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)); VERIFY0(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name)); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 11bcbf430210..e790fb43fe61 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2789,8 +2789,35 @@ Defines zvol block devices behavior when .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. +.It Sy zfs_debug_namespace_lock_delay +WARNING, DEBUG USE ONLY! +If set, add an artificial +.Sy zfs_debug_namespace_lock_delay +milliseconds delay after taking the spa_namespace lock. +This is used to simulate a hung pool. +If +.Sy zfs_debug_namespace_lock_delay +is set to -1 then crash the kernel on the next spa_namespace lock hold. +If set to 0 (default) then take no action. +This option is used by test suite. +.It Sy spa_namespace_trylock_ms +If the user has set +.Sy ZPOOL_LOCK_BEHAIVOR=lockless|trylock +then try for this many milliseconds to get the spa_namespace lock before +moving on. +.Sy zpool status +This is the internal lock that synchronizes changes to the pool configuration. +Note that +.Sy zpool status +will try to get the lock in multiple places (like getting the pool config, +stats, props) before returning. +That means the total wall clock time for +.Sy zpool status +could a multiple of +.Sy spa_namespace_trylock_ms +if the lock is already held. +Default is 100 (milliseconds). .El -. .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 3bfef780b298..7fac0ddc2fe2 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -576,6 +576,49 @@ or by setting .Sy ZFS_VDEV_DEVID_OPT_OUT . .Pp +.It Sy ZPOOL_LOCK_BEHAVIOR +Change the +.Sy zpool status +pool lock acquisition behavior. +This can be useful for running +.Sy zpool status +even when the pool is hung. +Valid +.Sy ZPOOL_LOCK_BEHAVIOR +values are: +.Bl -tag -compact -offset 4n -width "lockless" +.It Sy trylock +Try for a short amount of time to get the pool config lock (spa_namespace lock). +If that doesn't work then simply abort +.Sy zpool status . +The amount of time to wait for the lock is controlled by the +.Sy spa_namespace_trylock_ms +module parameter. +.It Sy lockless +Same as +.Sy trylock +but if +.Sy zpool status +cannot get the lock, then read the status information locklessly. +.Sy WARNING: this is dangerous and can crash your system if pool configs are being modified while zpool status is running. +As a safegurard, +.Sy zpool status +with +.Sy ZPOOL_LOCK_BEHAVIOR=lockless +can only be run by root. +.Sy lockless +should only be used in emergency/debug situations. +.It Sy wait +Wait forever for the lock. +This is the default behavior. +.El +.Pp +If +.Sy ZPOOL_LOCK_BEHAVIOR +is set to any other value, or not set, then +.Sy zpool status +will wait forever for the pool config lock. +.Pp .It Sy ZPOOL_SCRIPTS_AS_ROOT Allow a privileged user to run .Nm zpool Cm status Ns / Ns Cm iostat Fl c . diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b3bb46da263b..02f2b2fa71dc 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -311,6 +311,27 @@ static int zfs_livelist_condense_zthr_cancel = 0; */ static int zfs_livelist_condense_new_alloc = 0; + +/* + * WARNING, FOR TEST USE ONLY! + * + * Wait zfs_debug_namespace_lock_delay milliseconds after taking the + * spa_namespace_lock to simulate lock contention. + * If set to -1, then *panic* to simulate a pool failure. + * If set to 0 then take no action (default) + */ +static int zfs_debug_namespace_lock_delay = 0; + +/* + * When zpool_lock_behavior is set to ZPOOL_LOCK_BEHAVIOR_TRYLOCK or + * ZPOOL_LOCK_BEHAVIOR_LOCKLESS, then try to acquire the the spa_namespace_ lock + * for this many milliseconds before moving on to the next step. + * + * Note that 'zpool status' will try to get the lock in multiple places (like + * getting the pool config, stats, props) before returning. + */ +unsigned int spa_namespace_trylock_ms = 100; + /* * Time variable to decide how often the txg should be added into the * database (in seconds). @@ -5943,8 +5964,9 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, const void *tag, - nvlist_t *nvpolicy, nvlist_t **config) +spa_open_common_impl(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; @@ -5961,8 +5983,45 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; + + /* + * Special debug case: we've selected a pool for lockless + * operation + */ + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK)) { + /* Make a cursory effort to acquire the lock in 100ms */ + if (mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)) != 0) { + if (zpool_lock_behavior == + ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + + /* + * Danger: we're doing spa_lookup() + * without holding the lock + */ + spa = spa_lookup_lockless(pool); + if (spa == NULL) { + return (SET_ERROR(ENOENT)); + } + + goto skip_the_locks; + } else { + /* + * We had ZPOOL_LOCK_BEHAVIOR_TRYLOCK + * set and cant get the lock in 100ms. + */ + return (SET_ERROR(EAGAIN)); + } + } else { + /* We got the lock from mutex_enter_timeout() */ + locked = B_TRUE; + } + } else { + /* Normal case of getting the lock */ + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } } if ((spa = spa_lookup(pool)) == NULL) { @@ -5971,6 +6030,19 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (SET_ERROR(ENOENT)); } + /* + * Simulate holding the spa_namespace lock for a long time, or forcing + * a crash with the lock held. This is for testing purposes only. + */ + if (zfs_debug_namespace_lock_delay == -1) { +#if _KERNEL && defined(__linux__) + BUG(); +#endif + } else if (zfs_debug_namespace_lock_delay > 0) { + zfs_sleep_until(gethrtime() + + MSEC2NSEC(zfs_debug_namespace_lock_delay)); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; @@ -6030,10 +6102,12 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, } } +skip_the_locks: spa_open_ref(spa, tag); if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + *config = spa_config_generate_lock_behavior(spa, NULL, -1ULL, + B_TRUE, zpool_lock_behavior); /* * If we've recovered the pool, pass back any information we @@ -6059,6 +6133,23 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (0); } +static int +spa_open_common(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + +int +spa_open_common_lock_behavior(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, zpool_lock_behavior_t + zpool_lock_behavior) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + zpool_lock_behavior)); +} + int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) @@ -6292,13 +6383,18 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) int spa_get_stats(const char *name, nvlist_t **config, - char *altroot, size_t buflen) + char *altroot, size_t buflen, zpool_lock_behavior_t zpool_lock_behavior) { int error; spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); + + error = spa_open_common_lock_behavior(name, &spa, FTAG, NULL, config, + zpool_lock_behavior); + if (error != 0) { + return (error); + } if (spa != NULL) { /* @@ -11239,6 +11335,13 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); +ZFS_MODULE_PARAM(zfs, zfs_, debug_namespace_lock_delay, INT, ZMOD_RW, + "Do not use - for ZFS test suite only."); + +ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_trylock_ms, UINT, ZMOD_RW, + "Try for this many milliseconds to get the spa_namespace lock when " + "running 'zpool status --trylock|--lockless'."); + ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index cf28955b0c50..cb23b1d123f9 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -280,19 +280,42 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * information for all pool visible within the zone. */ int -spa_all_configs(uint64_t *generation, nvlist_t **pools) +spa_all_configs(uint64_t *generation, nvlist_t **pools, zpool_lock_behavior_t + zpool_lock_behavior) { + int error = 0; spa_t *spa = NULL; + boolean_t use_lock = B_TRUE; if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); - int error = mutex_enter_interruptible(&spa_namespace_lock); - if (error) - return (SET_ERROR(EINTR)); + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_WAIT) { + error = mutex_enter_interruptible(&spa_namespace_lock); + } else { + error = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + } + if (error) { + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) + return (SET_ERROR(EAGAIN)); + else if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) + use_lock = B_FALSE; + else + return (SET_ERROR(EINTR)); + } *pools = fnvlist_alloc(); - while ((spa = spa_next(spa)) != NULL) { + + do { + if (use_lock) + spa = spa_next(spa); + else + spa = spa_next_lockless(spa); + + if (spa == NULL) + break; + if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); @@ -300,9 +323,11 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) spa->spa_config); mutex_exit(&spa->spa_props_lock); } - } + } while (1); *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); + + if (use_lock) + mutex_exit(&spa_namespace_lock); return (0); } @@ -323,8 +348,9 @@ spa_config_set(spa_t *spa, nvlist_t *config) * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ -nvlist_t * -spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +static nvlist_t * +spa_config_generate_impl(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats, + zpool_lock_behavior_t zpool_lock_behavior) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; @@ -332,15 +358,26 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) boolean_t locked = B_FALSE; uint64_t split_guid; const char *pool_name; + boolean_t skip_locks = B_FALSE; + + /* Special debug case: we've selected a pool for lockless operation */ + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + skip_locks = B_TRUE; + } if (vd == NULL) { vd = rvd; locked = B_TRUE; - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + if (!skip_locks) + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, + RW_READER); } - ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE)); + if (!skip_locks) + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, + RW_READER) == (SCL_CONFIG | SCL_STATE)); + /* * If txg is -1, report the current value of spa->spa_config_txg. @@ -463,12 +500,27 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) kmem_free(dds, sizeof (ddt_stat_t)); } - if (locked) + if (locked && !skip_locks) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } +nvlist_t * +spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats, zpool_lock_behavior_t zpool_lock_behavior) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + zpool_lock_behavior)); +} + +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6f7c060f97f8..ac157b307453 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -608,21 +608,14 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) * ========================================================================== */ -/* - * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. - * Returns NULL if no matching spa_t is found. - */ spa_t * -spa_lookup(const char *name) +spa_lookup_lockless(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ - spa_t *spa; avl_index_t where; + spa_t *spa; char *cp; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - -retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -634,7 +627,24 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); - if (spa == NULL) + + return (spa); +} + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + spa_t *spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + +retry: + spa = spa_lookup_lockless(name); + if (!spa) return (NULL); /* @@ -842,7 +852,6 @@ spa_remove(spa_t *spa) ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); - avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) @@ -908,6 +917,15 @@ spa_remove(spa_t *spa) kmem_free(spa, sizeof (spa_t)); } +spa_t * +spa_next_lockless(spa_t *prev) +{ + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. @@ -916,11 +934,7 @@ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (prev) - return (AVL_NEXT(&spa_namespace_avl, prev)); - else - return (avl_first(&spa_namespace_avl)); + return (spa_next_lockless(prev)); } /* @@ -1585,8 +1599,9 @@ spa_load_guid_exists(uint64_t guid) ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { - if (spa_load_guid(spa) == guid) + if (spa_load_guid(spa) == guid) { return (B_TRUE); + } } return (arc_async_flush_guid_inuse(guid)); @@ -3049,6 +3064,7 @@ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); +EXPORT_SYMBOL(spa_next_lockless); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); @@ -3176,6 +3192,3 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); - -ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, - "Minimum number of CPUs per allocators"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 5ca7c2320c4e..d076db36fcc3 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1598,13 +1598,82 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) return (error); } +/* + * Given a zc with only ZPOOL_CONFIG_LOCK_BEHAVIOR set in its ioctl source + * nvlist, OR the source nvlist itself (innvl), extract and return the + * zpool_lock_behavior_t value. Additionally, if the user is trying to use + * lockless behavior, verify that the user has the correct permissions. + * + * NOTE: Only one of 'zc' or 'innvl' should be set, not both. + * + * Return value: + * + * If zc and innvl are NULL, then return ZPOOL_LOCK_BEHAVIOR_DEFAULT. + * + * If the user has permission to do the ZPOOL_CONFIG_LOCK_BEHAVIOR specified + * then return the correct zpool_lock_behavior_t. + * + * If ZPOOL_CONFIG_LOCK_BEHAVIOR is not specified then return + * ZPOOL_LOCK_BEHAVIOR_DEFAULT. + * + * If the user is has specified ZPOOL_LOCK_BEHAVIOR_LOCKLESS and does not have + * permission to do so, then return -EPERM. + */ +static int +zfs_get_zpool_lock_behavior_helper(zfs_cmd_t *zc, nvlist_t *innvl) +{ + int error = 0; + uint64_t zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + nvlist_t *nvl = NULL; /* args being passed in */ + + /* Only 'zc' or 'innvl' should be set (or neither), not both */ + ASSERT(!(zc != NULL && innvl != NULL)); + if (zc) { + /* + * Our zfs_cmd_t can include an nvlist with optional input + * arguments. + */ + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl); + } else { + if (nvlist_dup(innvl, &nvl, KM_SLEEP) != 0) + return (ZPOOL_LOCK_BEHAVIOR_DEFAULT); + } + + if (error == 0) { + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + &zpool_lock_behavior) == 0) { + if (zpool_lock_behavior >= ZPOOL_LOCK_BEHAVIOR_END) { + nvlist_free(nvl); + /* Unrecognized zpool_lock_behavior value */ + return (ZPOOL_LOCK_BEHAVIOR_DEFAULT); + } + } + nvlist_free(nvl); + } + + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) + if (secpolicy_sys_config(CRED(), B_FALSE) != 0) + return (-EPERM); + + return ((int)zpool_lock_behavior); +} + static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; + int zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc, NULL); + if (zpool_lock_behavior == -EPERM) { + zc->zc_cookie = EPERM; + return (EPERM); + } + + error = spa_all_configs(&zc->zc_cookie, &configs, zpool_lock_behavior); - error = spa_all_configs(&zc->zc_cookie, &configs); if (error) return (error); @@ -1630,9 +1699,16 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc) nvlist_t *config; int error; int ret = 0; + zpool_lock_behavior_t zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc, NULL); + if (zpool_lock_behavior == -EPERM) { + zc->zc_cookie = EPERM; + return (EPERM); + } error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); + sizeof (zc->zc_value), zpool_lock_behavior); if (config != NULL) { ret = put_nvlist(zc, config); @@ -3155,6 +3231,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_get_props[] = { { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, + { ZPOOL_CONFIG_LOCK_BEHAVIOR, DATA_TYPE_UINT64, ZK_OPTIONAL }, }; static int @@ -3164,26 +3241,59 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) char **props = NULL; unsigned int n_props = 0; int error; + int rc; + uint64_t zpool_lock_behavior; + boolean_t is_locked; if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, &props, &n_props) != 0) { props = NULL; } - if ((error = spa_open(pool, &spa, FTAG)) != 0) { + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(NULL, innvl); + if (zpool_lock_behavior == -EPERM) { + return (EPERM); + } + + if ((error = spa_open_common_lock_behavior(pool, &spa, FTAG, NULL, NULL, + zpool_lock_behavior)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { + + is_locked = B_FALSE; + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS)) { + rc = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + if (rc == 0) + is_locked = B_TRUE; + } else { + mutex_enter(&spa_namespace_lock); + is_locked = B_TRUE; + } + + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK && + !is_locked) { + return (EAGAIN); + } + + if (is_locked) + spa = spa_lookup(pool); + else + spa = spa_lookup_lockless(pool); + + if (spa != NULL) { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, outnvl); } - mutex_exit(&spa_namespace_lock); + + if (is_locked) + mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 2b002830c82f..c6e1383a901d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -572,7 +572,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_lockless'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8b30b9b91641..14ecc8c1b5f2 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3945,4 +3945,14 @@ function pop_coredump_pattern esac } +# Get the Unix timestamp as expressed in milliseconds. This can be useful for +# measuring how long something takes. This works on both Linux and +# FreeBSD. +function get_unix_timestamp_in_ms +{ + # We can't just use 'date +%s%N' here since FreeBSD 'date' only supports + # one-second resolution. + python3 -c 'import time; print(round(time.time() * 1000))' +} + . ${STF_SUITE}/include/kstat.shlib diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 54b50c9dba77..8151969fddda 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -39,6 +39,7 @@ DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_secon DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms +DEBUG_NAMESPACE_LOCK_DELAY debug_namespace_lock_delay zfs_debug_namespace_lock_delay DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms @@ -90,6 +91,7 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +SPA_NAMESPACE_TRYLOCK_MS spa.namespace_trylock_ms spa_namespace_trylock_ms SPA_NOTE_TXG_TIME spa.note_txg_time spa_note_txg_time SPA_FLUSH_TXG_TIME spa.flush_txg_time spa_flush_txg_time TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 1517f90e99a5..71fd79411fe6 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1291,6 +1291,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_lockless.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh new file mode 100755 index 000000000000..1971432ee511 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'ZPOOL_LOCK_BEHAVIOR=lockless|trylock|wait' works correctly with +# zpool status. + +DISK=$TEST_BASE_DIR/file-vdev-lockless +ZFS_USER=zfs_user +ZFS_GROUP=zfs_group + +function cleanup +{ + unset ZPOOL_LOCK_BEHAVIOR + log_must restore_tunable DEBUG_NAMESPACE_LOCK_DELAY + log_must restore_tunable SPA_NAMESPACE_TRYLOCK_MS + log_must del_user $ZFS_USER + log_must del_group $ZFS_USER + log_must zpool destroy testpool2 + log_must rm -f "$DISK" +} + +log_assert "Verify ZPOOL_LOCK_BEHAVIOR values with zpool status" + +log_onexit cleanup + +log_must save_tunable DEBUG_NAMESPACE_LOCK_DELAY +log_must save_tunable SPA_NAMESPACE_TRYLOCK_MS + +log_must add_group $ZFS_GROUP +log_must add_user $ZFS_GROUP $ZFS_USER + +log_mustnot test -e $DISK +log_must truncate -s 100M $DISK +log_must zpool create testpool2 $DISK + +# Normal, safe, zpool status variants. These should all work. +log_must user_run $ZFS_USER zpool status +export ZPOOL_LOCK_BEHAVIOR="" +log_must user_run $ZFS_USER zpool status +export ZPOOL_LOCK_BEHAVIOR=wait +log_must user_run $ZFS_USER "zpool status" +export ZPOOL_LOCK_BEHAVIOR=trylock +log_must user_run $ZFS_USER "zpool status" + +# We should not have permission as an ordinary user to run lockless +export ZPOOL_LOCK_BEHAVIOR=lockless +log_mustnot user_run $ZFS_USER "zpool status" +log_must zpool status +export ZPOOL_LOCK_BEHAVIOR="" + +# Add an artificial 500ms delay after taking the spa_namespace lock +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +zpool status & +sleep 0.1 + +# This should fail since the previously spawned off 'zpool status' is holding +# the lock for at least 500ms, and "trylock" only tries for 100ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 100 +export ZPOOL_LOCK_BEHAVIOR=trylock +log_mustnot user_run $ZFS_USER "zpool status" +wait + +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 100 +zpool status & +sleep 0.05 + +# This should succeed since the previously spawned off 'zpool status' is holding +# the lock for only 100ms, and "trylock" tries for 300ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 300 +export ZPOOL_LOCK_BEHAVIOR=trylock +log_must user_run $ZFS_USER "zpool status" +wait + +# Now we artificially hold the lock for 500ms, and check that "lockless" +# returns in a shorter amount of time, and without error. +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 10 +zpool status & +sleep 0.05 +before=$(get_unix_timestamp_in_ms) +export ZPOOL_LOCK_BEHAVIOR=lockless +log_must zpool status +after=$(get_unix_timestamp_in_ms) +d=$(($after - $before)) +wait +log_note "'lockless' zpool status took $d milliseconds" +log_must test $d -le 300 + +log_pass "ZPOOL_LOCK_BEHAVIOR=lockless|trylock|wait zpool status works" From 909b76e14fea76e90c5393e71144aa989b42c020 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 26 Mar 2025 11:36:37 -0700 Subject: [PATCH 2/2] Update ABI files after lockless zpool status change Signed-off-by: Tony Hutter --- lib/libzfs/libzfs.abi | 56 ++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 184ea4a55b43..fb2098a76054 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -218,6 +218,7 @@ + @@ -2272,6 +2273,15 @@ + + + + + + + + + @@ -2411,7 +2421,7 @@ - + @@ -2469,6 +2479,9 @@ + + + @@ -3164,6 +3177,12 @@ + + + + + + @@ -3174,6 +3193,12 @@ + + + + + + @@ -3269,6 +3294,12 @@ + + + + + + @@ -3778,12 +3809,6 @@ - - - - - - @@ -4542,12 +4567,6 @@ - - - - - - @@ -5160,12 +5179,6 @@ - - - - - - @@ -8804,6 +8817,11 @@ + + + + +