diff --git a/include/os/linux/spl/sys/rwlock.h b/include/os/linux/spl/sys/rwlock.h index c883836c2f83..912e6ead6524 100644 --- a/include/os/linux/spl/sys/rwlock.h +++ b/include/os/linux/spl/sys/rwlock.h @@ -63,8 +63,8 @@ spl_rw_clear_owner(krwlock_t *rwp) rwp->rw_owner = NULL; } -static inline kthread_t * -rw_owner(krwlock_t *rwp) +static inline const kthread_t * +rw_owner(const krwlock_t *rwp) { return (rwp->rw_owner); } @@ -100,7 +100,7 @@ RW_LOCK_HELD(krwlock_t *rwp) } static inline int -RW_WRITE_HELD(krwlock_t *rwp) +RW_WRITE_HELD(const krwlock_t *rwp) { return (rw_owner(rwp) == current); } diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index baf3b1508335..9c549f0647cf 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -329,6 +329,22 @@ typedef struct dmu_buf_impl { dmu_buf_user_t *db_user; } dmu_buf_impl_t; +/* + * Assert that the value of db.db_data cannot currently be changed. Either + * it's locked, or it's in an immutable state. + */ +void assert_db_data_addr_locked(const dmu_buf_impl_t *db); +/* + * Assert that the provided dbuf's contents can only be accessed by the caller, + * and by no other thread. Either it must be locked, or in a state where + * locking is not required. + */ +#ifdef __linux__ +void assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t wr); +#else +void assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t wr); +#endif + #define DBUF_HASH_MUTEX(h, idx) \ (&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)]) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index fccc4c5b5b94..a399499200de 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -286,6 +286,38 @@ static unsigned long dbuf_metadata_cache_target_bytes(void); static uint_t dbuf_cache_hiwater_pct = 10; static uint_t dbuf_cache_lowater_pct = 10; +void +assert_db_data_addr_locked(const dmu_buf_impl_t *db) +{ + if (db->db_level > 0) + return; + else if (db->db.db_object == DMU_META_DNODE_OBJECT) + return; + ASSERT(MUTEX_HELD(&db->db_mtx)); +} + +void +#ifdef __linux__ +assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t writer) +#else +assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t writer) +#endif +{ + /* + * db_rwlock protects indirect blocks and the data block of the meta + * dnode. + */ + if (db->db_level == 0 && db->db.db_object != DMU_META_DNODE_OBJECT) + return; + /* Bonus and Spill blocks should only exist at level 0 */ + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); + if (writer) + ASSERT(RW_WRITE_HELD(&db->db_rwlock)); + else + ASSERT(RW_LOCK_HELD(&db->db_rwlock)); +} + static int dbuf_cons(void *vdb, void *unused, int kmflag) { @@ -1214,10 +1246,12 @@ dbuf_verify(dmu_buf_impl_t *db) * partially fill in a hole. */ if (db->db_dirtycnt == 0) { + rw_enter(&db->db_rwlock, FALSE); if (db->db_level == 0) { - uint64_t *buf = db->db.db_data; + uint64_t *buf; int i; + buf = db->db.db_data; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT0(buf[i]); } @@ -1248,6 +1282,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp)); } } + rw_exit(&db->db_rwlock); } } DB_DNODE_EXIT(db); @@ -1703,6 +1738,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); + assert_db_data_contents_locked(db, FALSE); memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { dnode_t *dn = DB_DNODE(db); @@ -1733,6 +1769,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } + assert_db_data_contents_locked(db, FALSE); memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { db->db_buf = NULL; @@ -3023,6 +3060,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ + assert_db_data_contents_locked(db, TRUE); memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; db->db_state = DB_CACHED; @@ -3155,6 +3193,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, ASSERT(!arc_is_encrypted(buf)); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); + assert_db_data_contents_locked(db, TRUE); memcpy(db->db.db_data, buf->b_data, db->db.db_size); arc_buf_destroy(buf, db); return; @@ -3398,6 +3437,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } + assert_db_data_addr_locked(*parentp); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); return (0); @@ -4584,10 +4624,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr) return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); } else { dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; + assert_db_data_addr_locked(parent_db); int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); VERIFY3P(DB_DNODE(parent_db), ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); + assert_db_data_contents_locked(parent_db, FALSE); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); } @@ -4598,12 +4640,22 @@ dbuf_lightweight_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; blkptr_t *bp = zio->io_bp; + dmu_buf_impl_t *parent_db = NULL; if (zio->io_error != 0) return; dnode_t *dn = dr->dr_dnode; + EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); + if (dr->dr_parent == NULL) { + parent_db = dn->dn_dbuf; + } else { + parent_db = dr->dr_parent->dr_dbuf; + } + + assert_db_data_addr_locked(parent_db); + rw_enter(&parent_db->db_rwlock, RW_WRITER); blkptr_t *bp_orig = dbuf_lightweight_bp(dr); spa_t *spa = dmu_objset_spa(dn->dn_objset); int64_t delta = bp_get_dsize_sync(spa, bp) - @@ -4623,14 +4675,6 @@ dbuf_lightweight_ready(zio_t *zio) BP_SET_FILL(bp, fill); } - dmu_buf_impl_t *parent_db; - EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); - if (dr->dr_parent == NULL) { - parent_db = dn->dn_dbuf; - } else { - parent_db = dr->dr_parent->dr_dbuf; - } - rw_enter(&parent_db->db_rwlock, RW_WRITER); *bp_orig = *bp; rw_exit(&parent_db->db_rwlock); } @@ -4664,6 +4708,7 @@ noinline static void dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dnode_t *dn = dr->dr_dnode; + dmu_buf_impl_t *parent_db = NULL; zio_t *pio; if (dn->dn_phys->dn_nlevels == 1) { pio = dn->dn_zio; @@ -4682,6 +4727,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * See comment in dbuf_write(). This is so that zio->io_bp_orig * will have the old BP in dbuf_lightweight_done(). */ + if (dr->dr_dnode->dn_phys->dn_nlevels != 1) { + parent_db = dr->dr_parent->dr_dbuf; + assert_db_data_addr_locked(parent_db); + rw_enter(&parent_db->db_rwlock, RW_READER); + } dr->dr_bp_copy = *dbuf_lightweight_bp(dr); dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), @@ -4691,6 +4741,9 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); + if (parent_db) + rw_exit(&parent_db->db_rwlock); + zio_nowait(dr->dr_zio); } @@ -4847,6 +4900,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } else { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } + assert_db_data_contents_locked(db, FALSE); memcpy((*datap)->b_data, db->db.db_data, psize); } db->db_data_pending = dr; @@ -4953,6 +5007,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) if (dn->dn_type == DMU_OT_DNODE) { i = 0; + rw_enter(&db->db_rwlock, RW_READER); while (i < db->db.db_size) { dnode_phys_t *dnp = (void *)(((char *)db->db.db_data) + i); @@ -4978,6 +5033,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DNODE_MIN_SIZE; } } + rw_exit(&db->db_rwlock); } else { if (BP_IS_HOLE(bp)) { fill = 0; @@ -4986,6 +5042,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) } } } else { + rw_enter(&db->db_rwlock, RW_READER); blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { @@ -4995,6 +5052,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); @@ -5029,6 +5087,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_EXIT(db); ASSERT3U(epbs, <, 31); + assert_db_data_addr_locked(db); + rw_enter(&db->db_rwlock, RW_READER); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) @@ -5045,10 +5105,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) * anybody from reading the blocks we're about to * zero out. */ - rw_enter(&db->db_rwlock, RW_WRITER); + if (!rw_tryupgrade(&db->db_rwlock)) { + rw_exit(&db->db_rwlock); + rw_enter(&db->db_rwlock, RW_WRITER); + } memset(db->db.db_data, 0, db->db.db_size); - rw_exit(&db->db_rwlock); } + rw_exit(&db->db_rwlock); } static void @@ -5243,11 +5306,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) * avoid lock contention, only grab it when we are actually * changing the BP. */ - if (rw != NULL) + if (rw != NULL && !rw_tryupgrade(rw)) { + rw_exit(rw); rw_enter(rw, RW_WRITER); + } *bp = bp_copy; - if (rw != NULL) - rw_exit(rw); } } @@ -5263,6 +5326,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; + assert_db_data_addr_locked(db); + rw_enter(&db->db_rwlock, RW_READER); if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { @@ -5281,6 +5346,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } } } + rw_exit(&db->db_rwlock); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 8e6b569c2100..5eb3a4fe6700 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2234,8 +2234,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) FTAG, (dmu_buf_t **)&db); ASSERT0(error); mutex_enter(&db->db_mtx); - data = (before) ? db->db.db_data : - dmu_objset_userquota_find_data(db, tx); + if (before) { + assert_db_data_contents_locked(db, FALSE); + data = db->db.db_data; + } else { + data = dmu_objset_userquota_find_data(db, tx); + } have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 6c150d31c669..1a07511bc5de 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -445,11 +445,14 @@ dnode_verify(dnode_t *dn) if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); +#ifdef DEBUG if (dn->dn_dbuf != NULL) { + assert_db_data_addr_locked(dn->dn_dbuf); ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } +#endif if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } @@ -1522,6 +1525,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, epb = db->db.db_size >> DNODE_SHIFT; idx = object & (epb - 1); + assert_db_data_addr_locked(db); dn_block = (dnode_phys_t *)db->db.db_data; ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); @@ -1537,6 +1541,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnh = &dnc->dnc_children[0]; /* Initialize dnode slot status from dnode_phys_t */ + rw_enter(&db->db_rwlock, RW_READER); for (int i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); @@ -1557,6 +1562,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, skip = 0; } } + rw_exit(&db->db_rwlock); dmu_buf_init_user(&dnc->dnc_dbu, NULL, dnode_buf_evict_async, NULL); @@ -1608,8 +1614,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); dn = dnh->dnh_dnode; } else { + rw_enter(&db->db_rwlock, RW_READER); dn = dnode_create(os, dn_block + idx, db, object, dnh); + rw_exit(&db->db_rwlock); dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } @@ -1681,8 +1689,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { dn = dnh->dnh_dnode; } else { + rw_enter(&db->db_rwlock, RW_READER); dn = dnode_create(os, dn_block + idx, db, object, dnh); + rw_exit(&db->db_rwlock); dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } @@ -2564,6 +2574,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dbuf_rele(db, FTAG); return (error); } + assert_db_data_addr_locked(db); data = db->db.db_data; rw_enter(&db->db_rwlock, RW_READER); } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 046ceddb3609..5776573d51b6 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -79,6 +79,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); if (dn->dn_dbuf != NULL) rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + assert_db_data_addr_locked(db); rw_enter(&db->db_rwlock, RW_WRITER); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); @@ -233,6 +234,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) * future txg. */ mutex_enter(&child->db_mtx); + assert_db_data_contents_locked(child, FALSE); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && list_is_empty(&child->db_dirty_records)) { @@ -310,6 +312,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_buf_unlock_parent(db, dblt, FTAG); dbuf_release_bp(db); + assert_db_data_addr_locked(db); bp = db->db.db_data; DB_DNODE_ENTER(db); diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index 04f3b6f32cb8..986ccd133901 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -44,6 +44,7 @@ FILESIZE="4G" DEFAULT_RUNFILES="common.run,$(uname | tr '[:upper:]' '[:lower:]').run" RUNFILES=${RUNFILES:-$DEFAULT_RUNFILES} FILEDIR=${FILEDIR:-/var/tmp} +DISKS="/dev/vtbd1 /dev/vtbd2 /dev/vtbd3 /dev/vtbd4 /dev/vtbd5" DISKS=${DISKS:-""} SINGLETEST="" SINGLETESTUSER="root"