Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/os/linux/spl/sys/rwlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ spl_rw_clear_owner(krwlock_t *rwp)
rwp->rw_owner = NULL;
}

static inline kthread_t *
rw_owner(krwlock_t *rwp)
static inline const kthread_t *
rw_owner(const krwlock_t *rwp)
{
return (rwp->rw_owner);
}
Expand Down Expand Up @@ -100,7 +100,7 @@ RW_LOCK_HELD(krwlock_t *rwp)
}

static inline int
RW_WRITE_HELD(krwlock_t *rwp)
RW_WRITE_HELD(const krwlock_t *rwp)
{
return (rw_owner(rwp) == current);
}
Expand Down
16 changes: 16 additions & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,22 @@ typedef struct dmu_buf_impl {
dmu_buf_user_t *db_user;
} dmu_buf_impl_t;

/*
* Assert that the value of db.db_data cannot currently be changed. Either
* it's locked, or it's in an immutable state.
*/
void assert_db_data_addr_locked(const dmu_buf_impl_t *db);
/*
* Assert that the provided dbuf's contents can only be accessed by the caller,
* and by no other thread. Either it must be locked, or in a state where
* locking is not required.
*/
#ifdef __linux__
void assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t wr);
#else
void assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t wr);
#endif

#define DBUF_HASH_MUTEX(h, idx) \
(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])

Expand Down
96 changes: 82 additions & 14 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,40 @@ static unsigned long dbuf_metadata_cache_target_bytes(void);
static uint_t dbuf_cache_hiwater_pct = 10;
static uint_t dbuf_cache_lowater_pct = 10;

void
assert_db_data_addr_locked(const dmu_buf_impl_t *db)
{
if (db->db_level > 0)
return;
else if (db->db.db_object == DMU_META_DNODE_OBJECT)
return;
ASSERT(MUTEX_HELD(&db->db_mtx));
}

void
#ifdef __linux__
assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t writer)
#else
assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t writer)
#endif
{
/*
* db_rwlock protects indirect blocks and the data block of the meta
* dnode.
*/
if (db->db_dirtycnt == 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what this is doing here. If we really need to check it, then we'd need to know that we hold db_mtx.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of your comment "I don't believe this condition is needed (or even correct). If the db_dirtycnt below is zero (and it should be protected by db_mtx), then the buffer must be empty.". There, you told me not to bother locking db_rwlock if db_dirtycnt were below zero (but I think you meant "equal to zero").

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My comment was specifically about dbuf_verify(), where the code first asserts that db_mtx is held and then verifies that block that must never have any data does not really have any. It does not require locking there after all the conditions met.

return;
if (db->db_level == 0 && db->db.db_object != DMU_META_DNODE_OBJECT)
return;
/* Bonus and Spill blocks should only exist at level 0 */
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_blkid != DMU_SPILL_BLKID);
if (writer)
ASSERT(RW_WRITE_HELD(&db->db_rwlock));
else
ASSERT(RW_LOCK_HELD(&db->db_rwlock));
}

static int
dbuf_cons(void *vdb, void *unused, int kmflag)
{
Expand Down Expand Up @@ -1215,13 +1249,16 @@ dbuf_verify(dmu_buf_impl_t *db)
*/
if (db->db_dirtycnt == 0) {
if (db->db_level == 0) {
uint64_t *buf = db->db.db_data;
uint64_t *buf;
int i;

assert_db_data_contents_locked(db, FALSE);
buf = db->db.db_data;
for (i = 0; i < db->db.db_size >> 3; i++) {
ASSERT0(buf[i]);
}
} else {
assert_db_data_contents_locked(db, FALSE);
blkptr_t *bps = db->db.db_data;
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
db->db.db_size);
Expand Down Expand Up @@ -1703,6 +1740,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
assert_db_data_contents_locked(db, FALSE);
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
Expand Down Expand Up @@ -1733,6 +1771,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
assert_db_data_contents_locked(db, FALSE);
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
} else {
db->db_buf = NULL;
Expand Down Expand Up @@ -3023,6 +3062,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
assert_db_data_contents_locked(db, TRUE);
memset(db->db.db_data, 0, db->db.db_size);
db->db_freed_in_flight = FALSE;
db->db_state = DB_CACHED;
Expand Down Expand Up @@ -3155,6 +3195,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
ASSERT(!arc_is_encrypted(buf));
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
assert_db_data_contents_locked(db, TRUE);
memcpy(db->db.db_data, buf->b_data, db->db.db_size);
arc_buf_destroy(buf, db);
return;
Expand Down Expand Up @@ -3398,6 +3439,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
return (err);
}
assert_db_data_addr_locked(*parentp);
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1));
return (0);
Expand Down Expand Up @@ -4584,10 +4626,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
} else {
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
assert_db_data_addr_locked(parent_db);
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
VERIFY3U(parent_db->db_level, ==, 1);
VERIFY3P(DB_DNODE(parent_db), ==, dn);
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
assert_db_data_contents_locked(parent_db, FALSE);
blkptr_t *bp = parent_db->db.db_data;
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
}
Expand All @@ -4598,12 +4642,22 @@ dbuf_lightweight_ready(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
blkptr_t *bp = zio->io_bp;
dmu_buf_impl_t *parent_db = NULL;

if (zio->io_error != 0)
return;

dnode_t *dn = dr->dr_dnode;

EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
}

assert_db_data_addr_locked(parent_db);
rw_enter(&parent_db->db_rwlock, RW_WRITER);
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
spa_t *spa = dmu_objset_spa(dn->dn_objset);
int64_t delta = bp_get_dsize_sync(spa, bp) -
Expand All @@ -4623,14 +4677,6 @@ dbuf_lightweight_ready(zio_t *zio)
BP_SET_FILL(bp, fill);
}

dmu_buf_impl_t *parent_db;
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
}
rw_enter(&parent_db->db_rwlock, RW_WRITER);
*bp_orig = *bp;
rw_exit(&parent_db->db_rwlock);
}
Expand Down Expand Up @@ -4664,6 +4710,7 @@ noinline static void
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dnode_t *dn = dr->dr_dnode;
dmu_buf_impl_t *parent_db = NULL;
zio_t *pio;
if (dn->dn_phys->dn_nlevels == 1) {
pio = dn->dn_zio;
Expand All @@ -4682,6 +4729,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* See comment in dbuf_write(). This is so that zio->io_bp_orig
* will have the old BP in dbuf_lightweight_done().
*/
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
parent_db = dr->dr_parent->dr_dbuf;
assert_db_data_addr_locked(parent_db);
rw_enter(&parent_db->db_rwlock, RW_READER);
Comment on lines +4732 to +4735
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you think this lock is needed, shouldn't this block be similar to one in dbuf_lightweight_ready() above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Similar" how?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Locking dr->dr_parent->dr_dbuf if it is present or dn->dn_dbuf otherwise, as you've done two chunks above? I see dbuf_lightweight_bp() does check for dn_nlevels == 1 to decide what to access, but I wonder if it is equivalent.

}
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);

dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
Expand All @@ -4691,6 +4743,9 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);

if (parent_db)
rw_exit(&parent_db->db_rwlock);

zio_nowait(dr->dr_zio);
}

Expand Down Expand Up @@ -4847,6 +4902,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
assert_db_data_contents_locked(db, FALSE);
memcpy((*datap)->b_data, db->db.db_data, psize);
}
db->db_data_pending = dr;
Expand Down Expand Up @@ -4953,6 +5009,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)

if (dn->dn_type == DMU_OT_DNODE) {
i = 0;
rw_enter(&db->db_rwlock, RW_READER);
while (i < db->db.db_size) {
dnode_phys_t *dnp =
(void *)(((char *)db->db.db_data) + i);
Expand All @@ -4978,6 +5035,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DNODE_MIN_SIZE;
}
}
rw_exit(&db->db_rwlock);
} else {
if (BP_IS_HOLE(bp)) {
fill = 0;
Expand All @@ -4986,6 +5044,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
}
}
} else {
rw_enter(&db->db_rwlock, RW_READER);
blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
Expand All @@ -4995,6 +5054,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);

Expand Down Expand Up @@ -5029,6 +5089,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
ASSERT3U(epbs, <, 31);

assert_db_data_addr_locked(db);
rw_enter(&db->db_rwlock, RW_READER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need db_mtx lock here, while db_rwlock not sure, since at this point all modifications must be completed, since the block is going to be written. I think previous db_rwlock below should be sufficient.

/* Determine if all our children are holes */
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
Expand All @@ -5045,10 +5107,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&db->db_rwlock, RW_WRITER);
if (!rw_tryupgrade(&db->db_rwlock)) {
rw_exit(&db->db_rwlock);
rw_enter(&db->db_rwlock, RW_WRITER);
}
memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
}
rw_exit(&db->db_rwlock);
}

static void
Expand Down Expand Up @@ -5243,11 +5308,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
* avoid lock contention, only grab it when we are actually
* changing the BP.
*/
if (rw != NULL)
if (rw != NULL && !rw_tryupgrade(rw)) {
rw_exit(rw);
rw_enter(rw, RW_WRITER);
}
*bp = bp_copy;
if (rw != NULL)
rw_exit(rw);
}
}

Expand All @@ -5263,6 +5328,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
return;

assert_db_data_addr_locked(db);
rw_enter(&db->db_rwlock, RW_READER);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is called only for meta-dnode or indirects, so we should not need the db_mtx. About db_rwlock I am not sure. I think by this time modifications of this block should be completed, and old locking inside dbuf_remap_impl() should cover our modifications from other readers.

if (db->db_level > 0) {
blkptr_t *bp = db->db.db_data;
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
Expand All @@ -5281,6 +5348,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
}
}
}
rw_exit(&db->db_rwlock);
}


Expand Down
8 changes: 6 additions & 2 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2234,8 +2234,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
FTAG, (dmu_buf_t **)&db);
ASSERT0(error);
mutex_enter(&db->db_mtx);
data = (before) ? db->db.db_data :
dmu_objset_userquota_find_data(db, tx);
if (before) {
assert_db_data_contents_locked(db, FALSE);
data = db->db.db_data;
} else {
data = dmu_objset_userquota_find_data(db, tx);
}
have_spill = B_TRUE;
} else {
mutex_enter(&dn->dn_mtx);
Expand Down
9 changes: 9 additions & 0 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -445,11 +445,14 @@ dnode_verify(dnode_t *dn)
if (dn->dn_phys->dn_type != DMU_OT_NONE)
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
#ifdef DEBUG
if (dn->dn_dbuf != NULL) {
assert_db_data_addr_locked(dn->dn_dbuf);
ASSERT3P(dn->dn_phys, ==,
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
}
#endif
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
}
Expand Down Expand Up @@ -1522,6 +1525,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
epb = db->db.db_size >> DNODE_SHIFT;

idx = object & (epb - 1);
assert_db_data_addr_locked(db);
dn_block = (dnode_phys_t *)db->db.db_data;

ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
Expand All @@ -1537,6 +1541,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
dnh = &dnc->dnc_children[0];

/* Initialize dnode slot status from dnode_phys_t */
rw_enter(&db->db_rwlock, RW_READER);
for (int i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);

Expand All @@ -1557,6 +1562,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
skip = 0;
}
}
rw_exit(&db->db_rwlock);

dmu_buf_init_user(&dnc->dnc_dbu, NULL,
dnode_buf_evict_async, NULL);
Expand Down Expand Up @@ -1608,6 +1614,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
dn = dnh->dnh_dnode;
} else {
assert_db_data_contents_locked(db, FALSE);
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db,
Expand Down Expand Up @@ -1681,6 +1688,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
dn = dnh->dnh_dnode;
} else {
assert_db_data_contents_locked(db, FALSE);
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
Expand Down Expand Up @@ -2564,6 +2572,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
dbuf_rele(db, FTAG);
return (error);
}
assert_db_data_addr_locked(db);
data = db->db.db_data;
rw_enter(&db->db_rwlock, RW_READER);
}
Expand Down
Loading
Loading