-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Update dnode_next_offset_level to accept blkid instead of offset #17792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2496,26 +2496,27 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) | |
} | ||
|
||
/* | ||
* Scans a block at the indicated "level" looking for a hole or data, | ||
* depending on 'flags'. | ||
* Scans the block at the indicated "level" looking for a hole or data, | ||
* depending on 'flags' starting from array position given by *index. | ||
* | ||
* If level > 0, then we are scanning an indirect block looking at its | ||
* pointers. If level == 0, then we are looking at a block of dnodes. | ||
* If lvl > 0, then we are scanning an indirect block looking at its | ||
* pointers. If lvl == 0, then we are looking at a block of dnodes. | ||
* | ||
* If we don't find what we are looking for in the block, we return ESRCH. | ||
* Otherwise, return with *offset pointing to the beginning (if searching | ||
* forwards) or end (if searching backwards) of the range covered by the | ||
* block pointer we matched on (or dnode). | ||
* Otherwise, return with *index set to the matching array position. | ||
* | ||
* The basic search algorithm used below by dnode_next_offset() is to | ||
* use this function to search up the block tree (widen the search) until | ||
* we find something (i.e., we don't return ESRCH) and then search back | ||
* down the tree (narrow the search) until we reach our original search | ||
* level. | ||
* In both cases, *offset is updated to point at the matched BP/dnode or | ||
* the next offset to search (unless at the limit of possible offsets). | ||
* | ||
* The basic search algorithm used below by dnode_next_offset() uses this | ||
* function to perform a block-order tree traversal. We search up the block | ||
* tree (widen the search) until we find something (i.e., we don't return | ||
* ESRCH) and then search back down the tree (narrow the search) until we | ||
* reach our original search level or backtrack up because nothing matches. | ||
*/ | ||
static int | ||
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | ||
int lvl, uint64_t blkfill, uint64_t txg) | ||
dnode_next_offset_level(dnode_t *dn, int flags, int lvl, uint64_t blkid, | ||
int *index, uint64_t blkfill, uint64_t txg, uint64_t *offset) | ||
{ | ||
dmu_buf_impl_t *db = NULL; | ||
void *data = NULL; | ||
|
@@ -2541,20 +2542,12 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |
rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock, | ||
RW_READER, FTAG); | ||
} else { | ||
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); | ||
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); | ||
if (error) { | ||
if (error != ENOENT) | ||
return (error); | ||
if (hole) | ||
return (0); | ||
/* | ||
* This can only happen when we are searching up | ||
* the block tree for data. We don't really need to | ||
* adjust the offset, as we will just end up looking | ||
* at the pointer to this block in its parent, and its | ||
* going to be unallocated, so we will skip over it. | ||
*/ | ||
return (SET_ERROR(ESRCH)); | ||
} | ||
error = dbuf_read(db, NULL, | ||
|
@@ -2582,20 +2575,19 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |
ASSERT(dn->dn_type == DMU_OT_DNODE); | ||
ASSERT(!(flags & DNODE_FIND_BACKWARDS)); | ||
|
||
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); | ||
i < blkfill; i += dnp[i].dn_extra_slots + 1) { | ||
for (i = *index; i < blkfill; i += dnp[i].dn_extra_slots + 1) { | ||
if ((dnp[i].dn_type == DMU_OT_NONE) == hole) | ||
break; | ||
} | ||
|
||
if (i == blkfill) | ||
error = SET_ERROR(ESRCH); | ||
|
||
*index = i; | ||
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + | ||
(i << DNODE_SHIFT); | ||
} else { | ||
blkptr_t *bp = data; | ||
uint64_t start = *offset; | ||
span = (lvl - 1) * epbs + dn->dn_datablkshift; | ||
minfill = 0; | ||
maxfill = blkfill << ((lvl - 1) * epbs); | ||
|
@@ -2605,40 +2597,27 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |
else | ||
minfill++; | ||
|
||
if (span >= 8 * sizeof (*offset)) { | ||
/* This only happens on the highest indirection level */ | ||
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); | ||
*offset = 0; | ||
} else { | ||
*offset = *offset >> span; | ||
} | ||
|
||
for (i = BF64_GET(*offset, 0, epbs); | ||
i >= 0 && i < epb; i += inc) { | ||
for (i = *index; i >= 0 && i < epb; i += inc) { | ||
if (BP_GET_FILL(&bp[i]) >= minfill && | ||
BP_GET_FILL(&bp[i]) <= maxfill && | ||
(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg)) | ||
break; | ||
if (inc > 0 || *offset > 0) | ||
*offset += inc; | ||
} | ||
|
||
if (span >= 8 * sizeof (*offset)) { | ||
*offset = start; | ||
} else { | ||
*offset = *offset << span; | ||
} | ||
|
||
if (inc < 0) { | ||
/* traversing backwards; position offset at the end */ | ||
if (span < 8 * sizeof (*offset)) | ||
*offset = MIN(*offset + (1ULL << span) - 1, | ||
start); | ||
} else if (*offset < start) { | ||
*offset = start; | ||
} | ||
if (i < 0 || i >= epb) | ||
error = SET_ERROR(ESRCH); | ||
|
||
*index = i; | ||
if (span < 8 * sizeof (*offset)) { | ||
uint64_t nblk = blkid << epbs; | ||
if (i >= 0 || blkid != 0) | ||
nblk += i; | ||
if ((nblk >> (8 * sizeof (*offset) - span)) == 0) | ||
*offset = (flags & DNODE_FIND_BACKWARDS) ? | ||
/* backwards: position offset at the end */ | ||
MIN(*offset, ((nblk + 1) << span) - 1) : | ||
MAX(*offset, nblk << span); | ||
Comment on lines
+2615
to
+2619
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we are searching for a hole forward from the last block of a file with size close to 2^^64, and this level of indirection does not end at 2^^64, I suppose the code above will produce There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, you're right. This is also a problem before this PR (and before 16025). I can write a separate PR to address it. This is the case where there are no holes from the starting offset up to the end of the object, the object ends at 264 - 1, and there's an unallocated BP at L0 offset 264 in the indirect block at f.e. This happens with datablkshift=17, indblkshift=17, at L5 block 0 which covers [0, 267). Only BPs at indices [0,127] in that block ever get used each one covering 257 L0 offsets. In your scenario, we're at That case indeed should do something else, but note that this has always been broken. Before this would compute The fix I've got in mind for this is to limit Meanwhile I think in practice this does not have any practical effect since VFS layers don't allow files >= 263 bytes long because signed offsets. Note this case arises when all of the physical BPs are allocated (virtual hole case). |
||
} | ||
} | ||
|
||
if (db != NULL) { | ||
|
@@ -2655,39 +2634,25 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |
return (error); | ||
} | ||
|
||
/* | ||
* Adjust *offset to the next (or previous) block byte offset at lvl. | ||
* Returns FALSE if *offset would overflow or underflow. | ||
*/ | ||
static boolean_t | ||
dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) | ||
{ | ||
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | ||
int span = lvl * epbs + dn->dn_datablkshift; | ||
uint64_t blkid, maxblkid; | ||
|
||
if (span >= 8 * sizeof (uint64_t)) | ||
return (B_FALSE); | ||
|
||
blkid = *offset >> span; | ||
maxblkid = 1ULL << (8 * sizeof (*offset) - span); | ||
if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) | ||
*offset = (blkid + 1) << span; | ||
else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) | ||
*offset = (blkid << span) - 1; | ||
else | ||
return (B_FALSE); | ||
|
||
return (B_TRUE); | ||
} | ||
|
||
/* | ||
* Find the next hole, data, or sparse region at or after *offset. | ||
* The value 'blkfill' tells us how many items we expect to find | ||
* in an L0 data block; this value is 1 for normal objects, | ||
* DNODES_PER_BLOCK for the meta dnode, and some fraction of | ||
* DNODES_PER_BLOCK when searching for sparse regions thereof. | ||
* | ||
* If minlvl == 0, this searches for dnodes or unallocated dnodes. | ||
* If found, *offset points to the first offset of the matched dnode. | ||
* Backwards search is not allowed for dnodes. | ||
* | ||
* If minlvl > 0, this searches for blocks at the given level. | ||
* If found, *offset points to the first L0 offset of the block | ||
* (or for backwards search, the last offset, inclusive). | ||
* | ||
* If not found, in both cases, *offset is set to the first (or last) | ||
* offset of the unallocated indirect block where the search ended or | ||
* the initial offset if no such block was encountered. | ||
* | ||
* Examples: | ||
* | ||
* dnode_next_offset(dn, flags, offset, 1, 1, 0); | ||
|
@@ -2708,7 +2673,8 @@ int | |
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, | ||
int minlvl, uint64_t blkfill, uint64_t txg) | ||
{ | ||
uint64_t matched = *offset; | ||
uint64_t blkid; | ||
int index, epbs; | ||
int lvl, maxlvl; | ||
int error = 0; | ||
|
||
|
@@ -2730,18 +2696,31 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, | |
goto out; | ||
} | ||
|
||
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | ||
maxlvl = dn->dn_phys->dn_nlevels; | ||
|
||
if (minlvl > 0) { | ||
uint64_t n = dbuf_whichblock(dn, minlvl - 1, *offset); | ||
blkid = n >> epbs; | ||
index = BF64_GET(n, 0, epbs); | ||
} else { | ||
blkid = dbuf_whichblock(dn, 0, *offset); | ||
index = (*offset >> DNODE_SHIFT) & (blkfill - 1); | ||
ASSERT3U(BF64_GET(*offset, 0, DNODE_SHIFT), ==, 0); | ||
} | ||
|
||
for (lvl = minlvl; lvl <= maxlvl; ) { | ||
error = dnode_next_offset_level(dn, | ||
flags, offset, lvl, blkfill, txg); | ||
flags, lvl, blkid, &index, blkfill, txg, offset); | ||
|
||
if (error == 0 && lvl > minlvl) { | ||
/* Continue search at matched block in lvl-1. */ | ||
blkid = (blkid << epbs) + index; | ||
index = 0; | ||
--lvl; | ||
matched = *offset; | ||
} else if (error == ESRCH && lvl < maxlvl && | ||
dnode_next_block(dn, flags, &matched, lvl)) { | ||
} else if (error == ESRCH && lvl < maxlvl) { | ||
/* | ||
* Continue search at next/prev offset in lvl+1 block. | ||
* Continue search at next/prev index in lvl+1 block. | ||
* | ||
* Usually we only search upwards at the start of the | ||
* search as higher level blocks point at a matching | ||
|
@@ -2752,13 +2731,14 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, | |
* happens if we are still syncing out the tree, and | ||
* some BP's at higher levels are not updated yet. | ||
* | ||
* We must adjust offset to avoid coming back to the | ||
* same offset and getting stuck looping forever. This | ||
* also deals with the case where offset is already at | ||
* the beginning or end of the object. | ||
* We must adjust index to avoid coming back to the | ||
* same offset and getting stuck looping forever. The | ||
* next loop goes up again if index is -1 or (1<<epbs). | ||
*/ | ||
index = BF64_GET(blkid, 0, epbs) + | ||
((flags & DNODE_FIND_BACKWARDS) ? -1 : 1); | ||
blkid = blkid >> epbs; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I understand, when searching backwards, once it reach There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed. After this PR search always ends at maxlvl (error == The previous code had to break to prevent a loop for all the cases where |
||
++lvl; | ||
*offset = matched; | ||
} else { | ||
break; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess if
i < 0
, then we returnESRCH
and offset does not matter. Am I wrong?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're not wrong. If
i < 0
thendnode_next_offset_level
returnsESRCH
, anddnode_next_offset
doesn't use the offset to find the next block after this PR.However there is some offset returned even when the entire search ends with
ESRCH
fromdnode_next_offset
. That final offset depends on which blocks exist in the tree since each level updates the offset only for blocks that exist with some value if greater or smaller than the last (or initial) offset.None of the current kernel callers use the
ESRCH
result fromdnode_next_offset
as far as I can tell (especially for backwards search). That said, I'm preserving the behavior for now for the sake of a smaller PR.I'm open to feedback if you think we want to do more here. The returned offset in the
ESRCH
case is pretty dubious overall.