Skip to content

Commit cc8a093

Browse files
committed
Merge tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - extent map shrinker fixes: - fix potential use after free accessing an inode to reach fs_info, the shrinker could do iput() in the meantime - skip unnecessary scanning of inodes without extent maps - do direct iput(), no need for indirection via workqueue - in block < page mode, fix race when extending i_size in buffered mode - fix minor memory leak in selftests - print descriptive error message when seeding device is not found * tag 'for-6.14-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix data overwriting bug during buffered write when block size < page size btrfs: output an error message if btrfs failed to find the seed fsid btrfs: do regular iput instead of delayed iput during extent map shrinking btrfs: skip inodes without loaded extent maps when shrinking extent maps btrfs: fix use-after-free on inode when scanning root during em shrinking btrfs: selftests: fix btrfs_test_delayed_refs() leak of transaction
2 parents 3d85d6c + efa11fd commit cc8a093

File tree

4 files changed

+73
-26
lines changed

4 files changed

+73
-26
lines changed

fs/btrfs/extent_map.c

+59-24
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11281128
long nr_dropped = 0;
11291129
struct rb_node *node;
11301130

1131+
lockdep_assert_held_write(&tree->lock);
1132+
11311133
/*
11321134
* Take the mmap lock so that we serialize with the inode logging phase
11331135
* of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11391141
* to find new extents, which may not be there yet because ordered
11401142
* extents haven't completed yet.
11411143
*
1142-
* We also do a try lock because otherwise we could deadlock. This is
1143-
* because the shrinker for this filesystem may be invoked while we are
1144-
* in a path that is holding the mmap lock in write mode. For example in
1145-
* a reflink operation while COWing an extent buffer, when allocating
1146-
* pages for a new extent buffer and under memory pressure, the shrinker
1147-
* may be invoked, and therefore we would deadlock by attempting to read
1148-
* lock the mmap lock while we are holding already a write lock on it.
1144+
* We also do a try lock because we don't want to block for too long and
1145+
* we are holding the extent map tree's lock in write mode.
11491146
*/
11501147
if (!down_read_trylock(&inode->i_mmap_lock))
11511148
return 0;
11521149

1153-
/*
1154-
* We want to be fast so if the lock is busy we don't want to spend time
1155-
* waiting for it - either some task is about to do IO for the inode or
1156-
* we may have another task shrinking extent maps, here in this code, so
1157-
* skip this inode.
1158-
*/
1159-
if (!write_trylock(&tree->lock)) {
1160-
up_read(&inode->i_mmap_lock);
1161-
return 0;
1162-
}
1163-
11641150
node = rb_first(&tree->root);
11651151
while (node) {
11661152
struct rb_node *next = rb_next(node);
@@ -1201,34 +1187,83 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
12011187
break;
12021188
node = next;
12031189
}
1204-
write_unlock(&tree->lock);
12051190
up_read(&inode->i_mmap_lock);
12061191

12071192
return nr_dropped;
12081193
}
12091194

1195+
static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
1196+
u64 min_ino)
1197+
{
1198+
struct btrfs_inode *inode;
1199+
unsigned long from = min_ino;
1200+
1201+
xa_lock(&root->inodes);
1202+
while (true) {
1203+
struct extent_map_tree *tree;
1204+
1205+
inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
1206+
if (!inode)
1207+
break;
1208+
1209+
tree = &inode->extent_tree;
1210+
1211+
/*
1212+
* We want to be fast so if the lock is busy we don't want to
1213+
* spend time waiting for it (some task is about to do IO for
1214+
* the inode).
1215+
*/
1216+
if (!write_trylock(&tree->lock))
1217+
goto next;
1218+
1219+
/*
1220+
* Skip inode if it doesn't have loaded extent maps, so we avoid
1221+
* getting a reference and doing an iput later. This includes
1222+
* cases like files that were opened for things like stat(2), or
1223+
* files with all extent maps previously released through the
1224+
* release folio callback (btrfs_release_folio()) or released in
1225+
* a previous run, or directories which never have extent maps.
1226+
*/
1227+
if (RB_EMPTY_ROOT(&tree->root)) {
1228+
write_unlock(&tree->lock);
1229+
goto next;
1230+
}
1231+
1232+
if (igrab(&inode->vfs_inode))
1233+
break;
1234+
1235+
write_unlock(&tree->lock);
1236+
next:
1237+
from = btrfs_ino(inode) + 1;
1238+
cond_resched_lock(&root->inodes.xa_lock);
1239+
}
1240+
xa_unlock(&root->inodes);
1241+
1242+
return inode;
1243+
}
1244+
12101245
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
12111246
{
12121247
struct btrfs_fs_info *fs_info = root->fs_info;
12131248
struct btrfs_inode *inode;
12141249
long nr_dropped = 0;
12151250
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
12161251

1217-
inode = btrfs_find_first_inode(root, min_ino);
1252+
inode = find_first_inode_to_shrink(root, min_ino);
12181253
while (inode) {
12191254
nr_dropped += btrfs_scan_inode(inode, ctx);
1255+
write_unlock(&inode->extent_tree.lock);
12201256

12211257
min_ino = btrfs_ino(inode) + 1;
12221258
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
1223-
btrfs_add_delayed_iput(inode);
1259+
iput(&inode->vfs_inode);
12241260

1225-
if (ctx->scanned >= ctx->nr_to_scan ||
1226-
btrfs_fs_closing(inode->root->fs_info))
1261+
if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
12271262
break;
12281263

12291264
cond_resched();
12301265

1231-
inode = btrfs_find_first_inode(root, min_ino);
1266+
inode = find_first_inode_to_shrink(root, min_ino);
12321267
}
12331268

12341269
if (inode) {

fs/btrfs/file.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
10901090
u64 lockend;
10911091
size_t num_written = 0;
10921092
ssize_t ret;
1093-
loff_t old_isize = i_size_read(inode);
1093+
loff_t old_isize;
10941094
unsigned int ilock_flags = 0;
10951095
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
10961096
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
@@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
11031103
if (ret < 0)
11041104
return ret;
11051105

1106+
/*
1107+
* We can only trust the isize with inode lock held, or it can race with
1108+
* other buffered writes and cause incorrect call of
1109+
* pagecache_isize_extended() to overwrite existing data.
1110+
*/
1111+
old_isize = i_size_read(inode);
1112+
11061113
ret = generic_write_checks(iocb, i);
11071114
if (ret <= 0)
11081115
goto out;

fs/btrfs/tests/delayed-refs-tests.c

+1
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
10091009
if (!ret)
10101010
ret = select_delayed_refs_test(&trans);
10111011

1012+
kfree(transaction);
10121013
out_free_fs_info:
10131014
btrfs_free_dummy_fs_info(fs_info);
10141015
return ret;

fs/btrfs/volumes.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -7200,8 +7200,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
72007200

72017201
fs_devices = find_fsid(fsid, NULL);
72027202
if (!fs_devices) {
7203-
if (!btrfs_test_opt(fs_info, DEGRADED))
7203+
if (!btrfs_test_opt(fs_info, DEGRADED)) {
7204+
btrfs_err(fs_info,
7205+
"failed to find fsid %pU when attempting to open seed devices",
7206+
fsid);
72047207
return ERR_PTR(-ENOENT);
7208+
}
72057209

72067210
fs_devices = alloc_fs_devices(fsid);
72077211
if (IS_ERR(fs_devices))

0 commit comments

Comments
 (0)