From 6afaed53cc9adde69d8a76ff5b4d740d5efbc54c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 10 Jan 2023 14:56:39 +0000 Subject: [PATCH 1/3] btrfs: simplify update of last_dir_index_offset when logging a directory When logging a directory, we always set the inode's last_dir_index_offset to the offset of the last dir index item we found. This is using an extra field in the log context structure, and it makes more sense to update it only after we insert dir index items, and we could directly update the inode's last_dir_index_offset field instead. So make this simpler by updating the inode's last_dir_index_offset only when we actually insert dir index keys in the log tree, and getting rid of the last_dir_item_offset field in the log context structure. Reported-by: David Arendt Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/ Reported-by: Maxim Mikityanskiy Link: https://lore.kernel.org/linux-btrfs/Y8voyTXdnPDz8xwY@mail.gmail.com/ Reported-by: Hunter Wardlaw Link: https://bugzilla.suse.com/show_bug.cgi?id=1207231 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216851 CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 23 +++++++++++++++++------ fs/btrfs/tree-log.h | 2 -- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d43261545264..58599189bd18 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3576,17 +3576,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { + struct btrfs_root *log = inode->root->log_root; char *ins_data = NULL; struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; unsigned long dst_offset; + u64 last_index; struct btrfs_key key; u32 item_size; int ret; @@ -3644,6 +3646,19 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and return an error to fallback + * to a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = -EUCLEAN; + else + inode->last_dir_index_offset = last_index; out: kfree(ins_data); @@ -3693,7 +3708,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - ctx->last_dir_item_offset = key.offset; /* * Skip ranges of items that consist only of dir item keys created @@ -3756,7 +3770,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, if (batch_size > 0) { int ret; - ret = flush_dir_items_batch(trans, log, src, dst_path, + ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) return ret; @@ -4044,7 +4058,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = BTRFS_DIR_START_INDEX; max_key = 0; - ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { ret = log_dir_items(trans, inode, path, dst_path, @@ -4056,8 +4069,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = max_key + 1; } - inode->last_dir_index_offset = ctx->last_dir_item_offset; - return 0; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85b43075ac58..85cd24cb0540 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -24,8 +24,6 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - /* Tracks the last logged dir item/index key offset. */ - u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ From 519b7e13b5ae8dd38da1e52275705343be6bb508 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 23 Jan 2023 16:54:46 +0000 Subject: [PATCH 2/3] btrfs: lock the inode in shared mode before starting fiemap Currently fiemap does not take the inode's lock (VFS lock), it only locks a file range in the inode's io tree. This however can lead to a deadlock if we have a concurrent fsync on the file and fiemap code triggers a fault when accessing the user space buffer with fiemap_fill_next_extent(). The deadlock happens on the inode's i_mmap_lock semaphore, which is taken both by fsync and btrfs_page_mkwrite(). This deadlock was recently reported by syzbot and triggers a trace like the following: task:syz-executor361 state:D stack:20264 pid:5668 ppid:5119 flags:0x00004004 Call Trace: context_switch kernel/sched/core.c:5293 [inline] __schedule+0x995/0xe20 kernel/sched/core.c:6606 schedule+0xcb/0x190 kernel/sched/core.c:6682 wait_on_state fs/btrfs/extent-io-tree.c:707 [inline] wait_extent_bit+0x577/0x6f0 fs/btrfs/extent-io-tree.c:751 lock_extent+0x1c2/0x280 fs/btrfs/extent-io-tree.c:1742 find_lock_delalloc_range+0x4e6/0x9c0 fs/btrfs/extent_io.c:488 writepage_delalloc+0x1ef/0x540 fs/btrfs/extent_io.c:1863 __extent_writepage+0x736/0x14e0 fs/btrfs/extent_io.c:2174 extent_write_cache_pages+0x983/0x1220 fs/btrfs/extent_io.c:3091 extent_writepages+0x219/0x540 fs/btrfs/extent_io.c:3211 do_writepages+0x3c3/0x680 mm/page-writeback.c:2581 filemap_fdatawrite_wbc+0x11e/0x170 mm/filemap.c:388 __filemap_fdatawrite_range mm/filemap.c:421 [inline] filemap_fdatawrite_range+0x175/0x200 mm/filemap.c:439 btrfs_fdatawrite_range fs/btrfs/file.c:3850 [inline] start_ordered_ops fs/btrfs/file.c:1737 [inline] btrfs_sync_file+0x4ff/0x1190 fs/btrfs/file.c:1839 generic_write_sync include/linux/fs.h:2885 [inline] btrfs_do_write_iter+0xcd3/0x1280 fs/btrfs/file.c:1684 call_write_iter include/linux/fs.h:2189 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x7dc/0xc50 fs/read_write.c:584 ksys_write+0x177/0x2a0 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f7d4054e9b9 RSP: 002b:00007f7d404fa2f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f7d405d87a0 RCX: 00007f7d4054e9b9 RDX: 0000000000000090 RSI: 0000000020000000 RDI: 0000000000000006 RBP: 00007f7d405a51d0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 61635f65646f6e69 R13: 65646f7475616f6e R14: 7261637369646f6e R15: 00007f7d405d87a8 INFO: task syz-executor361:5697 blocked for more than 145 seconds. Not tainted 6.2.0-rc3-syzkaller-00376-g7c6984405241 #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor361 state:D stack:21216 pid:5697 ppid:5119 flags:0x00004004 Call Trace: context_switch kernel/sched/core.c:5293 [inline] __schedule+0x995/0xe20 kernel/sched/core.c:6606 schedule+0xcb/0x190 kernel/sched/core.c:6682 rwsem_down_read_slowpath+0x5f9/0x930 kernel/locking/rwsem.c:1095 __down_read_common+0x54/0x2a0 kernel/locking/rwsem.c:1260 btrfs_page_mkwrite+0x417/0xc80 fs/btrfs/inode.c:8526 do_page_mkwrite+0x19e/0x5e0 mm/memory.c:2947 wp_page_shared+0x15e/0x380 mm/memory.c:3295 handle_pte_fault mm/memory.c:4949 [inline] __handle_mm_fault mm/memory.c:5073 [inline] handle_mm_fault+0x1b79/0x26b0 mm/memory.c:5219 do_user_addr_fault+0x69b/0xcb0 arch/x86/mm/fault.c:1428 handle_page_fault arch/x86/mm/fault.c:1519 [inline] exc_page_fault+0x7a/0x110 arch/x86/mm/fault.c:1575 asm_exc_page_fault+0x22/0x30 arch/x86/include/asm/idtentry.h:570 RIP: 0010:copy_user_short_string+0xd/0x40 arch/x86/lib/copy_user_64.S:233 Code: 74 0a 89 (...) RSP: 0018:ffffc9000570f330 EFLAGS: 00050202 RAX: ffffffff843e6601 RBX: 00007fffffffefc8 RCX: 0000000000000007 RDX: 0000000000000000 RSI: ffffc9000570f3e0 RDI: 0000000020000120 RBP: ffffc9000570f490 R08: 0000000000000000 R09: fffff52000ae1e83 R10: fffff52000ae1e83 R11: 1ffff92000ae1e7c R12: 0000000000000038 R13: ffffc9000570f3e0 R14: 0000000020000120 R15: ffffc9000570f3e0 copy_user_generic arch/x86/include/asm/uaccess_64.h:37 [inline] raw_copy_to_user arch/x86/include/asm/uaccess_64.h:58 [inline] _copy_to_user+0xe9/0x130 lib/usercopy.c:34 copy_to_user include/linux/uaccess.h:169 [inline] fiemap_fill_next_extent+0x22e/0x410 fs/ioctl.c:144 emit_fiemap_extent+0x22d/0x3c0 fs/btrfs/extent_io.c:3458 fiemap_process_hole+0xa00/0xad0 fs/btrfs/extent_io.c:3716 extent_fiemap+0xe27/0x2100 fs/btrfs/extent_io.c:3922 btrfs_fiemap+0x172/0x1e0 fs/btrfs/inode.c:8209 ioctl_fiemap fs/ioctl.c:219 [inline] do_vfs_ioctl+0x185b/0x2980 fs/ioctl.c:810 __do_sys_ioctl fs/ioctl.c:868 [inline] __se_sys_ioctl+0x83/0x170 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f7d4054e9b9 RSP: 002b:00007f7d390d92f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f7d405d87b0 RCX: 00007f7d4054e9b9 RDX: 0000000020000100 RSI: 00000000c020660b RDI: 0000000000000005 RBP: 00007f7d405a51d0 R08: 00007f7d390d9700 R09: 0000000000000000 R10: 00007f7d390d9700 R11: 0000000000000246 R12: 61635f65646f6e69 R13: 65646f7475616f6e R14: 7261637369646f6e R15: 00007f7d405d87b8 What happens is the following: 1) Task A is doing an fsync, enters btrfs_sync_file() and flushes delalloc before locking the inode and the i_mmap_lock semaphore, that is, before calling btrfs_inode_lock(); 2) After task A flushes delalloc and before it calls btrfs_inode_lock(), another task dirties a page; 3) Task B starts a fiemap without FIEMAP_FLAG_SYNC, so the page dirtied at step 2 remains dirty and unflushed. Then when it enters extent_fiemap() and it locks a file range that includes the range of the page dirtied in step 2; 4) Task A calls btrfs_inode_lock() and locks the inode (VFS lock) and the inode's i_mmap_lock semaphore in write mode. Then it tries to flush delalloc by calling start_ordered_ops(), which will block, at find_lock_delalloc_range(), when trying to lock the range of the page dirtied at step 2, since this range was locked by the fiemap task (at step 3); 5) Task B generates a page fault when accessing the user space fiemap buffer with a call to fiemap_fill_next_extent(). The fault handler needs to call btrfs_page_mkwrite() for some other page of our inode, and there we deadlock when trying to lock the inode's i_mmap_lock semaphore in read mode, since the fsync task locked it in write mode (step 4) and the fsync task can not progress because it's waiting to lock a file range that is currently locked by us (the fiemap task, step 3). Fix this by taking the inode's lock (VFS lock) in shared mode when entering fiemap. This effectively serializes fiemap with fsync (except the most expensive part of fsync, the log sync), preventing this deadlock. Reported-by: syzbot+cc35f55c41e34c30dcb5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000032dc7305f2a66f46@google.com/ CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9bd32daa9b9a..3bbf8703db2a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3826,6 +3826,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); @@ -4019,6 +4020,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); From 5f58d783fd7823b2c2d5954d1126e702f94bfc4c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 20 Jan 2023 21:47:16 +0800 Subject: [PATCH 3/3] btrfs: free device in btrfs_close_devices for a single device filesystem We have this check to make sure we don't accidentally add older devices that may have disappeared and re-appeared with an older generation from being added to an fs_devices (such as a replace source device). This makes sense, we don't want stale disks in our file system. However for single disks this doesn't really make sense. I've seen this in testing, but I was provided a reproducer from a project that builds btrfs images on loopback devices. The loopback device gets cached with the new generation, and then if it is re-used to generate a new file system we'll fail to mount it because the new fs is "older" than what we have in cache. Fix this by freeing the cache when closing the device for a single device filesystem. This will ensure that the mount command passed device path is scanned successfully during the next mount. CC: stable@vger.kernel.org # 5.10+ Reported-by: Daan De Meyer Signed-off-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4cdadf30163d..df43093b7a46 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -403,6 +403,7 @@ void btrfs_free_device(struct btrfs_device *device) static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -1181,9 +1182,22 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) + if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list);