From a20d1cebb98bba75f2e34fddc768dd8712c1bded Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Mon, 3 May 2021 15:37:33 -0400 Subject: [PATCH 01/23] jbd2: fix portability problems caused by unaligned accesses This commit applies the e2fsck/recovery.c portions of commit 1e0c8ca7c08a ("e2fsck: fix portability problems caused by unaligned accesses) from the e2fsprogs git tree. The on-disk format for the ext4 journal can have unaigned 32-bit integers. This can happen when replaying a journal using a obsolete checksum format (which was never popularly used, since the v3 format replaced v2 while the metadata checksum feature was being stablized). Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/jbd2/recovery.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d47a0d96bf30..4c4209262437 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -196,7 +196,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) static int count_tags(journal_t *journal, struct buffer_head *bh) { char * tagp; - journal_block_tag_t * tag; + journal_block_tag_t tag; int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); @@ -206,14 +206,14 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) <= size) { - tag = (journal_block_tag_t *) tagp; + memcpy(&tag, tagp, sizeof(tag)); nr++; tagp += tag_bytes; - if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) + if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) tagp += 16; - if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) + if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) break; } @@ -433,9 +433,9 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) } static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + journal_block_tag3_t *tag3, void *buf, __u32 sequence) { - journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u32 csum32; __be32 seq; @@ -496,7 +496,7 @@ static int do_one_pass(journal_t *journal, while (1) { int flags; char * tagp; - journal_block_tag_t * tag; + journal_block_tag_t tag; struct buffer_head * obh; struct buffer_head * nbh; @@ -613,8 +613,8 @@ static int do_one_pass(journal_t *journal, <= journal->j_blocksize - descr_csum_size) { unsigned long io_block; - tag = (journal_block_tag_t *) tagp; - flags = be16_to_cpu(tag->t_flags); + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); io_block = next_log_block++; wrap(journal, next_log_block); @@ -632,7 +632,7 @@ static int do_one_pass(journal_t *journal, J_ASSERT(obh != NULL); blocknr = read_tag_block(journal, - tag); + &tag); /* If the block has been * revoked, then we're all done @@ -647,8 +647,8 @@ static int do_one_pass(journal_t *journal, /* Look for block corruption */ if (!jbd2_block_tag_csum_verify( - journal, tag, obh->b_data, - be32_to_cpu(tmp->h_sequence))) { + journal, &tag, (journal_block_tag3_t *)tagp, + obh->b_data, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " From 390add0cc9f4d7fda89cf3db7651717e82cf0afc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Tue, 10 Aug 2021 12:55:51 -0400 Subject: [PATCH 02/23] jbd2: fix clang warning in recovery.c Remove unused variable store which was never used. This fix is also in e2fsprogs commit 99a2294f85f0 ("e2fsck: value stored to err is never read"). Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/jbd2/recovery.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 4c4209262437..ba979fcf1cd3 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -760,7 +760,6 @@ static int do_one_pass(journal_t *journal, */ jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - err = 0; brelse(bh); goto done; } From 4009cc7ad6b5f8a260e46cdaabb3763f2e6ca2e0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Tue, 10 Aug 2021 14:02:33 -0400 Subject: [PATCH 03/23] jbd2: clean up two gcc -Wall warnings in recovery.c Fix a signed vs unsigned and a void * pointer arithmetic warning. This cleanup is also in e2fsprogs commit aec460db9a93 ("e2fsck: clean up two gcc -Wall warnings in recovery.c"). Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/jbd2/recovery.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index ba979fcf1cd3..8ca3527189f8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -179,8 +179,8 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) if (!jbd2_journal_has_csum_v2or3(j)) return 1; - tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); + tail = (struct jbd2_journal_block_tail *)((char *)buf + + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); @@ -896,7 +896,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, { jbd2_journal_revoke_header_t *header; int offset, max; - int csum_size = 0; + unsigned csum_size = 0; __u32 rcount; int record_len = 4; From bd2eea8d0a6b6a9aca22f20bf74f73b71d8808af Mon Sep 17 00:00:00 2001 From: Wang Jianchao <wangjianchao@kuaishou.com> Date: Sat, 24 Jul 2021 15:41:20 +0800 Subject: [PATCH 04/23] ext4: remove the 'group' parameter of ext4_trim_extent Get rid of the 'group' parameter of ext4_trim_extent as we can get it from the 'e4b'. Reviewed-by: Andreas Dilger <adilger@dilger.ca> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210724074124.25731-2-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/mballoc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 089c958aa2c3..018d5d3c6eeb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6183,19 +6183,19 @@ error_return: * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM - * @group: alloc. group we are working with * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static int ext4_trim_extent(struct super_block *sb, + int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; + ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -6271,8 +6271,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, next - start, &e4b); if (ret && ret != -EOPNOTSUPP) break; ret = 0; From 6920b3913235f517728bb69abe9b39047a987113 Mon Sep 17 00:00:00 2001 From: Wang Jianchao <wangjianchao@kuaishou.com> Date: Sat, 24 Jul 2021 15:41:21 +0800 Subject: [PATCH 05/23] ext4: add new helper interface ext4_try_to_trim_range() There is no functional change in this patch but just split the codes, which serachs free block and does trim, into a new function ext4_try_to_trim_range. This is preparing for the following async backgroup discard. Reviewed-by: Andreas Dilger <adilger@dilger.ca> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210724074124.25731-3-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/mballoc.c | 102 ++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 018d5d3c6eeb..e3844152a643 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6218,6 +6218,54 @@ __acquires(bitlock) return ret; } +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks) +{ + ext4_grpblk_t next, count, free_count; + void *bitmap; + int ret = 0; + + bitmap = e4b->bd_bitmap; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; + count = 0; + free_count = 0; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + next = mb_find_next_bit(bitmap, max + 1, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, next - start, e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, e4b->bd_group); + cond_resched(); + ext4_lock_group(sb, e4b->bd_group); + } + + if ((e4b->bd_info->bb_free - free_count) < minblocks) + break; + } + + return count; +} + /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system @@ -6241,10 +6289,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { - void *bitmap; - ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret = 0; + int ret; trace_ext4_trim_all_free(sb, group, start, max); @@ -6254,57 +6300,23 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ret, group); return ret; } - bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); - if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && - minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) - goto out; - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; - - while (start <= max) { - start = mb_find_next_zero_bit(bitmap, max + 1, start); - if (start > max) - break; - next = mb_find_next_bit(bitmap, max + 1, start); - - if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, next - start, &e4b); - if (ret && ret != -EOPNOTSUPP) - break; - ret = 0; - count += next - start; - } - free_count += next - start; - start = next + 1; - - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } - - if (need_resched()) { - ext4_unlock_group(sb, group); - cond_resched(); - ext4_lock_group(sb, group); - } - - if ((e4b.bd_info->bb_free - free_count) < minblocks) - break; + if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || + minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); + if (ret >= 0) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } else { + ret = 0; } - if (!ret) { - ret = count; - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } -out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", - count, group); + ret, group); return ret; } From b6f5558c304a17d4dfad096da980de6e222a462a Mon Sep 17 00:00:00 2001 From: Wang Jianchao <wangjianchao@kuaishou.com> Date: Sat, 24 Jul 2021 15:41:22 +0800 Subject: [PATCH 06/23] ext4: remove the repeated comment of ext4_trim_all_free Reviewed-by: Andreas Dilger <adilger@dilger.ca> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210724074124.25731-4-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/mballoc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e3844152a643..34be2f07449d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6274,15 +6274,10 @@ static int ext4_try_to_trim_range(struct super_block *sb, * @max: last group block to examine * @minblocks: minimum extent block count * - * ext4_trim_all_free walks through group's buddy bitmap searching for free - * extents. When the free block is found, ext4_trim_extent is called to TRIM - * the extent. - * - * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in - * the group buddy bitmap. This is done until whole group is scanned. + * the group buddy bitmap. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, From 55cdd0af2bc5ffc92a2deb745627755aecd5db33 Mon Sep 17 00:00:00 2001 From: Wang Jianchao <wangjianchao@kuaishou.com> Date: Sat, 24 Jul 2021 15:41:23 +0800 Subject: [PATCH 07/23] ext4: get discard out of jbd2 commit kthread contex Right now, discard is issued and waited to be completed in jbd2 commit kthread context after the logs are committed. When large amount of files are deleted and discard is flooding, jbd2 commit kthread can be blocked for long time. Then all of the metadata operations can be blocked to wait the log space. One case is the page fault path with read mm->mmap_sem held, which wants to update the file time but has to wait for the log space. When other threads in the task wants to do mmap, then write mmap_sem is blocked. Finally all of the following read mmap_sem requirements are blocked, even the ps command which need to read the /proc/pid/ -cmdline. Our monitor service which needs to read /proc/pid/cmdline used to be blocked for 5 mins. This patch frees the blocks back to buddy after commit and then do discard in a async kworker context in fstrim fashion, namely, - mark blocks to be discarded as used if they have not been allocated - do discard - mark them free After this, jbd2 commit kthread won't be blocked any more by discard and we won't get NOSPC even if the discard is slow or throttled. Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2 Suggested-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com> Link: https://lore.kernel.org/r/20210830075246.12516-5-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/ext4.h | 2 + fs/ext4/mballoc.c | 103 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 79 insertions(+), 26 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c51e243450d..6b678b968d84 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1536,6 +1536,8 @@ struct ext4_sb_info { unsigned int s_mb_free_pending; struct list_head s_freed_data_list; /* List of blocks to be freed after commit completed */ + struct list_head s_discard_list; + struct work_struct s_discard_work; struct rb_root s_mb_avg_fragment_size_root; rwlock_t s_mb_rb_lock; struct list_head *s_mb_largest_free_orders; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 34be2f07449d..907b3577988c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -408,6 +408,10 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr); +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -3308,6 +3312,55 @@ static int ext4_groupinfo_create_slab(size_t size) return 0; } +static void ext4_discard_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(work, + struct ext4_sb_info, s_discard_work); + struct super_block *sb = sbi->s_sb; + struct ext4_free_data *fd, *nfd; + struct ext4_buddy e4b; + struct list_head discard_list; + ext4_group_t grp, load_grp; + int err = 0; + + INIT_LIST_HEAD(&discard_list); + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_discard_list, &discard_list); + spin_unlock(&sbi->s_md_lock); + + load_grp = UINT_MAX; + list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { + /* + * If filesystem is umounting or no memory, give up the discard + */ + if ((sb->s_flags & SB_ACTIVE) && !err) { + grp = fd->efd_group; + if (grp != load_grp) { + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); + + err = ext4_mb_load_buddy(sb, grp, &e4b); + if (err) { + kmem_cache_free(ext4_free_data_cachep, fd); + load_grp = UINT_MAX; + continue; + } else { + load_grp = grp; + } + } + + ext4_lock_group(sb, grp); + ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, + fd->efd_start_cluster + fd->efd_count - 1, 1); + ext4_unlock_group(sb, grp); + } + kmem_cache_free(ext4_free_data_cachep, fd); + } + + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); +} + int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3376,6 +3429,8 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); + INIT_LIST_HEAD(&sbi->s_discard_list); + INIT_WORK(&sbi->s_discard_work, ext4_discard_work); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -3474,6 +3529,14 @@ int ext4_mb_release(struct super_block *sb) struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; + if (test_opt(sb, DISCARD)) { + /* + * wait the discard work to drain all of ext4_free_data + */ + flush_work(&sbi->s_discard_work); + WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); + } + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); @@ -3596,7 +3659,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb, put_page(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->efd_group); - kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in %d structures\n", count, @@ -3611,10 +3673,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct bio *discard_bio = NULL; struct list_head freed_data_list; struct list_head *cut_pos = NULL; - int err; + bool wake; INIT_LIST_HEAD(&freed_data_list); @@ -3629,30 +3690,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) cut_pos); spin_unlock(&sbi->s_md_lock); - if (test_opt(sb, DISCARD)) { - list_for_each_entry(entry, &freed_data_list, efd_list) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, - &discard_bio); - if (err && err != -EOPNOTSUPP) { - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } else if (err == -EOPNOTSUPP) - break; - } - - if (discard_bio) { - submit_bio_wait(discard_bio); - bio_put(discard_bio); - } - } - - list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); + + if (test_opt(sb, DISCARD)) { + spin_lock(&sbi->s_md_lock); + wake = list_empty(&sbi->s_discard_list); + list_splice_tail(&freed_data_list, &sbi->s_discard_list); + spin_unlock(&sbi->s_md_lock); + if (wake) + queue_work(system_unbound_wq, &sbi->s_discard_work); + } else { + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + kmem_cache_free(ext4_free_data_cachep, entry); + } } int __init ext4_init_mballoc(void) From 5036ab8df278f9879d8958679bd043e32515a3e4 Mon Sep 17 00:00:00 2001 From: Wang Jianchao <wangjianchao@kuaishou.com> Date: Mon, 30 Aug 2021 15:52:46 +0800 Subject: [PATCH 08/23] ext4: flush background discard kwork when retry allocation The background discard kwork tries to mark blocks used and issue discard. This can make filesystem suffer from NOSPC error, xfstest generic/371 can fail due to it. Fix it by flushing discard kwork in ext4_should_retry_alloc. At the same time, give up discard at the moment. Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com> Link: https://lore.kernel.org/r/20210830075246.12516-6-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/balloc.c | 8 +++++++- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 7 +++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9dc6e74b265c..a0fb0c4bdc7c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -652,8 +652,14 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * possible we just missed a transaction commit that did so */ smp_mb(); - if (sbi->s_mb_free_pending == 0) + if (sbi->s_mb_free_pending == 0) { + if (test_opt(sb, DISCARD)) { + atomic_inc(&sbi->s_retry_alloc_pending); + flush_work(&sbi->s_discard_work); + atomic_dec(&sbi->s_retry_alloc_pending); + } return ext4_has_free_clusters(sbi, 1, 0); + } /* * it's possible we've just missed a transaction commit here, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6b678b968d84..d71dcac3b97f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1538,6 +1538,7 @@ struct ext4_sb_info { after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; struct rb_root s_mb_avg_fragment_size_root; rwlock_t s_mb_rb_lock; struct list_head *s_mb_largest_free_orders; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 907b3577988c..34670cb63588 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3331,9 +3331,11 @@ static void ext4_discard_work(struct work_struct *work) load_grp = UINT_MAX; list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { /* - * If filesystem is umounting or no memory, give up the discard + * If filesystem is umounting or no memory or suffering + * from no space, give up the discard */ - if ((sb->s_flags & SB_ACTIVE) && !err) { + if ((sb->s_flags & SB_ACTIVE) && !err && + !atomic_read(&sbi->s_retry_alloc_pending)) { grp = fd->efd_group; if (grp != load_grp) { if (load_grp != UINT_MAX) @@ -3431,6 +3433,7 @@ int ext4_mb_init(struct super_block *sb) INIT_LIST_HEAD(&sbi->s_freed_data_list); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); + atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; From facec450a8243cd3310ff8a8b9bb2f71d35df9e9 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang <jiangguoqing@kylinos.cn> Date: Tue, 27 Jul 2021 16:07:08 +0800 Subject: [PATCH 09/23] ext4: reduce arguments of ext4_fc_add_dentry_tlv Let's pass fc_dentry directly since those arguments (tag, parent_ino and ino etc) can be deferenced from it. Signed-off-by: Guoqing Jiang <jiangguoqing@kylinos.cn> Reviewed-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Link: https://lore.kernel.org/r/20210727080708.3708814-1-guoqing.jiang@linux.dev --- fs/ext4/fast_commit.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e8195229c252..8e610a381862 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -775,28 +775,27 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, } /* Same as above, but adds dentry tlv. */ -static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, - int parent_ino, int ino, int dlen, - const unsigned char *dname, - u32 *crc) +static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, + struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; + int dlen = fc_dentry->fcd_name.len; u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, crc); if (!dst) return false; - fcd.fc_parent_ino = cpu_to_le32(parent_ino); - fcd.fc_ino = cpu_to_le32(ino); - tl.fc_tag = cpu_to_le16(tag); + fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); + fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); + tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); dst += sizeof(tl); ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, dname, dlen, crc); + ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); dst += dlen; return true; @@ -992,11 +991,7 @@ __releases(&sbi->s_fc_lock) &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv( - sb, fc_dentry->fcd_op, - fc_dentry->fcd_parent, fc_dentry->fcd_ino, - fc_dentry->fcd_name.len, - fc_dentry->fcd_name.name, crc)) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } @@ -1035,11 +1030,7 @@ __releases(&sbi->s_fc_lock) if (ret) goto lock_and_exit; - if (!ext4_fc_add_dentry_tlv( - sb, fc_dentry->fcd_op, - fc_dentry->fcd_parent, fc_dentry->fcd_ino, - fc_dentry->fcd_name.len, - fc_dentry->fcd_name.name, crc)) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } From 308c57ccf4318236be75dfa251c84713e694457b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Fri, 13 Aug 2021 11:20:48 -0400 Subject: [PATCH 10/23] ext4: if zeroout fails fall back to splitting the extent node If the underlying storage device is using thin-provisioning, it's possible for a zeroout operation to return ENOSPC. Commit df22291ff0fd ("ext4: Retry block allocation if we have free blocks left") added logic to retry block allocation since we might get free block after we commit a transaction. But the ENOSPC from thin-provisioning will confuse ext4, and lead to an infinite loop. Since using zeroout instead of splitting the extent node is an optimization, if it fails, we might as well fall back to splitting the extent node. Reported-by: yangerkun <yangerkun@huawei.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/extents.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92ad64b89d9b..501516cadc1b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3569,7 +3569,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) - goto out; + goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < @@ -3583,7 +3583,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) - goto out; + goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; @@ -3592,6 +3592,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } +fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) From b2bbb92f7042e8075fb036bf97043339576330c3 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 12 Aug 2021 14:47:37 +0200 Subject: [PATCH 11/23] ext4: fix e2fsprogs checksum failure for mounted filesystem Commit 81414b4dd48 ("ext4: remove redundant sb checksum recomputation") removed checksum recalculation after updating superblock free space / inode counters in ext4_fill_super() based on the fact that we will recalculate the checksum on superblock writeout. That is correct assumption but until the writeout happens (which can take a long time) the checksum is incorrect in the buffer cache and if programs such as tune2fs or resize2fs is called shortly after a file system is mounted can fail. So return back the checksum recalculation and add a comment explaining why. Fixes: 81414b4dd48f ("ext4: remove redundant sb checksum recomputation") Cc: stable@kernel.org Reported-by: Boyang Xue <bxue@redhat.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Link: https://lore.kernel.org/r/20210812124737.21981-1-jack@suse.cz --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfa09a277b56..970013c93d3e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5032,6 +5032,14 @@ no_journal: err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } + /* + * Update the checksum after updating free space/inode + * counters. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); From bd2c38cf1726ea913024393a0d11f2e2a3f4c180 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 12 Aug 2021 15:31:22 +0200 Subject: [PATCH 12/23] ext4: Make sure quota files are not grabbed accidentally If ext4 filesystem is corrupted so that quota files are linked from directory hirerarchy, bad things can happen. E.g. quota files can get corrupted or deleted. Make sure we are not grabbing quota file inodes when we expect normal inodes. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Link: https://lore.kernel.org/r/20210812133122.26360-1-jack@suse.cz --- fs/ext4/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8de607849df..2c33c795c4a7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4603,6 +4603,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; @@ -4613,9 +4614,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && - (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum))) || (ino < EXT4_ROOT_INO) || - (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, From a5fda11338180db13f3e9eec20c9deda1f7bad72 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Sat, 14 Aug 2021 10:41:30 -0400 Subject: [PATCH 13/23] ext4: fix sparse warnings Add sparse annotations to suppress false positive context imbalance warnings, and use NULL instead of 0 in EXT_MAX_{EXTENT,INDEX}. Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/ext4_extents.h | 5 +++-- fs/ext4/mballoc.c | 26 ++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 44e59881a1f0..26435f3a3094 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -173,10 +173,11 @@ struct partial_cluster { #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ - : 0) + : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ - ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 34670cb63588..665646a12e01 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2478,6 +2478,12 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. + * + * Note: because we are conditionally operating with the group lock in + * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this + * function using __acquire and __release. This means we need to be + * super careful before messing with the error path handling via "goto + * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, int cr) @@ -2491,8 +2497,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); - if (should_lock) + if (should_lock) { ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } free = grp->bb_free; if (free == 0) goto out; @@ -2500,8 +2508,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; - if (should_lock) + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); + } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { @@ -2528,12 +2538,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, return ret; } - if (should_lock) + if (should_lock) { ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } ret = ext4_mb_good_group(ac, group, cr); out: - if (should_lock) + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); + } return ret; } @@ -2969,6 +2983,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); unsigned long position; @@ -3041,6 +3056,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +__releases(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); @@ -6275,6 +6291,8 @@ __acquires(bitlock) static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) +__acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) +__releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count; void *bitmap; From b33d9f5909c8d30f1429fb9aefbb32760901a023 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Sat, 14 Aug 2021 10:54:09 -0400 Subject: [PATCH 14/23] jbd2: add sparse annotations for add_transaction_credits() Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/jbd2/transaction.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8804e126805f..5347411ae13e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -223,9 +223,15 @@ static void sub_reserved_credits(journal_t *journal, int blocks) * with j_state_lock held for reading. Returns 0 if handle joined the running * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and * caller must retry. + * + * Note: because j_state_lock may be dropped depending on the return + * value, we need to fake out sparse so ti doesn't complain about a + * locking imbalance. Callers of add_transaction_credits will need to + * make a similar accomodation. */ static int add_transaction_credits(journal_t *journal, int blocks, int rsv_blocks) +__must_hold(&journal->j_state_lock) { transaction_t *t = journal->j_running_transaction; int needed; @@ -238,6 +244,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (t->t_state != T_RUNNING) { WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -266,10 +273,12 @@ static int add_transaction_credits(journal_t *journal, int blocks, wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -293,6 +302,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -310,6 +320,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } return 0; @@ -413,8 +424,14 @@ repeat: if (!handle->h_reserved) { /* We may have dropped j_state_lock - restart in that case */ - if (add_transaction_credits(journal, blocks, rsv_blocks)) + if (add_transaction_credits(journal, blocks, rsv_blocks)) { + /* + * add_transaction_credits releases + * j_state_lock on a non-zero return + */ + __release(&journal->j_state_lock); goto repeat; + } } else { /* * We have handle reserved so we are allowed to join T_LOCKED From a54c4613dac1500b40e4ab55199f7c51f028e848 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Fri, 20 Aug 2021 23:44:17 -0400 Subject: [PATCH 15/23] ext4: fix race writing to an inline_data file while its xattrs are changing The location of the system.data extended attribute can change whenever xattr_sem is not taken. So we need to recalculate the i_inline_off field since it mgiht have changed between ext4_write_begin() and ext4_write_end(). This means that caching i_inline_off is probably not helpful, so in the long run we should probably get rid of it and shrink the in-memory ext4 inode slightly, but let's fix the race the simple way for now. Cc: stable@kernel.org Fixes: f19d5870cbf72 ("ext4: add normal write support for inline data") Reported-by: syzbot+13146364637c7363a7de@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/inline.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cb64db33f7..24e994e75f5c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -750,6 +750,12 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); + /* + * ei->i_inline_off may have changed since ext4_write_begin() + * called ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); + kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, pos, len); kunmap_atomic(kaddr); From 188c299e2a26cc33747187f87c9e044dfd85a782 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 16 Aug 2021 11:57:04 +0200 Subject: [PATCH 16/23] ext4: Support for checksumming from journal triggers JBD2 layer support triggers which are called when journaling layer moves buffer to a certain state. We can use the frozen trigger, which gets called when buffer data is frozen and about to be written out to the journal, to compute block checksums for some buffer types (similarly as does ocfs2). This avoids unnecessary repeated recomputation of the checksum (at the cost of larger window where memory corruption won't be caught by checksumming) and is even necessary when there are unsynchronized updaters of the checksummed data. So add superblock and journal trigger type arguments to ext4_journal_get_write_access() and ext4_journal_get_create_access() so that frozen triggers can be set accordingly. Also add inode argument to ext4_walk_page_buffers() and all the callbacks used with that function for the same purpose. This patch is mostly only a change of prototype of the above mentioned functions and a few small helpers. Real checksumming will come later. Reviewed-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210816095713.16537-1-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/ext4.h | 26 ++++++++++++-- fs/ext4/ext4_jbd2.c | 43 +++++++++++++++------- fs/ext4/ext4_jbd2.h | 18 ++++++---- fs/ext4/extents.c | 12 ++++--- fs/ext4/file.c | 3 +- fs/ext4/ialloc.c | 19 ++++++---- fs/ext4/indirect.c | 15 +++++--- fs/ext4/inline.c | 26 +++++++++----- fs/ext4/inode.c | 84 +++++++++++++++++++++++++------------------ fs/ext4/ioctl.c | 4 ++- fs/ext4/mballoc.c | 15 ++++---- fs/ext4/namei.c | 40 +++++++++++++-------- fs/ext4/resize.c | 38 ++++++++++++-------- fs/ext4/super.c | 16 ++++++++- fs/ext4/xattr.c | 26 +++++++++----- fs/jbd2/transaction.c | 2 +- 16 files changed, 259 insertions(+), 128 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d71dcac3b97f..27b101fc7c86 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1447,6 +1447,24 @@ struct ext4_super_block { #define EXT4_ENC_UTF8_12_1 1 +/* Types of ext4 journal triggers */ +enum ext4_journal_trigger_type { + EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ +}; + +#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE + +struct ext4_journal_trigger { + struct jbd2_buffer_trigger_type tr_triggers; + struct super_block *sb; +}; + +static inline struct ext4_journal_trigger *EXT4_TRIGGER( + struct jbd2_buffer_trigger_type *trigger) +{ + return container_of(trigger, struct ext4_journal_trigger, tr_triggers); +} + /* * fourth extended-fs super-block data in memory */ @@ -1628,6 +1646,9 @@ struct ext4_sb_info { struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; + /* Journal triggers for checksum computation */ + struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; + /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; @@ -2923,13 +2944,14 @@ int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, - int (*fn)(handle_t *handle, + int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); -int do_journal_get_write_access(handle_t *handle, +int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b60f0152ea57..6def7339056d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -218,9 +218,11 @@ static void ext4_check_bdev_write_error(struct super_block *sb) } int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) { - int err = 0; + int err; might_sleep(); @@ -229,11 +231,18 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); - if (err) + if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + return err; + } } - return err; + if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; } /* @@ -301,17 +310,27 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, } int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) { - int err = 0; + int err; - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (!ext4_handle_valid(handle)) + return 0; + + err = jbd2_journal_get_create_access(handle, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, bh, handle, + err); + return err; } - return err; + if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0d2fa423b7ad..0e4fa644df01 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -231,26 +231,32 @@ int ext4_expand_extra_isize(struct inode *inode, * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); -#define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) +#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) -#define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) +#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 501516cadc1b..eb1dd4f024f2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -139,7 +139,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, path->p_bh); + return ext4_journal_get_write_access(handle, inode->i_sb, + path->p_bh, EXT4_JTR_NONE); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ @@ -1082,7 +1083,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto cleanup; @@ -1160,7 +1162,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto cleanup; @@ -1286,7 +1289,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, return -ENOMEM; lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 816dedcbd541..eda12bc50592 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -822,7 +822,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e89fc0f770b0..f73e5eb43eae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -300,7 +300,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } BUFFER_TRACE(bitmap_bh, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bitmap_bh); + fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (fatal) goto error_return; @@ -308,7 +309,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bh2); + fatal = ext4_journal_get_write_access(handle, sb, bh2, + EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); @@ -1085,7 +1087,8 @@ repeat_in_this_group: } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; @@ -1127,7 +1130,8 @@ got: } BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); + err = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; @@ -1144,7 +1148,8 @@ got: goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, + EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); @@ -1583,8 +1588,8 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, - group_desc_bh); + ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); if (ret) goto err_out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a7bc6ad656a9..89efa78ed4b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -354,7 +354,8 @@ static int ext4_alloc_branch(handle_t *handle, } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, ar->inode->i_sb, + bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; @@ -429,7 +430,8 @@ static int ext4_splice_branch(handle_t *handle, */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); + err = ext4_journal_get_write_access(handle, ar->inode->i_sb, + where->bh, EXT4_JTR_NONE); if (err) goto err_out; } @@ -728,7 +730,8 @@ static int ext4_ind_truncate_ensure_credits(handle_t *handle, return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(ret)) return ret; } @@ -916,7 +919,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) @@ -1079,7 +1083,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, - parent_bh)){ + inode->i_sb, parent_bh, + EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 24e994e75f5c..82bf4ff6be28 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -264,7 +264,8 @@ static int ext4_create_inline_data(handle_t *handle, return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -350,7 +351,8 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -427,7 +429,8 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -593,7 +596,7 @@ retry: ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), from, to, NULL, do_journal_get_write_access); } @@ -682,7 +685,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto convert; } - ret = ext4_journal_get_write_access(handle, iloc.bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, + EXT4_JTR_NONE); if (ret) goto out; @@ -929,7 +933,8 @@ retry_journal: if (ret < 0) goto out_release_page; } - ret = ext4_journal_get_write_access(handle, iloc.bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, + EXT4_JTR_NONE); if (ret) goto out_release_page; @@ -1034,7 +1039,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle, return err; BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, + EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); @@ -1229,7 +1235,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, } lock_buffer(data_bh); - error = ext4_journal_get_create_access(handle, data_bh); + error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, + EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; @@ -1713,7 +1720,8 @@ int ext4_delete_inline_entry(handle_t *handle, } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c33c795c4a7..d47a57f3d8de 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -139,7 +139,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -869,7 +868,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; @@ -954,12 +954,12 @@ out_brelse: return err; } -int ext4_walk_page_buffers(handle_t *handle, +int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, - int (*fn)(handle_t *handle, + int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -978,7 +978,7 @@ int ext4_walk_page_buffers(handle_t *handle, *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1009,7 +1009,7 @@ int ext4_walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -int do_journal_get_write_access(handle_t *handle, +int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int dirty = buffer_dirty(bh); @@ -1028,7 +1028,8 @@ int do_journal_get_write_access(handle_t *handle, if (dirty) clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); return ret; @@ -1208,8 +1209,8 @@ retry_journal: ret = __block_write_begin(page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), - from, to, NULL, + ret = ext4_walk_page_buffers(handle, inode, + page_buffers(page), from, to, NULL, do_journal_get_write_access); } @@ -1253,7 +1254,8 @@ retry_journal: } /* For write_end() in data=journal mode */ -static int write_end_fn(handle_t *handle, struct buffer_head *bh) +static int write_end_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) @@ -1352,6 +1354,7 @@ errout: * to call ext4_handle_dirty_metadata() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, struct page *page, unsigned from, unsigned to) { @@ -1370,7 +1373,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; zero_user(page, start, size); - write_end_fn(handle, bh); + write_end_fn(handle, inode, bh); } clear_buffer_new(bh); } @@ -1412,13 +1415,13 @@ static int ext4_journalled_write_end(struct file *file, copied = ret; } else if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, page, + ext4_journalled_zero_new_buffers(handle, inode, page, from + copied, to); - ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - from + copied, &partial, + ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + from, from + copied, &partial, write_end_fn); if (!partial) SetPageUptodate(page); @@ -1619,7 +1622,8 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } @@ -1851,13 +1855,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { get_bh(bh); return 0; } -static int bput_one(handle_t *handle, struct buffer_head *bh) +static int bput_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { put_bh(bh); return 0; @@ -1888,7 +1894,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG(); goto out; } - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, bget_one); } /* @@ -1920,11 +1926,11 @@ static int __ext4_journalled_writepage(struct page *page, if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { - ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); - err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); } if (ret == 0) ret = err; @@ -1941,7 +1947,7 @@ out: unlock_page(page); out_no_pagelock: if (!inline_data && page_bufs) - ext4_walk_page_buffers(NULL, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, bput_one); brelse(inode_bh); return ret; @@ -2031,7 +2037,7 @@ static int ext4_writepage(struct page *page, * for the extremely common case, this is an optimization that * skips a useless round trip through ext4_bio_write_page(). */ - if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); if ((current->flags & PF_MEMALLOC) || @@ -3794,7 +3800,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto unlock; } @@ -5146,7 +5153,9 @@ static int ext4_do_update_inode(handle_t *handle, ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, + EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (err) goto out_brelse; lock_buffer(EXT4_SB(sb)->s_sbh); @@ -5747,7 +5756,8 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; @@ -5870,7 +5880,8 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, iloc->bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, + EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; @@ -6041,7 +6052,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } -static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { return !buffer_mapped(bh); } @@ -6114,7 +6126,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * inode to the transaction's list to writeprotect pages on commit. */ if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, page_buffers(page), + if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ @@ -6160,11 +6172,13 @@ retry_alloc: err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, page_buffers(page), - 0, len, NULL, do_journal_get_write_access)) + if (ext4_walk_page_buffers(handle, inode, + page_buffers(page), 0, len, NULL, + do_journal_get_write_access)) goto out_error; - if (ext4_walk_page_buffers(handle, page_buffers(page), - 0, len, NULL, write_end_fn)) + if (ext4_walk_page_buffers(handle, inode, + page_buffers(page), 0, len, NULL, + write_end_fn)) goto out_error; if (ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eed6170aded..20aeff88cab6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1154,7 +1154,9 @@ resizefs_out: err = PTR_ERR(handle); goto pwsalt_err_exit; } - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, + sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 665646a12e01..72bfac2d6dce 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3796,7 +3796,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto out_err; @@ -3809,7 +3810,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_free_group_clusters(sb, gdp)); BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdp_bh); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; @@ -5986,7 +5987,8 @@ do_more: } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto error_return; @@ -5996,7 +5998,7 @@ do_more: * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); + err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) goto error_return; #ifdef AGGRESSIVE_CHECK @@ -6177,7 +6179,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto error_return; @@ -6187,7 +6190,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); + err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) goto error_return; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f3bbcd4efb56..dfb3ae61dfe8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -70,7 +70,8 @@ static struct buffer_head *ext4_append(handle_t *handle, inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) { brelse(bh); ext4_std_error(inode->i_sb, err); @@ -1927,12 +1928,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, } BUFFER_TRACE(*bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, *bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, + EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, + EXT4_JTR_NONE); if (err) goto journal_error; @@ -2109,7 +2112,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; @@ -2167,7 +2171,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, bh); + retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); @@ -2419,7 +2424,7 @@ again: } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; @@ -2476,7 +2481,8 @@ again: node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); + err = ext4_journal_get_write_access(handle, sb, frame->bh, + EXT4_JTR_NONE); if (err) goto journal_error; if (!add_level) { @@ -2486,8 +2492,9 @@ again: icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, - (frame - 1)->bh); + err = ext4_journal_get_write_access(handle, sb, + (frame - 1)->bh, + EXT4_JTR_NONE); if (err) goto journal_error; @@ -2636,7 +2643,8 @@ static int ext4_delete_entry(handle_t *handle, csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(err)) goto out; @@ -3088,7 +3096,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto out; @@ -3186,7 +3195,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (prev == &sbi->s_orphan) { jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; @@ -3675,7 +3685,8 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, ent->dir_bh); + return ext4_journal_get_write_access(handle, ent->dir->i_sb, + ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, @@ -3710,7 +3721,8 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); - retval = ext4_journal_get_write_access(handle, ent->bh); + retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, + EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 7a9f1adef679..b63cb88ccdae 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -409,7 +409,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, if (unlikely(!bh)) return ERR_PTR(-ENOMEM); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) { + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); + if (err) { brelse(bh); bh = ERR_PTR(err); } else { @@ -474,7 +475,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, return -ENOMEM; BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE); if (err) { brelse(bh); return err; @@ -569,7 +571,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } BUFFER_TRACE(gdb, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb); + err = ext4_journal_get_write_access(handle, sb, gdb, + EXT4_JTR_NONE); if (err) { brelse(gdb); goto out; @@ -837,17 +840,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(dind, "get_write_access"); - err = ext4_journal_get_write_access(handle, dind); + err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; @@ -956,7 +960,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, n_group_desc[gdb_num] = gdb_bh; BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (err) { kvfree(n_group_desc); brelse(gdb_bh); @@ -1042,7 +1046,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { BUFFER_TRACE(primary[i], "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, primary[i]))) + if ((err = ext4_journal_get_write_access(handle, sb, primary[i], + EXT4_JTR_NONE))) goto exit_bh; } @@ -1149,10 +1154,9 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) { - brelse(bh); + if ((err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE))) break; - } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) @@ -1232,7 +1236,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, + EXT4_JTR_NONE); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); @@ -1509,7 +1514,8 @@ static int ext4_flex_group_add(struct super_block *sb, } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto exit_journal; @@ -1722,7 +1728,8 @@ static int ext4_group_extend_no_check(struct super_block *sb, } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (err) { ext4_warning(sb, "error %d on journal write access", err); goto errout; @@ -1884,7 +1891,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) return PTR_ERR(handle); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto errout; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 970013c93d3e..9d4e13ef5f56 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4014,6 +4014,20 @@ static const char *ext4_quota_mode(struct super_block *sb) #endif } +static void ext4_setup_csum_trigger(struct super_block *sb, + enum ext4_journal_trigger_type type, + void (*trigger)( + struct jbd2_buffer_trigger_type *type, + struct buffer_head *bh, + void *mapped_data, + size_t size)) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_journal_triggers[type].sb = sb; + sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); @@ -6617,7 +6631,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6dd5c05c444a..1e0fc1ed845b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -791,7 +791,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); @@ -1169,7 +1170,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } if (err > 0) { - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, + parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", @@ -1230,7 +1232,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, int error = 0; BUFFER_TRACE(bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (error) goto out; @@ -1371,7 +1374,8 @@ retry: "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, + EXT4_JTR_NONE); if (ret) goto out; @@ -1855,7 +1859,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, bs->bh); + error = ext4_journal_get_write_access(handle, sb, bs->bh, + EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); @@ -1987,8 +1992,9 @@ inserted: if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, - new_bh); + error = ext4_journal_get_write_access( + handle, sb, new_bh, + EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); @@ -2092,7 +2098,8 @@ getblk_failed: } lock_buffer(new_bh); - error = ext4_journal_get_create_access(handle, new_bh); + error = ext4_journal_get_create_access(handle, sb, + new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; @@ -2848,7 +2855,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, goto cleanup; } - error = ext4_journal_get_write_access(handle, iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, + iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5347411ae13e..6a3caedd2285 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1421,7 +1421,7 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, { struct journal_head *jh = jbd2_journal_grab_journal_head(bh); - if (WARN_ON(!jh)) + if (WARN_ON_ONCE(!jh)) return; jh->b_triggers = type; jbd2_journal_put_journal_head(jh); From 25c6d98fc4c245d164cf688815a7b259257ead2a Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 16 Aug 2021 11:57:05 +0200 Subject: [PATCH 17/23] ext4: Move orphan inode handling into a separate file Move functions for handling orphan inodes into a new file fs/ext4/orphan.c to have them in one place and somewhat reduce size of other files. No code changes. Reviewed-by: Andreas Dilger <adilger@dilger.ca> Reviewed-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210816095713.16537-2-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/Makefile | 2 +- fs/ext4/ext4.h | 11 +- fs/ext4/namei.c | 182 ------------------------ fs/ext4/orphan.c | 363 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/super.c | 180 +---------------------- 5 files changed, 375 insertions(+), 363 deletions(-) create mode 100644 fs/ext4/orphan.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 49e7af6cc93f..7d89142e1421 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ - xattr_user.o fast_commit.o + xattr_user.o fast_commit.o orphan.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 27b101fc7c86..e93eedb8b284 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2171,6 +2171,8 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } +extern int ext4_feature_set_ok(struct super_block *sb, int readonly); + /* * Superblock flags */ @@ -3031,8 +3033,6 @@ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); -extern int ext4_orphan_add(handle_t *, struct inode *); -extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, @@ -3501,6 +3501,7 @@ static inline bool ext4_is_quota_journalled(struct super_block *sb) return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } +int ext4_enable_quotas(struct super_block *sb); #endif /* @@ -3762,6 +3763,12 @@ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; +/* orphan.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es); + /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dfb3ae61dfe8..da7698341d7d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3054,188 +3054,6 @@ bool ext4_empty_dir(struct inode *inode) return true; } -/* - * ext4_orphan_add() links an unlinked or truncated inode into a list of - * such inodes, starting at the superblock, in case we crash before the - * file is closed/deleted, or in case the inode truncate spans multiple - * transactions and the last transaction is not recovered after a crash. - * - * At filesystem recovery time, we walk this list deleting unlinked - * inodes and truncating linked inodes in ext4_orphan_cleanup(). - * - * Orphan list manipulation functions must be called under i_mutex unless - * we are just creating the inode or deleting it. - */ -int ext4_orphan_add(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_iloc iloc; - int err = 0, rc; - bool dirty = false; - - if (!sbi->s_journal || is_bad_inode(inode)) - return 0; - - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !inode_is_locked(inode)); - /* - * Exit early if inode already is on orphan list. This is a big speedup - * since we don't have to contend on the global s_orphan_lock. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan)) - return 0; - - /* - * Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, - * so i_nlink should not be bumped due to race - */ - ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, - EXT4_JTR_NONE); - if (err) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out; - - mutex_lock(&sbi->s_orphan_lock); - /* - * Due to previous errors inode may be already a part of on-disk - * orphan list. If so skip on-disk list modification. - */ - if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > - (le32_to_cpu(sbi->s_es->s_inodes_count))) { - /* Insert this inode at the head of the on-disk orphan list */ - NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); - lock_buffer(sbi->s_sbh); - sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); - dirty = true; - } - list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); - mutex_unlock(&sbi->s_orphan_lock); - - if (dirty) { - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - if (err) { - /* - * We have to remove inode from in-memory list if - * addition to on disk orphan list failed. Stray orphan - * list entries can cause panics at unmount time. - */ - mutex_lock(&sbi->s_orphan_lock); - list_del_init(&EXT4_I(inode)->i_orphan); - mutex_unlock(&sbi->s_orphan_lock); - } - } else - brelse(iloc.bh); - - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); -out: - ext4_std_error(sb, err); - return err; -} - -/* - * ext4_orphan_del() removes an unlinked or truncated inode from the list - * of such inodes stored on disk, because it is finally being cleaned up. - */ -int ext4_orphan_del(handle_t *handle, struct inode *inode) -{ - struct list_head *prev; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 ino_next; - struct ext4_iloc iloc; - int err = 0; - - if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) - return 0; - - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !inode_is_locked(inode)); - /* Do this quick check before taking global s_orphan_lock. */ - if (list_empty(&ei->i_orphan)) - return 0; - - if (handle) { - /* Grab inode buffer early before taking global s_orphan_lock */ - err = ext4_reserve_inode_write(handle, inode, &iloc); - } - - mutex_lock(&sbi->s_orphan_lock); - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - - prev = ei->i_orphan.prev; - list_del_init(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on - * disk, but we still need to remove the inode from the linked - * list in memory. */ - if (!handle || err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_err; - } - - ino_next = NEXT_ORPHAN(inode); - if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %u\n", ino_next); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode->i_sb, - sbi->s_sbh, EXT4_JTR_NONE); - if (err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_brelse; - } - lock_buffer(sbi->s_sbh); - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - ext4_superblock_csum_set(inode->i_sb); - unlock_buffer(sbi->s_sbh); - mutex_unlock(&sbi->s_orphan_lock); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - } else { - struct ext4_iloc iloc2; - struct inode *i_prev = - &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - - jbd_debug(4, "orphan inode %lu will point to %u\n", - i_prev->i_ino, ino_next); - err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_brelse; - } - NEXT_ORPHAN(i_prev) = ino_next; - err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); - mutex_unlock(&sbi->s_orphan_lock); - } - if (err) - goto out_brelse; - NEXT_ORPHAN(inode) = 0; - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -out_err: - ext4_std_error(inode->i_sb, err); - return err; - -out_brelse: - brelse(iloc.bh); - goto out_err; -} - static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c new file mode 100644 index 000000000000..1f2fa2ef53bd --- /dev/null +++ b/fs/ext4/orphan.c @@ -0,0 +1,363 @@ +/* + * Ext4 orphan inode handling + */ +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_iloc iloc; + int err = 0, rc; + bool dirty = false; + + if (!sbi->s_journal || is_bad_inode(inode)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + /* + * Exit early if inode already is on orphan list. This is a big speedup + * since we don't have to contend on the global s_orphan_lock. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan)) + return 0; + + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race + */ + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out; + + mutex_lock(&sbi->s_orphan_lock); + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del_init(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } else + brelse(iloc.bh); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out: + ext4_std_error(sb, err); + return err; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + /* Do this quick check before taking global s_orphan_lock. */ + if (list_empty(&ei->i_orphan)) + return 0; + + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + + mutex_lock(&sbi->s_orphan_lock); + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + prev = ei->i_orphan.prev; + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_err; + } + + ino_next = NEXT_ORPHAN(inode); + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + sbi->s_sbh, EXT4_JTR_NONE); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + ext4_superblock_csum_set(inode->i_sb); + unlock_buffer(sbi->s_sbh); + mutex_unlock(&sbi->s_orphan_lock); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out_err: + ext4_std_error(inode->i_sb, err); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +#ifdef CONFIG_QUOTA +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, + rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], + lockdep_is_held(&sb->s_umount)), + EXT4_SB(sb)->s_jquota_fmt, type); +} +#endif + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int ret, nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int quota_update = 0; + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + } + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & SB_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~SB_RDONLY; + } +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: type %d: error %d", i, ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + dquot_initialize(inode); + if (inode->i_nlink) { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + inode_lock(inode); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ret = ext4_truncate(inode); + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); + ext4_std_error(inode->i_sb, ret); + } + inode_unlock(inode); + nr_truncates++; + } else { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } + } +#endif + sb->s_flags = s_flags; /* Restore SB_RDONLY status */ +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9d4e13ef5f56..e5efb8403f50 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,7 +80,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); @@ -1585,14 +1584,12 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); -static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); -static int ext4_enable_quotas(struct super_block *sb); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -2970,169 +2967,6 @@ static int ext4_check_descriptors(struct super_block *sb, return 1; } -/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at - * the superblock) which were deleted from all directories, but held open by - * a process at the time of a crash. We walk the list and try to delete these - * inodes at recovery time (only with a read-write filesystem). - * - * In order to keep the orphan inode chain consistent during traversal (in - * case of crash during recovery), we link each inode into the superblock - * orphan list_head and handle it the same way as an inode deletion during - * normal operation (which journals the operations for us). - * - * We only do an iget() and an iput() on each inode, which is very safe if we - * accidentally point at an in-use or already deleted inode. The worst that - * can happen in this case is that we get a "bit already cleared" message from - * ext4_free_inode(). The only reason we would point at a wrong inode is if - * e2fsck was run on this filesystem, and it must have already done the orphan - * inode cleanup for us, so we can safely abort without any further action. - */ -static void ext4_orphan_cleanup(struct super_block *sb, - struct ext4_super_block *es) -{ - unsigned int s_flags = sb->s_flags; - int ret, nr_orphans = 0, nr_truncates = 0; -#ifdef CONFIG_QUOTA - int quota_update = 0; - int i; -#endif - if (!es->s_last_orphan) { - jbd_debug(4, "no orphan inodes to clean up\n"); - return; - } - - if (bdev_read_only(sb->s_bdev)) { - ext4_msg(sb, KERN_ERR, "write access " - "unavailable, skipping orphan cleanup"); - return; - } - - /* Check if feature set would not allow a r/w mount */ - if (!ext4_feature_set_ok(sb, 0)) { - ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " - "unknown ROCOMPAT features"); - return; - } - - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { - ext4_msg(sb, KERN_INFO, "Errors on filesystem, " - "clearing orphan list.\n"); - es->s_last_orphan = 0; - } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - return; - } - - if (s_flags & SB_RDONLY) { - ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~SB_RDONLY; - } -#ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ - if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { - int ret = ext4_enable_quotas(sb); - - if (!ret) - quota_update = 1; - else - ext4_msg(sb, KERN_ERR, - "Cannot turn on quotas: error %d", ret); - } - - /* Turn on journaled quotas used for old sytle */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (EXT4_SB(sb)->s_qf_names[i]) { - int ret = ext4_quota_on_mount(sb, i); - - if (!ret) - quota_update = 1; - else - ext4_msg(sb, KERN_ERR, - "Cannot turn on journaled " - "quota: type %d: error %d", i, ret); - } - } -#endif - - while (es->s_last_orphan) { - struct inode *inode; - - /* - * We may have encountered an error during cleanup; if - * so, skip the rest. - */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - es->s_last_orphan = 0; - break; - } - - inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); - if (IS_ERR(inode)) { - es->s_last_orphan = 0; - break; - } - - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); - inode_lock(inode); - truncate_inode_pages(inode->i_mapping, inode->i_size); - ret = ext4_truncate(inode); - if (ret) { - /* - * We need to clean up the in-core orphan list - * manually if ext4_truncate() failed to get a - * transaction handle. - */ - ext4_orphan_del(NULL, inode); - ext4_std_error(inode->i_sb, ret); - } - inode_unlock(inode); - nr_truncates++; - } else { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; - } - iput(inode); /* The delete magic happens here! */ - } - -#define PLURAL(x) (x), ((x) == 1) ? "" : "s" - - if (nr_orphans) - ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", - PLURAL(nr_orphans)); - if (nr_truncates) - ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", - PLURAL(nr_truncates)); -#ifdef CONFIG_QUOTA - /* Turn off quotas if they were enabled for orphan cleanup */ - if (quota_update) { - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); - } - } -#endif - sb->s_flags = s_flags; /* Restore SB_RDONLY status */ -} - /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk @@ -3312,7 +3146,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ -static int ext4_feature_set_ok(struct super_block *sb, int readonly) +int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, @@ -6334,16 +6168,6 @@ static int ext4_write_info(struct super_block *sb, int type) return ret; } -/* - * Turn on quotas during mount time - we need to find - * the quota file and such... - */ -static int ext4_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), - EXT4_SB(sb)->s_jquota_fmt, type); -} - static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -6473,7 +6297,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, } /* Enable usage tracking for all quota types. */ -static int ext4_enable_quotas(struct super_block *sb) +int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { From 02f310fcf47fa9311d6ba2946a8d19e7d7d11f37 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 16 Aug 2021 11:57:06 +0200 Subject: [PATCH 18/23] ext4: Speedup ext4 orphan inode handling Ext4 orphan inode handling is a bottleneck for workloads which heavily truncate / unlink small files since it contends on the global s_orphan_mutex lock (and generally it's difficult to improve scalability of the ondisk linked list of orphaned inodes). This patch implements new way of handling orphan inodes. Instead of linking orphaned inode into a linked list, we store it's inode number in a new special file which we call "orphan file". Only if there's no more space in the orphan file (too many inodes are currently orphaned) we fall back to using old style linked list. Currently we protect operations in the orphan file with a spinlock for simplicity but even in this setting we can substantially reduce the length of the critical section and thus speedup some workloads. In the next patch we improve this by making orphan handling lockless. Note that the change is backwards compatible when the filesystem is clean - the existence of the orphan file is a compat feature, we set another ro-compat feature indicating orphan file needs scanning for orphaned inodes when mounting filesystem read-write. This ro-compat feature gets cleared on unmount / remount read-only. Some performance data from 80 CPU Xeon Server with 512 GB of RAM, filesystem located on SSD, average of 5 runs: stress-orphan (microbenchmark truncating files byte-by-byte from N processes in parallel) Threads Time Time Vanilla Patched 1 1.057200 0.945600 2 1.680400 1.331800 4 2.547000 1.995000 8 7.049400 6.424200 16 14.827800 14.937600 32 40.948200 33.038200 64 87.787400 60.823600 128 206.504000 122.941400 So we can see significant wins all over the board. Reviewed-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210816095713.16537-3-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/ext4.h | 69 +++++++++- fs/ext4/inode.c | 3 +- fs/ext4/orphan.c | 340 +++++++++++++++++++++++++++++++++++++++++------ fs/ext4/super.c | 34 ++++- 4 files changed, 394 insertions(+), 52 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e93eedb8b284..ae79797b9d60 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1034,7 +1034,14 @@ struct ext4_inode_info { */ struct rw_semaphore xattr_sem; - struct list_head i_orphan; /* unlinked but open inodes */ + /* + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise + * i_orphan is used. + */ + union { + struct list_head i_orphan; /* unlinked but open inodes */ + unsigned int i_orphan_idx; /* Index in orphan file */ + }; /* Fast commit related info */ @@ -1428,7 +1435,8 @@ struct ext4_super_block { __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ + __le32 s_reserved[94]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1449,6 +1457,7 @@ struct ext4_super_block { /* Types of ext4 journal triggers */ enum ext4_journal_trigger_type { + EXT4_JTR_ORPHAN_FILE, EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ }; @@ -1465,6 +1474,36 @@ static inline struct ext4_journal_trigger *EXT4_TRIGGER( return container_of(trigger, struct ext4_journal_trigger, tr_triggers); } +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 + +/* Structure at the tail of orphan block */ +struct ext4_orphan_block_tail { + __le32 ob_magic; + __le32 ob_checksum; +}; + +static inline int ext4_inodes_per_orphan_block(struct super_block *sb) +{ + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / + sizeof(u32); +} + +struct ext4_orphan_block { + int ob_free_entries; /* Number of free orphan entries in block */ + struct buffer_head *ob_bh; /* Buffer for orphan block */ +}; + +/* + * Info about orphan file. + */ +struct ext4_orphan_info { + spinlock_t of_lock; + int of_blocks; /* Number of orphan blocks in a file */ + __u32 of_csum_seed; /* Checksum seed for orphan file */ + struct ext4_orphan_block *of_binfo; /* Array with info about orphan + * file blocks */ +}; + /* * fourth extended-fs super-block data in memory */ @@ -1519,9 +1558,11 @@ struct ext4_sb_info { /* Journaling */ struct journal_s *s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; unsigned long s_ext4_flags; /* Ext4 superblock flags */ + struct mutex s_orphan_lock; /* Protects on disk list changes */ + struct list_head s_orphan; /* List of orphaned inodes in on disk + list */ + struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -1859,6 +1900,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1960,6 +2002,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode) */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1980,6 +2023,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be + non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -2063,6 +2108,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) +EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) @@ -2077,6 +2123,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) @@ -2110,7 +2157,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) -#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ + EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ @@ -2135,7 +2183,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ - EXT4_FEATURE_RO_COMPAT_VERITY) + EXT4_FEATURE_RO_COMPAT_VERITY |\ + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -2185,7 +2234,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); } - /* * Default values for user and/or group using reserved blocks */ @@ -3768,6 +3816,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es); +extern void ext4_release_orphan_info(struct super_block *sb); +extern int ext4_init_orphan_info(struct super_block *sb); +extern int ext4_orphan_file_empty(struct super_block *sb); +extern void ext4_orphan_file_block_trigger( + struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size); /* * Add new method to test whether block and inode bitmaps are properly diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d47a57f3d8de..8fc0dad9fa62 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4624,7 +4624,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum))) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 1f2fa2ef53bd..ac928f60234a 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -8,6 +8,52 @@ #include "ext4.h" #include "ext4_jbd2.h" +static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) +{ + int i, j; + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + int ret = 0; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + + spin_lock(&oi->of_lock); + for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++); + if (i == oi->of_blocks) { + spin_unlock(&oi->of_lock); + /* + * For now we don't grow or shrink orphan file. We just use + * whatever was allocated at mke2fs time. The additional + * credits we would have to reserve for each orphan inode + * operation just don't seem worth it. + */ + return -ENOSPC; + } + oi->of_binfo[i].ob_free_entries--; + spin_unlock(&oi->of_lock); + + /* + * Get access to orphan block. We have dropped of_lock but since we + * have decremented number of free entries we are guaranteed free entry + * in our block. + */ + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) + return ret; + + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + spin_lock(&oi->of_lock); + /* Find empty slot in a block */ + for (j = 0; j < inodes_per_ob && bdata[j]; j++); + BUG_ON(j == inodes_per_ob); + bdata[j] = cpu_to_le32(inode->i_ino); + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + spin_unlock(&oi->of_lock); + + return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); +} + /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the @@ -34,10 +80,10 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); /* - * Exit early if inode already is on orphan list. This is a big speedup - * since we don't have to contend on the global s_orphan_lock. + * Inode orphaned in orphan file or in orphan list? */ - if (!list_empty(&EXT4_I(inode)->i_orphan)) + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan)) return 0; /* @@ -49,6 +95,16 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + if (sbi->s_orphan_info.of_blocks) { + err = ext4_orphan_file_add(handle, inode); + /* + * Fallback to normal orphan list of orphan file is + * out of space + */ + if (err != -ENOSPC) + return err; + } + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); @@ -103,6 +159,39 @@ out: return err; } +static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) +{ + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + __le32 *bdata; + int blk, off; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int ret = 0; + + if (!handle) + goto out; + blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; + off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; + if (WARN_ON_ONCE(blk >= oi->of_blocks)) + goto out; + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) + goto out; + + bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); + spin_lock(&oi->of_lock); + bdata[off] = 0; + oi->of_binfo[blk].ob_free_entries++; + spin_unlock(&oi->of_lock); + ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); + + return ret; +} + /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. @@ -121,6 +210,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) + return ext4_orphan_file_del(handle, inode); + /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; @@ -200,6 +292,46 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) } #endif +static void ext4_process_orphan(struct inode *inode, + int *nr_truncates, int *nr_orphans) +{ + struct super_block *sb = inode->i_sb; + int ret; + + dquot_initialize(inode); + if (inode->i_nlink) { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + inode_lock(inode); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ret = ext4_truncate(inode); + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); + ext4_std_error(inode->i_sb, ret); + } + inode_unlock(inode); + (*nr_truncates)++; + } else { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + (*nr_orphans)++; + } + iput(inode); /* The delete magic happens here! */ +} + /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these @@ -220,12 +352,17 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; - int ret, nr_orphans = 0, nr_truncates = 0; + int nr_orphans = 0, nr_truncates = 0; + struct inode *inode; + int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; - int i; #endif - if (!es->s_last_orphan) { + __le32 *bdata; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!es->s_last_orphan && !oi->of_blocks) { jbd_debug(4, "no orphan inodes to clean up\n"); return; } @@ -289,8 +426,6 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) #endif while (es->s_last_orphan) { - struct inode *inode; - /* * We may have encountered an error during cleanup; if * so, skip the rest. @@ -308,38 +443,21 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); - inode_lock(inode); - truncate_inode_pages(inode->i_mapping, inode->i_size); - ret = ext4_truncate(inode); - if (ret) { - /* - * We need to clean up the in-core orphan list - * manually if ext4_truncate() failed to get a - * transaction handle. - */ - ext4_orphan_del(NULL, inode); - ext4_std_error(inode->i_sb, ret); - } - inode_unlock(inode); - nr_truncates++; - } else { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + + for (i = 0; i < oi->of_blocks; i++) { + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + for (j = 0; j < inodes_per_ob; j++) { + if (!bdata[j]) + continue; + inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); + if (IS_ERR(inode)) + continue; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } - iput(inode); /* The delete magic happens here! */ } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" @@ -361,3 +479,147 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } + +void ext4_release_orphan_info(struct super_block *sb) +{ + int i; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + + if (!oi->of_blocks) + return; + for (i = 0; i < oi->of_blocks; i++) + brelse(oi->of_binfo[i].ob_bh); + kfree(oi->of_binfo); +} + +static struct ext4_orphan_block_tail *ext4_orphan_block_tail( + struct super_block *sb, + struct buffer_head *bh) +{ + return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - + sizeof(struct ext4_orphan_block_tail)); +} + +static int ext4_orphan_file_block_csum_verify(struct super_block *sb, + struct buffer_head *bh) +{ + __u32 calculated; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + if (!ext4_has_metadata_csum(sb)) + return 1; + + ot = ext4_orphan_block_tail(sb, bh); + calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); + calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + inodes_per_ob * sizeof(__u32)); + return le32_to_cpu(ot->ob_checksum) == calculated; +} + +/* This gets called only when checksumming is enabled */ +void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct super_block *sb = EXT4_TRIGGER(triggers)->sb; + __u32 csum; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); + csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, + inodes_per_ob * sizeof(__u32)); + ot = ext4_orphan_block_tail(sb, bh); + ot->ob_checksum = cpu_to_le32(csum); +} + +int ext4_init_orphan_info(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct inode *inode; + int i, j; + int ret; + int free; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_block_tail *ot; + ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); + + spin_lock_init(&oi->of_lock); + + if (!ext4_has_feature_orphan_file(sb)) + return 0; + + inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) { + ext4_msg(sb, KERN_ERR, "get orphan inode failed"); + return PTR_ERR(inode); + } + oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; + oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; + oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), + GFP_KERNEL); + if (!oi->of_binfo) { + ret = -ENOMEM; + goto out_put; + } + for (i = 0; i < oi->of_blocks; i++) { + oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); + if (IS_ERR(oi->of_binfo[i].ob_bh)) { + ret = PTR_ERR(oi->of_binfo[i].ob_bh); + goto out_free; + } + if (!oi->of_binfo[i].ob_bh) { + ret = -EIO; + goto out_free; + } + ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); + if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { + ext4_error(sb, "orphan file block %d: bad magic", i); + ret = -EIO; + goto out_free; + } + if (!ext4_orphan_file_block_csum_verify(sb, + oi->of_binfo[i].ob_bh)) { + ext4_error(sb, "orphan file block %d: bad checksum", i); + ret = -EIO; + goto out_free; + } + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + free = 0; + for (j = 0; j < inodes_per_ob; j++) + if (bdata[j] == 0) + free++; + oi->of_binfo[i].ob_free_entries = free; + } + iput(inode); + return 0; +out_free: + for (i--; i >= 0; i--) + brelse(oi->of_binfo[i].ob_bh); + kfree(oi->of_binfo); +out_put: + iput(inode); + return ret; +} + +int ext4_orphan_file_empty(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int i; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!ext4_has_feature_orphan_file(sb)) + return 1; + for (i = 0; i < oi->of_blocks; i++) + if (oi->of_binfo[i].ob_free_entries != inodes_per_ob) + return 0; + return 1; +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5efb8403f50..feca816b6bf3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb) flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); + ext4_release_orphan_info(sb); /* * Unregister sysfs before destroying jbd2 journal. @@ -1199,6 +1200,7 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) @@ -2684,8 +2686,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - if (sbi->s_journal) + if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); + } err = ext4_commit_super(sb); done: @@ -3960,6 +3965,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) silent = 1; goto cantfind_ext4; } + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); @@ -4624,6 +4631,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || + ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) @@ -4922,12 +4930,15 @@ no_journal: if (err) goto failed_mount7; + err = ext4_init_orphan_info(sb); + if (err) + goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount8; + goto failed_mount9; } #endif /* CONFIG_QUOTA */ @@ -4946,7 +4957,7 @@ no_journal: ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount8; + goto failed_mount9; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -4992,6 +5003,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +failed_mount9: + ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); @@ -5502,8 +5515,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb, if (err < 0) goto out; - if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { + if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || + ext4_has_feature_orphan_present(sb))) { + if (!ext4_orphan_file_empty(sb)) { + ext4_error(sb, "Orphan file not empty on read-only fs."); + err = -EFSCORRUPTED; + goto out; + } ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: @@ -5646,6 +5666,8 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); + if (ext4_orphan_file_empty(sb)) + ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); @@ -5668,6 +5690,8 @@ static int ext4_unfreeze(struct super_block *sb) if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); @@ -5871,7 +5895,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * around from a previously readonly bdev mount, * require a full umount/remount for now. */ - if (es->s_last_orphan) { + if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " From 3a6541e97c035dba90cdf37169d73b2d8057e55d Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 16 Aug 2021 11:57:07 +0200 Subject: [PATCH 19/23] ext4: Orphan file documentation Add documentation about the orphan file feature. Reviewed-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210816095713.16537-4-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- Documentation/filesystems/ext4/globals.rst | 1 + Documentation/filesystems/ext4/inodes.rst | 10 ++-- Documentation/filesystems/ext4/orphan.rst | 52 +++++++++++++++++++ .../filesystems/ext4/special_inodes.rst | 17 ++++++ Documentation/filesystems/ext4/super.rst | 15 +++++- 5 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 Documentation/filesystems/ext4/orphan.rst diff --git a/Documentation/filesystems/ext4/globals.rst b/Documentation/filesystems/ext4/globals.rst index 368bf7662b96..b17418974fd3 100644 --- a/Documentation/filesystems/ext4/globals.rst +++ b/Documentation/filesystems/ext4/globals.rst @@ -11,3 +11,4 @@ have static metadata at fixed locations. .. include:: bitmaps.rst .. include:: mmp.rst .. include:: journal.rst +.. include:: orphan.rst diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index a65baffb4ebf..6c5ce666e63f 100644 --- a/Documentation/filesystems/ext4/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst @@ -498,11 +498,11 @@ structure -- inode change time (ctime), access time (atime), data modification time (mtime), and deletion time (dtime). The four fields are 32-bit signed integers that represent seconds since the Unix epoch (1970-01-01 00:00:00 GMT), which means that the fields will overflow in -January 2038. For inodes that are not linked from any directory but are -still open (orphan inodes), the dtime field is overloaded for use with -the orphan list. The superblock field ``s_last_orphan`` points to the -first inode in the orphan list; dtime is then the number of the next -orphaned inode, or zero if there are no more orphans. +January 2038. If the filesystem does not have orphan_file feature, inodes +that are not linked from any directory but are still open (orphan inodes) have +the dtime field overloaded for use with the orphan list. The superblock field +``s_last_orphan`` points to the first inode in the orphan list; dtime is then +the number of the next orphaned inode, or zero if there are no more orphans. If the inode structure size ``sb->s_inode_size`` is larger than 128 bytes and the ``i_inode_extra`` field is large enough to encompass the diff --git a/Documentation/filesystems/ext4/orphan.rst b/Documentation/filesystems/ext4/orphan.rst new file mode 100644 index 000000000000..bb19ecd1b626 --- /dev/null +++ b/Documentation/filesystems/ext4/orphan.rst @@ -0,0 +1,52 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Orphan file +----------- + +In unix there can inodes that are unlinked from directory hierarchy but that +are still alive because they are open. In case of crash the filesystem has to +clean up these inodes as otherwise they (and the blocks referenced from them) +would leak. Similarly if we truncate or extend the file, we need not be able +to perform the operation in a single journalling transaction. In such case we +track the inode as orphan so that in case of crash extra blocks allocated to +the file get truncated. + +Traditionally ext4 tracks orphan inodes in a form of single linked list where +superblock contains the inode number of the last orphan inode (s\_last\_orphan +field) and then each inode contains inode number of the previously orphaned +inode (we overload i\_dtime inode field for this). However this filesystem +global single linked list is a scalability bottleneck for workloads that result +in heavy creation of orphan inodes. When orphan file feature +(COMPAT\_ORPHAN\_FILE) is enabled, the filesystem has a special inode +(referenced from the superblock through s\_orphan_file_inum) with several +blocks. Each of these blocks has a structure: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - Array of \_\_le32 entries + - Orphan inode entries + - Each \_\_le32 entry is either empty (0) or it contains inode number of + an orphan inode. + * - blocksize - 8 + - \_\_le32 + - ob\_magic + - Magic value stored in orphan block tail (0x0b10ca04) + * - blocksize - 4 + - \_\_le32 + - ob\_checksum + - Checksum of the orphan block. + +When a filesystem with orphan file feature is writeably mounted, we set +RO\_COMPAT\_ORPHAN\_PRESENT feature in the superblock to indicate there may +be valid orphan entries. In case we see this feature when mounting the +filesystem, we read the whole orphan file and process all orphan inodes found +there as usual. When cleanly unmounting the filesystem we remove the +RO\_COMPAT\_ORPHAN\_PRESENT feature to avoid unnecessary scanning of the orphan +file and also make the filesystem fully compatible with older kernels. diff --git a/Documentation/filesystems/ext4/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst index 9061aabba827..94f304e3a0a7 100644 --- a/Documentation/filesystems/ext4/special_inodes.rst +++ b/Documentation/filesystems/ext4/special_inodes.rst @@ -36,3 +36,20 @@ ext4 reserves some inode for special features, as follows: * - 11 - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. +Note that there are also some inodes allocated from non-reserved inode numbers +for other filesystem features which are not referenced from standard directory +hierarchy. These are generally reference from the superblock. They are: + +.. list-table:: + :widths: 20 50 + :header-rows: 1 + + * - Superblock field + - Description + + * - s\_lpf\_ino + - Inode number of lost+found directory. + * - s\_prj\_quota\_inum + - Inode number of quota file tracking project quotas + * - s\_orphan\_file\_inum + - Inode number of file tracking orphan inodes. diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index 2eb1ab20498d..f6a548e957bb 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -479,7 +479,11 @@ The ext4 superblock is laid out as follows in - Filename charset encoding flags. * - 0x280 - \_\_le32 - - s\_reserved[95] + - s\_orphan\_file\_inum + - Orphan file inode number. + * - 0x284 + - \_\_le32 + - s\_reserved[94] - Padding to the end of the block. * - 0x3FC - \_\_le32 @@ -603,6 +607,11 @@ following: the journal, JBD2 incompat feature (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) gets set (COMPAT\_FAST\_COMMIT). + * - 0x1000 + - Orphan file allocated. This is the special file for more efficient + tracking of unlinked but still open inodes. When there may be any + entries in the file, we additionally set proper rocompat feature + (RO\_COMPAT\_ORPHAN\_PRESENT). .. _super_incompat: @@ -713,6 +722,10 @@ the following: - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) * - 0x8000 - Verity inodes may be present on the filesystem. (RO\_COMPAT\_VERITY) + * - 0x10000 + - Indicates orphan file may have valid orphan entries and thus we need + to clean them up when mounting the filesystem + (RO\_COMPAT\_ORPHAN\_PRESENT). .. _super_def_hash: From 4a79a98c7b19dd3d4cfe9200fbc384ba9119e039 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 16 Aug 2021 11:57:08 +0200 Subject: [PATCH 20/23] ext4: Improve scalability of ext4 orphan file handling Even though the length of the critical section when adding / removing orphaned inodes was significantly reduced by using orphan file, the contention of lock protecting orphan file still appears high in profiles for truncate / unlink intensive workloads with high number of threads. This patch makes handling of orphan file completely lockless. Also to reduce conflicts between CPUs different CPUs start searching for empty slot in orphan file in different blocks. Performance comparison of locked orphan file handling, lockless orphan file handling, and completely disabled orphan inode handling from 80 CPU Xeon Server with 526 GB of RAM, filesystem located on SAS SSD disk, average of 5 runs: stress-orphan (microbenchmark truncating files byte-by-byte from N processes in parallel) Threads Time Time Time Orphan locked Orphan lockless No orphan 1 0.945600 0.939400 0.891200 2 1.331800 1.246600 1.174400 4 1.995000 1.780600 1.713200 8 6.424200 4.900000 4.106000 16 14.937600 8.516400 8.138000 32 33.038200 24.565600 24.002200 64 60.823600 39.844600 38.440200 128 122.941400 70.950400 69.315000 So we can see that with lockless orphan file handling, addition / deletion of orphaned inodes got almost completely out of picture even for a microbenchmark stressing it. For reaim creat_clo workload on ramdisk there are also noticeable gains (average of 5 runs): Clients Vanilla (ops/s) Patched (ops/s) creat_clo-1 14705.88 ( 0.00%) 14354.07 * -2.39%* creat_clo-3 27108.43 ( 0.00%) 28301.89 ( 4.40%) creat_clo-5 37406.48 ( 0.00%) 45180.73 * 20.78%* creat_clo-7 41338.58 ( 0.00%) 54687.50 * 32.29%* creat_clo-9 45226.13 ( 0.00%) 62937.07 * 39.16%* creat_clo-11 44000.00 ( 0.00%) 65088.76 * 47.93%* creat_clo-13 36516.85 ( 0.00%) 68661.97 * 88.03%* creat_clo-15 30864.20 ( 0.00%) 69551.78 * 125.35%* creat_clo-17 27478.45 ( 0.00%) 67729.08 * 146.48%* creat_clo-19 25000.00 ( 0.00%) 61621.62 * 146.49%* creat_clo-21 18772.35 ( 0.00%) 63829.79 * 240.02%* creat_clo-23 16698.94 ( 0.00%) 61938.96 * 270.92%* creat_clo-25 14973.05 ( 0.00%) 56947.61 * 280.33%* creat_clo-27 16436.69 ( 0.00%) 65008.03 * 295.51%* creat_clo-29 13949.01 ( 0.00%) 69047.62 * 395.00%* creat_clo-31 14283.52 ( 0.00%) 67982.45 * 375.95%* Reviewed-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210816095713.16537-5-jack@suse.cz Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/ext4.h | 3 +- fs/ext4/orphan.c | 77 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ae79797b9d60..98758e8ea7fc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1489,7 +1489,7 @@ static inline int ext4_inodes_per_orphan_block(struct super_block *sb) } struct ext4_orphan_block { - int ob_free_entries; /* Number of free orphan entries in block */ + atomic_t ob_free_entries; /* Number of free orphan entries in block */ struct buffer_head *ob_bh; /* Buffer for orphan block */ }; @@ -1497,7 +1497,6 @@ struct ext4_orphan_block { * Info about orphan file. */ struct ext4_orphan_info { - spinlock_t of_lock; int of_blocks; /* Number of orphan blocks in a file */ __u32 of_csum_seed; /* Checksum seed for orphan file */ struct ext4_orphan_block *of_binfo; /* Array with info about orphan diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index ac928f60234a..53adc8f570a3 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -10,16 +10,31 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { - int i, j; + int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; + bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int looped = 0; - spin_lock(&oi->of_lock); - for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++); - if (i == oi->of_blocks) { - spin_unlock(&oi->of_lock); + /* + * Find block with free orphan entry. Use CPU number for a naive hash + * for a search start in the orphan file + */ + start = raw_smp_processor_id()*13 % oi->of_blocks; + i = start; + do { + if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) + >= 0) { + found = true; + break; + } + if (++i >= oi->of_blocks) + i = 0; + } while (i != start); + + if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional @@ -28,28 +43,43 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) */ return -ENOSPC; } - oi->of_binfo[i].ob_free_entries--; - spin_unlock(&oi->of_lock); - /* - * Get access to orphan block. We have dropped of_lock but since we - * have decremented number of free entries we are guaranteed free entry - * in our block. - */ ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); - if (ret) + if (ret) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; + } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); - spin_lock(&oi->of_lock); /* Find empty slot in a block */ - for (j = 0; j < inodes_per_ob && bdata[j]; j++); - BUG_ON(j == inodes_per_ob); - bdata[j] = cpu_to_le32(inode->i_ino); + j = 0; + do { + if (looped) { + /* + * Did we walk through the block several times without + * finding free entry? It is theoretically possible + * if entries get constantly allocated and freed or + * if the block is corrupted. Avoid indefinite looping + * and bail. We'll use orphan list instead. + */ + if (looped > 3) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return -ENOSPC; + } + cond_resched(); + } + while (bdata[j]) { + if (++j >= inodes_per_ob) { + j = 0; + looped++; + } + } + } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != + (__le32)0); + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); - spin_unlock(&oi->of_lock); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } @@ -180,10 +210,8 @@ static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); - spin_lock(&oi->of_lock); bdata[off] = 0; - oi->of_binfo[blk].ob_free_entries++; - spin_unlock(&oi->of_lock); + atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); @@ -552,8 +580,6 @@ int ext4_init_orphan_info(struct super_block *sb) struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); - spin_lock_init(&oi->of_lock); - if (!ext4_has_feature_orphan_file(sb)) return 0; @@ -597,7 +623,7 @@ int ext4_init_orphan_info(struct super_block *sb) for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; - oi->of_binfo[i].ob_free_entries = free; + atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; @@ -619,7 +645,8 @@ int ext4_orphan_file_empty(struct super_block *sb) if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) - if (oi->of_binfo[i].ob_free_entries != inodes_per_ob) + if (atomic_read(&oi->of_binfo[i].ob_free_entries) != + inodes_per_ob) return 0; return 1; } From 0904c9ae3465c7acc066a564a76b75c0af83e6c7 Mon Sep 17 00:00:00 2001 From: Zhang Yi <yi.zhang@huawei.com> Date: Thu, 26 Aug 2021 21:04:07 +0800 Subject: [PATCH 21/23] ext4: move inode eio simulation behind io completeion No EIO simulation is required if the buffer is uptodate, so move the simulation behind read bio completeion just like inode/block bitmap simulation does. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210826130412.3921207-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8fc0dad9fa62..532443f89015 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4337,8 +4337,6 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; - if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) - goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4425,8 +4423,8 @@ make_io: ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { - simulate_eio: if (ret_block) *ret_block = block; brelse(bh); From 8e33fadf945a918c50d92fab6a769661df484156 Mon Sep 17 00:00:00 2001 From: Zhang Yi <yi.zhang@huawei.com> Date: Thu, 26 Aug 2021 21:04:08 +0800 Subject: [PATCH 22/23] ext4: remove an unnecessary if statement in __ext4_get_inode_loc() The "if (!buffer_uptodate(bh))" hunk covered almost the whole code after getting buffer in __ext4_get_inode_loc() which seems unnecessary, remove it and switch to check ext4_buffer_uptodate(), it simplify code and make it more readable. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210826130412.3921207-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/inode.c | 164 +++++++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 85 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 532443f89015..c26f946cf04a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4337,99 +4337,93 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; - if (!buffer_uptodate(bh)) { - lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) + goto has_buffer; - if (ext4_buffer_uptodate(bh)) { - /* someone brought it uptodate while we waited */ + lock_buffer(bh); + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; + + start = inode_offset & ~(inodes_per_block - 1); + + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (unlikely(!bitmap_bh)) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } - - /* - * If we have all information of the inode in memory and this - * is the only valid inode in the block, we need not read the - * block. - */ - if (in_mem) { - struct buffer_head *bitmap_bh; - int i, start; - - start = inode_offset & ~(inodes_per_block - 1); - - /* Is the inode bitmap in cache? */ - bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (unlikely(!bitmap_bh)) - goto make_io; - - /* - * If the inode bitmap isn't in cache then the - * optimisation may end up performing two reads instead - * of one, so skip it. - */ - if (!buffer_uptodate(bitmap_bh)) { - brelse(bitmap_bh); - goto make_io; - } - for (i = start; i < start + inodes_per_block; i++) { - if (i == inode_offset) - continue; - if (ext4_test_bit(i, bitmap_bh->b_data)) - break; - } - brelse(bitmap_bh); - if (i == start + inodes_per_block) { - /* all other inodes are free, so skip I/O */ - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - unlock_buffer(bh); - goto has_buffer; - } - } + } make_io: - /* - * If we need to do any I/O, try to pre-readahead extra - * blocks from the inode table. - */ - blk_start_plug(&plug); - if (EXT4_SB(sb)->s_inode_readahead_blks) { - ext4_fsblk_t b, end, table; - unsigned num; - __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + blk_start_plug(&plug); + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; - table = ext4_inode_table(sb, gdp); - /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~((ext4_fsblk_t) ra_blks - 1); - if (table > b) - b = table; - end = b + ra_blks; - num = EXT4_INODES_PER_GROUP(sb); - if (ext4_has_group_desc_csum(sb)) - num -= ext4_itable_unused_count(sb, gdp); - table += num / inodes_per_block; - if (end > table) - end = table; - while (b <= end) - ext4_sb_breadahead_unmovable(sb, b++); - } + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~((ext4_fsblk_t) ra_blks - 1); + if (table > b) + b = table; + end = b + ra_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (ext4_has_group_desc_csum(sb)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + ext4_sb_breadahead_unmovable(sb, b++); + } - /* - * There are other valid inodes in the buffer, this inode - * has in-inode xattrs, or we don't have this inode in memory. - * Read the block from disk. - */ - trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); - blk_finish_plug(&plug); - wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); - if (!buffer_uptodate(bh)) { - if (ret_block) - *ret_block = block; - brelse(bh); - return -EIO; - } + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(sb, ino); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + blk_finish_plug(&plug); + wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); + if (!buffer_uptodate(bh)) { + if (ret_block) + *ret_block = block; + brelse(bh); + return -EIO; } has_buffer: iloc->bh = bh; From baaae979b112642a41b71c71c599d875c067d257 Mon Sep 17 00:00:00 2001 From: Zhang Yi <yi.zhang@huawei.com> Date: Thu, 26 Aug 2021 21:04:09 +0800 Subject: [PATCH 23/23] ext4: make the updating inode data procedure atomic Now that ext4_do_update_inode() return error before filling the whole inode data if we fail to set inode blocks in ext4_inode_blocks_set(). This error should never happen in theory since sb->s_maxbytes should not have allowed this, we have already init sb->s_maxbytes according to this feature in ext4_fill_super(). So even through that could only happen due to the filesystem corruption, we'd better to return after we finish updating the inode because it may left an uninitialized buffer and we could read this buffer later in "errors=continue" mode. This patch make the updating inode data procedure atomic, call EXT4_ERROR_INODE() after we dropping i_raw_lock after something bad happened, make sure that the inode is integrated, and also drop a BUG_ON and do some small cleanups. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20210826130412.3921207-4-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> --- fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c26f946cf04a..62e9165bc69c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4932,8 +4932,14 @@ static int ext4_inode_blocks_set(handle_t *handle, ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ if (!ext4_has_feature_huge_file(sb)) - return -EFBIG; + return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* @@ -5036,16 +5042,14 @@ static int ext4_do_update_inode(handle_t *handle, spin_lock(&ei->i_raw_lock); - /* For fields not tracked in the in-memory inode, - * initialise them to zero for new inodes. */ + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); err = ext4_inode_blocks_set(handle, raw_inode, ei); - if (err) { - spin_unlock(&ei->i_raw_lock); - goto out_brelse; - } raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); @@ -5054,10 +5058,11 @@ static int ext4_do_update_inode(handle_t *handle, if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; @@ -5126,8 +5131,9 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!ext4_has_feature_project(inode->i_sb) && - i_projid != EXT4_DEF_PROJID); + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) @@ -5135,6 +5141,11 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); + if (err) { + EXT4_ERROR_INODE(inode, "corrupted inode contents"); + goto out_brelse; + } + if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); @@ -5142,7 +5153,7 @@ static int ext4_do_update_inode(handle_t *handle, BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) - goto out_brelse; + goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); @@ -5150,7 +5161,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) - goto out_brelse; + goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); @@ -5160,9 +5171,10 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_error: + ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); - ext4_std_error(inode->i_sb, err); return err; }