From ad431025aecda85d3ebef5e4a3aca5c1c681d0c7 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:10:39 -0400 Subject: [PATCH 01/21] ext4: generalize extents status tree search functions Ext4 contains a few functions that are used to search for delayed extents or blocks in the extents status tree. Rather than duplicate code to add new functions to search for extents with different status values, such as written or a combination of delayed and unwritten, generalize the existing code to search for caller-specified extents status values. Also, move this code into extents_status.c where it is better associated with the data structures it operates upon, and where it can be more readily used to implement new extents status tree functions that might want a broader scope for i_es_lock. Three missing static specifiers in RFC version of patch reported and fixed by Fengguang Wu . Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 - fs/ext4/extents.c | 52 +++---------- fs/ext4/extents_status.c | 149 +++++++++++++++++++++++++++++++----- fs/ext4/extents_status.h | 13 +++- fs/ext4/inode.c | 17 ++-- include/trace/events/ext4.h | 4 +- 6 files changed, 165 insertions(+), 74 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index caff935fbeb8..ad2c215720be 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3142,10 +3142,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..95796f00e4e6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, { struct extent_status es; - ext4_es_find_delayed_extent_range(inode, hole_start, - hole_start + hole_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= hole_start) @@ -3819,39 +3819,6 @@ out: return ext4_mark_inode_dirty(handle, inode); } -/** - * ext4_find_delalloc_range: find delayed allocated block in the given range. - * - * Return 1 if there is a delalloc block in the range, otherwise 0. - */ -int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) -{ - struct extent_status es; - - ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); - if (es.es_len == 0) - return 0; /* there is no delay extent in this tree */ - else if (es.es_lblk <= lblk_start && - lblk_start < es.es_lblk + es.es_len) - return 1; - else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) - return 1; - else - return 0; -} - -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t lblk_start, lblk_end; - lblk_start = EXT4_LBLK_CMASK(sbi, lblk); - lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - - return ext4_find_delalloc_range(inode, lblk_start, lblk_end); -} - /** * Determines how many complete clusters (out of those specified by the 'map') * are under delalloc and were reserved quota for. @@ -3910,7 +3877,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, + lblk_to)) allocated_clusters--; } @@ -3920,7 +3888,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, + lblk_to)) allocated_clusters--; } @@ -5075,8 +5044,10 @@ static int ext4_find_delayed_extent(struct inode *inode, ext4_lblk_t block, next_del; if (newes->es_pblk == 0) { - ext4_es_find_delayed_extent_range(inode, newes->es_lblk, - newes->es_lblk + newes->es_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + newes->es_lblk, + newes->es_lblk + newes->es_len - 1, + &es); /* * No extent in extent-tree contains block @newes->es_pblk, @@ -5097,7 +5068,8 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, + EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c4e6fb15101b..8530fbd3012d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -233,30 +233,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering - * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_extent_range - find extent with specified status within block + * range or next extent following block range in + * extents status tree * - * @inode: the inode which owns delayed extents - * @lblk: the offset where we start to search - * @end: the offset where we stop to search - * @es: delayed extent that we found + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * @es - extent found, if any + * + * Find the first extent within the block range specified by @lblk and @end + * in the extents status tree that satisfies @matching_fn. If a match + * is found, it's returned in @es. If not, and a matching extent is found + * beyond the block range, it's returned in @es. If no match is found, an + * extent is returned in @es whose es_lblk, es_len, and es_pblk components + * are 0. */ -void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es) +static void __es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - BUG_ON(es == NULL); - BUG_ON(end < lblk); - trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + WARN_ON(es == NULL); + WARN_ON(end < lblk); - read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find extent in cache firstly */ + /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; @@ -271,28 +279,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, es1 = __es_tree_search(&tree->root, lblk); out: - if (es1 && !ext4_es_is_delayed(es1)) { + if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } - if (ext4_es_is_delayed(es1)) + if (matching_fn(es1)) break; } } - if (es1 && ext4_es_is_delayed(es1)) { + if (es1 && matching_fn(es1)) { tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } +} + +/* + * Locking for __es_find_extent_range() for external use + */ +void ext4_es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + trace_ext4_es_find_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + __es_find_extent_range(inode, matching_fn, lblk, end, es); read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_delayed_extent_range_exit(inode, es); + trace_ext4_es_find_extent_range_exit(inode, es); +} + +/* + * __es_scan_range - search block range for block with specified status + * in extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * + * Returns true if at least one block in the specified block range satisfies + * the criterion specified by @matching_fn, and false if not. If at least + * one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t start, ext4_lblk_t end) +{ + struct extent_status es; + + __es_find_extent_range(inode, matching_fn, start, end, &es); + if (es.es_len == 0) + return false; /* no matching extent in the tree */ + else if (es.es_lblk <= start && + start < es.es_lblk + es.es_len) + return true; + else if (start <= es.es_lblk && es.es_lblk <= end) + return true; + else + return false; +} +/* + * Locking for __es_scan_range() for external use + */ +bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_range(inode, matching_fn, lblk, end); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; +} + +/* + * __es_scan_clu - search cluster for block with specified status in + * extents status tree + * + * @inode - file containing the cluster + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block in cluster to be searched + * + * Returns true if at least one extent in the cluster containing @lblk + * satisfies the criterion specified by @matching_fn, and false if not. If at + * least one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); +} + +/* + * Locking for __es_scan_clu() for external use + */ +bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_clu(inode, matching_fn, lblk); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; } static void ext4_es_list_add(struct inode *inode) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..df9628c3ec3b 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -90,11 +90,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d767e993591d..b83bf3308b5e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -701,8 +701,8 @@ found: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -1681,7 +1681,7 @@ static void ext4_da_page_release_reservation(struct page *page, lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) + !ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1859,6 +1859,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, add_delayed: if (retval == 0) { int ret; + /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? @@ -1869,7 +1870,8 @@ add_delayed: * to reserve metadata for every block we're going to write. */ if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, map->m_lblk)) { + !ext4_es_scan_clu(inode, + &ext4_es_is_delayed, map->m_lblk)) { ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ @@ -3450,7 +3452,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ext4_lblk_t end = map.m_lblk + map.m_len - 1; struct extent_status es; - ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map.m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) { /* entire range is a hole */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0e31eb136c57..7849b7f8fd9d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2270,7 +2270,7 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->lblk, __entry->len) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, +TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), @@ -2292,7 +2292,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, (unsigned long) __entry->ino, __entry->lblk) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, +TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), From 1dc0aa46e74a3366e12f426b7caaca477853e9c3 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:17:41 -0400 Subject: [PATCH 02/21] ext4: add new pending reservation mechanism Add new pending reservation mechanism to help manage reserved cluster accounting. Its primary function is to avoid the need to read extents from the disk when invalidating pages as a result of a truncate, punch hole, or collapse range operation. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 + fs/ext4/extents_status.c | 187 +++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 51 +++++++++++ fs/ext4/super.c | 8 ++ 4 files changed, 249 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ad2c215720be..fc0f41dbf90b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1030,6 +1030,9 @@ struct ext4_inode_info { ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + /* on-disk additional length */ __u16 i_extra_isize; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 8530fbd3012d..194785ce890a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -142,6 +142,7 @@ */ static struct kmem_cache *ext4_es_cachep; +static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, @@ -1365,3 +1366,189 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) ei->i_es_tree.cache_es = NULL; return nr_shrunk; } + +#ifdef ES_DEBUG__ +static void ext4_print_pending_tree(struct inode *inode) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr; + + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_pending_tree; + node = rb_first(&tree->root); + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + printk(KERN_DEBUG " %u", pr->lclu); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_print_pending_tree(inode) +#endif + +int __init ext4_init_pending(void) +{ + ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", + sizeof(struct pending_reservation), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_pending_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pending(void) +{ + kmem_cache_destroy(ext4_pending_cachep); +} + +void ext4_init_pending_tree(struct ext4_pending_tree *tree) +{ + tree->root = RB_ROOT; +} + +/* + * __get_pending - retrieve a pointer to a pending reservation + * + * @inode - file containing the pending cluster reservation + * @lclu - logical cluster of interest + * + * Returns a pointer to a pending reservation if it's a member of + * the set, and NULL if not. Must be called holding i_es_lock. + */ +static struct pending_reservation *__get_pending(struct inode *inode, + ext4_lblk_t lclu) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr = NULL; + + tree = &EXT4_I(inode)->i_pending_tree; + node = (&tree->root)->rb_node; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else if (lclu == pr->lclu) + return pr; + } + return NULL; +} + +/* + * __insert_pending - adds a pending cluster reservation to the set of + * pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster to be added + * + * Returns 0 on successful insertion and -ENOMEM on failure. If the + * pending reservation is already in the set, returns successfully. + */ +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct pending_reservation *pr; + ext4_lblk_t lclu; + int ret = 0; + + lclu = EXT4_B2C(sbi, lblk); + /* search to find parent for insertion */ + while (*p) { + parent = *p; + pr = rb_entry(parent, struct pending_reservation, rb_node); + + if (lclu < pr->lclu) { + p = &(*p)->rb_left; + } else if (lclu > pr->lclu) { + p = &(*p)->rb_right; + } else { + /* pending reservation already inserted */ + goto out; + } + } + + pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + if (pr == NULL) { + ret = -ENOMEM; + goto out; + } + pr->lclu = lclu; + + rb_link_node(&pr->rb_node, parent, p); + rb_insert_color(&pr->rb_node, &tree->root); + +out: + return ret; +} + +/* + * __remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Returns successfully if pending reservation is not a member of the set. + */ +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } +} + +/* + * ext4_remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Locking for external use of __remove_pending. + */ +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + write_lock(&ei->i_es_lock); + __remove_pending(inode, lblk); + write_unlock(&ei->i_es_lock); +} + +/* + * ext4_is_pending - determine whether a cluster has a pending reservation + * on it + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster + * + * Returns true if there's a pending reservation for the cluster in the + * set of pending reservations, and false if not. + */ +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + bool ret; + + read_lock(&ei->i_es_lock); + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); + read_unlock(&ei->i_es_lock); + + return ret; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index df9628c3ec3b..379b7171c67c 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -78,6 +78,51 @@ struct ext4_es_stats { struct percpu_counter es_stats_shk_cnt; }; +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -182,4 +227,10 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1145109968ef..faf293ed8060 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1040,6 +1040,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); @@ -5954,6 +5955,10 @@ static int __init ext4_init_fs(void) if (err) return err; + err = ext4_init_pending(); + if (err) + goto out6; + err = ext4_init_pageio(); if (err) goto out5; @@ -5992,6 +5997,8 @@ out3: out4: ext4_exit_pageio(); out5: + ext4_exit_pending(); +out6: ext4_exit_es(); return err; @@ -6009,6 +6016,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_es(); + ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); From 0b02f4c0d6d9e2c611dfbdd4317193e9dca740e6 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:19:37 -0400 Subject: [PATCH 03/21] ext4: fix reserved cluster accounting at delayed write time The code in ext4_da_map_blocks sometimes reserves space for more delayed allocated clusters than it should, resulting in premature ENOSPC, exceeded quota, and inaccurate free space reporting. Fix this by checking for written and unwritten blocks shared in the same cluster with the newly delayed allocated block. A cluster reservation should not be made for a cluster for which physical space has already been allocated. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 79 +++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.c | 53 +++++++++++++++++++++++++ fs/ext4/extents_status.h | 12 ++++++ fs/ext4/inode.c | 79 ++++++++++++++++++++++++++++--------- include/trace/events/ext4.h | 35 ++++++++++++++++ 6 files changed, 241 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fc0f41dbf90b..d85fd5c8a2c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3155,6 +3155,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95796f00e4e6..26481e543312 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5930,3 +5930,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } return replaced_count; } + +/* + * ext4_clu_mapped - determine whether any block in a logical cluster has + * been mapped to a physical cluster + * + * @inode - file containing the logical cluster + * @lclu - logical cluster of interest + * + * Returns 1 if any block in the logical cluster is mapped, signifying + * that a physical cluster has been allocated for it. Otherwise, + * returns 0. Can also return negative error codes. Derived from + * ext4_ext_map_blocks(). + */ +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_ext_path *path; + int depth, mapped = 0, err = 0; + struct ext4_extent *extent; + ext4_lblk_t first_lblk, first_lclu, last_lclu; + + /* search for the extent closest to the first block in the cluster */ + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + + depth = ext_depth(inode); + + /* + * A consistent leaf must not be empty. This situation is possible, + * though, _during_ tree modification, and it's why an assert can't + * be put in ext4_find_extent(). + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, + "bad extent address - lblock: %lu, depth: %d, pblock: %lld", + (unsigned long) EXT4_C2B(sbi, lclu), + depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + extent = path[depth].p_ext; + + /* can't be mapped if the extent tree is empty */ + if (extent == NULL) + goto out; + + first_lblk = le32_to_cpu(extent->ee_block); + first_lclu = EXT4_B2C(sbi, first_lblk); + + /* + * Three possible outcomes at this point - found extent spanning + * the target cluster, to the left of the target cluster, or to the + * right of the target cluster. The first two cases are handled here. + * The last case indicates the target cluster is not mapped. + */ + if (lclu >= first_lclu) { + last_lclu = EXT4_B2C(sbi, first_lblk + + ext4_ext_get_actual_len(extent) - 1); + if (lclu <= last_lclu) { + mapped = 1; + } else { + first_lblk = ext4_ext_next_allocated_block(path); + first_lclu = EXT4_B2C(sbi, first_lblk); + if (lclu == first_lclu) + mapped = 1; + } + } + +out: + ext4_ext_drop_refs(path); + kfree(path); + + return err ? err : mapped; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 194785ce890a..c5d456e12062 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1552,3 +1552,56 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) return ret; } + +/* + * ext4_es_insert_delayed_block - adds a delayed block to the extents status + * tree, adding a pending reservation where + * needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * @allocated - indicates whether a physical cluster has been allocated for + * the logical cluster that contains the block + * + * Returns 0 on success, negative error code on failure. + */ +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) +{ + struct extent_status newes; + int err = 0; + + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", + lblk, inode->i_ino); + + newes.es_lblk = lblk; + newes.es_len = 1; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + + err = __es_remove_extent(inode, lblk, lblk); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err != 0) + goto error; + + if (allocated) + __insert_pending(inode, lblk); + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + + return err; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 379b7171c67c..9d3c676ec623 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -178,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -232,5 +242,7 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b83bf3308b5e..57c6dd38f071 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1780,6 +1780,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } +/* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block + * count or making a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool allocated = false; + + /* + * If the cluster containing lblk is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { /* bigalloc */ + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { + if (!ext4_es_scan_clu(inode, + &ext4_es_is_mapped, lblk)) { + ret = ext4_clu_mapped(inode, + EXT4_B2C(sbi, lblk)); + if (ret < 0) + goto errout; + if (ret == 0) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { + allocated = true; + } + } else { + allocated = true; + } + } + } + + ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + +errout: + return ret; +} + /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write @@ -1864,25 +1923,9 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* - * If the block was allocated from previously allocated cluster, - * then we don't need to reserve it again. However we still need - * to reserve metadata for every block we're going to write. - */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_es_scan_clu(inode, - &ext4_es_is_delayed, map->m_lblk)) { - ret = ext4_da_reserve_space(inode); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } - } - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - ~0, EXTENT_STATUS_DELAYED); - if (ret) { + ret = ext4_insert_delayed_block(inode, map->m_lblk); + if (ret != 0) { retval = ret; goto out_unlock; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7849b7f8fd9d..6d7a943f849c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2512,6 +2512,41 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); +TRACE_EVENT(ext4_es_insert_delayed_block, + TP_PROTO(struct inode *inode, struct extent_status *es, + bool allocated), + + TP_ARGS(inode, es, allocated), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( char, status ) + __field( bool, allocated ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); + __entry->allocated = allocated; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " + "allocated %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->len, + __entry->pblk, show_extent_status(__entry->status), + __entry->allocated) +); + /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, From b6bf9171ef5c37b66d446378ba63af5339a56a97 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:24:08 -0400 Subject: [PATCH 04/21] ext4: reduce reserved cluster count by number of allocated clusters Ext4 does not always reduce the reserved cluster count by the number of clusters allocated when mapping a delayed extent. It sometimes adds back one or more clusters after allocation if delalloc blocks adjacent to the range allocated by ext4_ext_map_blocks() share the clusters newly allocated for that range. However, this overcounts the number of clusters needed to satisfy future mapping requests (holding one or more reservations for clusters that have already been allocated) and premature ENOSPC and quota failures, etc., result. Ext4 also does not reduce the reserved cluster count when allocating clusters for non-delayed allocated writes that have previously been reserved for delayed writes. This also results in overcounts. To make it possible to handle reserved cluster accounting for fallocated regions in the same manner as used for other non-delayed writes, do the reserved cluster accounting for them at the time of allocation. In the current code, this is only done later when a delayed extent sharing the fallocated region is finally mapped. Address comment correcting handling of unsigned long long constant from Jan Kara's review of RFC version of this patch. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 188 ++++++--------------------------------- fs/ext4/extents_status.c | 175 ++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 4 + 3 files changed, 207 insertions(+), 160 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 26481e543312..b52ac813ca20 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3819,83 +3819,6 @@ out: return ext4_mark_inode_dirty(handle, inode); } -/** - * Determines how many complete clusters (out of those specified by the 'map') - * are under delalloc and were reserved quota for. - * This function is called when we are writing out the blocks that were - * originally written with their allocation delayed, but then the space was - * allocated using fallocate() before the delayed allocation could be resolved. - * The cases to look for are: - * ('=' indicated delayed allocated blocks - * '-' indicates non-delayed allocated blocks) - * (a) partial clusters towards beginning and/or end outside of allocated range - * are not delalloc'ed. - * Ex: - * |----c---=|====c====|====c====|===-c----| - * |++++++ allocated ++++++| - * ==> 4 complete clusters in above example - * - * (b) partial cluster (outside of allocated range) towards either end is - * marked for delayed allocation. In this case, we will exclude that - * cluster. - * Ex: - * |----====c========|========c========| - * |++++++ allocated ++++++| - * ==> 1 complete clusters in above example - * - * Ex: - * |================c================| - * |++++++ allocated ++++++| - * ==> 0 complete clusters in above example - * - * The ext4_da_update_reserve_space will be called only if we - * determine here that there were some "entire" clusters that span - * this 'allocated' range. - * In the non-bigalloc case, this function will just end up returning num_blks - * without ever calling ext4_find_delalloc_range. - */ -static unsigned int -get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, - unsigned int num_blks) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t alloc_cluster_start, alloc_cluster_end; - ext4_lblk_t lblk_from, lblk_to, c_offset; - unsigned int allocated_clusters = 0; - - alloc_cluster_start = EXT4_B2C(sbi, lblk_start); - alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); - - /* max possible clusters for this allocation */ - allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; - - trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); - - /* Check towards left side */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start); - if (c_offset) { - lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); - lblk_to = lblk_from + c_offset - 1; - - if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, - lblk_to)) - allocated_clusters--; - } - - /* Now check towards right. */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); - if (allocated_clusters && c_offset) { - lblk_from = lblk_start + num_blks; - lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - - if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, - lblk_to)) - allocated_clusters--; - } - - return allocated_clusters; -} - static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -4077,23 +4000,6 @@ out: } map->m_len = allocated; - /* - * If we have done fallocate with the offset that is already - * delayed allocated, we would have block reservation - * and quota reservation done in the delayed write path. - * But fallocate would have already updated quota and block - * count for this offset. So cancel these reservation - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, map->m_len); - if (reserved_clusters) - ext4_da_update_reserve_space(inode, - reserved_clusters, - 0); - } - map_out: map->m_flags |= EXT4_MAP_MAPPED; if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { @@ -4482,77 +4388,39 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_NEW; /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - /* - * Check how many clusters we had reserved this allocated range - */ - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, allocated); - if (!map_from_cluster) { - BUG_ON(allocated_clusters < reserved_clusters); - if (reserved_clusters < allocated_clusters) { - struct ext4_inode_info *ei = EXT4_I(inode); - int reservation = allocated_clusters - - reserved_clusters; - /* - * It seems we claimed few clusters outside of - * the range of this allocation. We should give - * it back to the reservation pool. This can - * happen in the following case: - * - * * Suppose s_cluster_ratio is 4 (i.e., each - * cluster has 4 blocks. Thus, the clusters - * are [0-3],[4-7],[8-11]... - * * First comes delayed allocation write for - * logical blocks 10 & 11. Since there were no - * previous delayed allocated blocks in the - * range [8-11], we would reserve 1 cluster - * for this write. - * * Next comes write for logical blocks 3 to 8. - * In this case, we will reserve 2 clusters - * (for [0-3] and [4-7]; and not for [8-11] as - * that range has a delayed allocated blocks. - * Thus total reserved clusters now becomes 3. - * * Now, during the delayed allocation writeout - * time, we will first write blocks [3-8] and - * allocate 3 clusters for writing these - * blocks. Also, we would claim all these - * three clusters above. - * * Now when we come here to writeout the - * blocks [10-11], we would expect to claim - * the reservation of 1 cluster we had made - * (and we would claim it since there are no - * more delayed allocated blocks in the range - * [8-11]. But our reserved cluster count had - * already gone to 0. - * - * Thus, at the step 4 above when we determine - * that there are still some unwritten delayed - * allocated blocks outside of our current - * block range, we should increment the - * reserved clusters count so that when the - * remaining blocks finally gets written, we - * could claim them. - */ - dquot_reserve_block(inode, - EXT4_C2B(sbi, reservation)); - spin_lock(&ei->i_block_reservation_lock); - ei->i_reserved_data_blocks += reservation; - spin_unlock(&ei->i_block_reservation_lock); - } + if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* - * We will claim quota for all newly allocated blocks. - * We're updating the reserved space *after* the - * correction above so we do not accidentally free - * all the metadata reservation because we might - * actually need it later on. + * When allocating delayed allocated clusters, simply + * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); + } else { + ext4_lblk_t lblk, len; + unsigned int n; + + /* + * When allocating non-delayed allocated clusters + * (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by + * ext4_nonda_switch), reduce the reserved cluster + * count by the number of allocated clusters that + * have previously been delayed allocated. Quota + * has been claimed by ext4_mb_new_blocks() above, + * so release the quota reservations made for any + * previously delayed allocated clusters. + */ + lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); + len = allocated_clusters << sbi->s_cluster_bits; + n = ext4_es_delayed_clu(inode, lblk, len); + if (n > 0) + ext4_da_update_reserve_space(inode, (int) n, 0); } } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c5d456e12062..c92fbf444d08 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -150,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); int __init ext4_init_es(void) { @@ -808,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); @@ -844,6 +847,11 @@ retry: if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && + (status & EXTENT_STATUS_WRITTEN || + status & EXTENT_STATUS_UNWRITTEN)) + __revise_pending(inode, lblk, len); + error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -1605,3 +1613,170 @@ error: return err; } + +/* + * __es_delayed_clu - count number of clusters containing blocks that + * are delayed only + * + * @inode - file containing block range + * @start - logical block defining start of range + * @end - logical block defining end of range + * + * Returns the number of clusters containing only delayed (not delayed + * and unwritten) blocks in the range specified by @start and @end. Any + * cluster or part of a cluster within the range and containing a delayed + * and not unwritten block within the range is counted as a whole cluster. + */ +static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + unsigned long long last_counted_lclu; + unsigned int n = 0; + + /* guaranteed to be unequal to any ext4_lblk_t value */ + last_counted_lclu = ~0ULL; + + es = __es_tree_search(&tree->root, start); + + while (es && (es->es_lblk <= end)) { + if (ext4_es_is_delonly(es)) { + if (es->es_lblk <= start) + first_lclu = EXT4_B2C(sbi, start); + else + first_lclu = EXT4_B2C(sbi, es->es_lblk); + + if (ext4_es_end(es) >= end) + last_lclu = EXT4_B2C(sbi, end); + else + last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); + + if (first_lclu == last_counted_lclu) + n += last_lclu - first_lclu; + else + n += last_lclu - first_lclu + 1; + last_counted_lclu = last_lclu; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + + return n; +} + +/* + * ext4_es_delayed_clu - count number of clusters containing blocks that + * are both delayed and unwritten + * + * @inode - file containing block range + * @lblk - logical block defining start of range + * @len - number of blocks in range + * + * Locking for external use of __es_delayed_clu(). + */ +unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t end; + unsigned int n; + + if (len == 0) + return 0; + + end = lblk + len - 1; + WARN_ON(end < lblk); + + read_lock(&ei->i_es_lock); + + n = __es_delayed_clu(inode, lblk, end); + + read_unlock(&ei->i_es_lock); + + return n; +} + +/* + * __revise_pending - makes, cancels, or leaves unchanged pending cluster + * reservations for a specified block range depending + * upon the presence or absence of delayed blocks + * outside the range within clusters at the ends of the + * range + * + * @inode - file containing the range + * @lblk - logical block defining the start of range + * @len - length of range in blocks + * + * Used after a newly allocated extent is added to the extents status tree. + * Requires that the extents in the range have either written or unwritten + * status. Must be called while holding i_es_lock. + */ +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t end = lblk + len - 1; + ext4_lblk_t first, last; + bool f_del = false, l_del = false; + + if (len == 0) + return; + + /* + * Two cases - block range within single cluster and block range + * spanning two or more clusters. Note that a cluster belonging + * to a range starting and/or ending on a cluster boundary is treated + * as if it does not contain a delayed extent. The new range may + * have allocated space for previously delayed blocks out to the + * cluster boundary, requiring that any pre-existing pending + * reservation be canceled. Because this code only looks at blocks + * outside the range, it should revise pending reservations + * correctly even if the extent represented by the range can't be + * inserted in the extents status tree due to ENOSPC. + */ + + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) { + __insert_pending(inode, first); + } else { + last = EXT4_LBLK_CMASK(sbi, end) + + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, + &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } + } else { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) + __insert_pending(inode, first); + else + __remove_pending(inode, first); + + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 9d3c676ec623..131a8b7df265 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -244,5 +244,9 @@ extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); #endif /* _EXT4_EXTENTS_STATUS_H */ From 9fe671496b6c286f9033aedfc1718d67721da0ae Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:25:08 -0400 Subject: [PATCH 05/21] ext4: adjust reserved cluster count when removing extents Modify ext4_ext_remove_space() and the code it calls to correct the reserved cluster count for pending reservations (delayed allocated clusters shared with allocated blocks) when a block range is removed from the extent tree. Pending reservations may be found for the clusters at the ends of written or unwritten extents when a block range is removed. If a physical cluster at the end of an extent is freed, it's necessary to increment the reserved cluster count to maintain correct accounting if the corresponding logical cluster is shared with at least one delayed and unwritten extent as found in the extents status tree. Add a new function, ext4_rereserve_cluster(), to reapply a reservation on a delayed allocated cluster sharing blocks with a freed allocated cluster. To avoid ENOSPC on reservation, a flag is applied to ext4_free_blocks() to briefly defer updating the freeclusters counter when an allocated cluster is freed. This prevents another thread from allocating the freed block before the reservation can be reapplied. Redefine the partial cluster object as a struct to carry more state information and to clarify the code using it. Adjust the conditional code structure in ext4_ext_remove_space to reduce the indentation level in the main body of the code to improve readability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/ext4_extents.h | 13 ++ fs/ext4/extents.c | 286 ++++++++++++++++++++++-------------- fs/ext4/mballoc.c | 14 +- include/trace/events/ext4.h | 60 +++++--- 5 files changed, 239 insertions(+), 135 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d85fd5c8a2c4..0bdbbd151d2c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -628,6 +628,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 /* * ioctl commands diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -119,6 +119,19 @@ struct ext4_ext_path { struct buffer_head *p_bh; }; +/* + * Used to record a portion of a cluster found at the beginning or end + * of an extent while traversing the extent tree during space removal. + * A partial cluster may be removed if it does not contain blocks shared + * with extents that aren't being deleted (tofree state). Otherwise, + * it cannot be removed (nofree state). + */ +struct partial_cluster { + ext4_fsblk_t pclu; /* physical cluster number */ + ext4_lblk_t lblk; /* logical block number within logical cluster */ + enum {initial, tofree, nofree} state; +}; + /* * structure for external API */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b52ac813ca20..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) return 0; } +/* + * ext4_rereserve_cluster - increment the reserved cluster count when + * freeing a cluster with a pending reservation + * + * @inode - file containing the cluster + * @lblk - logical block in cluster to be reserved + * + * Increments the reserved cluster count and adjusts quota in a bigalloc + * file system when freeing a partial cluster containing at least one + * delayed and unwritten block. A partial cluster meeting that + * requirement will have a pending reservation. If so, the + * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to + * defer reserved and allocated space accounting to a subsequent call + * to this function. + */ +static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); + + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); + spin_unlock(&ei->i_block_reservation_lock); + + percpu_counter_add(&sbi->s_freeclusters_counter, 1); + ext4_remove_pending(inode, lblk); +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); - ext4_fsblk_t pblk; - int flags = get_default_free_blocks_flags(inode); + ext4_fsblk_t last_pblk, pblk; + ext4_lblk_t num; + int flags; + + /* only extent tail removal is allowed */ + if (from < le32_to_cpu(ex->ee_block) || + to != le32_to_cpu(ex->ee_block) + ee_len - 1) { + ext4_error(sbi->s_sb, + "strange request: removal(2) %u-%u from %u:%u", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; + } + +#ifdef EXTENTS_STATS + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); +#endif + + trace_ext4_remove_blocks(inode, ex, from, to, partial); + + /* + * if we have a partial cluster, and it's different from the + * cluster of the last block in the extent, we free it + */ + last_pblk = ext4_ext_pblock(ex) + ee_len - 1; + + if (partial->state != initial && + partial->pclu != EXT4_B2C(sbi, last_pblk)) { + if (partial->state == tofree) { + flags = get_default_free_blocks_flags(inode); + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; + + /* + * We free the partial cluster at the end of the extent (if any), + * unless the cluster is used by another extent (partial_cluster + * state is nofree). If a partial cluster exists here, it must be + * shared with the last block in the extent. + */ + flags = get_default_free_blocks_flags(inode); + + /* partial, left end cluster aligned, right end unaligned */ + if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && + (EXT4_LBLK_CMASK(sbi, to) >= from) && + (partial->state != nofree)) { + if (ext4_is_pending(inode, to)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_PBLK_CMASK(sbi, last_pblk), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, to); + partial->state = initial; + flags = get_default_free_blocks_flags(inode); + } + + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; /* * For bigalloc file systems, we never free a partial cluster - * at the beginning of the extent. Instead, we make a note - * that we tried freeing the cluster, and check to see if we + * at the beginning of the extent. Instead, we check to see if we * need to free it on a subsequent call to ext4_remove_blocks, * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + + /* reset the partial cluster if we've freed past it */ + if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) + partial->state = initial; - trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* - * If we have a partial cluster, and it's different from the - * cluster of the last block, we need to explicitly free the - * partial cluster here. + * If we've freed the entire extent but the beginning is not left + * cluster aligned and is not marked as ineligible for freeing we + * record the partial cluster at the beginning of the extent. It + * wasn't freed by the preceding ext4_free_blocks() call, and we + * need to look farther to the left to determine if it's to be freed + * (not shared with another extent). Else, reset the partial + * cluster - we're either done freeing or the beginning of the + * extent is left cluster aligned. */ - pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster > 0 && - *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); - *partial_cluster = 0; - } - -#ifdef EXTENTS_STATS - { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - spin_lock(&sbi->s_ext_stats_lock); - sbi->s_ext_blocks += ee_len; - sbi->s_ext_extents++; - if (ee_len < sbi->s_ext_min) - sbi->s_ext_min = ee_len; - if (ee_len > sbi->s_ext_max) - sbi->s_ext_max = ee_len; - if (ext_depth(inode) > sbi->s_depth_max) - sbi->s_depth_max = ext_depth(inode); - spin_unlock(&sbi->s_ext_stats_lock); - } -#endif - if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* tail removal */ - ext4_lblk_t num; - long long first_cluster; - - num = le32_to_cpu(ex->ee_block) + ee_len - from; - pblk = ext4_ext_pblock(ex) + ee_len - num; - /* - * Usually we want to free partial cluster at the end of the - * extent, except for the situation when the cluster is still - * used by any other extent (partial_cluster is negative). - */ - if (*partial_cluster < 0 && - *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) - flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; - - ext_debug("free last %u blocks starting %llu partial %lld\n", - num, pblk, *partial_cluster); - ext4_free_blocks(handle, inode, NULL, pblk, num, flags); - /* - * If the block range to be freed didn't start at the - * beginning of a cluster, and we removed the entire - * extent and the cluster is not used by any other extent, - * save the partial cluster here, since we might need to - * delete if we determine that the truncate or punch hole - * operation has removed all of the blocks in the cluster. - * If that cluster is used by another extent, preserve its - * negative value so it isn't freed later on. - * - * If the whole extent wasn't freed, we've reached the - * start of the truncated/punched region and have finished - * removing blocks. If there's a partial cluster here it's - * shared with the remainder of the extent and is no longer - * a candidate for removal. - */ - if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { - first_cluster = (long long) EXT4_B2C(sbi, pblk); - if (first_cluster != -*partial_cluster) - *partial_cluster = first_cluster; - } else { - *partial_cluster = 0; + if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { + if (partial->state == initial) { + partial->pclu = EXT4_B2C(sbi, pblk); + partial->lblk = from; + partial->state = tofree; } - } else - ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u", - from, to, le32_to_cpu(ex->ee_block), ee_len); + } else { + partial->state = initial; + } + return 0; } - /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" @@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); - *partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial->pclu = EXT4_B2C(sbi, pblk); + partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, partial_cluster, - a, b); + err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; @@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent - * we zero partial_cluster because we've reached the start of the + * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + if (partial->pclu != EXT4_B2C(sbi, pblk)) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); } - *partial_cluster = 0; + partial->state = initial; } /* if this leaf is free, then we should @@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - long long partial_cluster = 0; + struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + partial.pclu = 0; + partial.lblk = 0; + partial.state = initial; + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ @@ -2882,8 +2941,8 @@ again: */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 2; - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; } /* @@ -2911,9 +2970,10 @@ again: &ex); if (err) goto out; - if (pblk) - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + if (pblk) { + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } } } /* @@ -2948,8 +3008,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, start, - end); + &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3021,21 +3080,24 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, end, depth, - partial_cluster, path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, + path->p_hdr->eh_entries); /* - * If we still have something in the partial cluster and we have removed - * even the first extent, then we should free the blocks in the partial - * cluster as well. (This code will only run when there are no leaves - * to the immediate left of the truncated/punched region.) + * if there's a partial cluster and we have removed the first extent + * in the file, then we also free the partial cluster, if any */ - if (partial_cluster > 0 && err == 0) { - /* don't zero partial_cluster since it's not used afterwards */ + if (partial.state == tofree && err == 0) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial.lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial.pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial.lblk); + partial.state = initial; } /* TODO: flexible tree reduction should be here */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4915,9 +4915,17 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + /* + * on a bigalloc file system, defer the s_freeclusters_counter + * update to the caller (ext4_remove_space and friends) so they + * can determine if a cluster freed here should be rereserved + */ + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, + count_clusters); + } ext4_mb_unload_buddy(&e4b); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6d7a943f849c..698e0d8a5ca4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -17,6 +17,7 @@ struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; +struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent, ); TRACE_EVENT(ext4_remove_blocks, - TP_PROTO(struct inode *inode, struct ext4_extent *ex, - ext4_lblk_t from, ext4_fsblk_t to, - long long partial_cluster), + TP_PROTO(struct inode *inode, struct ext4_extent *ex, + ext4_lblk_t from, ext4_fsblk_t to, + struct partial_cluster *pc), - TP_ARGS(inode, ex, from, to, partial_cluster), + TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) - __field( long long, partial ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( @@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks, __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; - __entry->partial = partial_cluster; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" - "from %u to %u partial_cluster %lld", + "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, @@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, - long long partial_cluster), + struct partial_cluster *pc), - TP_ARGS(inode, start, ex, partial_cluster), + TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( long long, partial ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->partial = partial_cluster; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" - "partial_cluster %lld", + "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, @@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space, TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, - int depth, long long partial, __le16 eh_entries), + int depth, struct partial_cluster *pc, __le16 eh_entries), - TP_ARGS(inode, start, end, depth, partial, eh_entries), + TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) @@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done, __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) - __field( long long, partial ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state ) __field( unsigned short, eh_entries ) ), @@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->start = start; __entry->end = end; __entry->depth = depth; - __entry->partial = partial; + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), - TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " + TP_printk("dev %d,%d ino %lu since %u end %u depth %d " + "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, - (long long) __entry->partial, + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); From f456767d3391e9f7d9d25a2e7241d75676dc19da Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:33:24 -0400 Subject: [PATCH 06/21] ext4: fix reserved cluster accounting at page invalidation time Add new code to count canceled pending cluster reservations on bigalloc file systems and to reduce the cluster reservation count on all file systems using delayed allocation. This replaces old code in ext4_da_page_release_reservations that was incorrect. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents_status.c | 90 ++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 23 ++-------- 3 files changed, 95 insertions(+), 19 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0bdbbd151d2c..57cbc98d730f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2491,6 +2491,7 @@ extern int ext4_page_mkwrite(struct vm_fault *vmf); extern int ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c92fbf444d08..2b439afafe13 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1780,3 +1780,93 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, __remove_pending(inode, last); } } + +/* + * ext4_es_remove_blks - remove block range from extents status tree and + * reduce reservation count or cancel pending + * reservation as needed + * + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + */ +void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int clu_size, reserved = 0; + ext4_lblk_t last_lclu, first, length, remainder, last; + bool delonly; + int err = 0; + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + /* + * Process cluster by cluster for bigalloc - there may be up to + * two clusters in a 4k page with a 1k block size and two blocks + * per cluster. Also necessary for systems with larger page sizes + * and potentially larger block sizes. + */ + clu_size = sbi->s_cluster_ratio; + last_lclu = EXT4_B2C(sbi, lblk + len - 1); + + write_lock(&EXT4_I(inode)->i_es_lock); + + for (first = lblk, remainder = len; + remainder > 0; + first += length, remainder -= length) { + + if (EXT4_B2C(sbi, first) == last_lclu) + length = remainder; + else + length = clu_size - EXT4_LBLK_COFF(sbi, first); + + /* + * The BH_Delay flag, which triggers calls to this function, + * and the contents of the extents status tree can be + * inconsistent due to writepages activity. So, note whether + * the blocks to be removed actually belong to an extent with + * delayed only status. + */ + delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); + + /* + * because of the writepages effect, written and unwritten + * blocks could be removed here + */ + last = first + length - 1; + err = __es_remove_extent(inode, first, last); + if (err) + ext4_warning(inode->i_sb, + "%s: couldn't remove page (err = %d)", + __func__, err); + + /* non-bigalloc case: simply count the cluster for release */ + if (sbi->s_cluster_ratio == 1 && delonly) { + reserved++; + continue; + } + + /* + * bigalloc case: if all delayed allocated only blocks have + * just been removed from a cluster, either cancel a pending + * reservation if it exists or count a cluster for release + */ + if (delonly && + !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { + pr = __get_pending(inode, EXT4_B2C(sbi, first)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } else { + reserved++; + } + } + } + + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_da_release_space(inode, reserved); +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 57c6dd38f071..9b69f88bdacc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode) return 0; /* success */ } -static void ext4_da_release_space(struct inode *inode, int to_free) +void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0, contiguous_blks = 0; + int contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int stop = offset + length; - int num_clusters; ext4_fsblk_t lblk; BUG_ON(stop > PAGE_SIZE || stop < length); @@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page, break; if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; contiguous_blks++; clear_buffer_delay(bh); } else if (contiguous_blks) { @@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page, (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); contiguous_blks = 0; } curr_off = next_off; @@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page, if (contiguous_blks) { lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); } - /* If we have released all the blocks belonging to a cluster, then we - * need to release the reserved space for that cluster. */ - num_clusters = EXT4_NUM_B2C(sbi, to_release); - while (num_clusters > 0) { - lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + - ((num_clusters - 1) << sbi->s_cluster_bits); - if (sbi->s_cluster_ratio == 1 || - !ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) - ext4_da_release_space(inode, 1); - - num_clusters--; - } } /* From f18b2b83a727a3db208308057d2c7945f368e625 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 2 Oct 2018 01:34:44 -0400 Subject: [PATCH 07/21] ext4: fix argument checking in EXT4_IOC_MOVE_EXT If the starting block number of either the source or destination file exceeds the EOF, EXT4_IOC_MOVE_EXT should return EINVAL. Also fixed the helper function mext_check_coverage() so that if the logical block is beyond EOF, make it return immediately, instead of looping until the block number wraps all the away around. This takes long enough that if there are multiple threads trying to do pound on an the same inode doing non-sensical things, it can end up triggering the kernel's soft lockup detector. Reported-by: syzbot+c61979f6f2cba5cb3c06@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/move_extent.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a409ff70d67b..2f5be02fc6f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_eof < orig_start + *len - 1) + if (orig_eof <= orig_start) + *len = 0; + else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; - if (donor_eof < donor_start + *len - 1) + if (donor_eof <= donor_start) + *len = 0; + else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " From 799578ab16e86b074c184ec5abbda0bc698c7b0b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 2 Oct 2018 12:43:51 -0400 Subject: [PATCH 08/21] ext4: fix build error when DX_DEBUG is defined MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enabling DX_DEBUG triggers the build error below. info is an attribute of the dxroot structure. linux/fs/ext4/namei.c:2264:12: error: ‘info’ undeclared (first use in this function); did you mean ‘insl’? info->indirect_levels)); Fixes: e08ac99fa2a2 ("ext4: add largedir feature") Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 377d516c475f..67a38532032a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2261,7 +2261,7 @@ again: dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", - info->indirect_levels)); + dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; From 18aded17492088962ef43f00825179598b3e8c58 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 2 Oct 2018 18:21:19 -0400 Subject: [PATCH 09/21] ext4: fix EXT4_IOC_SWAP_BOOT The code EXT4_IOC_SWAP_BOOT ioctl hasn't been updated in a while, and it's a bit broken with respect to more modern ext4 kernels, especially metadata checksums. Other problems fixed with this commit: * Don't allow installing a DAX, swap file, or an encrypted file as a boot loader. * Respect the immutable and append-only flags. * Wait until any DIO operations are finished *before* calling truncate_inode_pages(). * Don't swap inode->i_flags, since these flags have nothing to do with the inode blocks --- and it will give the IMA/audit code heartburn when the inode is evicted. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Reported-by: syzbot+e81ccd4744c6c4f71354@syzkaller.appspotmail.com --- fs/ext4/ioctl.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a7074115d6f6..d7ed7487e630 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - swap(inode1->i_flags, inode2->i_flags); swap(inode1->i_version, inode2->i_version); swap(inode1->i_blocks, inode2->i_blocks); swap(inode1->i_bytes, inode2->i_bytes); @@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) i_size_write(inode2, isize); } +static void reset_inode_seed(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + __u32 csum; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); +} + /** * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other @@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb, struct inode *inode_bl; struct ext4_inode_info *ei_bl; - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + ext4_has_inline_data(inode)) return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); @@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); - truncate_inode_pages(&inode->i_data, 0); - truncate_inode_pages(&inode_bl->i_data, 0); - /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; @@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_generation = prandom_u32(); inode_bl->i_generation = prandom_u32(); + reset_inode_seed(inode); + reset_inode_seed(inode_bl); ext4_discard_preallocations(inode); @@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); } else { err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { @@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); } } ext4_journal_stop(handle); From 625ef8a3acd111d5f496d190baf99d1a815bd03e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 2 Oct 2018 21:18:45 -0400 Subject: [PATCH 10/21] ext4: initialize retries variable in ext4_da_write_inline_data_begin() Variable retries is not initialized in ext4_da_write_inline_data_begin() which can lead to nondeterministic number of retries in case we hit ENOSPC. Initialize retries to zero as we do everywhere else. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Fixes: bc0ca9df3b2a ("ext4: retry allocation when inline->extent conversion failed") Cc: stable@kernel.org --- fs/ext4/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7b4736022761..9c4bac18cc6c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; - int retries; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) From 401b25aa1a75e7fe4e3202a6336604269697d705 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 2 Oct 2018 22:20:50 -0400 Subject: [PATCH 11/21] ext4: convert fault handler to use vm_fault_t type Return type of ext4_page_mkwrite and ext4_filemap_fault are changed to use vm_fault_t type. With this patch all the callers of block_page_mkwrite_return() are changed to handle vm_fault_t. So converting the return type of block_page_mkwrite_return() to vm_fault_t. Signed-off-by: Souptick Joarder Signed-off-by: Theodore Ts'o Reviewed-by: Matthew Wilcox --- fs/ext4/ext4.h | 4 ++-- fs/ext4/inode.c | 29 +++++++++++++++-------------- include/linux/buffer_head.h | 2 +- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 57cbc98d730f..86e1bacac757 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2487,8 +2487,8 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); -extern int ext4_page_mkwrite(struct vm_fault *vmf); -extern int ext4_filemap_fault(struct vm_fault *vmf); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b69f88bdacc..c3d9a42c561e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6184,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_fault *vmf) +vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; - int ret; + int err; + vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; @@ -6203,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf) down_read(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_convert_inline_data(inode); - if (ret) + err = ext4_convert_inline_data(inode); + if (err) goto out_ret; /* Delalloc case is easy... */ @@ -6212,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { - ret = block_page_mkwrite(vma, vmf, + err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); - } while (ret == -ENOSPC && + } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } @@ -6259,8 +6260,8 @@ retry_alloc: ret = VM_FAULT_SIGBUS; goto out; } - ret = block_page_mkwrite(vma, vmf, get_block); - if (!ret && ext4_should_journal_data(inode)) { + err = block_page_mkwrite(vma, vmf, get_block); + if (!err && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); @@ -6271,24 +6272,24 @@ retry_alloc: ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(ret); + ret = block_page_mkwrite_return(err); out: up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } -int ext4_filemap_fault(struct vm_fault *vmf) +vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int err; + vm_fault_t ret; down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vmf); + ret = filemap_fault(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); - return err; + return ret; } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 96225a77c112..7b73ef7f902d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); /* Convert errno to return value from ->page_mkwrite() call */ -static inline int block_page_mkwrite_return(int err) +static inline vm_fault_t block_page_mkwrite_return(int err) { if (err == 0) return VM_FAULT_LOCKED; From e5f0926115a4a40ed1cd0d3ce8b09bb88be73ab9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Oct 2018 22:40:32 -0400 Subject: [PATCH 12/21] docs: generate a separate ext4 pdf file from the documentation The documentation build scripts won't build a pdf for the ext4 documentation unless explicitly called for, so ask for a separate ext4.pdf to be generated with all the documentation. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/conf.py b/Documentation/conf.py index b691af4831fa..05dad6bda787 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -383,6 +383,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), + ('filesystems/ext4/index', 'ext4.tex', 'ext4 Filesystem', + 'ext4 Filesystem Developers', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', From de7abd7bbb73d67f90c6fb48d4b2debe54f6f46e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Oct 2018 22:43:40 -0400 Subject: [PATCH 13/21] docs: fix ext4 documentation table formatting problems It turns out that the latex table formatters lay out table columns with the exact proportional widths given in the table metadata, even if text overflows outside the box. This was not caught during the initial import because the HTML renderers are smart enough to fudge the table. Fix the table column width formatting problems in the data structures and algorithms documentation so that we don't have squashed columns. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- .../filesystems/ext4/ondisk/attributes.rst | 8 ++--- .../filesystems/ext4/ondisk/checksums.rst | 2 +- .../filesystems/ext4/ondisk/directory.rst | 18 +++++------ .../filesystems/ext4/ondisk/group_descr.rst | 4 +-- .../filesystems/ext4/ondisk/ifork.rst | 8 ++--- .../filesystems/ext4/ondisk/inodes.rst | 19 +++++------ .../filesystems/ext4/ondisk/journal.rst | 32 +++++++++---------- Documentation/filesystems/ext4/ondisk/mmp.rst | 2 +- .../ext4/ondisk/special_inodes.rst | 2 +- .../filesystems/ext4/ondisk/super.rst | 24 +++++++------- 10 files changed, 60 insertions(+), 59 deletions(-) diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst index 0b01b67b81fe..54386a010a8d 100644 --- a/Documentation/filesystems/ext4/ondisk/attributes.rst +++ b/Documentation/filesystems/ext4/ondisk/attributes.rst @@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header ``ext4_xattr_ibody_header`` that is 4 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -47,7 +47,7 @@ The beginning of an extended attribute block is in ``struct ext4_xattr_header``, which is 32 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is Attributes stored inside an inode do not need be stored in sorted order. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from the key name. Here is a map of name index values to key prefixes: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Name Index diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst index 9d6a793b2e03..5519e253810d 100644 --- a/Documentation/filesystems/ext4/ondisk/checksums.rst +++ b/Documentation/filesystems/ext4/ondisk/checksums.rst @@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes (crc32c as of October 2013) unless noted otherwise. .. list-table:: - :widths: 1 1 4 + :widths: 20 8 50 :header-rows: 1 * - Metadata diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst index 8fcba68c2884..614034e24669 100644 --- a/Documentation/filesystems/ext4/ondisk/directory.rst +++ b/Documentation/filesystems/ext4/ondisk/directory.rst @@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference ``dirent.rec_len`` to know for sure. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most ``dirent.rec_len`` to know for sure. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most The directory file type is one of the following values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is ``struct ext4_dir_entry_tail``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length of a data block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -305,7 +305,7 @@ of a data block: The directory hash is one of the following values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is also the full length of a data block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum. The dx\_tail structure is 8 bytes long and looks like this: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst index 759827e5d2cf..0f783ed88592 100644 --- a/Documentation/filesystems/ext4/ondisk/group_descr.rst +++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst @@ -43,7 +43,7 @@ entire bitmap. The block group descriptor is laid out in ``struct ext4_group_desc``. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. Block group flags can be any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst index 5dbe3b2b121a..b9816d5a896b 100644 --- a/Documentation/filesystems/ext4/ondisk/ifork.rst +++ b/Documentation/filesystems/ext4/ondisk/ifork.rst @@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``, which is 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are recorded as ``struct ext4_extent_idx``, and are 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, and are also 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -174,7 +174,7 @@ including) the checksum itself. ``struct ext4_extent_tail`` is 4 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst index 655ce898f3f5..6bd35e506b6f 100644 --- a/Documentation/filesystems/ext4/ondisk/inodes.rst +++ b/Documentation/filesystems/ext4/ondisk/inodes.rst @@ -29,8 +29,9 @@ and the inode structure itself. The inode table entry is laid out in ``struct ext4_inode``. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 + :class: longtable * - Offset - Size @@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``. The ``i_mode`` value is a combination of the following flags: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags: The ``i_flags`` field is a combination of these values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator: Linux: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -331,7 +332,7 @@ Linux: Hurd: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -346,7 +347,7 @@ Hurd: Masix: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator: Linux: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -402,7 +403,7 @@ Linux: Hurd: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -433,7 +434,7 @@ Hurd: Masix: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst index e7031af86876..ea613ee701f5 100644 --- a/Documentation/filesystems/ext4/ondisk/journal.rst +++ b/Documentation/filesystems/ext4/ondisk/journal.rst @@ -48,7 +48,7 @@ Layout Generally speaking, the journal has this format: .. list-table:: - :widths: 1 1 78 + :widths: 16 48 16 :header-rows: 1 * - Superblock @@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the superblock. .. list-table:: - :widths: 1 1 1 1 76 + :widths: 12 12 12 32 12 :header-rows: 1 * - 1024 bytes of padding @@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header ``struct journal_header_s``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header The journal block type can be any one of: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``, which is 1024 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -264,7 +264,7 @@ which is 1024 bytes long: The journal compat features are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -278,7 +278,7 @@ The journal compat features are any combination of the following: The journal incompat features are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the most likely choices. .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway. Descriptor blocks consume at least 36 bytes, but use a full block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the following. The size is 16 or 32 bytes. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes. The journal tag flags are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the following. The size is 8, 12, 24, or 28 bytes: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a ``struct jbd2_journal_block_tail``, which looks like this: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -513,7 +513,7 @@ Revocation blocks are described in length, but use a full block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation block is a ``struct jbd2_journal_revoke_tail``, which has this format: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32 bytes long (but uses a full block): .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst index b7d7a3137f80..25660981d93c 100644 --- a/Documentation/filesystems/ext4/ondisk/mmp.rst +++ b/Documentation/filesystems/ext4/ondisk/mmp.rst @@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure. The MMP structure (``struct mmp_struct``) is as follows: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 12 20 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst index a82f70c9baeb..9061aabba827 100644 --- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst +++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst @@ -6,7 +6,7 @@ Special inodes ext4 reserves some inode for special features, as follows: .. list-table:: - :widths: 1 79 + :widths: 6 70 :header-rows: 1 * - inode Number diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst index 5f81dd87e0b9..04ff079a2acf 100644 --- a/Documentation/filesystems/ext4/ondisk/super.rst +++ b/Documentation/filesystems/ext4/ondisk/super.rst @@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in ``struct ext4_super_block``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in The superblock state is some combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -500,7 +500,7 @@ The superblock state is some combination of the following: The superblock error policy is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -517,7 +517,7 @@ The superblock error policy is one of the following: The filesystem creator is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -538,7 +538,7 @@ The filesystem creator is one of the following: The superblock revision is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -702,7 +702,7 @@ the following: The ``s_def_hash_version`` field is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following: The ``s_default_mount_opts`` field is any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following: The ``s_flags`` field is any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following: The ``s_encrypt_algos`` list can contain any of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value From c0e3e0406a0c39044c7dc25f3386694542d50fcc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Oct 2018 22:45:25 -0400 Subject: [PATCH 14/21] docs: make ext4 readme tables readable The tables in the ext4 readme are not particularly space efficient in the text or html outputs, and they're totally broken in the pdf output. Convert them into titled paragraphs so that they render more nicely. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/ext4.rst | 699 +++++++++++------------- 1 file changed, 330 insertions(+), 369 deletions(-) diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst index 9d4368d591fa..e2b6bb7c2730 100644 --- a/Documentation/filesystems/ext4/ext4.rst +++ b/Documentation/filesystems/ext4/ext4.rst @@ -101,269 +101,256 @@ Options When mounting an ext4 filesystem, the following option are accepted: (*) == default -======================= ======================================================= -Mount Option Description -======================= ======================================================= -ro Mount filesystem read only. Note that ext4 will - replay the journal (and thus write to the - partition) even when mounted "read only". The - mount options "ro,noload" can be used to prevent - writes to the filesystem. + ro + Mount filesystem read only. Note that ext4 will replay the journal (and + thus write to the partition) even when mounted "read only". The mount + options "ro,noload" can be used to prevent writes to the filesystem. -journal_checksum Enable checksumming of the journal transactions. - This will allow the recovery code in e2fsck and the - kernel to detect corruption in the kernel. It is a - compatible change and will be ignored by older kernels. + journal_checksum + Enable checksumming of the journal transactions. This will allow the + recovery code in e2fsck and the kernel to detect corruption in the + kernel. It is a compatible change and will be ignored by older + kernels. -journal_async_commit Commit block can be written to disk without waiting - for descriptor blocks. If enabled older kernels cannot - mount the device. This will enable 'journal_checksum' - internally. + journal_async_commit + Commit block can be written to disk without waiting for descriptor + blocks. If enabled older kernels cannot mount the device. This will + enable 'journal_checksum' internally. -journal_path=path -journal_dev=devnum When the external journal device's major/minor numbers - have changed, these options allow the user to specify - the new journal location. The journal device is - identified through either its new major/minor numbers - encoded in devnum, or via a path to the device. + journal_path=path, journal_dev=devnum + When the external journal device's major/minor numbers have changed, + these options allow the user to specify the new journal location. The + journal device is identified through either its new major/minor numbers + encoded in devnum, or via a path to the device. -norecovery Don't load the journal on mounting. Note that -noload if the filesystem was not unmounted cleanly, - skipping the journal replay will lead to the - filesystem containing inconsistencies that can - lead to any number of problems. + norecovery, noload + Don't load the journal on mounting. Note that if the filesystem was + not unmounted cleanly, skipping the journal replay will lead to the + filesystem containing inconsistencies that can lead to any number of + problems. -data=journal All data are committed into the journal prior to being - written into the main file system. Enabling - this mode will disable delayed allocation and - O_DIRECT support. + data=journal + All data are committed into the journal prior to being written into the + main file system. Enabling this mode will disable delayed allocation + and O_DIRECT support. -data=ordered (*) All data are forced directly out to the main file - system prior to its metadata being committed to the - journal. + data=ordered (*) + All data are forced directly out to the main file system prior to its + metadata being committed to the journal. -data=writeback Data ordering is not preserved, data may be written - into the main file system after its metadata has been - committed to the journal. + data=writeback + Data ordering is not preserved, data may be written into the main file + system after its metadata has been committed to the journal. -commit=nrsec (*) Ext4 can be told to sync all its data and metadata - every 'nrsec' seconds. The default value is 5 seconds. - This means that if you lose your power, you will lose - as much as the latest 5 seconds of work (your - filesystem will not be damaged though, thanks to the - journaling). This default value (or any low value) - will hurt performance, but it's good for data-safety. - Setting it to 0 will have the same effect as leaving - it at the default (5 seconds). - Setting it to very large values will improve - performance. + commit=nrsec (*) + Ext4 can be told to sync all its data and metadata every 'nrsec' + seconds. The default value is 5 seconds. This means that if you lose + your power, you will lose as much as the latest 5 seconds of work (your + filesystem will not be damaged though, thanks to the journaling). This + default value (or any low value) will hurt performance, but it's good + for data-safety. Setting it to 0 will have the same effect as leaving + it at the default (5 seconds). Setting it to very large values will + improve performance. -barrier=<0|1(*)> This enables/disables the use of write barriers in -barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. -nobarrier This also requires an IO stack which can support - barriers, and if jbd gets an error on a barrier - write, it will disable again with a warning. - Write barriers enforce proper on-disk ordering - of journal commits, making volatile disk write caches - safe to use, at some performance penalty. If - your disks are battery-backed in one way or another, - disabling barriers may safely improve performance. - The mount options "barrier" and "nobarrier" can - also be used to enable or disable barriers, for - consistency with other ext4 mount options. + barrier=<0|1(*)>, barrier(*), nobarrier + This enables/disables the use of write barriers in the jbd code. + barrier=0 disables, barrier=1 enables. This also requires an IO stack + which can support barriers, and if jbd gets an error on a barrier + write, it will disable again with a warning. Write barriers enforce + proper on-disk ordering of journal commits, making volatile disk write + caches safe to use, at some performance penalty. If your disks are + battery-backed in one way or another, disabling barriers may safely + improve performance. The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for consistency with other + ext4 mount options. -inode_readahead_blks=n This tuning parameter controls the maximum - number of inode table blocks that ext4's inode - table readahead algorithm will pre-read into - the buffer cache. The default value is 32 blocks. + inode_readahead_blks=n + This tuning parameter controls the maximum number of inode table blocks + that ext4's inode table readahead algorithm will pre-read into the + buffer cache. The default value is 32 blocks. -nouser_xattr Disables Extended User Attributes. See the - attr(5) manual page for more information about - extended attributes. + nouser_xattr + Disables Extended User Attributes. See the attr(5) manual page for + more information about extended attributes. -noacl This option disables POSIX Access Control List - support. If ACL support is enabled in the kernel - configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is - enabled by default on mount. See the acl(5) manual - page for more information about acl. + noacl + This option disables POSIX Access Control List support. If ACL support + is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL + is enabled by default on mount. See the acl(5) manual page for more + information about acl. -bsddf (*) Make 'df' act like BSD. -minixdf Make 'df' act like Minix. + bsddf (*) + Make 'df' act like BSD. -debug Extra debugging information is sent to syslog. + minixdf + Make 'df' act like Minix. -abort Simulate the effects of calling ext4_abort() for - debugging purposes. This is normally used while - remounting a filesystem which is already mounted. + debug + Extra debugging information is sent to syslog. -errors=remount-ro Remount the filesystem read-only on an error. -errors=continue Keep going on a filesystem error. -errors=panic Panic and halt the machine if an error occurs. - (These mount options override the errors behavior - specified in the superblock, which can be configured - using tune2fs) + abort + Simulate the effects of calling ext4_abort() for debugging purposes. + This is normally used while remounting a filesystem which is already + mounted. -data_err=ignore(*) Just print an error message if an error occurs - in a file data buffer in ordered mode. -data_err=abort Abort the journal if an error occurs in a file - data buffer in ordered mode. + errors=remount-ro + Remount the filesystem read-only on an error. -grpid New objects have the group ID of their parent. -bsdgroups + errors=continue + Keep going on a filesystem error. -nogrpid (*) New objects have the group ID of their creator. -sysvgroups + errors=panic + Panic and halt the machine if an error occurs. (These mount options + override the errors behavior specified in the superblock, which can be + configured using tune2fs) -resgid=n The group ID which may use the reserved blocks. + data_err=ignore(*) + Just print an error message if an error occurs in a file data buffer in + ordered mode. + data_err=abort + Abort the journal if an error occurs in a file data buffer in ordered + mode. -resuid=n The user ID which may use the reserved blocks. + grpid | bsdgroups + New objects have the group ID of their parent. -sb=n Use alternate superblock at this location. + nogrpid (*) | sysvgroups + New objects have the group ID of their creator. -quota These options are ignored by the filesystem. They -noquota are used only by quota tools to recognize volumes -grpquota where quota should be turned on. See documentation -usrquota in the quota-tools package for more details - (http://sourceforge.net/projects/linuxquota). + resgid=n + The group ID which may use the reserved blocks. -jqfmt= These options tell filesystem details about quota -usrjquota= so that quota information can be properly updated -grpjquota= during journal replay. They replace the above - quota options. See documentation in the quota-tools - package for more details - (http://sourceforge.net/projects/linuxquota). + resuid=n + The user ID which may use the reserved blocks. -stripe=n Number of filesystem blocks that mballoc will try - to use for allocation size and alignment. For RAID5/6 - systems this should be the number of data - disks * RAID chunk size in file system blocks. + sb= + Use alternate superblock at this location. -delalloc (*) Defer block allocation until just before ext4 - writes out the block(s) in question. This - allows ext4 to better allocation decisions - more efficiently. -nodelalloc Disable delayed allocation. Blocks are allocated - when the data is copied from userspace to the - page cache, either via the write(2) system call - or when an mmap'ed page which was previously - unallocated is written for the first time. + quota, noquota, grpquota, usrquota + These options are ignored by the filesystem. They are used only by + quota tools to recognize volumes where quota should be turned on. See + documentation in the quota-tools package for more details + (http://sourceforge.net/projects/linuxquota). -max_batch_time=usec Maximum amount of time ext4 should wait for - additional filesystem operations to be batch - together with a synchronous write operation. - Since a synchronous write operation is going to - force a commit and then a wait for the I/O - complete, it doesn't cost much, and can be a - huge throughput win, we wait for a small amount - of time to see if any other transactions can - piggyback on the synchronous write. The - algorithm used is designed to automatically tune - for the speed of the disk, by measuring the - amount of time (on average) that it takes to - finish committing a transaction. Call this time - the "commit time". If the time that the - transaction has been running is less than the - commit time, ext4 will try sleeping for the - commit time to see if other operations will join - the transaction. The commit time is capped by - the max_batch_time, which defaults to 15000us - (15ms). This optimization can be turned off - entirely by setting max_batch_time to 0. + jqfmt=, usrjquota=, grpjquota= + These options tell filesystem details about quota so that quota + information can be properly updated during journal replay. They replace + the above quota options. See documentation in the quota-tools package + for more details (http://sourceforge.net/projects/linuxquota). -min_batch_time=usec This parameter sets the commit time (as - described above) to be at least min_batch_time. - It defaults to zero microseconds. Increasing - this parameter may improve the throughput of - multi-threaded, synchronous workloads on very - fast disks, at the cost of increasing latency. + stripe=n + Number of filesystem blocks that mballoc will try to use for allocation + size and alignment. For RAID5/6 systems this should be the number of + data disks * RAID chunk size in file system blocks. -journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the - highest priority) which should be used for I/O - operations submitted by kjournald2 during a - commit operation. This defaults to 3, which is - a slightly higher priority than the default I/O - priority. + delalloc (*) + Defer block allocation until just before ext4 writes out the block(s) + in question. This allows ext4 to better allocation decisions more + efficiently. -auto_da_alloc(*) Many broken applications don't use fsync() when -noauto_da_alloc replacing existing files via patterns such as - fd = open("foo.new")/write(fd,..)/close(fd)/ - rename("foo.new", "foo"), or worse yet, - fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). - If auto_da_alloc is enabled, ext4 will detect - the replace-via-rename and replace-via-truncate - patterns and force that any delayed allocation - blocks are allocated such that at the next - journal commit, in the default data=ordered - mode, the data blocks of the new file are forced - to disk before the rename() operation is - committed. This provides roughly the same level - of guarantees as ext3, and avoids the - "zero-length" problem that can happen when a - system crashes before the delayed allocation - blocks are forced to disk. + nodelalloc + Disable delayed allocation. Blocks are allocated when the data is + copied from userspace to the page cache, either via the write(2) system + call or when an mmap'ed page which was previously unallocated is + written for the first time. -noinit_itable Do not initialize any uninitialized inode table - blocks in the background. This feature may be - used by installation CD's so that the install - process can complete as quickly as possible; the - inode table initialization process would then be - deferred until the next time the file system - is unmounted. + max_batch_time=usec + Maximum amount of time ext4 should wait for additional filesystem + operations to be batch together with a synchronous write operation. + Since a synchronous write operation is going to force a commit and then + a wait for the I/O complete, it doesn't cost much, and can be a huge + throughput win, we wait for a small amount of time to see if any other + transactions can piggyback on the synchronous write. The algorithm + used is designed to automatically tune for the speed of the disk, by + measuring the amount of time (on average) that it takes to finish + committing a transaction. Call this time the "commit time". If the + time that the transaction has been running is less than the commit + time, ext4 will try sleeping for the commit time to see if other + operations will join the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us (15ms). This + optimization can be turned off entirely by setting max_batch_time to 0. -init_itable=n The lazy itable init code will wait n times the - number of milliseconds it took to zero out the - previous block group's inode table. This - minimizes the impact on the system performance - while file system's inode table is being initialized. + min_batch_time=usec + This parameter sets the commit time (as described above) to be at least + min_batch_time. It defaults to zero microseconds. Increasing this + parameter may improve the throughput of multi-threaded, synchronous + workloads on very fast disks, at the cost of increasing latency. -discard Controls whether ext4 should issue discard/TRIM -nodiscard(*) commands to the underlying block device when - blocks are freed. This is useful for SSD devices - and sparse/thinly-provisioned LUNs, but it is off - by default until sufficient testing has been done. + journal_ioprio=prio + The I/O priority (from 0 to 7, where 0 is the highest priority) which + should be used for I/O operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is a slightly higher + priority than the default I/O priority. -nouid32 Disables 32-bit UIDs and GIDs. This is for - interoperability with older kernels which only - store and expect 16-bit values. + auto_da_alloc(*), noauto_da_alloc + Many broken applications don't use fsync() when replacing existing + files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, fd = open("foo", + O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 + will detect the replace-via-rename and replace-via-truncate patterns + and force that any delayed allocation blocks are allocated such that at + the next journal commit, in the default data=ordered mode, the data + blocks of the new file are forced to disk before the rename() operation + is committed. This provides roughly the same level of guarantees as + ext3, and avoids the "zero-length" problem that can happen when a + system crashes before the delayed allocation blocks are forced to disk. -block_validity(*) These options enable or disable the in-kernel -noblock_validity facility for tracking filesystem metadata blocks - within internal data structures. This allows multi- - block allocator and other routines to notice - bugs or corrupted allocation bitmaps which cause - blocks to be allocated which overlap with - filesystem metadata blocks. + noinit_itable + Do not initialize any uninitialized inode table blocks in the + background. This feature may be used by installation CD's so that the + install process can complete as quickly as possible; the inode table + initialization process would then be deferred until the next time the + file system is unmounted. -dioread_lock Controls whether or not ext4 should use the DIO read -dioread_nolock locking. If the dioread_nolock option is specified - ext4 will allocate uninitialized extent before buffer - write and convert the extent to initialized after IO - completes. This approach allows ext4 code to avoid - using inode mutex, which improves scalability on high - speed storages. However this does not work with - data journaling and dioread_nolock option will be - ignored with kernel warning. Note that dioread_nolock - code path is only used for extent-based files. - Because of the restrictions this options comprises - it is off by default (e.g. dioread_lock). + init_itable=n + The lazy itable init code will wait n times the number of milliseconds + it took to zero out the previous block group's inode table. This + minimizes the impact on the system performance while file system's + inode table is being initialized. -max_dir_size_kb=n This limits the size of directories so that any - attempt to expand them beyond the specified - limit in kilobytes will cause an ENOSPC error. - This is useful in memory constrained - environments, where a very large directory can - cause severe performance problems or even - provoke the Out Of Memory killer. (For example, - if there is only 512mb memory available, a 176mb - directory may seriously cramp the system's style.) + discard, nodiscard(*) + Controls whether ext4 should issue discard/TRIM commands to the + underlying block device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs, but it is off by default + until sufficient testing has been done. -i_version Enable 64-bit inode version support. This option is - off by default. + nouid32 + Disables 32-bit UIDs and GIDs. This is for interoperability with + older kernels which only store and expect 16-bit values. -dax Use direct access (no page cache). See - Documentation/filesystems/dax.txt. Note that - this option is incompatible with data=journal. -======================= ======================================================= + block_validity(*), noblock_validity + These options enable or disable the in-kernel facility for tracking + filesystem metadata blocks within internal data structures. This + allows multi- block allocator and other routines to notice bugs or + corrupted allocation bitmaps which cause blocks to be allocated which + overlap with filesystem metadata blocks. + + dioread_lock, dioread_nolock + Controls whether or not ext4 should use the DIO read locking. If the + dioread_nolock option is specified ext4 will allocate uninitialized + extent before buffer write and convert the extent to initialized after + IO completes. This approach allows ext4 code to avoid using inode + mutex, which improves scalability on high speed storages. However this + does not work with data journaling and dioread_nolock option will be + ignored with kernel warning. Note that dioread_nolock code path is only + used for extent-based files. Because of the restrictions this options + comprises it is off by default (e.g. dioread_lock). + + max_dir_size_kb=n + This limits the size of directories so that any attempt to expand them + beyond the specified limit in kilobytes will cause an ENOSPC error. + This is useful in memory constrained environments, where a very large + directory can cause severe performance problems or even provoke the Out + Of Memory killer. (For example, if there is only 512mb memory + available, a 176mb directory may seriously cramp the system's style.) + + i_version + Enable 64-bit inode version support. This option is off by default. + + dax + Use direct access (no page cache). See + Documentation/filesystems/dax.txt. Note that this option is + incompatible with data=journal. Data Mode ========= @@ -407,11 +394,8 @@ in table below. Files in /proc/fs/ext4/ -================ ======= - File Content -================ ======= - mb_groups details of multiblock allocator buddy cache of free blocks -================ ======= + mb_groups + details of multiblock allocator buddy cache of free blocks /sys entries ============ @@ -426,74 +410,71 @@ Files in /sys/fs/ext4/: (see also Documentation/ABI/testing/sysfs-fs-ext4) -============================= ================================================= -File Content -============================= ================================================= - delayed_allocation_blocks This file is read-only and shows the number of - blocks that are dirty in the page cache, but - which do not have their location in the - filesystem allocated yet. + delayed_allocation_blocks + This file is read-only and shows the number of blocks that are dirty in + the page cache, but which do not have their location in the filesystem + allocated yet. -inode_goal Tuning parameter which (if non-zero) controls - the goal inode used by the inode allocator in - preference to all other allocation heuristics. - This is intended for debugging use only, and - should be 0 on production systems. + inode_goal + Tuning parameter which (if non-zero) controls the goal inode used by + the inode allocator in preference to all other allocation heuristics. + This is intended for debugging use only, and should be 0 on production + systems. -inode_readahead_blks Tuning parameter which controls the maximum - number of inode table blocks that ext4's inode - table readahead algorithm will pre-read into - the buffer cache + inode_readahead_blks + Tuning parameter which controls the maximum number of inode table + blocks that ext4's inode table readahead algorithm will pre-read into + the buffer cache. -lifetime_write_kbytes This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was created. + lifetime_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was created. - max_writeback_mb_bump The maximum number of megabytes the writeback - code will try to write out before move on to - another inode. + max_writeback_mb_bump + The maximum number of megabytes the writeback code will try to write + out before move on to another inode. - mb_group_prealloc The multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if - the stripe size is not set in the ext4 superblock + mb_group_prealloc + The multiblock allocator will round up allocation requests to a + multiple of this tuning parameter if the stripe size is not set in the + ext4 superblock - mb_max_to_scan The maximum number of extents the multiblock - allocator will search to find the best extent + mb_max_to_scan + The maximum number of extents the multiblock allocator will search to + find the best extent. - mb_min_to_scan The minimum number of extents the multiblock - allocator will search to find the best extent + mb_min_to_scan + The minimum number of extents the multiblock allocator will search to + find the best extent. - mb_order2_req Tuning parameter which controls the minimum size - for requests (as a power of 2) where the buddy - cache is used + mb_order2_req + Tuning parameter which controls the minimum size for requests (as a + power of 2) where the buddy cache is used. - mb_stats Controls whether the multiblock allocator should - collect statistics, which are shown during the - unmount. 1 means to collect statistics, 0 means - not to collect statistics + mb_stats + Controls whether the multiblock allocator should collect statistics, + which are shown during the unmount. 1 means to collect statistics, 0 + means not to collect statistics. - mb_stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out - of a block group specific preallocation pool, so - that small files are packed closely together. - Each large file will have its blocks allocated - out of its own unique preallocation pool. + mb_stream_req + Files which have fewer blocks than this tunable parameter will have + their blocks allocated out of a block group specific preallocation + pool, so that small files are packed closely together. Each large file + will have its blocks allocated out of its own unique preallocation + pool. - session_write_kbytes This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was mounted. + session_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was mounted. - reserved_clusters This is RW file and contains number of reserved - clusters in the file system which will be used - in the specific situations to avoid costly - zeroout, unexpected ENOSPC, or possible data - loss. The default is 2% or 4096 clusters, - whichever is smaller and this can be changed - however it can never exceed number of clusters - in the file system. If there is not enough space - for the reserved space when mounting the file - mount will _not_ fail. -============================= ================================================= + reserved_clusters + This is RW file and contains number of reserved clusters in the file + system which will be used in the specific situations to avoid costly + zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or + 4096 clusters, whichever is smaller and this can be changed however it + can never exceed number of clusters in the file system. If there is not + enough space for the reserved space when mounting the file mount will + _not_ fail. Ioctls ====== @@ -504,100 +485,80 @@ shown in the table below. Table of Ext4 specific ioctls -============================= ================================================= -Ioctl Description -============================= ================================================= - EXT4_IOC_GETFLAGS Get additional attributes associated with inode. - The ioctl argument is an integer bitfield, with - bit values described in ext4.h. This ioctl is an - alias for FS_IOC_GETFLAGS. + EXT4_IOC_GETFLAGS + Get additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_GETFLAGS. - EXT4_IOC_SETFLAGS Set additional attributes associated with inode. - The ioctl argument is an integer bitfield, with - bit values described in ext4.h. This ioctl is an - alias for FS_IOC_SETFLAGS. + EXT4_IOC_SETFLAGS + Set additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_SETFLAGS. - EXT4_IOC_GETVERSION - EXT4_IOC_GETVERSION_OLD - Get the inode i_generation number stored for - each inode. The i_generation number is normally - changed only when new inode is created and it is - particularly useful for network filesystems. The - '_OLD' version of this ioctl is an alias for - FS_IOC_GETVERSION. + EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD + Get the inode i_generation number stored for each inode. The + i_generation number is normally changed only when new inode is created + and it is particularly useful for network filesystems. The '_OLD' + version of this ioctl is an alias for FS_IOC_GETVERSION. - EXT4_IOC_SETVERSION - EXT4_IOC_SETVERSION_OLD - Set the inode i_generation number stored for - each inode. The '_OLD' version of this ioctl - is an alias for FS_IOC_SETVERSION. + EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD + Set the inode i_generation number stored for each inode. The '_OLD' + version of this ioctl is an alias for FS_IOC_SETVERSION. - EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize - mount option. It allows to resize filesystem - to the end of the last existing block group, - further resize has to be done with resize2fs, - either online, or offline. The argument points - to the unsigned logn number representing the - filesystem new block count. + EXT4_IOC_GROUP_EXTEND + This ioctl has the same purpose as the resize mount option. It allows + to resize filesystem to the end of the last existing block group, + further resize has to be done with resize2fs, either online, or + offline. The argument points to the unsigned logn number representing + the filesystem new block count. - EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one - this ioctl is pointing to) to the donor_fd (the - one specified in move_extent structure passed - as an argument to this ioctl). Then, exchange - inode metadata between orig_fd and donor_fd. - This is especially useful for online - defragmentation, because the allocator has the - opportunity to allocate moved blocks better, - ideally into one contiguous extent. + EXT4_IOC_MOVE_EXT + Move the block extents from orig_fd (the one this ioctl is pointing to) + to the donor_fd (the one specified in move_extent structure passed as + an argument to this ioctl). Then, exchange inode metadata between + orig_fd and donor_fd. This is especially useful for online + defragmentation, because the allocator has the opportunity to allocate + moved blocks better, ideally into one contiguous extent. - EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or - new group descriptor block. The new group - descriptor is described by ext4_new_group_input - structure, which is passed as an argument to - this ioctl. This is especially useful in - conjunction with EXT4_IOC_GROUP_EXTEND, - which allows online resize of the filesystem - to the end of the last existing block group. - Those two ioctls combined is used in userspace - online resize tool (e.g. resize2fs). + EXT4_IOC_GROUP_ADD + Add a new group descriptor to an existing or new group descriptor + block. The new group descriptor is described by ext4_new_group_input + structure, which is passed as an argument to this ioctl. This is + especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which + allows online resize of the filesystem to the end of the last existing + block group. Those two ioctls combined is used in userspace online + resize tool (e.g. resize2fs). - EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. - It converts (migrates) ext3 indirect block mapped - inode to ext4 extent mapped inode by walking - through indirect block mapping of the original - inode and converting contiguous block ranges - into ext4 extents of the temporary inode. Then, - inodes are swapped. This ioctl might help, when - migrating from ext3 to ext4 filesystem, however - suggestion is to create fresh ext4 filesystem - and copy data from the backup. Note, that - filesystem has to support extents for this ioctl - to work. + EXT4_IOC_MIGRATE + This ioctl operates on the filesystem itself. It converts (migrates) + ext3 indirect block mapped inode to ext4 extent mapped inode by walking + through indirect block mapping of the original inode and converting + contiguous block ranges into ext4 extents of the temporary inode. Then, + inodes are swapped. This ioctl might help, when migrating from ext3 to + ext4 filesystem, however suggestion is to create fresh ext4 filesystem + and copy data from the backup. Note, that filesystem has to support + extents for this ioctl to work. - EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be - allocated to preserve application-expected ext3 - behaviour. Note that this will also start - triggering a write of the data blocks, but this - behaviour may change in the future as it is - not necessary and has been done this way only - for sake of simplicity. + EXT4_IOC_ALLOC_DA_BLKS + Force all of the delay allocated blocks to be allocated to preserve + application-expected ext3 behaviour. Note that this will also start + triggering a write of the data blocks, but this behaviour may change in + the future as it is not necessary and has been done this way only for + sake of simplicity. - EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number - of blocks of resized filesystem is passed in via - 64 bit integer argument. The kernel allocates - bitmaps and inode table, the userspace tool thus - just passes the new number of blocks. + EXT4_IOC_RESIZE_FS + Resize the filesystem to a new size. The number of blocks of resized + filesystem is passed in via 64 bit integer argument. The kernel + allocates bitmaps and inode table, the userspace tool thus just passes + the new number of blocks. - EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes - (like i_blocks, i_size, i_flags, ...) from - the specified inode with inode - EXT4_BOOT_LOADER_INO (#5). This is typically - used to store a boot loader in a secure part of - the filesystem, where it can't be changed by a - normal user by accident. - The data blocks of the previous boot loader - will be associated with the given inode. -============================= ================================================= + EXT4_IOC_SWAP_BOOT + Swap i_blocks and associated attributes (like i_blocks, i_size, + i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO + (#5). This is typically used to store a boot loader in a secure part of + the filesystem, where it can't be changed by a normal user by accident. + The data blocks of the previous boot loader will be associated with the + given inode. References ========== From dc7ac6c4cae3b58724c2f1e21a7c05ce19ecd5a8 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 3 Oct 2018 10:33:32 -0400 Subject: [PATCH 15/21] ext4: fix setattr project check in fssetxattr ioctl Currently, project quota could be changed by fssetxattr ioctl, and existed permission check inode_owner_or_capable() is obviously not enough, just think that common users could change project id of file, that could make users to break project quota easily. This patch try to follow same regular of xfs project quota: "Project Quota ID state is only allowed to change from within the init namespace. Enforce that restriction only if we are trying to change the quota ID state. Everything else is allowed in user namespaces." Besides that, check and set project id'state should be an atomic operation, protect whole operation with inode lock, ext4_ioctl_setproject() is only used for ioctl EXT4_IOC_FSSETXATTR, we have held mnt_want_write_file() before ext4_ioctl_setflags(), and ext4_ioctl_setproject() is called after ext4_ioctl_setflags(), we could share codes, so remove it inside ext4_ioctl_setproject(). Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@kernel.org --- fs/ext4/ioctl.c | 60 ++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d7ed7487e630..0b3e2486f988 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -360,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; - err = mnt_want_write_file(filp); - if (err) - return err; - err = -EPERM; - inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) - goto out_unlock; + return err; err = ext4_get_inode_loc(inode, &iloc); if (err) - goto out_unlock; + return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { @@ -380,7 +375,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) - goto out_unlock; + return err; } else { brelse(iloc.bh); } @@ -390,10 +385,8 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_unlock; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) @@ -421,9 +414,6 @@ out_dirty: err = rc; out_stop: ext4_journal_stop(handle); -out_unlock: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } #else @@ -647,6 +637,30 @@ group_add_out: return err; } +static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1046,19 +1060,19 @@ resizefs_out: return err; inode_lock(inode); + err = ext4_ioctl_check_project(inode, &fa); + if (err) + goto out; flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); err = ext4_ioctl_setflags(inode, flags); + if (err) + goto out; + err = ext4_ioctl_setproject(filp, fa.fsx_projid); +out: inode_unlock(inode); mnt_drop_write_file(filp); - if (err) - return err; - - err = ext4_ioctl_setproject(filp, fa.fsx_projid); - if (err) - return err; - - return 0; + return err; } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); From 182a79e0c17147d2c2d3990a9a7b6b58a1561c7a Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 3 Oct 2018 12:19:21 -0400 Subject: [PATCH 16/21] ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR We return most failure of dquota_initialize() except inode evict, this could make a bit sense, for example we allow file removal even quota files are broken? But it dosen't make sense to allow setting project if quota files etc are broken. Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ioctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0b3e2486f988..0edee31913d1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -380,7 +380,9 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) brelse(iloc.bh); } - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + return err; handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + From ccd3c4373eacb044eb3832966299d13d2631f66f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Oct 2018 18:44:40 -0400 Subject: [PATCH 17/21] jbd2: fix use after free in jbd2_log_do_checkpoint() The code cleaning transaction's lists of checkpoint buffers has a bug where it increases bh refcount only after releasing journal->j_list_lock. Thus the following race is possible: CPU0 CPU1 jbd2_log_do_checkpoint() jbd2_journal_try_to_free_buffers() __journal_try_to_free_buffer(bh) ... while (transaction->t_checkpoint_io_list) ... if (buffer_locked(bh)) { <-- IO completes now, buffer gets unlocked --> spin_unlock(&journal->j_list_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); try_to_free_buffers(page); get_bh(bh) <-- accesses freed bh Fix the problem by grabbing bh reference before unlocking journal->j_list_lock. Fixes: dc6e8d669cf5 ("jbd2: don't call get_bh() before calling __jbd2_journal_remove_checkpoint()") Fixes: be1158cc615f ("jbd2: fold __process_buffer() into jbd2_log_do_checkpoint()") Reported-by: syzbot+7f4a27091759e2fe7453@syzkaller.appspotmail.com CC: stable@vger.kernel.org Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c125d662777c..26f8d7e46462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -251,8 +251,8 @@ restart: bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -333,8 +333,8 @@ restart2: jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); From d3091215921bd4b8fdf3129bf8f733b8ca48dc80 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 5 Oct 2018 19:11:59 -0400 Subject: [PATCH 18/21] docs: move ext4 administrative docs to admin-guide/ Move the ext4 mount option and other administrative stuff to the Linux administrator's guide. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/{filesystems/ext4 => admin-guide}/ext4.rst | 2 +- Documentation/admin-guide/index.rst | 1 + Documentation/conf.py | 2 ++ Documentation/filesystems/ext4/index.rst | 1 - 4 files changed, 4 insertions(+), 2 deletions(-) rename Documentation/{filesystems/ext4 => admin-guide}/ext4.rst (99%) diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/admin-guide/ext4.rst similarity index 99% rename from Documentation/filesystems/ext4/ext4.rst rename to Documentation/admin-guide/ext4.rst index e2b6bb7c2730..e506d3dae510 100644 --- a/Documentation/filesystems/ext4/ext4.rst +++ b/Documentation/admin-guide/ext4.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 ======================== -General Information +ext4 General Information ======================== Ext4 is an advanced level of the ext3 filesystem which incorporates diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0873685bab0f..965745d5fb9a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking. java ras bcache + ext4 pm/index thunderbolt LSM/index diff --git a/Documentation/conf.py b/Documentation/conf.py index 05dad6bda787..4d32c01e1e16 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -383,6 +383,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), + ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', + 'ext4 Community', 'manual'), ('filesystems/ext4/index', 'ext4.tex', 'ext4 Filesystem', 'ext4 Filesystem Developers', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 71121605558c..427bc115012e 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst @@ -13,5 +13,4 @@ the ext4 community. :maxdepth: 5 :numbered: - ext4 ondisk/index From 8a98ec7c7b3901330a036af0f62f523c31d763da Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 5 Oct 2018 19:20:08 -0400 Subject: [PATCH 19/21] docs: promote the ext4 data structures book to top level Move the ext4 data structures book to Documentation/filesystems/ext4/ since the administrative information moved elsewhere. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/conf.py | 4 ++-- .../filesystems/ext4/{ondisk => }/about.rst | 0 .../ext4/{ondisk => }/allocators.rst | 0 .../ext4/{ondisk => }/attributes.rst | 0 .../filesystems/ext4/{ondisk => }/bigalloc.rst | 0 .../filesystems/ext4/{ondisk => }/bitmaps.rst | 0 .../ext4/{ondisk => }/blockgroup.rst | 0 .../filesystems/ext4/{ondisk => }/blockmap.rst | 0 .../filesystems/ext4/{ondisk => }/blocks.rst | 0 .../ext4/{ondisk => }/checksums.rst | 0 .../ext4/{ondisk => }/directory.rst | 0 .../filesystems/ext4/{ondisk => }/dynamic.rst | 0 .../filesystems/ext4/{ondisk => }/eainode.rst | 0 .../filesystems/ext4/{ondisk => }/globals.rst | 0 .../ext4/{ondisk => }/group_descr.rst | 0 .../filesystems/ext4/{ondisk => }/ifork.rst | 0 Documentation/filesystems/ext4/index.rst | 18 ++++++++---------- .../ext4/{ondisk => }/inlinedata.rst | 0 .../filesystems/ext4/{ondisk => }/inodes.rst | 0 .../filesystems/ext4/{ondisk => }/journal.rst | 0 .../filesystems/ext4/{ondisk => }/mmp.rst | 0 .../filesystems/ext4/ondisk/index.rst | 9 --------- .../filesystems/ext4/{ondisk => }/overview.rst | 0 .../ext4/{ondisk => }/special_inodes.rst | 0 .../filesystems/ext4/{ondisk => }/super.rst | 0 25 files changed, 10 insertions(+), 21 deletions(-) rename Documentation/filesystems/ext4/{ondisk => }/about.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/allocators.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/attributes.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/bigalloc.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/bitmaps.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/blockgroup.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/blockmap.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/blocks.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/checksums.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/directory.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/dynamic.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/eainode.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/globals.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/group_descr.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/ifork.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/inlinedata.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/inodes.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/journal.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/mmp.rst (100%) delete mode 100644 Documentation/filesystems/ext4/ondisk/index.rst rename Documentation/filesystems/ext4/{ondisk => }/overview.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/special_inodes.rst (100%) rename Documentation/filesystems/ext4/{ondisk => }/super.rst (100%) diff --git a/Documentation/conf.py b/Documentation/conf.py index 4d32c01e1e16..ede67ccafc29 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -385,8 +385,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', 'ext4 Community', 'manual'), - ('filesystems/ext4/index', 'ext4.tex', 'ext4 Filesystem', - 'ext4 Filesystem Developers', 'manual'), + ('filesystems/ext4/index', 'ext4-data-structures.tex', + 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/about.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/about.rst rename to Documentation/filesystems/ext4/about.rst diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/allocators.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/allocators.rst rename to Documentation/filesystems/ext4/allocators.rst diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/attributes.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/attributes.rst rename to Documentation/filesystems/ext4/attributes.rst diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/bigalloc.rst rename to Documentation/filesystems/ext4/bigalloc.rst diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/bitmaps.rst rename to Documentation/filesystems/ext4/bitmaps.rst diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/blockgroup.rst rename to Documentation/filesystems/ext4/blockgroup.rst diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/blockmap.rst rename to Documentation/filesystems/ext4/blockmap.rst diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/blocks.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/blocks.rst rename to Documentation/filesystems/ext4/blocks.rst diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/checksums.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/checksums.rst rename to Documentation/filesystems/ext4/checksums.rst diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/directory.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/directory.rst rename to Documentation/filesystems/ext4/directory.rst diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/dynamic.rst rename to Documentation/filesystems/ext4/dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/eainode.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/eainode.rst rename to Documentation/filesystems/ext4/eainode.rst diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/globals.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/globals.rst rename to Documentation/filesystems/ext4/globals.rst diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/group_descr.rst rename to Documentation/filesystems/ext4/group_descr.rst diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ifork.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/ifork.rst rename to Documentation/filesystems/ext4/ifork.rst diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 427bc115012e..3be3e54d480d 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst @@ -1,16 +1,14 @@ .. SPDX-License-Identifier: GPL-2.0 -=============== -ext4 Filesystem -=============== - -General usage and on-disk artifacts writen by ext4. More documentation may -be ported from the wiki as time permits. This should be considered the -canonical source of information as the details here have been reviewed by -the ext4 community. +=================================== +ext4 Data Structures and Algorithms +=================================== .. toctree:: - :maxdepth: 5 + :maxdepth: 6 :numbered: - ondisk/index + about.rst + overview.rst + globals.rst + dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/inlinedata.rst rename to Documentation/filesystems/ext4/inlinedata.rst diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/inodes.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/inodes.rst rename to Documentation/filesystems/ext4/inodes.rst diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/journal.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/journal.rst rename to Documentation/filesystems/ext4/journal.rst diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/mmp.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/mmp.rst rename to Documentation/filesystems/ext4/mmp.rst diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst deleted file mode 100644 index f7d082c3a435..000000000000 --- a/Documentation/filesystems/ext4/ondisk/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============================== -Data Structures and Algorithms -============================== -.. include:: about.rst -.. include:: overview.rst -.. include:: globals.rst -.. include:: dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/overview.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/overview.rst rename to Documentation/filesystems/ext4/overview.rst diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/special_inodes.rst rename to Documentation/filesystems/ext4/special_inodes.rst diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/super.rst similarity index 100% rename from Documentation/filesystems/ext4/ondisk/super.rst rename to Documentation/filesystems/ext4/super.rst From 6fd941784b8ac3e74313f7112f0586076dc36544 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 6 Oct 2018 22:40:34 -0400 Subject: [PATCH 20/21] ext4: cache NULL when both default_acl and acl are NULL default_acl and acl of newly created inode will be initiated as ACL_NOT_CACHED in vfs function inode_init_always() and later will be updated by calling xxx_init_acl() in specific filesystems. However, when default_acl and acl are NULL then they keep the value of ACL_NOT_CACHED. This patch changes the code to cache NULL for acl / default_acl in this case to save unnecessary ACL lookup attempt. Signed-off-by: Chengguang Xu Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/acl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fb50f9aa6ead..c1d570ee1d9f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } From 33458eaba4dfe778a426df6a19b7aad2ff9f7eec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 12 Oct 2018 09:28:09 -0400 Subject: [PATCH 21/21] ext4: fix use-after-free race in ext4_remount()'s error path It's possible for ext4_show_quota_options() to try reading s_qf_names[i] while it is being modified by ext4_remount() --- most notably, in ext4_remount's error path when the original values of the quota file name gets restored. Reported-by: syzbot+a2872d6feea6918008a9@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.2+ --- fs/ext4/ext4.h | 3 +- fs/ext4/super.c | 73 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 86e1bacac757..12f90d48ba61 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1405,7 +1405,8 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index faf293ed8060..a221f1cdf704 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb) for (type = 0; type < EXT4_MAXQUOTAS; type++) ext4_quota_off(sb, type); } + +/* + * This is a helper function which is used in the mount/remount + * codepaths (which holds s_umount) to fetch the quota file name. + */ +static inline char *get_qf_name(struct super_block *sb, + struct ext4_sb_info *sbi, + int type) +{ + return rcu_dereference_protected(sbi->s_qf_names[type], + lockdep_is_held(&sb->s_umount)); +} #else static inline void ext4_quota_off_umount(struct super_block *sb) { @@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb) percpu_free_rwsem(&sbi->s_journal_flag_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list @@ -1531,11 +1543,10 @@ static const char deprecated_msg[] = static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname; + char *qname, *old_qname = get_qf_name(sb, sbi, qtype); int ret = -1; - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -1552,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return -1; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (old_qname) { + if (strcmp(old_qname, qname) == 0) ret = 1; else ext4_msg(sb, KERN_ERR, @@ -1566,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + rcu_assign_pointer(sbi->s_qf_names[qtype], qname); set_opt(sb, QUOTA); return 1; errout: @@ -1578,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype) { struct ext4_sb_info *sbi = EXT4_SB(sb); + char *old_qname = get_qf_name(sb, sbi, qtype); - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -1; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); + synchronize_rcu(); + kfree(old_qname); return 1; } #endif @@ -1961,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; + char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -1992,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb, "Cannot enable project quota enforcement."); return 0; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); + grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); + if (usr_qf_name || grp_qf_name) { + if (test_opt(sb, USRQUOTA) && usr_qf_name) clear_opt(sb, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && grp_qf_name) clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { @@ -2030,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); + char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; @@ -2048,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + rcu_read_lock(); + usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); + grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); + if (usr_qf_name) + seq_show_option(seq, "usrjquota", usr_qf_name); + if (grp_qf_name) + seq_show_option(seq, "grpjquota", grp_qf_name); + rcu_read_unlock(); #endif } @@ -5104,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int err = 0; #ifdef CONFIG_QUOTA int i, j; + char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); @@ -5123,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { - old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); + char *qf_name = get_qf_name(sb, sbi, i); + + old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); @@ -5353,9 +5373,12 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + to_free[i] = get_qf_name(sb, sbi, i); + rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } + synchronize_rcu(); + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(to_free[i]); #endif kfree(orig_data); return err; @@ -5546,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type) */ static int ext4_quota_on_mount(struct super_block *sb, int type) { - return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), EXT4_SB(sb)->s_jquota_fmt, type); }