From ffa09b3bd02427ab631f0c1b64714ce6fc885f61 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 10 Aug 2023 23:48:59 +0800 Subject: [PATCH 01/17] erofs: DEFLATE compression support Add DEFLATE compression as the 3rd supported algorithm. DEFLATE is a popular generic-purpose compression algorithm for quite long time (many advanced formats like gzip, zlib, zip, png are all based on that) as Apple documentation written "If you require interoperability with non-Apple devices, use COMPRESSION_ZLIB. [1]". Due to its popularity, there are several hardware on-market DEFLATE accelerators, such as (s390) DFLTCC, (Intel) IAA/QAT, (HiSilicon) ZIP accelerator, etc. In addition, there are also several high-performence IP cores and even open-source FPGA approches available for DEFLATE. Therefore, it's useful to support DEFLATE compression in order to find a way to utilize these accelerators for asynchronous I/Os and get benefits from these later. Besides, it's a good choice to trade off between compression ratios and performance compared to LZ4 and LZMA. The DEFLATE core format is simple as well as easy to understand, therefore the code size of its decompressor is small even for the bootloader use cases. The runtime memory consumption is quite limited too (e.g. 32K + ~7K for each zlib stream). As usual, EROFS ourperforms similar approaches too. Alternatively, DEFLATE could still be used for some specific files since EROFS supports multiple compression algorithms in one image. [1] https://developer.apple.com/documentation/compression/compression_algorithm Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230810154859.118330-1-hsiangkao@linux.alibaba.com --- fs/erofs/Kconfig | 15 ++ fs/erofs/Makefile | 1 + fs/erofs/compress.h | 2 + fs/erofs/decompressor.c | 6 + fs/erofs/decompressor_deflate.c | 247 ++++++++++++++++++++++++++++++++ fs/erofs/erofs_fs.h | 7 + fs/erofs/internal.h | 20 +++ fs/erofs/super.c | 10 ++ fs/erofs/zmap.c | 5 +- 9 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 fs/erofs/decompressor_deflate.c diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index f259d92c9720..56a99ba8ce22 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -99,6 +99,21 @@ config EROFS_FS_ZIP_LZMA If unsure, say N. +config EROFS_FS_ZIP_DEFLATE + bool "EROFS DEFLATE compressed data support" + depends on EROFS_FS_ZIP + select ZLIB_INFLATE + help + Saying Y here includes support for reading EROFS file systems + containing DEFLATE compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + DEFLATE support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index a3a98fc3e481..994d0b9deddf 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index b1b846504027..349c3316ae6b 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -94,4 +94,6 @@ extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cfad1eac7fd9..332ec5f74002 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -379,4 +379,10 @@ const struct z_erofs_decompressor erofs_decompressors[] = { .name = "lzma" }, #endif +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE + [Z_EROFS_COMPRESSION_DEFLATE] = { + .decompress = z_erofs_deflate_decompress, + .name = "deflate" + }, +#endif }; diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c new file mode 100644 index 000000000000..19e5bdeb30b6 --- /dev/null +++ b/fs/erofs/decompressor_deflate.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include "compress.h" + +struct z_erofs_deflate { + struct z_erofs_deflate *next; + struct z_stream_s z; + u8 bounce[PAGE_SIZE]; +}; + +static DEFINE_SPINLOCK(z_erofs_deflate_lock); +static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms; +static struct z_erofs_deflate *z_erofs_deflate_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); + +module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); + +void z_erofs_deflate_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + continue; + } + z_erofs_deflate_head = NULL; + spin_unlock(&z_erofs_deflate_lock); + + while (strm) { + struct z_erofs_deflate *n = strm->next; + + vfree(strm->z.workspace); + kfree(strm); + --z_erofs_deflate_avail_strms; + strm = n; + } + } +} + +int __init z_erofs_deflate_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_deflate_nstrms) + z_erofs_deflate_nstrms = num_possible_cpus(); + + for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms; + ++z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) + goto out_failed; + + /* XXX: in-kernel zlib cannot shrink windowbits currently */ + strm->z.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!strm->z.workspace) { + kfree(strm); + goto out_failed; + } + + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + } + return 0; + +out_failed: + pr_err("failed to allocate zlib workspace\n"); + z_erofs_deflate_exit(); + return -ENOMEM; +} + +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size) +{ + if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) { + erofs_err(sb, "invalid deflate cfgs, size=%u", size); + return -EINVAL; + } + + if (dfl->windowbits > MAX_WBITS) { + erofs_err(sb, "unsupported windowbits %u", dfl->windowbits); + return -EOPNOTSUPP; + } + + erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); + return 0; +} + +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + struct super_block *sb = rq->sb; + unsigned int insz, outsz, pofs; + struct z_erofs_deflate *strm; + u8 *kin, *kout = NULL; + bool bounced = false; + int no = -1, ni = 0, j = 0, zerr, err; + + /* 1. get the exact DEFLATE compressed size */ + kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(kin); + return err; + } + + /* 2. get an available DEFLATE context */ +again: + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head)); + goto again; + } + z_erofs_deflate_head = strm->next; + spin_unlock(&z_erofs_deflate_lock); + + /* 3. multi-call decompress */ + insz = rq->inputsize; + outsz = rq->outputsize; + zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); + if (zerr != Z_OK) { + err = -EIO; + goto failed_zinit; + } + + pofs = rq->pageofs_out; + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); + insz -= strm->z.avail_in; + strm->z.next_in = kin + rq->pageofs_in; + strm->z.avail_out = 0; + + while (1) { + if (!strm->z.avail_out) { + if (++no >= nrpages_out || !outsz) { + erofs_err(sb, "insufficient space for decompressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) + kunmap_local(kout); + strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); + outsz -= strm->z.avail_out; + if (!rq->out[no]) { + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + pofs; + pofs = 0; + } + + if (!strm->z.avail_in && insz) { + if (++ni >= nrpages_in) { + erofs_err(sb, "invalid compressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) { /* unlike kmap(), take care of the orders */ + j = strm->z.next_out - kout; + kunmap_local(kout); + } + kunmap_local(kin); + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE); + insz -= strm->z.avail_in; + kin = kmap_local_page(rq->in[ni]); + strm->z.next_in = kin; + bounced = false; + if (kout) { + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + j; + } + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Or use short-lived pages from the + * on-stack pagepool where pages share among the same request + * and not _all_ inplace I/O pages are needed to be doubled. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in); + strm->z.next_in = strm->bounce; + bounced = true; + } + + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + + zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); + if (zerr != Z_OK || !(outsz + strm->z.avail_out)) { + if (zerr == Z_OK && rq->partial_decoding) + break; + if (zerr == Z_STREAM_END && !outsz) + break; + erofs_err(sb, "failed to decompress %d in[%u] out[%u]", + zerr, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + + if (zlib_inflateEnd(&strm->z) != Z_OK && !err) + err = -EIO; + if (kout) + kunmap_local(kout); +failed_zinit: + kunmap_local(kin); + /* 4. push back DEFLATE stream context to the global list */ + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + wake_up(&z_erofs_deflate_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2c7b16e340fe..364f51171c13 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -289,6 +289,7 @@ struct erofs_dirent { enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, + Z_EROFS_COMPRESSION_DEFLATE = 2, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -309,6 +310,12 @@ struct z_erofs_lzma_cfgs { #define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_deflate_cfgs { + u8 windowbits; /* 8..15 for DEFLATE */ + u8 reserved[5]; +} __packed; + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 36e32fa542f0..fb45855cfd5d 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -519,6 +519,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE +int __init z_erofs_deflate_init(void); +void z_erofs_deflate_exit(void); +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size); +#else +static inline int z_erofs_deflate_init(void) { return 0; } +static inline int z_erofs_deflate_exit(void) { return 0; } +static inline int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size) { + if (dfl) { + erofs_err(sb, "deflate algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 566f68ddfa36..3275505059cc 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -201,6 +201,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZMA: ret = z_erofs_load_lzma_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_DEFLATE: + ret = z_erofs_load_deflate_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -964,6 +967,10 @@ static int __init erofs_module_init(void) if (err) goto lzma_err; + err = z_erofs_deflate_init(); + if (err) + goto deflate_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -984,6 +991,8 @@ fs_err: sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_deflate_exit(); +deflate_err: z_erofs_lzma_exit(); lzma_err: erofs_exit_shrinker(); @@ -1001,6 +1010,7 @@ static void __exit erofs_module_exit(void) erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); + z_erofs_deflate_exit(); z_erofs_lzma_exit(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1909ddafd9c7..7b55111fd533 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -561,8 +561,9 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && - map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= i_blocksize(inode))) { + (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; From 3f339920175c871ac63b4ea179117da7518618fd Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Sat, 22 Jul 2023 17:45:37 +0800 Subject: [PATCH 02/17] erofs: update on-disk format for xattr name filter The xattr name bloom filter feature is going to be introduced to speed up the negative xattr lookup, e.g. system.posix_acl_[access|default] lookup when running "ls -lR" workload. There are some commonly used extended attributes (n) and the total number of these is approximately 30. trusted.overlay.opaque trusted.overlay.redirect trusted.overlay.origin trusted.overlay.impure trusted.overlay.nlink trusted.overlay.upper trusted.overlay.metacopy trusted.overlay.protattr user.overlay.opaque user.overlay.redirect user.overlay.origin user.overlay.impure user.overlay.nlink user.overlay.upper user.overlay.metacopy user.overlay.protattr security.evm security.ima security.selinux security.SMACK64 security.SMACK64IPIN security.SMACK64IPOUT security.SMACK64EXEC security.SMACK64TRANSMUTE security.SMACK64MMAP security.apparmor security.capability system.posix_acl_access system.posix_acl_default user.mime_type Given the number of bits of the bloom filter (m) is 32, the optimal value for the number of the hash functions (k) is 1 (ln2 * m/n = 0.74). The single hash function is implemented as: xxh32(name, strlen(name), EROFS_XATTR_FILTER_SEED + index) where `index` represents the index of corresponding predefined short name prefix, while `name` represents the name string after stripping the above predefined name prefix. The constant magic number EROFS_XATTR_FILTER_SEED, i.e. 0x25BBE08F, is used to give a better spread when mapping these 30 extended attributes into 32-bit bloom filter as: bit 0: security.ima bit 1: bit 2: trusted.overlay.nlink bit 3: bit 4: user.overlay.nlink bit 5: trusted.overlay.upper bit 6: user.overlay.origin bit 7: trusted.overlay.protattr bit 8: security.apparmor bit 9: user.overlay.protattr bit 10: user.overlay.opaque bit 11: security.selinux bit 12: security.SMACK64TRANSMUTE bit 13: security.SMACK64 bit 14: security.SMACK64MMAP bit 15: user.overlay.impure bit 16: security.SMACK64IPIN bit 17: trusted.overlay.redirect bit 18: trusted.overlay.origin bit 19: security.SMACK64IPOUT bit 20: trusted.overlay.opaque bit 21: system.posix_acl_default bit 22: bit 23: user.mime_type bit 24: trusted.overlay.impure bit 25: security.SMACK64EXEC bit 26: user.overlay.redirect bit 27: user.overlay.upper bit 28: security.evm bit 29: security.capability bit 30: system.posix_acl_access bit 31: trusted.overlay.metacopy, user.overlay.metacopy h_name_filter is introduced to the on-disk per-inode xattr header to place the corresponding xattr name filter, where bit value 1 indicates non-existence for compatibility. This feature is indicated by EROFS_FEATURE_COMPAT_XATTR_FILTER compatible feature bit. Reserve one byte in on-disk superblock as the on-disk format for xattr name filter may change in the future. With this flag we don't need bothering these compatible bits again at that time. Suggested-by: Alexander Larsson Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230722094538.11754-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/erofs_fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 364f51171c13..a03ec70ba6f2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -13,6 +13,7 @@ #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 #define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -81,7 +82,8 @@ struct erofs_super_block { __u8 xattr_prefix_count; /* # of long xattr name prefixes */ __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ - __u8 reserved2[24]; + __u8 xattr_filter_reserved; /* reserved for xattr name filter */ + __u8 reserved2[23]; }; /* @@ -200,7 +202,7 @@ struct erofs_inode_extended { * for read-only fs, no need to introduce h_refcount */ struct erofs_xattr_ibody_header { - __le32 h_reserved; + __le32 h_name_filter; /* bit value 1 indicates not-present */ __u8 h_shared_count; __u8 h_reserved2[7]; __le32 h_shared_xattrs[]; /* shared xattr id array */ @@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_LONG_PREFIX 0x80 #define EROFS_XATTR_LONG_PREFIX_MASK 0x7f +#define EROFS_XATTR_FILTER_BITS 32 +#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX +#define EROFS_XATTR_FILTER_SEED 0x25BBE08F + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ From fd73a4395d477ae134f319f7368a9f8a6264fd8b Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Sat, 22 Jul 2023 17:45:38 +0800 Subject: [PATCH 03/17] erofs: boost negative xattr lookup with bloom filter Optimise the negative xattr lookup with bloom filter. The bit value for the bloom filter map has a reverse semantics for compatibility. That is, the bit value of 0 indicates existence, while the bit value of 1 indicates the absence of corresponding xattr. The initial version is _only_ enabled when xattr_filter_reserved is zero. The filter map internals may change in the future, in which case the reserved flag will be set non-zero and we don't need bothering the compatible bits again at that time. For now disable the optimization if this reserved flag is non-zero. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230722094538.11754-3-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/Kconfig | 1 + fs/erofs/internal.h | 3 +++ fs/erofs/super.c | 1 + fs/erofs/xattr.c | 14 ++++++++++++++ 4 files changed, 19 insertions(+) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 56a99ba8ce22..f6dc961e6c2b 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -38,6 +38,7 @@ config EROFS_FS_DEBUG config EROFS_FS_XATTR bool "EROFS extended attributes" depends on EROFS_FS + select XXHASH default y help Extended attributes are name:value pairs associated with inodes by diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index fb45855cfd5d..4ff88d0dd980 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -151,6 +151,7 @@ struct erofs_sb_info { u32 xattr_prefix_start; u8 xattr_prefix_count; struct erofs_xattr_prefix_item *xattr_prefixes; + unsigned int xattr_filter_reserved; #endif u16 device_id_mask; /* valid bits of device id to be used */ @@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) +EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 @@ -270,6 +272,7 @@ struct erofs_inode { unsigned char inode_isize; unsigned int xattr_isize; + unsigned int xattr_name_filter; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3275505059cc..3d04cc33a6b8 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -391,6 +391,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; + sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 40178b6e0688..09d341675e89 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -5,6 +5,7 @@ * Copyright (C) 2021-2022, Alibaba Cloud */ #include +#include #include "xattr.h" struct erofs_xattr_iter { @@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) } ih = it.kaddr + erofs_blkoff(sb, it.pos); + vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { int ret; + unsigned int hashbit; struct erofs_xattr_iter it; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!name) return -EINVAL; @@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, if (ret) return ret; + /* reserved flag is non-zero if there's any change of on-disk format */ + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { + hashbit = xxh32(name, strlen(name), + EROFS_XATTR_FILTER_SEED + index); + hashbit &= EROFS_XATTR_FILTER_BITS - 1; + if (vi->xattr_name_filter & (1U << hashbit)) + return -ENOATTR; + } + it.index = index; it.name = (struct qstr)QSTR_INIT(name, strlen(name)); if (it.name.len > EROFS_NAME_LEN) From e3157bb55d3ed81ab3e21d1e2a07527d674556b4 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Wed, 9 Aug 2023 14:06:37 +0800 Subject: [PATCH 04/17] erofs: refine warning messages for zdata I/Os Don't warn users since -EINTR can be returned due to user interruption. Also suppress warning messages of readmore. Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230809060637.21311-1-mengferry@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index de4f12152b62..53820271e538 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1848,15 +1848,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, page = erofs_grab_cache_page_nowait(inode->i_mapping, index); if (page) { - if (PageUptodate(page)) { + if (PageUptodate(page)) unlock_page(page); - } else { - err = z_erofs_do_read_page(f, page); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - } + else + (void)z_erofs_do_read_page(f, page); put_page(page); } @@ -1885,8 +1880,9 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - if (err) - erofs_err(inode->i_sb, "failed to read, err [%d]", err); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); @@ -1920,10 +1916,9 @@ static void z_erofs_readahead(struct readahead_control *rac) head = (void *)page_private(page); err = z_erofs_do_read_page(&f, page); - if (err) - erofs_err(inode->i_sb, - "readahead error at page %lu @ nid %llu", - page->index, EROFS_I(inode)->nid); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error %d @ %lu of nid %llu", + err, page->index, EROFS_I(inode)->nid); put_page(page); } z_erofs_pcluster_readmore(&f, rac, false); From 428f27cc8d3202ebbc5013fe53e8596949955d85 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Tue, 15 Aug 2023 17:48:47 +0800 Subject: [PATCH 05/17] erofs: clean up redundant comment and adjust code alignment Remove some redundant comments in erofs/super.c, and avoid unncessary line breaks for cleanup. Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230815094849.53249-1-mengferry@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3d04cc33a6b8..f3b33332a446 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -21,8 +21,7 @@ static struct kmem_cache *erofs_inode_cachep __read_mostly; struct file_system_type erofs_fs_type; -void _erofs_err(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -32,12 +31,11 @@ void _erofs_err(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); va_end(args); } -void _erofs_info(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -102,11 +100,9 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - /* be careful of RCU symlink path */ if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); - kmem_cache_free(erofs_inode_cachep, vi); } @@ -119,8 +115,7 @@ static bool check_layout_compatibility(struct super_block *sb, /* check if current kernel meets all mandatory requirements */ if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, - "unidentified incompatible feature %x, please upgrade kernel version", + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", feature & ~EROFS_ALL_FEATURE_INCOMPAT); return false; } @@ -433,7 +428,6 @@ out: return ret; } -/* set up default EROFS parameters */ static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP @@ -735,7 +729,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) xa_init(&sbi->managed_pslots); #endif - /* get the root inode */ inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -752,7 +745,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; erofs_shrinker_register(sb); - /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); if (IS_ERR(sbi->packed_inode)) { @@ -885,10 +877,6 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } -/* - * could be triggered after deactivate_locked_super() - * is called, thus including umount and failed to initialize. - */ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -917,7 +905,6 @@ static void erofs_kill_sb(struct super_block *sb) sb->s_fs_info = NULL; } -/* called when ->s_root is non-NULL */ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); @@ -1018,7 +1005,6 @@ static void __exit erofs_module_exit(void) erofs_pcpubuf_exit(); } -/* get filesystem statistics */ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; From 8ec9a25258244c4b3955ad052cd4c74ccf3d5d6a Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Tue, 15 Aug 2023 17:48:48 +0800 Subject: [PATCH 06/17] erofs: add necessary kmem_cache_create flags for erofs inode cache To improve memory access efficiency and enable statistics functionality, add SLAB_MEM_SPREAD and SLAB_ACCOUNT flag during erofs_inode_cachep's allocation time. Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230815094849.53249-2-mengferry@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index f3b33332a446..e5fb1775dd02 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -941,9 +941,9 @@ static int __init erofs_module_init(void) erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", - sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT, - erofs_inode_init_once); + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; From d442495c9676b6587f0058c5843b2d053dd1765b Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Tue, 15 Aug 2023 17:48:49 +0800 Subject: [PATCH 07/17] erofs: remove redundant erofs_fs_type declaration in super.c As erofs_fs_type has been declared in internal.h, there is no use to declare repeatedly in super.c. Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang eviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230815094849.53249-3-mengferry@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index e5fb1775dd02..91ed1460613b 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -19,7 +19,6 @@ #include static struct kmem_cache *erofs_inode_cachep __read_mostly; -struct file_system_type erofs_fs_type; void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) { From 8b00be163f7b57cbf957b3d27b5a7ca1e2495cfa Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:06 +0800 Subject: [PATCH 08/17] erofs: simplify z_erofs_read_fragment() A trivial cleanup to make the fragment handling logic more clear. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 53820271e538..dc104add0a99 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -932,34 +932,27 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; buf.inode = packed_inode; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; @@ -972,7 +965,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -1041,17 +1034,11 @@ hitted: goto next_part; } if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; + erofs_off_t fpos = offset + cur - map->m_la; - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; ++spiltted; From dcba1b232e26ebadbd215728199455d38a59253e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:07 +0800 Subject: [PATCH 09/17] erofs: avoid obsolete {collector,collection} terms {collector,collection} were once reserved in order to indicate different runtime logical extent instance of multi-reference pclusters. However, de-duplicated decompression has been landed in a more flexable way, thus `struct z_erofs_collection` was formally removed in commit 87ca34a7065d ("erofs: get rid of `struct z_erofs_collection'"). Let's handle the remaining leftovers, for example: `z_erofs_collector_begin` => `z_erofs_pcluster_begin` `z_erofs_collector_end` => `z_erofs_pcluster_end` as well as some comments. No logic changes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-2-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index dc104add0a99..4ed99346c4e1 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -507,19 +507,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -851,7 +849,7 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct erofs_workgroup *grp = NULL; @@ -908,12 +906,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); @@ -929,7 +927,7 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; + fe->backmost = false; } static int z_erofs_read_fragment(struct super_block *sb, struct page *page, @@ -978,8 +976,7 @@ repeat: if (offset + cur < map->m_la || offset + cur >= map->m_la + map->m_llen) { - if (z_erofs_collector_end(fe)) - fe->backmost = false; + z_erofs_pcluster_end(fe); map->m_la = offset + cur; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); @@ -995,7 +992,7 @@ repeat: map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; - err = z_erofs_collector_begin(fe); + err = z_erofs_pcluster_begin(fe); if (err) goto out; @@ -1862,7 +1859,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_do_read_page(&f, page); z_erofs_pcluster_readmore(&f, NULL, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); @@ -1909,7 +1906,7 @@ static void z_erofs_readahead(struct readahead_control *rac) put_page(page); } z_erofs_pcluster_readmore(&f, rac, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); From aeebae9d77217709f8ae3edb0cd7858ec8c7a9d6 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:08 +0800 Subject: [PATCH 10/17] erofs: move preparation logic into z_erofs_pcluster_begin() Some preparation logic should be part of z_erofs_pcluster_begin() instead of z_erofs_do_read_page(). Let's move now. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-3-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 60 ++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 4ed99346c4e1..9b67991edc7b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -852,6 +852,8 @@ err_out: static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -861,8 +863,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -881,9 +882,26 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -982,39 +1000,15 @@ repeat: err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ + } else if (fe->pcl) { + goto hitted; } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); + if ((map->m_flags & EROFS_MAP_MAPPED) && + !(map->m_flags & EROFS_MAP_FRAGMENT)) { + err = z_erofs_pcluster_begin(fe); + if (err) goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe); } hitted: /* From e4c1cf523d820730a86cae2c6d55924833b6f7ac Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:09 +0800 Subject: [PATCH 11/17] erofs: tidy up z_erofs_do_read_page() - Fix a typo: spiltted => split; - Move !EROFS_MAP_MAPPED and EROFS_MAP_FRAGMENT upwards; - Increase `split` in advance to avoid unnecessary repeats. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-4-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 53 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 9b67991edc7b..6c78566efb92 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -981,49 +981,34 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, len, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { z_erofs_pcluster_end(fe); - map->m_la = offset + cur; + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else if (fe->pcl) { - goto hitted; } - if ((map->m_flags & EROFS_MAP_MAPPED) && - !(map->m_flags & EROFS_MAP_FRAGMENT)) { - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); + tight = false; goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; @@ -1032,12 +1017,24 @@ hitted: EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; - ++spiltted; tight = false; goto next_part; } - exclusive = (!cur && (!spiltted || tight)); + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1050,8 +1047,6 @@ hitted: goto out; z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { From 9a05c6a8bc26138d34e87b39e6a815603bc2a66c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:10 +0800 Subject: [PATCH 12/17] erofs: drop z_erofs_page_mark_eio() It can be folded into z_erofs_onlinepage_endio() to simplify the code. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-5-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6c78566efb92..82834e302fb4 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page) atomic_inc((atomic_t *)&page->private); } -static inline void z_erofs_page_mark_eio(struct page *page) +static void z_erofs_onlinepage_endio(struct page *page, int err) { - int orig; + int orig, v; + + DBG_BUGON(!PagePrivate(page)); do { orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; - - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); @@ -1067,9 +1062,7 @@ next_part: goto repeat; out: - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); return err; } @@ -1172,9 +1165,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - if (err) - z_erofs_page_mark_eio(bvi->bvec.page); - z_erofs_onlinepage_endio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page, err); list_del(p); kfree(bvi); } @@ -1345,9 +1336,7 @@ out: /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); } if (be->decompressed_pages != be->onstack_pages) From 06ec03660d819fdb88692d62a86464daf6ddb0a5 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:11 +0800 Subject: [PATCH 13/17] erofs: get rid of fe->backmost for cache decompression EROFS_MAP_FULL_MAPPED is more accurate to decide if caching the last incomplete pcluster for later read or not. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-6-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 82834e302fb4..7e55e95bc33a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -528,8 +528,6 @@ struct z_erofs_decompress_frontend { z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - /* used for applying cache strategy on the fly */ - bool backmost; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ @@ -538,7 +536,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } + .mode = Z_EROFS_PCLUSTER_FOLLOWED } static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) { @@ -547,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; - if (fe->backmost) + if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && @@ -940,7 +938,6 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - fe->backmost = false; } static int z_erofs_read_fragment(struct super_block *sb, struct page *page, From 491b1105a8e25d9d26ea321839e057a752e31383 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:12 +0800 Subject: [PATCH 14/17] erofs: adapt folios for z_erofs_readahead() It's a straight-forward conversion except that readahead_folio() will do folio_put() in advance but it doesn't matter since folios are still locked. As before, since file-backed folios (pages for now) are locked, so we could temporarily use folio->private as an internal counter to indicate split parts of each folio for the corresponding pclusters to decompress. When such counter becomes zero, the folio will be finally unlocked (see compress.h and z_erofs_onlinepage_endio()). Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-7-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 7e55e95bc33a..78631d113d98 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1853,37 +1853,35 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *head = NULL, *page; - unsigned int nr_pages; + struct folio *head = NULL, *folio; + unsigned int nr_folios; + int err; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, true); - nr_pages = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + nr_folios = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); - while ((page = readahead_page(rac))) { - set_page_private(page, (unsigned long)head); - head = page; + while ((folio = readahead_folio(rac))) { + folio->private = head; + head = folio; } + /* traverse in reverse order for best metadata I/O performance */ while (head) { - struct page *page = head; - int err; + folio = head; + head = folio_get_private(folio); - /* traversal in reverse order */ - head = (void *)page_private(page); - - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, &folio->page); if (err && err != -EINTR) - erofs_err(inode->i_sb, "readahead error %d @ %lu of nid %llu", - err, page->index, EROFS_I(inode)->nid); - put_page(page); + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } From c33ad3b2b7100ac944aad38d3ebc89cb5f654e94 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:39:42 +0800 Subject: [PATCH 15/17] erofs: adapt folios for z_erofs_read_folio() It's a straight-forward conversion and no logic changes (except that it renames the corresponding tracepoint.) Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817083942.103303-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 9 ++++----- include/trace/events/erofs.h | 16 ++++++++-------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 78631d113d98..da52b1cccd38 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1822,17 +1822,16 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *const inode = page->mapping->host; + struct inode *const inode = folio->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; - trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + trace_erofs_read_folio(folio, false); + f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, &folio->page); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 71dbe8bfa7db..e18684b02c3d 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -80,11 +80,11 @@ TRACE_EVENT(erofs_fill_inode, __entry->blkaddr, __entry->ofs) ); -TRACE_EVENT(erofs_readpage, +TRACE_EVENT(erofs_read_folio, - TP_PROTO(struct page *page, bool raw), + TP_PROTO(struct folio *folio, bool raw), - TP_ARGS(page, raw), + TP_ARGS(folio, raw), TP_STRUCT__entry( __field(dev_t, dev ) @@ -96,11 +96,11 @@ TRACE_EVENT(erofs_readpage, ), TP_fast_assign( - __entry->dev = page->mapping->host->i_sb->s_dev; - __entry->nid = EROFS_I(page->mapping->host)->nid; - __entry->dir = S_ISDIR(page->mapping->host->i_mode); - __entry->index = page->index; - __entry->uptodate = PageUptodate(page); + __entry->dev = folio->mapping->host->i_sb->s_dev; + __entry->nid = EROFS_I(folio->mapping->host)->nid; + __entry->dir = S_ISDIR(folio->mapping->host->i_mode); + __entry->index = folio->index; + __entry->uptodate = folio_test_uptodate(folio); __entry->raw = raw; ), From 5ec693ca70dd88db20094cab9aa045852363b93f Mon Sep 17 00:00:00 2001 From: sunshijie Date: Mon, 21 Aug 2023 23:21:16 +0800 Subject: [PATCH 16/17] erofs: don't warn dedupe and fragments features anymore The `dedupe` and `fragments` features have been merged for a year. They are mostly stable now. Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: sunshijie Link: https://lore.kernel.org/r/20230821041737.2673401-1-sunshijie@xiaomi.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 91ed1460613b..44a24d573f1f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -418,10 +418,6 @@ static int erofs_read_superblock(struct super_block *sb) if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); - if (erofs_sb_has_fragments(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); - if (erofs_sb_has_dedupe(sbi)) - erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; From 91b1ad0815fbb1095c8b9e8a2bf4201186afe304 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 22 Aug 2023 19:05:30 +0800 Subject: [PATCH 17/17] erofs: release ztailpacking pclusters properly Currently ztailpacking pclusters are chained with FOLLOWED_NOINPLACE and not recorded into the managed_pslots XArray. After commit 7674a42f35ea ("erofs: use struct lockref to replace handcrafted approach"), ztailpacking pclusters won't be freed with erofs_workgroup_put() anymore, which will cause the following issue: BUG erofs_pcluster-1 (Tainted: G OE ): Objects remaining in erofs_pcluster-1 on __kmem_cache_shutdown() Use z_erofs_free_pcluster() directly to free ztailpacking pclusters. Fixes: 7674a42f35ea ("erofs: use struct lockref to replace handcrafted approach") Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230822110530.96831-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index da52b1cccd38..036f610e044b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1369,7 +1369,10 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); - erofs_workgroup_put(&be.pcl->obj); + if (z_erofs_is_inline_pcluster(be.pcl)) + z_erofs_free_pcluster(be.pcl); + else + erofs_workgroup_put(&be.pcl->obj); } }