From 1eb5fa849f2bf9186a618e85bea23f02e527540a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 28 Feb 2018 15:59:59 -0500 Subject: [PATCH 01/28] dm: allow targets to return output from messages they are sent Could be useful for a target to return stats or other information. If a target does DMEMIT() anything to @result from its .message method then it must return 1 to the caller. Signed-off-By: Mike Snitzer --- drivers/md/dm-cache-target.c | 3 ++- drivers/md/dm-crypt.c | 3 ++- drivers/md/dm-era-target.c | 3 ++- drivers/md/dm-ioctl.c | 2 +- drivers/md/dm-log-writes.c | 3 ++- drivers/md/dm-mpath.c | 3 ++- drivers/md/dm-raid.c | 3 ++- drivers/md/dm-switch.c | 3 ++- drivers/md/dm-thin.c | 3 ++- include/linux/device-mapper.h | 3 ++- include/uapi/linux/dm-ioctl.h | 4 ++-- 11 files changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 47407e43b96a..da208638fba4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3387,7 +3387,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun * * The key migration_threshold is supported by the cache target core. */ -static int cache_message(struct dm_target *ti, unsigned argc, char **argv) +static int cache_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { struct cache *cache = ti->private; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8168f737590e..dd9b5332b39c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2942,7 +2942,8 @@ static void crypt_resume(struct dm_target *ti) * key set * key wipe */ -static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) +static int crypt_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { struct crypt_config *cc = ti->private; int key_size, ret = -EINVAL; diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 73a5c198113a..8e48920a3ffa 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1635,7 +1635,8 @@ err: DMEMIT("Error"); } -static int era_message(struct dm_target *ti, unsigned argc, char **argv) +static int era_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { struct era *era = ti->private; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a89fd8f44453..5acf77de5945 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1595,7 +1595,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para DMWARN("Target message sector outside device."); r = -EINVAL; } else if (ti->type->message) - r = ti->type->message(ti, argc, argv); + r = ti->type->message(ti, argc, argv, result, maxlen); else { DMWARN("Target type does not support messages"); r = -EINVAL; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 3362d866793b..e4c015dfef43 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -887,7 +887,8 @@ static int log_writes_iterate_devices(struct dm_target *ti, * Messages supported: * mark - specify the marked data. */ -static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv) +static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { int r = -EINVAL; struct log_writes_c *lc = ti->private; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index a6b7baf31cdd..1e60ec44942e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1811,7 +1811,8 @@ static void multipath_status(struct dm_target *ti, status_type_t type, spin_unlock_irqrestore(&m->lock, flags); } -static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) +static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { int r = -EINVAL; struct dm_dev *dev; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c1d1034ff7b7..b7a9c710ebec 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3663,7 +3663,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, } } -static int raid_message(struct dm_target *ti, unsigned int argc, char **argv) +static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned maxlen) { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 8d0ba879777e..8f9208c2d2e3 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -466,7 +466,8 @@ static int process_set_region_mappings(struct switch_ctx *sctx, * * Only set_region_mappings is supported. */ -static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +static int switch_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { static DEFINE_MUTEX(message_mutex); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 629c555890c1..b11107497d2e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3705,7 +3705,8 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct * reserve_metadata_snap * release_metadata_snap */ -static int pool_message(struct dm_target *ti, unsigned argc, char **argv) +static int pool_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { int r = -EINVAL; struct pool_c *pt = ti->private; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index da83f64952e7..1e2426c18eb4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -87,7 +87,8 @@ typedef void (*dm_resume_fn) (struct dm_target *ti); typedef void (*dm_status_fn) (struct dm_target *ti, status_type_t status_type, unsigned status_flags, char *result, unsigned maxlen); -typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv); +typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen); typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev, fmode_t *mode); diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 14c44ec8b622..5108da02cd32 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -270,9 +270,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 37 +#define DM_VERSION_MINOR 38 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2017-09-20)" +#define DM_VERSION_EXTRA "-ioctl (2018-02-28)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ From 5059353df86e2573ccd9d43fd9d9396dcec47ca2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 13 Aug 2017 22:45:08 -0400 Subject: [PATCH 02/28] dm crypt: limit the number of allocated pages dm-crypt consumes an excessive amount memory when the user attempts to zero a dm-crypt device with "blkdiscard -z". The command "blkdiscard -z" calls the BLKZEROOUT ioctl, it goes to the function __blkdev_issue_zeroout, __blkdev_issue_zeroout sends a large amount of write bios that contain the zero page as their payload. For each incoming page, dm-crypt allocates another page that holds the encrypted data, so when processing "blkdiscard -z", dm-crypt tries to allocate the amount of memory that is equal to the size of the device. This can trigger OOM killer or cause system crash. Fix this by limiting the amount of memory that dm-crypt allocates to 2% of total system memory. This limit is system-wide and is divided by the number of active dm-crypt devices and each device receives an equal share. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 66 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index dd9b5332b39c..44ff473dab3e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -148,6 +148,8 @@ struct crypt_config { mempool_t *tag_pool; unsigned tag_pool_max_sectors; + struct percpu_counter n_allocated_pages; + struct bio_set *bs; struct mutex bio_alloc_lock; @@ -219,6 +221,12 @@ struct crypt_config { #define MAX_TAG_SIZE 480 #define POOL_ENTRY_SIZE 512 +static DEFINE_SPINLOCK(dm_crypt_clients_lock); +static unsigned dm_crypt_clients_n = 0; +static volatile unsigned long dm_crypt_pages_per_client; +#define DM_CRYPT_MEMORY_PERCENT 2 +#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_PAGES * 16) + static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, @@ -2155,6 +2163,43 @@ static int crypt_wipe_key(struct crypt_config *cc) return r; } +static void crypt_calculate_pages_per_client(void) +{ + unsigned long pages = (totalram_pages - totalhigh_pages) * DM_CRYPT_MEMORY_PERCENT / 100; + + if (!dm_crypt_clients_n) + return; + + pages /= dm_crypt_clients_n; + if (pages < DM_CRYPT_MIN_PAGES_PER_CLIENT) + pages = DM_CRYPT_MIN_PAGES_PER_CLIENT; + dm_crypt_pages_per_client = pages; +} + +static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct crypt_config *cc = pool_data; + struct page *page; + + if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) && + likely(gfp_mask & __GFP_NORETRY)) + return NULL; + + page = alloc_page(gfp_mask); + if (likely(page != NULL)) + percpu_counter_add(&cc->n_allocated_pages, 1); + + return page; +} + +static void crypt_page_free(void *page, void *pool_data) +{ + struct crypt_config *cc = pool_data; + + __free_page(page); + percpu_counter_sub(&cc->n_allocated_pages, 1); +} + static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; @@ -2181,6 +2226,10 @@ static void crypt_dtr(struct dm_target *ti) mempool_destroy(cc->req_pool); mempool_destroy(cc->tag_pool); + if (cc->page_pool) + WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0); + percpu_counter_destroy(&cc->n_allocated_pages); + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); @@ -2197,6 +2246,12 @@ static void crypt_dtr(struct dm_target *ti) /* Must zero key material before freeing */ kzfree(cc); + + spin_lock(&dm_crypt_clients_lock); + WARN_ON(!dm_crypt_clients_n); + dm_crypt_clients_n--; + crypt_calculate_pages_per_client(); + spin_unlock(&dm_crypt_clients_lock); } static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) @@ -2644,6 +2699,15 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = cc; + spin_lock(&dm_crypt_clients_lock); + dm_crypt_clients_n++; + crypt_calculate_pages_per_client(); + spin_unlock(&dm_crypt_clients_lock); + + ret = percpu_counter_init(&cc->n_allocated_pages, 0, GFP_KERNEL); + if (ret < 0) + goto bad; + /* Optional parameters need to be read before cipher constructor */ if (argc > 5) { ret = crypt_ctr_optional(ti, argc - 5, &argv[5]); @@ -2698,7 +2762,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); - cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); + cc->page_pool = mempool_create(BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; goto bad; From 2ae600cd15a7cce7f2c26d24cfbd9c1bc9e1810d Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 1 Feb 2018 19:06:09 +0100 Subject: [PATCH 03/28] dm unstripe: support non-power-of-2 chunk size Address "FIXME: must support non power of 2 chunk_size, dm-stripe.c does". Bump target version to indicate change. Signed-off-by: Heinz Mauelshagen Tested-by: Scott Bauer Reviewed-by: Scott Bauer Signed-off-by: Mike Snitzer --- drivers/md/dm-unstripe.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 65f838fa2e99..05d76f337838 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -69,12 +69,6 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err; } - // FIXME: must support non power of 2 chunk_size, dm-stripe.c does - if (!is_power_of_2(uc->chunk_size)) { - ti->error = "Non power of 2 chunk_size is not supported yet"; - goto err; - } - if (kstrtouint(argv[2], 10, &uc->unstripe)) { ti->error = "Invalid stripe number"; goto err; @@ -98,7 +92,7 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) uc->unstripe_offset = uc->unstripe * uc->chunk_size; uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size; - uc->chunk_shift = fls(uc->chunk_size) - 1; + uc->chunk_shift = is_power_of_2(uc->chunk_size) ? fls(uc->chunk_size) - 1 : 0; tmp_len = ti->len; if (sector_div(tmp_len, uc->chunk_size)) { @@ -129,14 +123,18 @@ static sector_t map_to_core(struct dm_target *ti, struct bio *bio) { struct unstripe_c *uc = ti->private; sector_t sector = bio->bi_iter.bi_sector; + sector_t tmp_sector = sector; /* Shift us up to the right "row" on the stripe */ - sector += uc->unstripe_width * (sector >> uc->chunk_shift); + if (uc->chunk_shift) + tmp_sector >>= uc->chunk_shift; + else + sector_div(tmp_sector, uc->chunk_size); + + sector += uc->unstripe_width * tmp_sector; /* Account for what stripe we're operating on */ - sector += uc->unstripe_offset; - - return sector; + return sector + uc->unstripe_offset; } static int unstripe_map(struct dm_target *ti, struct bio *bio) @@ -185,7 +183,7 @@ static void unstripe_io_hints(struct dm_target *ti, static struct target_type unstripe_target = { .name = "unstriped", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = unstripe_ctr, .dtr = unstripe_dtr, From ba5dfbb712e72002c4e4b02def4df4a020849ce6 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 1 Feb 2018 19:06:09 +0100 Subject: [PATCH 04/28] dm unstripe: add "dm-unstriped" module alias This target's kernel module being named dm-unstripe.ko doesn't allow lvm2's DM module autoload capability to load the dm-unstripe.ko because lvm2 looks for dm-unstriped.ko due to the target name being "unstriped". Add the "dm-unstriped" module alias to resolve this oversight. NOTE: this isn't needed for the "striped" target, despite its source file being named dm-stripe.c, because it is part of dm-mod.ko. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-unstripe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 05d76f337838..28ce7e57d981 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -213,5 +213,6 @@ module_init(dm_unstripe_init); module_exit(dm_unstripe_exit); MODULE_DESCRIPTION(DM_NAME " unstriped target"); +MODULE_ALIAS("dm-unstriped"); MODULE_AUTHOR("Scott Bauer "); MODULE_LICENSE("GPL"); From 91e065d8f2354f25246d5c6a0ee270ab74c43dd8 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 1 Feb 2018 19:06:10 +0100 Subject: [PATCH 05/28] dm unstripe: remove superfluous module init error path message Signed-off-by: Heinz Mauelshagen Reviewed-by: Scott Bauer Signed-off-by: Mike Snitzer --- drivers/md/dm-unstripe.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 28ce7e57d981..cf7ac073d840 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -195,13 +195,7 @@ static struct target_type unstripe_target = { static int __init dm_unstripe_init(void) { - int r; - - r = dm_register_target(&unstripe_target); - if (r < 0) - DMERR("target registration failed"); - - return r; + return dm_register_target(&unstripe_target); } static void __exit dm_unstripe_exit(void) From afac6bd6d1a3ea38c800aa82d6d0ceda518b3aec Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 1 Feb 2018 19:06:12 +0100 Subject: [PATCH 06/28] dm unstripe: remove unnecessary header includes Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-unstripe.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index cf7ac073d840..954b7ab4e684 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -7,12 +7,6 @@ #include "dm.h" #include -#include -#include -#include -#include -#include -#include struct unstripe_c { struct dm_dev *dev; From 2d77dafe23b6c0cc9e501bda0e3f138b96ecd811 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Mon, 5 Feb 2018 10:25:44 +0800 Subject: [PATCH 07/28] dm: remove unused macro DM_MOD_NAME_SIZE Signed-off-by: Wang Sheng-Hui Signed-off-by: Mike Snitzer --- drivers/md/dm-target.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index c0d7e60820c4..314d17ca6466 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -16,8 +16,6 @@ static LIST_HEAD(_targets); static DECLARE_RWSEM(_lock); -#define DM_MOD_NAME_SIZE 32 - static inline struct target_type *__find_target_type(const char *name) { struct target_type *tt; From e16b4f99f0f79682a7efe191a8ce694d87ca9fc8 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 13 Feb 2018 14:50:50 +0100 Subject: [PATCH 08/28] dm integrity: fail early if required HMAC key is not available Since crypto API commit 9fa68f62004 ("crypto: hash - prevent using keyed hashes without setting key") dm-integrity cannot use keyed algorithms without the key being set. The dm-integrity recognizes this too late (during use of HMAC), so it allows creation and formatting of superblock, but the device is in fact unusable. Fix it by detecting the key requirement in integrity table constructor. Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 46d7c8749222..6c81b11d0521 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2548,6 +2548,9 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, *error = error_key; return r; } + } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { + *error = error_key; + return -ENOKEY; } } From e5c4cb9b1b78edb5bd42a9bd7315a0d7b842ac71 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 28 Feb 2018 15:32:47 +0800 Subject: [PATCH 09/28] dm log writes: record metadata flag for better flags record So developer could distinguish data and metadata bios easier. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Mike Snitzer --- drivers/md/dm-log-writes.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index e4c015dfef43..fefe6719a64d 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -52,10 +52,11 @@ * in fact we want to do the data and the discard in the order that they * completed. */ -#define LOG_FLUSH_FLAG (1 << 0) -#define LOG_FUA_FLAG (1 << 1) -#define LOG_DISCARD_FLAG (1 << 2) -#define LOG_MARK_FLAG (1 << 3) +#define LOG_FLUSH_FLAG (1 << 0) +#define LOG_FUA_FLAG (1 << 1) +#define LOG_DISCARD_FLAG (1 << 2) +#define LOG_MARK_FLAG (1 << 3) +#define LOG_METADATA_FLAG (1 << 4) #define WRITE_LOG_VERSION 1ULL #define WRITE_LOG_MAGIC 0x6a736677736872ULL @@ -699,6 +700,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); bool fua_bio = (bio->bi_opf & REQ_FUA); bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); + bool meta_bio = (bio->bi_opf & REQ_META); pb->block = NULL; @@ -743,6 +745,8 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) block->flags |= LOG_FUA_FLAG; if (discard_bio) block->flags |= LOG_DISCARD_FLAG; + if (meta_bio) + block->flags |= LOG_METADATA_FLAG; block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector); block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); From 706dd22f12628b70b342bada819ffc1e80337f3f Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Thu, 8 Mar 2018 20:35:44 -0700 Subject: [PATCH 10/28] dm stripe: get rid of a Variable Length Array (VLA) Ideally, we'd like to get rid of all VLAs in the kernel and add -Wvla to the build args: https://lkml.org/lkml/2018/3/7/621 This one is a simple case, since we don't actually need the VLA at all: we can just iterate over the stripes twice, once to emit their names, and the second time to emit status (i.e. trade memory for time). Since the number of stripes is probably low, this is hopefully not that expensive. Signed-off-by: Tycho Andersen Reviewed-by: Kees Cook Signed-off-by: Mike Snitzer --- drivers/md/dm-stripe.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b5e892149c54..cd209e8902a6 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -368,7 +368,6 @@ static void stripe_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct stripe_c *sc = (struct stripe_c *) ti->private; - char buffer[sc->stripes + 1]; unsigned int sz = 0; unsigned int i; @@ -377,11 +376,12 @@ static void stripe_status(struct dm_target *ti, status_type_t type, DMEMIT("%d ", sc->stripes); for (i = 0; i < sc->stripes; i++) { DMEMIT("%s ", sc->stripe[i].dev->name); - buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? - 'D' : 'A'; } - buffer[i] = '\0'; - DMEMIT("1 %s", buffer); + DMEMIT("1 "); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%c", atomic_read(&(sc->stripe[i].error_count)) ? + 'D' : 'A'); + } break; case STATUSTYPE_TABLE: From 8192a0cd7648f222d8ba73f302df0f03f07d5c4f Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Sat, 10 Mar 2018 10:18:55 +0800 Subject: [PATCH 11/28] dm mpath: use DM_MAPIO_SUBMITTED instead of magic number 0 in process_queued_bios() Signed-off-by: Wang Sheng-Hui Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 1e60ec44942e..746dd8a75b4a 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -714,7 +714,7 @@ static void process_queued_bios(struct work_struct *work) case DM_MAPIO_REMAPPED: generic_make_request(bio); break; - case 0: + case DM_MAPIO_SUBMITTED: break; default: WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r); From 880bcce0dcc3172fe865352b492c41d85290cb8d Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 16 Mar 2018 23:01:59 +0100 Subject: [PATCH 12/28] dm raid: fix nosync status Fix a race for "nosync" activations providing "aa.." device health characters and "0/N" sync ratio rather than "AA..." and "N/N". Occurs when status for the raid set is retrieved during resume before the MD sync thread starts and clears the MD_RECOVERY_NEEDED flag. Cc: stable@vger.kernel.org # 4.16+ Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b7a9c710ebec..598c9e3e41a5 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3408,7 +3408,8 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - if (!test_bit(MD_RECOVERY_INTR, &recovery) && + if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) && + !test_bit(MD_RECOVERY_INTR, &recovery) && (test_bit(MD_RECOVERY_NEEDED, &recovery) || test_bit(MD_RECOVERY_RESHAPE, &recovery) || test_bit(MD_RECOVERY_RUNNING, &recovery))) From 0519c71e8d461ac3ef9a555bb7339243c9128d37 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 26 Mar 2018 11:49:16 -0400 Subject: [PATCH 13/28] dm: backfill abnormal IO support to non-splitting IO submission Otherwise, these abnormal IOs would be sent to the DM target regardless of whether the target advertised support for them. Factor out __process_abnormal_io() from __split_and_process_non_flush() so that discards, write same, etc may be conditionally processed. Fixes: 978e51ba3 ("dm: optimize bio-based NVMe IO submission") Cc: stable@vger.kernel.org # 4.16 Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 353ea0ede091..038c7572fdd4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1477,6 +1477,23 @@ static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL); } +static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, + int *result) +{ + struct bio *bio = ci->bio; + + if (bio_op(bio) == REQ_OP_DISCARD) + *result = __send_discard(ci, ti); + else if (bio_op(bio) == REQ_OP_WRITE_SAME) + *result = __send_write_same(ci, ti); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) + *result = __send_write_zeroes(ci, ti); + else + return false; + + return true; +} + /* * Select the correct strategy for processing a non-flush bio. */ @@ -1491,12 +1508,8 @@ static int __split_and_process_non_flush(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) - return __send_discard(ci, ti); - else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - return __send_write_same(ci, ti); - else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) - return __send_write_zeroes(ci, ti); + if (unlikely(__process_abnormal_io(ci, ti, &r))) + return r; if (bio_op(bio) == REQ_OP_ZONE_REPORT) len = ci->sector_count; @@ -1617,9 +1630,12 @@ static blk_qc_t __process_bio(struct mapped_device *md, goto out; } - tio = alloc_tio(&ci, ti, 0, GFP_NOIO); ci.bio = bio; ci.sector_count = bio_sectors(bio); + if (unlikely(__process_abnormal_io(&ci, ti, &error))) + goto out; + + tio = alloc_tio(&ci, ti, 0, GFP_NOIO); ret = __clone_and_map_simple_bio(&ci, tio, NULL); } out: From 00716545c894fc464e00612809d9cb836b180c99 Mon Sep 17 00:00:00 2001 From: Denis Semakin Date: Tue, 13 Mar 2018 13:23:45 +0400 Subject: [PATCH 14/28] dm: add support for secure erase forwarding Set QUEUE_FLAG_SECERASE in DM device's queue_flags if a DM table's data devices support secure erase. Also, add support for secure erase to both the linear and striped targets. Signed-off-by: Denis Semakin Signed-off-by: Mike Snitzer --- drivers/md/dm-linear.c | 1 + drivers/md/dm-stripe.c | 2 ++ drivers/md/dm-table.c | 31 +++++++++++++++++++++++++++++++ drivers/md/dm.c | 12 ++++++++++++ include/linux/device-mapper.h | 6 ++++++ 5 files changed, 52 insertions(+) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d5f8eff7c11d..ff751b00aacd 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 1; ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; ti->private = lc; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index cd209e8902a6..bb907cb3e60d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; + ti->num_secure_erase_bios = stripes; ti->num_write_same_bios = stripes; ti->num_write_zeroes_bios = stripes; @@ -295,6 +296,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || + unlikely(bio_op(bio) == REQ_OP_SECURE_ERASE) || unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) || unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) { target_bio_nr = dm_bio_get_target_bio_nr(bio); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7eb3e2a3c07d..f20fbc96a805 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1846,6 +1846,34 @@ static bool dm_table_supports_discards(struct dm_table *t) return true; } +static int device_not_secure_erase_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !blk_queue_secure_erase(q); +} + +static bool dm_table_supports_secure_erase(struct dm_table *t) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (!ti->num_secure_erase_bios) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) + return false; + } + + return true; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1867,6 +1895,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + if (dm_table_supports_secure_erase(t)) + queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q); + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 038c7572fdd4..3b3cbd1a1659 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1414,6 +1414,11 @@ static unsigned get_num_discard_bios(struct dm_target *ti) return ti->num_discard_bios; } +static unsigned get_num_secure_erase_bios(struct dm_target *ti) +{ + return ti->num_secure_erase_bios; +} + static unsigned get_num_write_same_bios(struct dm_target *ti) { return ti->num_write_same_bios; @@ -1467,6 +1472,11 @@ static int __send_discard(struct clone_info *ci, struct dm_target *ti) is_split_required_for_discard); } +static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) +{ + return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL); +} + static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL); @@ -1484,6 +1494,8 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, if (bio_op(bio) == REQ_OP_DISCARD) *result = __send_discard(ci, ti); + else if (bio_op(bio) == REQ_OP_SECURE_ERASE) + *result = __send_secure_erase(ci, ti); else if (bio_op(bio) == REQ_OP_WRITE_SAME) *result = __send_write_same(ci, ti); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 1e2426c18eb4..019e2efc6c25 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -267,6 +267,12 @@ struct dm_target { */ unsigned num_discard_bios; + /* + * The number of secure erase bios that will be submitted to the target. + * The bio number can be accessed with dm_bio_get_target_bio_nr. + */ + unsigned num_secure_erase_bios; + /* * The number of WRITE SAME bios that will be submitted to the target. * The bio number can be accessed with dm_bio_get_target_bio_nr. From 1f013174b352057557e47321f23a33e39e752bd4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 4 Mar 2018 01:53:00 -0500 Subject: [PATCH 15/28] dm bufio: delete outdated comment This comment was true when dm-bufio was written but, since 4.3, bios can now have arbitrary size and the driver splits them. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aa2032fa80d4..f5360a6260ff 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -540,10 +540,6 @@ static void __relink_lru(struct dm_buffer *b, int dirty) * * the memory must be direct-mapped, not vmalloced; * - * the I/O driver can reject requests spuriously if it thinks that - * the requests are too big for the device or if they cross a - * controller-defined memory boundary. - * * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and * it is not vmalloced, try using the bio interface. * From afa53df869121fd4f6f1265cbe794d64387890ae Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 15 Mar 2018 16:02:31 -0400 Subject: [PATCH 16/28] dm bufio: move dm-bufio.h to include/linux/ Move dm-bufio.h to include/linux/ so that external GPL'd DM target modules can use it. It is better to allow the use of dm-bufio than force external modules to implement the equivalent buffered IO mechanism in some new way. The hope is this will encourage the use of dm-bufio; which will then make it easier for a GPL'd external DM target module to be included upstream. A couple dm-bufio EXPORT_SYMBOL exports have also been updated to use EXPORT_SYMBOL_GPL. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 8 ++++---- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-snap-persistent.c | 2 +- drivers/md/dm-verity.h | 2 +- drivers/md/persistent-data/dm-block-manager.c | 2 +- {drivers/md => include/linux}/dm-bufio.h | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) rename {drivers/md => include/linux}/dm-bufio.h (98%) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index f5360a6260ff..87795a361d09 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -6,7 +6,7 @@ * This file is released under the GPL. */ -#include "dm-bufio.h" +#include #include #include @@ -1478,13 +1478,13 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) dm_bufio_unlock(c); } -EXPORT_SYMBOL(dm_bufio_forget); +EXPORT_SYMBOL_GPL(dm_bufio_forget); void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) { c->minimum_buffers = n; } -EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); +EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers); unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) { @@ -1690,7 +1690,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; - c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; + dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS); init_waitqueue_head(&c->free_buffer_wait); c->async_write_error = 0; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 6c81b11d0521..77d9fe58dae2 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -18,7 +18,7 @@ #include #include #include -#include "dm-bufio.h" +#include #define DM_MSG_PREFIX "integrity" diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index c5534d294773..3c50c4e4da8f 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -14,7 +14,7 @@ #include #include #include -#include "dm-bufio.h" +#include #define DM_MSG_PREFIX "persistent snapshot" #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index b675bc015512..69f9a298f8c9 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -12,7 +12,7 @@ #ifndef DM_VERITY_H #define DM_VERITY_H -#include "dm-bufio.h" +#include #include #include diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index ea15d220ced7..492a3f8ac119 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -5,8 +5,8 @@ */ #include "dm-block-manager.h" #include "dm-persistent-data-internal.h" -#include "../dm-bufio.h" +#include #include #include #include diff --git a/drivers/md/dm-bufio.h b/include/linux/dm-bufio.h similarity index 98% rename from drivers/md/dm-bufio.h rename to include/linux/dm-bufio.h index be732d3f8611..3c8b7d274bd9 100644 --- a/drivers/md/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -6,8 +6,8 @@ * This file is released under the GPL. */ -#ifndef DM_BUFIO_H -#define DM_BUFIO_H +#ifndef _LINUX_DM_BUFIO_H +#define _LINUX_DM_BUFIO_H #include #include From eeb67a0ba04dfb80ec1a956fa6345f4ca5b9c937 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 15 Mar 2018 17:16:07 -0400 Subject: [PATCH 17/28] dm bufio: get rid of slab cache name allocations dm-bufio keeps the dm_bufio_cache_names array that holds names of the slab caches. Since the commit db265eca7700 ("mm/sl[aou]b: Move duping of slab name to slab_common.c"), the kernel automatically duplicates the slab cache name when creating the slab cache, so we no longer have to keep the name allocated. Remove the code that allocates the slab names and keeps them around. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 87795a361d09..6dd465fa90a7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -173,7 +173,6 @@ struct dm_buffer { /*----------------------------------------------------------------*/ static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; -static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; static inline int dm_bufio_cache_index(struct dm_bufio_client *c) { @@ -185,7 +184,6 @@ static inline int dm_bufio_cache_index(struct dm_bufio_client *c) } #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) -#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) #define dm_bufio_in_request() (!!current->bio_list) @@ -1703,19 +1701,10 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign mutex_lock(&dm_bufio_clients_lock); if (c->blocks_per_page_bits) { - if (!DM_BUFIO_CACHE_NAME(c)) { - DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); - if (!DM_BUFIO_CACHE_NAME(c)) { - r = -ENOMEM; - mutex_unlock(&dm_bufio_clients_lock); - goto bad; - } - } - if (!DM_BUFIO_CACHE(c)) { - DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), - c->block_size, - c->block_size, 0, NULL); + char name[26]; + snprintf(name, sizeof name, "dm_bufio_cache-%u", c->block_size); + DM_BUFIO_CACHE(c) = kmem_cache_create(name, c->block_size, c->block_size, 0, NULL); if (!DM_BUFIO_CACHE(c)) { r = -ENOMEM; mutex_unlock(&dm_bufio_clients_lock); @@ -1908,7 +1897,6 @@ static int __init dm_bufio_init(void) dm_bufio_current_allocated = 0; memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); - memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; @@ -1952,9 +1940,6 @@ static void __exit dm_bufio_exit(void) for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) kmem_cache_destroy(dm_bufio_caches[i]); - for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) - kfree(dm_bufio_cache_names[i]); - if (dm_bufio_client_count) { DMCRIT("%s: dm_bufio_client_count leaked: %d", __func__, dm_bufio_client_count); From 21bb13276768da7925c0b532037e071877e4fb4d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Mar 2018 20:29:42 +0200 Subject: [PATCH 18/28] dm bufio: remove code that merges slab caches All slab allocators can merge duplicate caches. So dm-bufio doesn't need extra slab merging logic. Instead it can just allocate one slab cache per client and let the allocator merge them. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 53 ++++++++++++------------------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 6dd465fa90a7..ef6365ed2af0 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -57,10 +57,9 @@ #define DM_BUFIO_INLINE_VECS 16 /* - * Don't try to use kmem_cache_alloc for blocks larger than this. + * Don't try to use alloc_pages for blocks larger than this. * For explanation, see alloc_buffer_data below. */ -#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) /* @@ -101,11 +100,11 @@ struct dm_bufio_client { unsigned block_size; unsigned char sectors_per_block_bits; unsigned char pages_per_block_bits; - unsigned char blocks_per_page_bits; unsigned aux_size; void (*alloc_callback)(struct dm_buffer *); void (*write_callback)(struct dm_buffer *); + struct kmem_cache *slab_cache; struct dm_io_client *dm_io; struct list_head reserved_buffers; @@ -172,19 +171,6 @@ struct dm_buffer { /*----------------------------------------------------------------*/ -static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; - -static inline int dm_bufio_cache_index(struct dm_bufio_client *c) -{ - unsigned ret = c->blocks_per_page_bits - 1; - - BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); - - return ret; -} - -#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) - #define dm_bufio_in_request() (!!current->bio_list) static void dm_bufio_lock(struct dm_bufio_client *c) @@ -384,9 +370,9 @@ static void __cache_size_refresh(void) static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, enum data_mode *data_mode) { - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { + if (unlikely(c->slab_cache != NULL)) { *data_mode = DATA_MODE_SLAB; - return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); + return kmem_cache_alloc(c->slab_cache, gfp_mask); } if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && @@ -426,7 +412,7 @@ static void free_buffer_data(struct dm_bufio_client *c, { switch (data_mode) { case DATA_MODE_SLAB: - kmem_cache_free(DM_BUFIO_CACHE(c), data); + kmem_cache_free(c->slab_cache, data); break; case DATA_MODE_GET_FREE_PAGES: @@ -1672,8 +1658,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? __ffs(block_size) - PAGE_SHIFT : 0; - c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? - PAGE_SHIFT - __ffs(block_size) : 0); c->aux_size = aux_size; c->alloc_callback = alloc_callback; @@ -1699,20 +1683,15 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign goto bad_dm_io; } - mutex_lock(&dm_bufio_clients_lock); - if (c->blocks_per_page_bits) { - if (!DM_BUFIO_CACHE(c)) { - char name[26]; - snprintf(name, sizeof name, "dm_bufio_cache-%u", c->block_size); - DM_BUFIO_CACHE(c) = kmem_cache_create(name, c->block_size, c->block_size, 0, NULL); - if (!DM_BUFIO_CACHE(c)) { - r = -ENOMEM; - mutex_unlock(&dm_bufio_clients_lock); - goto bad; - } + if (block_size < PAGE_SIZE) { + char name[26]; + snprintf(name, sizeof name, "dm_bufio_cache-%u", c->block_size); + c->slab_cache = kmem_cache_create(name, c->block_size, c->block_size, 0, NULL); + if (!c->slab_cache) { + r = -ENOMEM; + goto bad; } } - mutex_unlock(&dm_bufio_clients_lock); while (c->need_reserved_buffers) { struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); @@ -1747,6 +1726,7 @@ bad: list_del(&b->lru_list); free_buffer(b); } + kmem_cache_destroy(c->slab_cache); dm_io_client_destroy(c->dm_io); bad_dm_io: mutex_destroy(&c->lock); @@ -1793,6 +1773,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) for (i = 0; i < LIST_SIZE; i++) BUG_ON(c->n_buffers[i]); + kmem_cache_destroy(c->slab_cache); dm_io_client_destroy(c->dm_io); mutex_destroy(&c->lock); kfree(c); @@ -1896,8 +1877,6 @@ static int __init dm_bufio_init(void) dm_bufio_allocated_vmalloc = 0; dm_bufio_current_allocated = 0; - memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); - mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; @@ -1932,14 +1911,10 @@ static int __init dm_bufio_init(void) static void __exit dm_bufio_exit(void) { int bug = 0; - int i; cancel_delayed_work_sync(&dm_bufio_work); destroy_workqueue(dm_bufio_wq); - for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) - kmem_cache_destroy(dm_bufio_caches[i]); - if (dm_bufio_client_count) { DMCRIT("%s: dm_bufio_client_count leaked: %d", __func__, dm_bufio_client_count); From 6b5e718cc138ef691e91685535e3ba776acc4893 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 15 Mar 2018 17:22:00 -0400 Subject: [PATCH 19/28] dm bufio: relax alignment constraint on slab cache The I/O buffer doesn't have to be aligned on block size granularity, relax alignment to ARCH_KMALLOC_MINALIGN (required to allow DMA from slab cache memory on some architectures). Also, set SLAB_RECLAIM_ACCOUNT so that the memory allocated from the cache is accounted as reclaimable and doesn't inflate the 'used' entry in the free command. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ef6365ed2af0..c57aefb3c643 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -618,7 +618,6 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len); if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step, offset_in_page(ptr))) { - BUG_ON(b->c->block_size <= PAGE_SIZE); use_dmio(b, rw, sector, n_sectors, offset, end_io); return; } @@ -1686,7 +1685,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (block_size < PAGE_SIZE) { char name[26]; snprintf(name, sizeof name, "dm_bufio_cache-%u", c->block_size); - c->slab_cache = kmem_cache_create(name, c->block_size, c->block_size, 0, NULL); + c->slab_cache = kmem_cache_create(name, c->block_size, ARCH_KMALLOC_MINALIGN, + SLAB_RECLAIM_ACCOUNT, NULL); if (!c->slab_cache) { r = -ENOMEM; goto bad; From 03b02939593700caf391dc02b86b23b0d7aac2e5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Mar 2018 20:29:44 +0200 Subject: [PATCH 20/28] dm bufio: reorder fields in dm_buffer structure Reorder fields in dm_buffer structure to improve packing and reduce structure size. The compiler allocates 32-bit integer for field 'enum data_mode', so change it to unsigned char. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c57aefb3c643..776075ccf775 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -147,11 +147,11 @@ struct dm_buffer { struct list_head lru_list; sector_t block; void *data; - enum data_mode data_mode; + unsigned char data_mode; /* DATA_MODE_* */ unsigned char list_mode; /* LIST_* */ - unsigned hold_count; blk_status_t read_error; blk_status_t write_error; + unsigned hold_count; unsigned long state; unsigned long last_accessed; unsigned dirty_start; @@ -303,7 +303,7 @@ static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) /*----------------------------------------------------------------*/ -static void adjust_total_allocated(enum data_mode data_mode, long diff) +static void adjust_total_allocated(unsigned char data_mode, long diff) { static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { &dm_bufio_allocated_kmem_cache, @@ -368,7 +368,7 @@ static void __cache_size_refresh(void) * space. */ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, - enum data_mode *data_mode) + unsigned char *data_mode) { if (unlikely(c->slab_cache != NULL)) { *data_mode = DATA_MODE_SLAB; @@ -408,7 +408,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, * Free buffer's data. */ static void free_buffer_data(struct dm_bufio_client *c, - void *data, enum data_mode data_mode) + void *data, unsigned char data_mode) { switch (data_mode) { case DATA_MODE_SLAB: From 359dbf19ab524652a2208a2a2cddccec2eede2ad Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Mar 2018 20:29:45 +0200 Subject: [PATCH 21/28] dm bufio: use slab cache for dm_buffer structure allocations kmalloc padded to the next power of two, using a slab cache avoids this. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 776075ccf775..4a216926ee19 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -100,10 +100,10 @@ struct dm_bufio_client { unsigned block_size; unsigned char sectors_per_block_bits; unsigned char pages_per_block_bits; - unsigned aux_size; void (*alloc_callback)(struct dm_buffer *); void (*write_callback)(struct dm_buffer *); + struct kmem_cache *slab_buffer; struct kmem_cache *slab_cache; struct dm_io_client *dm_io; @@ -435,8 +435,7 @@ static void free_buffer_data(struct dm_bufio_client *c, */ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) { - struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, - gfp_mask); + struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask); if (!b) return NULL; @@ -445,7 +444,7 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); if (!b->data) { - kfree(b); + kmem_cache_free(c->slab_buffer, b); return NULL; } @@ -467,7 +466,7 @@ static void free_buffer(struct dm_buffer *b) adjust_total_allocated(b->data_mode, -(long)c->block_size); free_buffer_data(c, b->data, b->data_mode); - kfree(b); + kmem_cache_free(c->slab_buffer, b); } /* @@ -1641,6 +1640,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign int r; struct dm_bufio_client *c; unsigned i; + char slab_name[27]; BUG_ON(block_size < 1 << SECTOR_SHIFT || (block_size & (block_size - 1))); @@ -1658,7 +1658,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? __ffs(block_size) - PAGE_SHIFT : 0; - c->aux_size = aux_size; c->alloc_callback = alloc_callback; c->write_callback = write_callback; @@ -1683,15 +1682,24 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign } if (block_size < PAGE_SIZE) { - char name[26]; - snprintf(name, sizeof name, "dm_bufio_cache-%u", c->block_size); - c->slab_cache = kmem_cache_create(name, c->block_size, ARCH_KMALLOC_MINALIGN, + snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", c->block_size); + c->slab_cache = kmem_cache_create(slab_name, c->block_size, ARCH_KMALLOC_MINALIGN, SLAB_RECLAIM_ACCOUNT, NULL); if (!c->slab_cache) { r = -ENOMEM; goto bad; } } + if (aux_size) + snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size); + else + snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer"); + c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size, + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (!c->slab_buffer) { + r = -ENOMEM; + goto bad; + } while (c->need_reserved_buffers) { struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); @@ -1727,6 +1735,7 @@ bad: free_buffer(b); } kmem_cache_destroy(c->slab_cache); + kmem_cache_destroy(c->slab_buffer); dm_io_client_destroy(c->dm_io); bad_dm_io: mutex_destroy(&c->lock); @@ -1774,6 +1783,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) BUG_ON(c->n_buffers[i]); kmem_cache_destroy(c->slab_cache); + kmem_cache_destroy(c->slab_buffer); dm_io_client_destroy(c->dm_io); mutex_destroy(&c->lock); kfree(c); From f51f2e0a7fb17af57634c65687c985c9b3449263 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Mar 2018 20:29:46 +0200 Subject: [PATCH 22/28] dm bufio: support non-power-of-two block sizes Support block sizes that are not a power-of-two (but they must be a multiple of 512b). As always, a slab cache is used for allocations. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 64 ++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 4a216926ee19..e88797e04fcc 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -56,12 +56,6 @@ */ #define DM_BUFIO_INLINE_VECS 16 -/* - * Don't try to use alloc_pages for blocks larger than this. - * For explanation, see alloc_buffer_data below. - */ -#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) - /* * Align buffer writes to this boundary. * Tests show that SSDs have the highest IOPS when using 4k writes. @@ -98,8 +92,7 @@ struct dm_bufio_client { struct block_device *bdev; unsigned block_size; - unsigned char sectors_per_block_bits; - unsigned char pages_per_block_bits; + s8 sectors_per_block_bits; void (*alloc_callback)(struct dm_buffer *); void (*write_callback)(struct dm_buffer *); @@ -375,11 +368,11 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, return kmem_cache_alloc(c->slab_cache, gfp_mask); } - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && + if (c->block_size <= KMALLOC_MAX_SIZE && gfp_mask & __GFP_NORETRY) { *data_mode = DATA_MODE_GET_FREE_PAGES; return (void *)__get_free_pages(gfp_mask, - c->pages_per_block_bits); + c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); } *data_mode = DATA_MODE_VMALLOC; @@ -416,7 +409,8 @@ static void free_buffer_data(struct dm_bufio_client *c, break; case DATA_MODE_GET_FREE_PAGES: - free_pages((unsigned long)data, c->pages_per_block_bits); + free_pages((unsigned long)data, + c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); break; case DATA_MODE_VMALLOC: @@ -634,10 +628,14 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) sector_t sector; unsigned offset, end; - sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; + if (likely(b->c->sectors_per_block_bits >= 0)) + sector = b->block << b->c->sectors_per_block_bits; + else + sector = b->block * (b->c->block_size >> SECTOR_SHIFT); + sector += b->c->start; if (rw != REQ_OP_WRITE) { - n_sectors = 1 << b->c->sectors_per_block_bits; + n_sectors = b->c->block_size >> SECTOR_SHIFT; offset = 0; } else { if (b->c->write_callback) @@ -941,8 +939,11 @@ static void __get_memory_limit(struct dm_bufio_client *c, } } - buffers = dm_bufio_cache_size_per_client >> - (c->sectors_per_block_bits + SECTOR_SHIFT); + buffers = dm_bufio_cache_size_per_client; + if (likely(c->sectors_per_block_bits >= 0)) + buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT; + else + buffers /= c->block_size; if (buffers < c->minimum_buffers) buffers = c->minimum_buffers; @@ -1476,8 +1477,12 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) { - return i_size_read(c->bdev->bd_inode) >> - (SECTOR_SHIFT + c->sectors_per_block_bits); + sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT; + if (likely(c->sectors_per_block_bits >= 0)) + s >>= c->sectors_per_block_bits; + else + sector_div(s, c->block_size >> SECTOR_SHIFT); + return s; } EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); @@ -1575,8 +1580,12 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); - return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); + unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); + if (likely(c->sectors_per_block_bits >= 0)) + retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT; + else + retain_bytes /= c->block_size; + return retain_bytes; } static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, @@ -1642,8 +1651,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign unsigned i; char slab_name[27]; - BUG_ON(block_size < 1 << SECTOR_SHIFT || - (block_size & (block_size - 1))); + if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) { + DMERR("%s: block size not specified or is not multiple of 512b", __func__); + r = -EINVAL; + goto bad_client; + } c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) { @@ -1654,9 +1666,10 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->bdev = bdev; c->block_size = block_size; - c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; - c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? - __ffs(block_size) - PAGE_SHIFT : 0; + if (is_power_of_2(block_size)) + c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; + else + c->sectors_per_block_bits = -1; c->alloc_callback = alloc_callback; c->write_callback = write_callback; @@ -1681,7 +1694,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign goto bad_dm_io; } - if (block_size < PAGE_SIZE) { + if (block_size <= KMALLOC_MAX_SIZE && + (block_size < PAGE_SIZE || !is_power_of_2(block_size))) { snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", c->block_size); c->slab_cache = kmem_cache_create(slab_name, c->block_size, ARCH_KMALLOC_MINALIGN, SLAB_RECLAIM_ACCOUNT, NULL); From 45354f1eb67224669a1de94dbfcb8bb226be1f74 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Mar 2018 20:29:47 +0200 Subject: [PATCH 23/28] dm bufio: don't embed a bio in the dm_buffer structure The bio structure consumes a substantial part of dm_buffer. The bio structure is only needed when doing I/O on the buffer, thus we don't have to embed it in the buffer. Allocate the bio structure only when doing I/O. We don't need to create a bio_set because, in case of allocation failure, dm-bufio falls back to using dm-io (which keeps its own bio_set). Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 105 ++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 60 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index e88797e04fcc..12aa9ca21d8c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -50,12 +50,6 @@ */ #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) -/* - * The number of bvec entries that are embedded directly in the buffer. - * If the chunk size is larger, dm-io is used to do the io. - */ -#define DM_BUFIO_INLINE_VECS 16 - /* * Align buffer writes to this boundary. * Tests show that SSDs have the highest IOPS when using 4k writes. @@ -153,8 +147,7 @@ struct dm_buffer { unsigned write_end; struct dm_bufio_client *c; struct list_head write_list; - struct bio bio; - struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; + void (*end_io)(struct dm_buffer *, blk_status_t); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING #define MAX_STACK 10 struct stack_trace stack_trace; @@ -534,12 +527,11 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_status = error ? BLK_STS_IOERR : 0; - b->bio.bi_end_io(&b->bio); + b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0); } static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, - unsigned n_sectors, unsigned offset, bio_end_io_t *end_io) + unsigned n_sectors, unsigned offset) { int r; struct dm_io_request io_req = { @@ -563,71 +555,69 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, io_req.mem.ptr.vma = (char *)b->data + offset; } - b->bio.bi_end_io = end_io; - r = dm_io(&io_req, 1, ®ion, NULL); - if (r) { - b->bio.bi_status = errno_to_blk_status(r); - end_io(&b->bio); - } + if (unlikely(r)) + b->end_io(b, errno_to_blk_status(r)); } -static void inline_endio(struct bio *bio) +static void bio_complete(struct bio *bio) { - bio_end_io_t *end_fn = bio->bi_private; + struct dm_buffer *b = bio->bi_private; blk_status_t status = bio->bi_status; - - /* - * Reset the bio to free any attached resources - * (e.g. bio integrity profiles). - */ - bio_reset(bio); - - bio->bi_status = status; - end_fn(bio); + bio_put(bio); + b->end_io(b, status); } -static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, - unsigned n_sectors, unsigned offset, bio_end_io_t *end_io) +static void use_bio(struct dm_buffer *b, int rw, sector_t sector, + unsigned n_sectors, unsigned offset) { + struct bio *bio; char *ptr; - unsigned len; + unsigned vec_size, len; - bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); - b->bio.bi_iter.bi_sector = sector; - bio_set_dev(&b->bio, b->c->bdev); - b->bio.bi_end_io = inline_endio; - /* - * Use of .bi_private isn't a problem here because - * the dm_buffer's inline bio is local to bufio. - */ - b->bio.bi_private = end_io; - bio_set_op_attrs(&b->bio, rw, 0); + vec_size = b->c->block_size >> PAGE_SHIFT; + if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT)) + vec_size += 2; + + bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size); + if (!bio) { +dmio: + use_dmio(b, rw, sector, n_sectors, offset); + return; + } + + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, b->c->bdev); + bio_set_op_attrs(bio, rw, 0); + bio->bi_end_io = bio_complete; + bio->bi_private = b; ptr = (char *)b->data + offset; len = n_sectors << SECTOR_SHIFT; do { unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len); - if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step, + if (!bio_add_page(bio, virt_to_page(ptr), this_step, offset_in_page(ptr))) { - use_dmio(b, rw, sector, n_sectors, offset, end_io); - return; + bio_put(bio); + goto dmio; } len -= this_step; ptr += this_step; } while (len > 0); - submit_bio(&b->bio); + submit_bio(bio); } -static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) +static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; sector_t sector; unsigned offset, end; + b->end_io = end_io; + if (likely(b->c->sectors_per_block_bits >= 0)) sector = b->block << b->c->sectors_per_block_bits; else @@ -652,11 +642,10 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) n_sectors = (end - offset) >> SECTOR_SHIFT; } - if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) && - b->data_mode != DATA_MODE_VMALLOC) - use_inline_bio(b, rw, sector, n_sectors, offset, end_io); + if (b->data_mode != DATA_MODE_VMALLOC) + use_bio(b, rw, sector, n_sectors, offset); else - use_dmio(b, rw, sector, n_sectors, offset, end_io); + use_dmio(b, rw, sector, n_sectors, offset); } /*---------------------------------------------------------------- @@ -669,16 +658,14 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) * Set the error, clear B_WRITING bit and wake anyone who was waiting on * it. */ -static void write_endio(struct bio *bio) +static void write_endio(struct dm_buffer *b, blk_status_t status) { - struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - - b->write_error = bio->bi_status; - if (unlikely(bio->bi_status)) { + b->write_error = status; + if (unlikely(status)) { struct dm_bufio_client *c = b->c; (void)cmpxchg(&c->async_write_error, 0, - blk_status_to_errno(bio->bi_status)); + blk_status_to_errno(status)); } BUG_ON(!test_bit(B_WRITING, &b->state)); @@ -1055,11 +1042,9 @@ found_buffer: * The endio routine for reading: set the error, clear the bit and wake up * anyone waiting on the buffer. */ -static void read_endio(struct bio *bio) +static void read_endio(struct dm_buffer *b, blk_status_t status) { - struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - - b->read_error = bio->bi_status; + b->read_error = status; BUG_ON(!test_bit(B_READING, &b->state)); From 843f38d382b1ca2f6f4ae2ef7c35933e6319ffbb Mon Sep 17 00:00:00 2001 From: Patrik Torstensson Date: Thu, 22 Mar 2018 18:18:04 -0700 Subject: [PATCH 24/28] dm verity: add 'check_at_most_once' option to only validate hashes once This allows platforms that are CPU/memory contrained to verify data blocks only the first time they are read from the data device, rather than every time. As such, it provides a reduced level of security because only offline tampering of the data device's content will be detected, not online tampering. Hash blocks are still verified each time they are read from the hash device, since verification of hash blocks is less performance critical than data blocks, and a hash block will not be verified any more after all the data blocks it covers have been verified anyway. This option introduces a bitset that is used to check if a block has been validated before or not. A block can be validated more than once as there is no thread protection for the bitset. These changes were developed and tested on entry-level Android Go devices. Signed-off-by: Patrik Torstensson Signed-off-by: Mike Snitzer --- Documentation/device-mapper/verity.txt | 11 +++++ drivers/md/dm-verity-target.c | 64 ++++++++++++++++++++++++-- drivers/md/dm-verity.h | 1 + 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index 89fd8f9a259f..b3d2e4a42255 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -109,6 +109,17 @@ fec_start This is the offset, in blocks, from the start of the FEC device to the beginning of the encoding data. +check_at_most_once + Verify data blocks only the first time they are read from the data device, + rather than every time. This reduces the overhead of dm-verity so that it + can be used on systems that are memory and/or CPU constrained. However, it + provides a reduced level of security because only offline tampering of the + data device's content will be detected, not online tampering. + + Hash blocks are still verified each time they are read from the hash device, + since verification of hash blocks is less performance critical than data + blocks, and a hash block will not be verified any more after all the data + blocks it covers have been verified anyway. Theory of operation =================== diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index aedb8222836b..14c620992794 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -32,6 +32,7 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" +#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" #define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) @@ -432,6 +433,18 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, return 0; } +/* + * Moves the bio iter one data block forward. + */ +static inline void verity_bv_skip_block(struct dm_verity *v, + struct dm_verity_io *io, + struct bvec_iter *iter) +{ + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + + bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits); +} + /* * Verify one "dm_verity_io" structure. */ @@ -445,9 +458,16 @@ static int verity_verify_io(struct dm_verity_io *io) for (b = 0; b < io->n_blocks; b++) { int r; + sector_t cur_block = io->block + b; struct ahash_request *req = verity_io_hash_req(v, io); - r = verity_hash_for_block(v, io, io->block + b, + if (v->validated_blocks && + likely(test_bit(cur_block, v->validated_blocks))) { + verity_bv_skip_block(v, io, &io->iter); + continue; + } + + r = verity_hash_for_block(v, io, cur_block, verity_io_want_digest(v, io), &is_zero); if (unlikely(r < 0)) @@ -481,13 +501,16 @@ static int verity_verify_io(struct dm_verity_io *io) return r; if (likely(memcmp(verity_io_real_digest(v, io), - verity_io_want_digest(v, io), v->digest_size) == 0)) + verity_io_want_digest(v, io), v->digest_size) == 0)) { + if (v->validated_blocks) + set_bit(cur_block, v->validated_blocks); continue; + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b, NULL, &start) == 0) + cur_block, NULL, &start) == 0) continue; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) + cur_block)) return -EIO; } @@ -673,6 +696,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args += DM_VERITY_OPTS_FEC; if (v->zero_digest) args++; + if (v->validated_blocks) + args++; if (!args) return; DMEMIT(" %u", args); @@ -691,6 +716,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, } if (v->zero_digest) DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); + if (v->validated_blocks) + DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); sz = verity_fec_status_table(v, sz, result, maxlen); break; } @@ -740,6 +767,7 @@ static void verity_dtr(struct dm_target *ti) if (v->bufio) dm_bufio_client_destroy(v->bufio); + kvfree(v->validated_blocks); kfree(v->salt); kfree(v->root_digest); kfree(v->zero_digest); @@ -760,6 +788,26 @@ static void verity_dtr(struct dm_target *ti) kfree(v); } +static int verity_alloc_most_once(struct dm_verity *v) +{ + struct dm_target *ti = v->ti; + + /* the bitset can only handle INT_MAX blocks */ + if (v->data_blocks > INT_MAX) { + ti->error = "device too large to use check_at_most_once"; + return -E2BIG; + } + + v->validated_blocks = kvzalloc(BITS_TO_LONGS(v->data_blocks) * + sizeof(unsigned long), GFP_KERNEL); + if (!v->validated_blocks) { + ti->error = "failed to allocate bitset for check_at_most_once"; + return -ENOMEM; + } + + return 0; +} + static int verity_alloc_zero_digest(struct dm_verity *v) { int r = -ENOMEM; @@ -829,6 +877,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) } continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) { + r = verity_alloc_most_once(v); + if (r) + return r; + continue; + } else if (verity_is_fec_opt_arg(arg_name)) { r = verity_fec_parse_opt_args(as, v, &argc, arg_name); if (r) @@ -1096,7 +1150,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 69f9a298f8c9..3441c10b840c 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -63,6 +63,7 @@ struct dm_verity { sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; struct dm_verity_fec *fec; /* forward error correction */ + unsigned long *validated_blocks; /* bitset blocks validated */ }; struct dm_verity_io { From d4b1aaf53c02e6440c49aeae06ba3a3a8ce9882a Mon Sep 17 00:00:00 2001 From: "weiyongjun (A)" Date: Wed, 28 Mar 2018 11:11:58 +0000 Subject: [PATCH 25/28] dm verity: make verity_for_io_block static Fixes the following sparse warning: drivers/md/dm-verity-target.c:375:6: warning: symbol 'verity_for_io_block' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-target.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 14c620992794..037ba17e3a3e 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -348,8 +348,8 @@ out: /* * Calculates the digest for the given bio */ -int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, struct crypto_wait *wait) +static int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, struct crypto_wait *wait) { unsigned int todo = 1 << v->data_dev_block_bits; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); From 13bc62d4a6c79b95ec299591df1bae0a505f2d07 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 28 Mar 2018 17:07:14 +0200 Subject: [PATCH 26/28] dm raid: fix parse_raid_params() variable range issue parse_raid_params() compares variable "int value" with INT_MAX. E.g. related Coverity report excerpt: CID 1364818 (#2 of 3): Operands don't affect result (CONSTANT_EXPRESSION_RESULT) [select issue] 1433 if (value > INT_MAX) { Fix by changing checks to avoid INT_MAX. Whilst on it, avoid unnecessary checks against constants and add check for sane recovery speed min/max. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 598c9e3e41a5..6f823f44b4aa 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1370,19 +1370,18 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, * In device-mapper, we specify things in sectors, but * MD records this value in kB */ - value /= 2; - if (value > COUNTER_MAX) { + if (value < 0 || value / 2 > COUNTER_MAX) { rs->ti->error = "Max write-behind limit out of range"; return -EINVAL; } - rs->md.bitmap_info.max_write_behind = value; + rs->md.bitmap_info.max_write_behind = value / 2; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) { if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) { rs->ti->error = "Only one daemon_sleep argument pair allowed"; return -EINVAL; } - if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + if (value < 0) { rs->ti->error = "daemon sleep period out of range"; return -EINVAL; } @@ -1424,27 +1423,33 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } + if (value < 0) { + rs->ti->error = "Bogus stripe cache entries value"; + return -EINVAL; + } rs->stripe_cache_entries = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) { if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { rs->ti->error = "Only one min_recovery_rate argument pair allowed"; return -EINVAL; } - if (value > INT_MAX) { + + if (value < 0) { rs->ti->error = "min_recovery_rate out of range"; return -EINVAL; } - rs->md.sync_speed_min = (int)value; + rs->md.sync_speed_min = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) { if (test_and_set_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) { rs->ti->error = "Only one max_recovery_rate argument pair allowed"; return -EINVAL; } - if (value > INT_MAX) { + + if (value < 0) { rs->ti->error = "max_recovery_rate out of range"; return -EINVAL; } - rs->md.sync_speed_max = (int)value; + rs->md.sync_speed_max = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) { if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) { rs->ti->error = "Only one region_size argument pair allowed"; @@ -1490,6 +1495,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } + if (rs->md.sync_speed_max && + rs->md.sync_speed_min > rs->md.sync_speed_max) { + rs->ti->error = "Bogus recovery rates"; + return -EINVAL; + } + if (validate_region_size(rs, region_size)) return -EINVAL; From 971888c46993f871f20d02d1fe43486a924fad11 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Apr 2018 15:05:12 -0400 Subject: [PATCH 27/28] dm: hold DM table for duration of ioctl rather than use blkdev_get Commit 519049afead ("dm: use blkdev_get rather than bdgrab when issuing pass-through ioctl") inadvertantly introduced a regression relative to users of device cgroups that issue ioctls (e.g. libvirt). Using blkdev_get() in DM's passthrough ioctl support implicitly introduced a cgroup permissions check that would fail unless care were taken to add all devices in the IO stack to the device cgroup. E.g. rather than just adding the top-level DM multipath device to the cgroup all the underlying devices would need to be allowed. Fix this, to no longer require allowing all underlying devices, by simply holding the live DM table (which includes the table's original blkdev_get() reference on the blockdevice that the ioctl will be issued to) for the duration of the ioctl. Also, bump the DM ioctl version so a user can know that their device cgroup allow workaround is no longer needed. Reported-by: Michal Privoznik Suggested-by: Mikulas Patocka Fixes: 519049afead ("dm: use blkdev_get rather than bdgrab when issuing pass-through ioctl") Cc: stable@vger.kernel.org # 4.16 Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 97 ++++++++++++++++------------------- include/uapi/linux/dm-ioctl.h | 4 +- 2 files changed, 46 insertions(+), 55 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3b3cbd1a1659..d4438188d088 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -458,67 +458,56 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static char *_dm_claim_ptr = "I belong to device-mapper"; - -static int dm_get_bdev_for_ioctl(struct mapped_device *md, - struct block_device **bdev, - fmode_t *mode) +static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, + struct block_device **bdev, fmode_t *mode) + __acquires(md->io_barrier) { struct dm_target *tgt; struct dm_table *map; - int srcu_idx, r, r2; + int r; retry: r = -ENOTTY; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table(md, srcu_idx); if (!map || !dm_table_get_size(map)) - goto out; + return r; /* We only support devices that have a single target */ if (dm_table_get_num_targets(map) != 1) - goto out; + return r; tgt = dm_table_get_target(map, 0); if (!tgt->type->prepare_ioctl) - goto out; + return r; - if (dm_suspended_md(md)) { - r = -EAGAIN; - goto out; - } + if (dm_suspended_md(md)) + return -EAGAIN; r = tgt->type->prepare_ioctl(tgt, bdev, mode); - if (r < 0) - goto out; - - bdgrab(*bdev); - r2 = blkdev_get(*bdev, *mode, _dm_claim_ptr); - if (r2 < 0) { - r = r2; - goto out; - } - - dm_put_live_table(md, srcu_idx); - return r; - -out: - dm_put_live_table(md, srcu_idx); if (r == -ENOTCONN && !fatal_signal_pending(current)) { + dm_put_live_table(md, *srcu_idx); msleep(10); goto retry; } + return r; } +static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) + __releases(md->io_barrier) +{ + dm_put_live_table(md, srcu_idx); +} + static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); if (r < 0) - return r; + goto out; if (r > 0) { /* @@ -536,7 +525,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - blkdev_put(bdev, mode); + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -710,6 +699,8 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static char *_dm_claim_ptr = "I belong to device-mapper"; + /* * Open a table device so we can use it as a map destination. */ @@ -3044,19 +3035,19 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_reserve) r = ops->pr_reserve(bdev, key, type, flags); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -3065,19 +3056,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_release) r = ops->pr_release(bdev, key, type); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -3087,19 +3078,19 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_preempt) r = ops->pr_preempt(bdev, old_key, new_key, type, abort); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -3108,19 +3099,19 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) r = ops->pr_clear(bdev, key); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 5108da02cd32..d1e49514977b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -270,9 +270,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 38 +#define DM_VERSION_MINOR 39 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2018-02-28)" +#define DM_VERSION_EXTRA "-ioctl (2018-04-03)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ From 5bd5e8d891c1fd2d966a7e2c26f0452d22410683 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Apr 2018 16:54:10 -0400 Subject: [PATCH 28/28] dm: remove fmode_t argument from .prepare_ioctl hook Use the fmode_t that is passed to dm_blk_ioctl() rather than inconsistently (varies across targets) drop it on the floor by overriding it with the fmode_t stored in 'struct dm_dev'. All the persistent reservation functions weren't using the fmode_t they got back from .prepare_ioctl so remove them. Signed-off-by: Mike Snitzer --- drivers/md/dm-flakey.c | 3 +-- drivers/md/dm-linear.c | 3 +-- drivers/md/dm-log-writes.c | 2 +- drivers/md/dm-mpath.c | 3 +-- drivers/md/dm-switch.c | 4 +--- drivers/md/dm-verity-target.c | 3 +-- drivers/md/dm-zoned-target.c | 3 +-- drivers/md/dm.c | 18 +++++++----------- include/linux/device-mapper.h | 3 +-- 9 files changed, 15 insertions(+), 27 deletions(-) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 1b907b15f5c3..21d126a5078c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -442,8 +442,7 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct flakey_c *fc = ti->private; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ff751b00aacd..99297212eeec 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -130,8 +130,7 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct linear_c *lc = (struct linear_c *) ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index fefe6719a64d..9de072b7782a 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -864,7 +864,7 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } static int log_writes_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 746dd8a75b4a..203a0419d2b0 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1876,7 +1876,7 @@ out: } static int multipath_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev) { struct multipath *m = ti->private; struct pgpath *current_pgpath; @@ -1889,7 +1889,6 @@ static int multipath_prepare_ioctl(struct dm_target *ti, if (current_pgpath) { if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { *bdev = current_pgpath->path.dev->bdev; - *mode = current_pgpath->path.dev->mode; r = 0; } else { /* pg_init has not started or completed */ diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 8f9208c2d2e3..7924a6a33ddc 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -512,8 +512,7 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct switch_ctx *sctx = ti->private; unsigned path_nr; @@ -521,7 +520,6 @@ static int switch_prepare_ioctl(struct dm_target *ti, path_nr = switch_get_path_nr(sctx, 0); *bdev = sctx->path_list[path_nr].dmdev->bdev; - *mode = sctx->path_list[path_nr].dmdev->mode; /* * Only pass ioctls through if the device sizes match exactly. diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 037ba17e3a3e..fc893f636a98 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -723,8 +723,7 @@ static void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dm_verity *v = ti->private; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index caff02caf083..e73b0776683c 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -898,8 +898,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Pass on ioctl to the backend device. */ -static int dmz_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d4438188d088..3af590e0d78c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -459,7 +459,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev) __acquires(md->io_barrier) { struct dm_target *tgt; @@ -483,7 +483,7 @@ retry: if (dm_suspended_md(md)) return -EAGAIN; - r = tgt->type->prepare_ioctl(tgt, bdev, mode); + r = tgt->type->prepare_ioctl(tgt, bdev); if (r == -ENOTCONN && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); msleep(10); @@ -505,7 +505,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, struct mapped_device *md = bdev->bd_disk->private_data; int r, srcu_idx; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) goto out; @@ -3034,10 +3034,9 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; int r, srcu_idx; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) goto out; @@ -3055,10 +3054,9 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; int r, srcu_idx; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) goto out; @@ -3077,10 +3075,9 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; int r, srcu_idx; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) goto out; @@ -3098,10 +3095,9 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; int r, srcu_idx; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) goto out; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 019e2efc6c25..ed038fbecd55 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -90,8 +90,7 @@ typedef void (*dm_status_fn) (struct dm_target *ti, status_type_t status_type, typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv, char *result, unsigned maxlen); -typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, - struct block_device **bdev, fmode_t *mode); +typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); /* * These iteration functions are typically used to check (and combine)