- DM core fixes to ensure that bio submission follows a depth-first tree

walk; this is critical to allow forward progress without the need to use the bioset's BIOSET_NEED_RESCUER. - Remove DM core's BIOSET_NEED_RESCUER based dm_offload infrastructure. - DM core cleanups and improvements to make bio-based DM more efficient (e.g. reduced memory footprint as well leveraging per-bio-data more). - Introduce new bio-based mode (DM_TYPE_NVME_BIO_BASED) that leverages the more direct IO submission path in the block layer; this mode is used by DM multipath and also optimizes targets like DM thin-pool that stack directly on NVMe data device. - DM multipath improvements to factor out legacy SCSI-only (e.g. scsi_dh) code paths to allow for more optimized support for NVMe multipath. - A fix for DM multipath path selectors (service-time and queue-length) to select paths in a more balanced way; largely academic but doesn't hurt. - Numerous DM raid target fixes and improvements. - Add a new DM "unstriped" target that enables Intel to workaround firmware limitations in some NVMe drives that are striped internally (this target also works when stacked above the DM "striped" target). - Various Documentation fixes and improvements. - Misc. cleanups and fixes across various DM infrastructure and targets (e.g. bufio, flakey, log-writes, snapshot). -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJacgwPAAoJEMUj8QotnQNaEw0H/0XRTcg8/lRuGl46kdeI3PgR ZxUy4XgUrCLiACWO5yCU/nKipB32+3xTlTDTBcjmaBfX8HolH147Pasb1KdHqLVC dOWLMpjlFztb5fnuOMitJA05qQAbgRlZ52QdVk/FDo9yWicgWjQZduh8aYX53pHw 6XOYWzSFAXQcaduPdz6TLiPw479xBwIpXxQbrO09f4qt3Ub4bqknEhzFXc+6M7zl ejmW/bG2Qg6WmsfAuaAhFTV0LpTPSEzvaq9TfR7yqFU3DvDIAi7Yh8eQinIUDo4u txpOGoESRAMPAMKH0/UJdr/u7jTsfgJox4QEavWfnViPvkouah5KdjVOL1veZ5U= =R3dN -----END PGP SIGNATURE----- Merge tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper updates from Mike Snitzer: - DM core fixes to ensure that bio submission follows a depth-first tree walk; this is critical to allow forward progress without the need to use the bioset's BIOSET_NEED_RESCUER. - Remove DM core's BIOSET_NEED_RESCUER based dm_offload infrastructure. - DM core cleanups and improvements to make bio-based DM more efficient (e.g. reduced memory footprint as well leveraging per-bio-data more). - Introduce new bio-based mode (DM_TYPE_NVME_BIO_BASED) that leverages the more direct IO submission path in the block layer; this mode is used by DM multipath and also optimizes targets like DM thin-pool that stack directly on NVMe data device. - DM multipath improvements to factor out legacy SCSI-only (e.g. scsi_dh) code paths to allow for more optimized support for NVMe multipath. - A fix for DM multipath path selectors (service-time and queue-length) to select paths in a more balanced way; largely academic but doesn't hurt. - Numerous DM raid target fixes and improvements. - Add a new DM "unstriped" target that enables Intel to workaround firmware limitations in some NVMe drives that are striped internally (this target also works when stacked above the DM "striped" target). - Various Documentation fixes and improvements. - Misc cleanups and fixes across various DM infrastructure and targets (e.g. bufio, flakey, log-writes, snapshot). * tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (69 commits) dm cache: Documentation: update default migration_throttling value dm mpath selector: more evenly distribute ties dm unstripe: fix target length versus number of stripes size check dm thin: fix trailing semicolon in __remap_and_issue_shared_cell dm table: fix NVMe bio-based dm_table_determine_type() validation dm: various cleanups to md->queue initialization code dm mpath: delay the retry of a request if the target responded as busy dm mpath: return DM_MAPIO_DELAY_REQUEUE if QUEUE_IO or PG_INIT_REQUIRED dm mpath: return DM_MAPIO_REQUEUE on blk-mq rq allocation failure dm log writes: fix max length used for kstrndup dm: backfill missing calls to mutex_destroy() dm snapshot: use mutex instead of rw_semaphore dm flakey: check for null arg_name in parse_features() dm thin: extend thinpool status format string with omitted fields dm thin: fixes in thin-provisioning.txt dm thin: document representation of <highest mapped sector> when there is none dm thin: fix documentation relative to low water mark threshold dm cache: be consistent in specifying sectors and SI units in cache.txt dm cache: delete obsoleted paragraph in cache.txt dm cache: fix grammar in cache-policies.txt ...
2025-07-23 23:32:14 +00:00 · 2018-01-31 11:05:47 -08:00 · 2018-01-31 11:05:47 -08:00 · 0be600a5ad
commit 0be600a5ad
parent 040639b7fc 9614e2ba91
31 changed files with 1410 additions and 672 deletions
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@ -29,6 +29,9 @@
 */
 #define	MIN_RAID456_JOURNAL_SPACE (4*2048)

+/* Global list of all raid sets */
+static LIST_HEAD(raid_sets);
+
 static bool devices_handle_discard_safely = false;

 /*
@ -105,8 +108,6 @@ struct raid_dev {
 #define CTR_FLAG_JOURNAL_DEV		(1 << __CTR_FLAG_JOURNAL_DEV)
 #define CTR_FLAG_JOURNAL_MODE		(1 << __CTR_FLAG_JOURNAL_MODE)

-#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
-
 /*
 * Definitions of various constructor flags to
 * be used in checks of valid / invalid flags
@ -209,6 +210,8 @@ struct raid_dev {
 #define RT_FLAG_UPDATE_SBS		3
 #define RT_FLAG_RESHAPE_RS		4
 #define RT_FLAG_RS_SUSPENDED		5
+#define RT_FLAG_RS_IN_SYNC		6
+#define RT_FLAG_RS_RESYNCING		7

 /* Array elements of 64 bit needed for rebuild/failed disk bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@ -224,8 +227,8 @@ struct rs_layout {

 struct raid_set {
 	struct dm_target *ti;
+	struct list_head list;

-	uint32_t bitmap_loaded;
 	uint32_t stripe_cache_entries;
 	unsigned long ctr_flags;
 	unsigned long runtime_flags;
@ -270,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
 	mddev->new_chunk_sectors = l->new_chunk_sectors;
 }

+/* Find any raid_set in active slot for @rs on global list */
+static struct raid_set *rs_find_active(struct raid_set *rs)
+{
+	struct raid_set *r;
+	struct mapped_device *md = dm_table_get_md(rs->ti->table);
+
+	list_for_each_entry(r, &raid_sets, list)
+		if (r != rs && dm_table_get_md(r->ti->table) == md)
+			return r;
+
+	return NULL;
+}
+
 /* raid10 algorithms (i.e. formats) */
 #define	ALGORITHM_RAID10_DEFAULT	0
 #define	ALGORITHM_RAID10_NEAR		1
@ -572,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
 }

 /* Return md raid10 algorithm for @name */
-static int raid10_name_to_format(const char *name)
+static const int raid10_name_to_format(const char *name)
 {
 	if (!strcasecmp(name, "near"))
 		return ALGORITHM_RAID10_NEAR;
@ -675,15 +691,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 	return NULL;
 }

-/*
- * Conditionally change bdev capacity of @rs
- * in case of a disk add/remove reshape
- */
-static void rs_set_capacity(struct raid_set *rs)
+/* Adjust rdev sectors */
+static void rs_set_rdev_sectors(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
 	struct md_rdev *rdev;
-	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));

 	/*
 	 * raid10 sets rdev->sector to the device size, which
@ -692,8 +704,16 @@ static void rs_set_capacity(struct raid_set *rs)
 	rdev_for_each(rdev, mddev)
 		if (!test_bit(Journal, &rdev->flags))
 			rdev->sectors = mddev->dev_sectors;
+}

-	set_capacity(gendisk, mddev->array_sectors);
+/*
+ * Change bdev capacity of @rs in case of a disk add/remove reshape
+ */
+static void rs_set_capacity(struct raid_set *rs)
+{
+	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
+
+	set_capacity(gendisk, rs->md.array_sectors);
 	revalidate_disk(gendisk);
 }

@ -744,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r

 	mddev_init(&rs->md);

+	INIT_LIST_HEAD(&rs->list);
 	rs->raid_disks = raid_devs;
 	rs->delta_disks = 0;

@ -761,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);

+	/* Add @rs to global list. */
+	list_add(&rs->list, &raid_sets);
+
 	/*
 	 * Remaining items to be initialized by further RAID params:
 	 *  rs->md.persistent
@ -773,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	return rs;
 }

+/* Free all @rs allocations and remove it from global list. */
 static void raid_set_free(struct raid_set *rs)
 {
 	int i;
@ -790,6 +815,8 @@ static void raid_set_free(struct raid_set *rs)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
 	}

+	list_del(&rs->list);
+
 	kfree(rs);
 }

@ -1002,7 +1029,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		    !rs->dev[i].rdev.sb_page)
 			rebuild_cnt++;

-	switch (rs->raid_type->level) {
+	switch (rs->md.level) {
 	case 0:
 		break;
 	case 1:
@ -1017,6 +1044,11 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		break;
 	case 10:
 		copies = raid10_md_layout_to_copies(rs->md.new_layout);
+		if (copies < 2) {
+			DMERR("Bogus raid10 data copies < 2!");
+			return -EINVAL;
+		}
+
 		if (rebuild_cnt < copies)
 			break;

@ -1576,6 +1608,24 @@ static sector_t __rdev_sectors(struct raid_set *rs)
 	return 0;
 }

+/* Check that calculated dev_sectors fits all component devices. */
+static int _check_data_dev_sectors(struct raid_set *rs)
+{
+	sector_t ds = ~0;
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, &rs->md)
+		if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
+			ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode)));
+			if (ds < rs->md.dev_sectors) {
+				rs->ti->error = "Component device(s) too small";
+				return -EINVAL;
+			}
+		}
+
+	return 0;
+}
+
 /* Calculate the sectors per device and per array used for @rs */
 static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 {
@ -1625,7 +1675,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	mddev->array_sectors = array_sectors;
 	mddev->dev_sectors = dev_sectors;

-	return 0;
+	return _check_data_dev_sectors(rs);
 bad:
 	rs->ti->error = "Target length not divisible by number of data devices";
 	return -EINVAL;
@ -1674,8 +1724,11 @@ static void do_table_event(struct work_struct *ws)
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);

 	smp_rmb(); /* Make sure we access most actual mddev properties */
-	if (!rs_is_reshaping(rs))
+	if (!rs_is_reshaping(rs)) {
+		if (rs_is_raid10(rs))
+			rs_set_rdev_sectors(rs);
 		rs_set_capacity(rs);
+	}
 	dm_table_event(rs->ti->table);
 }

@ -1860,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
 	if (rs_takeover_requested(rs))
 		return false;

-	if (!mddev->level)
+	if (rs_is_raid0(rs))
 		return false;

 	change = mddev->new_layout != mddev->layout ||
@ -1868,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
 		 rs->delta_disks;

 	/* Historical case to support raid1 reshape without delta disks */
-	if (mddev->level == 1) {
+	if (rs_is_raid1(rs)) {
 		if (rs->delta_disks)
 			return !!rs->delta_disks;

@ -1876,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
 		       mddev->raid_disks != rs->raid_disks;
 	}

-	if (mddev->level == 10)
+	if (rs_is_raid10(rs))
 		return change &&
 		       !__is_raid10_far(mddev->new_layout) &&
 		       rs->delta_disks >= 0;
@ -2340,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 			DMERR("new device%s provided without 'rebuild'",
 			      new_devs > 1 ? "s" : "");
 			return -EINVAL;
-		} else if (rs_is_recovering(rs)) {
+		} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
 			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
 			      (unsigned long long) mddev->recovery_cp);
 			return -EINVAL;
@ -2640,12 +2693,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
 	 * Make sure we got a minimum amount of free sectors per device
 	 */
 	if (rs->data_offset &&
-	    to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) {
+	    to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
 		rs->ti->error = data_offset ? "No space for forward reshape" :
 					      "No space for backward reshape";
 		return -ENOSPC;
 	}
 out:
+	/*
+	 * Raise recovery_cp in case data_offset != 0 to
+	 * avoid false recovery positives in the constructor.
+	 */
+	if (rs->md.recovery_cp < rs->md.dev_sectors)
+		rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
+
 	/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
 	rdev_for_each(rdev, &rs->md) {
 		if (!test_bit(Journal, &rdev->flags)) {
@ -2682,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs)
 	sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;

 	if (rt_is_raid10(rs->raid_type)) {
-		if (mddev->level == 0) {
+		if (rs_is_raid0(rs)) {
 			/* Userpace reordered disks -> adjust raid_disk indexes */
 			__reorder_raid_disk_indexes(rs);

 			/* raid0 -> raid10_far layout */
 			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
 								   rs->raid10_copies);
-		} else if (mddev->level == 1)
+		} else if (rs_is_raid1(rs))
 			/* raid1 -> raid10_near layout */
 			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
 								   rs->raid_disks);
@ -2777,6 +2837,23 @@ static int rs_prepare_reshape(struct raid_set *rs)
 	return 0;
 }

+/* Get reshape sectors from data_offsets or raid set */
+static sector_t _get_reshape_sectors(struct raid_set *rs)
+{
+	struct md_rdev *rdev;
+	sector_t reshape_sectors = 0;
+
+	rdev_for_each(rdev, &rs->md)
+		if (!test_bit(Journal, &rdev->flags)) {
+			reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
+					rdev->data_offset - rdev->new_data_offset :
+					rdev->new_data_offset - rdev->data_offset;
+			break;
+		}
+
+	return max(reshape_sectors, (sector_t) rs->data_offset);
+}
+
 /*
 *
 * - change raid layout
@ -2788,6 +2865,7 @@ static int rs_setup_reshape(struct raid_set *rs)
 {
 	int r = 0;
 	unsigned int cur_raid_devs, d;
+	sector_t reshape_sectors = _get_reshape_sectors(rs);
 	struct mddev *mddev = &rs->md;
 	struct md_rdev *rdev;

@ -2804,13 +2882,13 @@ static int rs_setup_reshape(struct raid_set *rs)
 	/*
 	 * Adjust array size:
 	 *
-	 * - in case of adding disks, array size has
+	 * - in case of adding disk(s), array size has
 	 *   to grow after the disk adding reshape,
 	 *   which'll hapen in the event handler;
 	 *   reshape will happen forward, so space has to
 	 *   be available at the beginning of each disk
 	 *
-	 * - in case of removing disks, array size
+	 * - in case of removing disk(s), array size
 	 *   has to shrink before starting the reshape,
 	 *   which'll happen here;
 	 *   reshape will happen backward, so space has to
@ -2841,7 +2919,7 @@ static int rs_setup_reshape(struct raid_set *rs)
 			rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
 		}

-		mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
+		mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */

 	/* Remove disk(s) */
 	} else if (rs->delta_disks < 0) {
@ -2874,6 +2952,15 @@ static int rs_setup_reshape(struct raid_set *rs)
 		mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
 	}

+	/*
+	 * Adjust device size for forward reshape
+	 * because md_finish_reshape() reduces it.
+	 */
+	if (!mddev->reshape_backwards)
+		rdev_for_each(rdev, &rs->md)
+			if (!test_bit(Journal, &rdev->flags))
+				rdev->sectors += reshape_sectors;
+
 	return r;
 }

@ -2890,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs)
 	/*
 	 * XXX: RAID level 4,5,6 require zeroing for safety.
 	 */
-	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
+	raid456 = rs_is_raid456(rs);

 	for (i = 0; i < rs->raid_disks; i++) {
 		struct request_queue *q;
@ -2915,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs)
 	 * RAID1 and RAID10 personalities require bio splitting,
 	 * RAID0/4/5/6 don't and process large discard bios properly.
 	 */
-	ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10);
+	ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
 	ti->num_discard_bios = 1;
 }

@ -2935,10 +3022,10 @@ static void configure_discard_support(struct raid_set *rs)
 static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	int r;
-	bool resize;
+	bool resize = false;
 	struct raid_type *rt;
 	unsigned int num_raid_params, num_raid_devs;
-	sector_t calculated_dev_sectors, rdev_sectors;
+	sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
 	struct raid_set *rs = NULL;
 	const char *arg;
 	struct rs_layout rs_layout;
@ -3021,7 +3108,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}

-	resize = calculated_dev_sectors != rdev_sectors;
+
+	reshape_sectors = _get_reshape_sectors(rs);
+	if (calculated_dev_sectors != rdev_sectors)
+		resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);

 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
@ -3105,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			goto bad;
 		}

-		/*
-		  * We can only prepare for a reshape here, because the
-		  * raid set needs to run to provide the repective reshape
-		  * check functions via its MD personality instance.
-		  *
-		  * So do the reshape check after md_run() succeeded.
-		  */
-		r = rs_prepare_reshape(rs);
-		if (r)
-			return r;
+		/* Out-of-place space has to be available to allow for a reshape unless raid1! */
+		if (reshape_sectors || rs_is_raid1(rs)) {
+			/*
+			  * We can only prepare for a reshape here, because the
+			  * raid set needs to run to provide the repective reshape
+			  * check functions via its MD personality instance.
+			  *
+			  * So do the reshape check after md_run() succeeded.
+			  */
+			r = rs_prepare_reshape(rs);
+			if (r)
+				return r;

-		/* Reshaping ain't recovery, so disable recovery */
-		rs_setup_recovery(rs, MaxSector);
+			/* Reshaping ain't recovery, so disable recovery */
+			rs_setup_recovery(rs, MaxSector);
+		}
 		rs_set_cur(rs);
 	} else {
 		/* May not set recovery when a device rebuild is requested */
@ -3144,7 +3237,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	mddev_lock_nointr(&rs->md);
 	r = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
-
 	if (r) {
 		ti->error = "Failed to run raid array";
 		mddev_unlock(&rs->md);
@ -3248,25 +3340,27 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 }

 /* Return string describing the current sync action of @mddev */
-static const char *decipher_sync_action(struct mddev *mddev)
+static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery)
 {
-	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
 		return "frozen";

-	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
-		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+	/* The MD sync thread can be done with io but still be running */
+	if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
+	    (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+	     (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
 			return "reshape";

-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
 				return "resync";
-			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
 				return "check";
 			return "repair";
 		}

-		if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_RECOVER, &recovery))
 			return "recover";
 	}

@ -3283,7 +3377,7 @@ static const char *decipher_sync_action(struct mddev *mddev)
 *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
 *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
 */
-static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
 {
 	if (!rdev->bdev)
 		return "-";
@ -3291,85 +3385,108 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev,
 		return "D";
 	else if (test_bit(Journal, &rdev->flags))
 		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
-	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
+	else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) ||
+		 (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
+		  !test_bit(In_sync, &rdev->flags)))
 		return "a";
 	else
 		return "A";
 }

-/* Helper to return resync/reshape progress for @rs and @array_in_sync */
-static sector_t rs_get_progress(struct raid_set *rs,
-				sector_t resync_max_sectors, bool *array_in_sync)
+/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
+static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
+				sector_t resync_max_sectors)
 {
-	sector_t r, curr_resync_completed;
+	sector_t r;
 	struct mddev *mddev = &rs->md;

-	curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
-	*array_in_sync = false;
+	clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+	clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);

 	if (rs_is_raid0(rs)) {
 		r = resync_max_sectors;
-		*array_in_sync = true;
+		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);

 	} else {
-		r = mddev->reshape_position;
-
-		/* Reshape is relative to the array size */
-		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
-		    r != MaxSector) {
-			if (r == MaxSector) {
-				*array_in_sync = true;
-				r = resync_max_sectors;
-			} else {
-				/* Got to reverse on backward reshape */
-				if (mddev->reshape_backwards)
-					r = mddev->array_sectors - r;
-
-				/* Devide by # of data stripes */
-				sector_div(r, mddev_data_stripes(rs));
-			}
-
-		/* Sync is relative to the component device size */
-		} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
-			r = curr_resync_completed;
+		if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
+		    test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
+		    test_bit(MD_RECOVERY_RUNNING, &recovery))
+			r = mddev->curr_resync_completed;
 		else
 			r = mddev->recovery_cp;

-		if ((r == MaxSector) ||
-		    (test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
-		     (mddev->curr_resync_completed == resync_max_sectors))) {
+		if (r >= resync_max_sectors &&
+		    (!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
+		     (!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
+		      !test_bit(MD_RECOVERY_NEEDED, &recovery) &&
+		      !test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
 			/*
 			 * Sync complete.
 			 */
-			*array_in_sync = true;
-			r = resync_max_sectors;
-		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+			/* In case we have finished recovering, the array is in sync. */
+			if (test_bit(MD_RECOVERY_RECOVER, &recovery))
+				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
+		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) {
+			/*
+			 * In case we are recovering, the array is not in sync
+			 * and health chars should show the recovering legs.
+			 */
+			;
+
+		} else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
+			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+			/*
+			 * If "resync" is occurring, the raid set
+			 * is or may be out of sync hence the health
+			 * characters shall be 'a'.
+			 */
+			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
+		} else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
+			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+			/*
+			 * If "reshape" is occurring, the raid set
+			 * is or may be out of sync hence the health
+			 * characters shall be 'a'.
+			 */
+			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
 			/*
 			 * If "check" or "repair" is occurring, the raid set has
 			 * undergone an initial sync and the health characters
 			 * should not be 'a' anymore.
 			 */
-			*array_in_sync = true;
+			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
 		} else {
 			struct md_rdev *rdev;

+			/*
+			 * We are idle and recovery is needed, prevent 'A' chars race
+			 * caused by components still set to in-sync by constrcuctor.
+			 */
+			if (test_bit(MD_RECOVERY_NEEDED, &recovery))
+				set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
 			/*
 			 * The raid set may be doing an initial sync, or it may
 			 * be rebuilding individual components.	 If all the
 			 * devices are In_sync, then it is the raid set that is
 			 * being initialized.
 			 */
+			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 			rdev_for_each(rdev, mddev)
 				if (!test_bit(Journal, &rdev->flags) &&
-				    !test_bit(In_sync, &rdev->flags))
-					*array_in_sync = true;
-#if 0
-			r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
-#endif
+				    !test_bit(In_sync, &rdev->flags)) {
+					clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+					break;
+				}
 		}
 	}

-	return r;
+	return min(r, resync_max_sectors);
 }

 /* Helper to return @dev name or "-" if !@dev */
@ -3385,7 +3502,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	struct mddev *mddev = &rs->md;
 	struct r5conf *conf = mddev->private;
 	int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
-	bool array_in_sync;
+	unsigned long recovery;
 	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
 	unsigned int sz = 0;
 	unsigned int rebuild_disks;
@ -3405,17 +3522,18 @@ static void raid_status(struct dm_target *ti, status_type_t type,

 		/* Access most recent mddev properties for status output */
 		smp_rmb();
+		recovery = rs->md.recovery;
 		/* Get sensible max sectors even if raid set not yet started */
 		resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
 				      mddev->resync_max_sectors : mddev->dev_sectors;
-		progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
+		progress = rs_get_progress(rs, recovery, resync_max_sectors);
 		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
 				    atomic64_read(&mddev->resync_mismatches) : 0;
-		sync_action = decipher_sync_action(&rs->md);
+		sync_action = decipher_sync_action(&rs->md, recovery);

 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
 		for (i = 0; i < rs->raid_disks; i++)
-			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
+			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev));

 		/*
 		 * In-sync/Reshape ratio:
@ -3466,7 +3584,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		 * v1.10.0+:
 		 */
 		DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
-			      __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
+			      __raid_dev_status(rs, &rs->journal_dev.rdev) : "-");
 		break;

 	case STATUSTYPE_TABLE:
@ -3622,24 +3740,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
 }

-static void raid_presuspend(struct dm_target *ti)
-{
-	struct raid_set *rs = ti->private;
-
-	md_stop_writes(&rs->md);
-}
-
 static void raid_postsuspend(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;

 	if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+		/* Writes have to be stopped before suspending to avoid deadlocks. */
+		if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
+			md_stop_writes(&rs->md);
+
 		mddev_lock_nointr(&rs->md);
 		mddev_suspend(&rs->md);
 		mddev_unlock(&rs->md);
 	}
-
-	rs->md.ro = 1;
 }

 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@ -3816,10 +3929,33 @@ static int raid_preresume(struct dm_target *ti)
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;

-	/* This is a resume after a suspend of the set -> it's already started */
+	/* This is a resume after a suspend of the set -> it's already started. */
 	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
 		return 0;

+	if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
+		struct raid_set *rs_active = rs_find_active(rs);
+
+		if (rs_active) {
+			/*
+			 * In case no rebuilds have been requested
+			 * and an active table slot exists, copy
+			 * current resynchonization completed and
+			 * reshape position pointers across from
+			 * suspended raid set in the active slot.
+			 *
+			 * This resumes the new mapping at current
+			 * offsets to continue recover/reshape without
+			 * necessarily redoing a raid set partially or
+			 * causing data corruption in case of a reshape.
+			 */
+			if (rs_active->md.curr_resync_completed != MaxSector)
+				mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
+			if (rs_active->md.reshape_position != MaxSector)
+				mddev->reshape_position = rs_active->md.reshape_position;
+		}
+	}
+
 	/*
 	 * The superblocks need to be updated on disk if the
 	 * array is new or new devices got added (thus zeroed
@ -3851,11 +3987,10 @@ static int raid_preresume(struct dm_target *ti)
 		mddev->resync_min = mddev->recovery_cp;
 	}

-	rs_set_capacity(rs);
-
 	/* Check for any reshape request unless new raid set */
-	if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
 		/* Initiate a reshape. */
+		rs_set_rdev_sectors(rs);
 		mddev_lock_nointr(mddev);
 		r = rs_start_reshape(rs);
 		mddev_unlock(mddev);
@ -3881,21 +4016,15 @@ static void raid_resume(struct dm_target *ti)
 		attempt_restore_of_faulty_devices(rs);
 	}

-	mddev->ro = 0;
-	mddev->in_sync = 0;
-
-	/*
-	 * Keep the RAID set frozen if reshape/rebuild flags are set.
-	 * The RAID set is unfrozen once the next table load/resume,
-	 * which clears the reshape/rebuild flags, occurs.
-	 * This ensures that the constructor for the inactive table
-	 * retrieves an up-to-date reshape_position.
-	 */
-	if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
-		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-
 	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+		/* Only reduce raid set size before running a disk removing reshape. */
+		if (mddev->delta_disks < 0)
+			rs_set_capacity(rs);
+
 		mddev_lock_nointr(mddev);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		mddev->ro = 0;
+		mddev->in_sync = 0;
 		mddev_resume(mddev);
 		mddev_unlock(mddev);
 	}
@ -3903,7 +4032,7 @@ static void raid_resume(struct dm_target *ti)

 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 13, 0},
+	.version = {1, 13, 2},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
@ -3912,7 +4041,6 @@ static struct target_type raid_target = {
 	.message = raid_message,
 	.iterate_devices = raid_iterate_devices,
 	.io_hints = raid_io_hints,
-	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
 	.preresume = raid_preresume,
 	.resume = raid_resume,