- DM core fixes to ensure that bio submission follows a depth-first tree

walk; this is critical to allow forward progress without the need to
   use the bioset's BIOSET_NEED_RESCUER.
 
 - Remove DM core's BIOSET_NEED_RESCUER based dm_offload infrastructure.
 
 - DM core cleanups and improvements to make bio-based DM more efficient
   (e.g. reduced memory footprint as well leveraging per-bio-data more).
 
 - Introduce new bio-based mode (DM_TYPE_NVME_BIO_BASED) that leverages
   the more direct IO submission path in the block layer; this mode is
   used by DM multipath and also optimizes targets like DM thin-pool that
   stack directly on NVMe data device.
 
 - DM multipath improvements to factor out legacy SCSI-only
   (e.g. scsi_dh) code paths to allow for more optimized support for NVMe
   multipath.
 
 - A fix for DM multipath path selectors (service-time and queue-length)
   to select paths in a more balanced way; largely academic but doesn't
   hurt.
 
 - Numerous DM raid target fixes and improvements.
 
 - Add a new DM "unstriped" target that enables Intel to workaround
   firmware limitations in some NVMe drives that are striped internally
   (this target also works when stacked above the DM "striped" target).
 
 - Various Documentation fixes and improvements.
 
 - Misc. cleanups and fixes across various DM infrastructure and targets
   (e.g. bufio, flakey, log-writes, snapshot).
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQEcBAABAgAGBQJacgwPAAoJEMUj8QotnQNaEw0H/0XRTcg8/lRuGl46kdeI3PgR
 ZxUy4XgUrCLiACWO5yCU/nKipB32+3xTlTDTBcjmaBfX8HolH147Pasb1KdHqLVC
 dOWLMpjlFztb5fnuOMitJA05qQAbgRlZ52QdVk/FDo9yWicgWjQZduh8aYX53pHw
 6XOYWzSFAXQcaduPdz6TLiPw479xBwIpXxQbrO09f4qt3Ub4bqknEhzFXc+6M7zl
 ejmW/bG2Qg6WmsfAuaAhFTV0LpTPSEzvaq9TfR7yqFU3DvDIAi7Yh8eQinIUDo4u
 txpOGoESRAMPAMKH0/UJdr/u7jTsfgJox4QEavWfnViPvkouah5KdjVOL1veZ5U=
 =R3dN
 -----END PGP SIGNATURE-----

Merge tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - DM core fixes to ensure that bio submission follows a depth-first
   tree walk; this is critical to allow forward progress without the
   need to use the bioset's BIOSET_NEED_RESCUER.

 - Remove DM core's BIOSET_NEED_RESCUER based dm_offload infrastructure.

 - DM core cleanups and improvements to make bio-based DM more efficient
   (e.g. reduced memory footprint as well leveraging per-bio-data more).

 - Introduce new bio-based mode (DM_TYPE_NVME_BIO_BASED) that leverages
   the more direct IO submission path in the block layer; this mode is
   used by DM multipath and also optimizes targets like DM thin-pool
   that stack directly on NVMe data device.

 - DM multipath improvements to factor out legacy SCSI-only (e.g.
   scsi_dh) code paths to allow for more optimized support for NVMe
   multipath.

 - A fix for DM multipath path selectors (service-time and queue-length)
   to select paths in a more balanced way; largely academic but doesn't
   hurt.

 - Numerous DM raid target fixes and improvements.

 - Add a new DM "unstriped" target that enables Intel to workaround
   firmware limitations in some NVMe drives that are striped internally
   (this target also works when stacked above the DM "striped" target).

 - Various Documentation fixes and improvements.

 - Misc cleanups and fixes across various DM infrastructure and targets
   (e.g. bufio, flakey, log-writes, snapshot).

* tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (69 commits)
  dm cache: Documentation: update default migration_throttling value
  dm mpath selector: more evenly distribute ties
  dm unstripe: fix target length versus number of stripes size check
  dm thin: fix trailing semicolon in __remap_and_issue_shared_cell
  dm table: fix NVMe bio-based dm_table_determine_type() validation
  dm: various cleanups to md->queue initialization code
  dm mpath: delay the retry of a request if the target responded as busy
  dm mpath: return DM_MAPIO_DELAY_REQUEUE if QUEUE_IO or PG_INIT_REQUIRED
  dm mpath: return DM_MAPIO_REQUEUE on blk-mq rq allocation failure
  dm log writes: fix max length used for kstrndup
  dm: backfill missing calls to mutex_destroy()
  dm snapshot: use mutex instead of rw_semaphore
  dm flakey: check for null arg_name in parse_features()
  dm thin: extend thinpool status format string with omitted fields
  dm thin: fixes in thin-provisioning.txt
  dm thin: document representation of <highest mapped sector> when there is none
  dm thin: fix documentation relative to low water mark threshold
  dm cache: be consistent in specifying sectors and SI units in cache.txt
  dm cache: delete obsoleted paragraph in cache.txt
  dm cache: fix grammar in cache-policies.txt
  ...
This commit is contained in:
Linus Torvalds 2018-01-31 11:05:47 -08:00
commit 0be600a5ad
31 changed files with 1410 additions and 672 deletions

View file

@ -29,6 +29,9 @@
*/
#define MIN_RAID456_JOURNAL_SPACE (4*2048)
/* Global list of all raid sets */
static LIST_HEAD(raid_sets);
static bool devices_handle_discard_safely = false;
/*
@ -105,8 +108,6 @@ struct raid_dev {
#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
/*
* Definitions of various constructor flags to
* be used in checks of valid / invalid flags
@ -209,6 +210,8 @@ struct raid_dev {
#define RT_FLAG_UPDATE_SBS 3
#define RT_FLAG_RESHAPE_RS 4
#define RT_FLAG_RS_SUSPENDED 5
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@ -224,8 +227,8 @@ struct rs_layout {
struct raid_set {
struct dm_target *ti;
struct list_head list;
uint32_t bitmap_loaded;
uint32_t stripe_cache_entries;
unsigned long ctr_flags;
unsigned long runtime_flags;
@ -270,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
mddev->new_chunk_sectors = l->new_chunk_sectors;
}
/* Find any raid_set in active slot for @rs on global list */
static struct raid_set *rs_find_active(struct raid_set *rs)
{
struct raid_set *r;
struct mapped_device *md = dm_table_get_md(rs->ti->table);
list_for_each_entry(r, &raid_sets, list)
if (r != rs && dm_table_get_md(r->ti->table) == md)
return r;
return NULL;
}
/* raid10 algorithms (i.e. formats) */
#define ALGORITHM_RAID10_DEFAULT 0
#define ALGORITHM_RAID10_NEAR 1
@ -572,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
}
/* Return md raid10 algorithm for @name */
static int raid10_name_to_format(const char *name)
static const int raid10_name_to_format(const char *name)
{
if (!strcasecmp(name, "near"))
return ALGORITHM_RAID10_NEAR;
@ -675,15 +691,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
return NULL;
}
/*
* Conditionally change bdev capacity of @rs
* in case of a disk add/remove reshape
*/
static void rs_set_capacity(struct raid_set *rs)
/* Adjust rdev sectors */
static void rs_set_rdev_sectors(struct raid_set *rs)
{
struct mddev *mddev = &rs->md;
struct md_rdev *rdev;
struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
/*
* raid10 sets rdev->sector to the device size, which
@ -692,8 +704,16 @@ static void rs_set_capacity(struct raid_set *rs)
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags))
rdev->sectors = mddev->dev_sectors;
}
set_capacity(gendisk, mddev->array_sectors);
/*
* Change bdev capacity of @rs in case of a disk add/remove reshape
*/
static void rs_set_capacity(struct raid_set *rs)
{
struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
set_capacity(gendisk, rs->md.array_sectors);
revalidate_disk(gendisk);
}
@ -744,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
mddev_init(&rs->md);
INIT_LIST_HEAD(&rs->list);
rs->raid_disks = raid_devs;
rs->delta_disks = 0;
@ -761,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
for (i = 0; i < raid_devs; i++)
md_rdev_init(&rs->dev[i].rdev);
/* Add @rs to global list. */
list_add(&rs->list, &raid_sets);
/*
* Remaining items to be initialized by further RAID params:
* rs->md.persistent
@ -773,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return rs;
}
/* Free all @rs allocations and remove it from global list. */
static void raid_set_free(struct raid_set *rs)
{
int i;
@ -790,6 +815,8 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev);
}
list_del(&rs->list);
kfree(rs);
}
@ -1002,7 +1029,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
!rs->dev[i].rdev.sb_page)
rebuild_cnt++;
switch (rs->raid_type->level) {
switch (rs->md.level) {
case 0:
break;
case 1:
@ -1017,6 +1044,11 @@ static int validate_raid_redundancy(struct raid_set *rs)
break;
case 10:
copies = raid10_md_layout_to_copies(rs->md.new_layout);
if (copies < 2) {
DMERR("Bogus raid10 data copies < 2!");
return -EINVAL;
}
if (rebuild_cnt < copies)
break;
@ -1576,6 +1608,24 @@ static sector_t __rdev_sectors(struct raid_set *rs)
return 0;
}
/* Check that calculated dev_sectors fits all component devices. */
static int _check_data_dev_sectors(struct raid_set *rs)
{
sector_t ds = ~0;
struct md_rdev *rdev;
rdev_for_each(rdev, &rs->md)
if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode)));
if (ds < rs->md.dev_sectors) {
rs->ti->error = "Component device(s) too small";
return -EINVAL;
}
}
return 0;
}
/* Calculate the sectors per device and per array used for @rs */
static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
{
@ -1625,7 +1675,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
return 0;
return _check_data_dev_sectors(rs);
bad:
rs->ti->error = "Target length not divisible by number of data devices";
return -EINVAL;
@ -1674,8 +1724,11 @@ static void do_table_event(struct work_struct *ws)
struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
smp_rmb(); /* Make sure we access most actual mddev properties */
if (!rs_is_reshaping(rs))
if (!rs_is_reshaping(rs)) {
if (rs_is_raid10(rs))
rs_set_rdev_sectors(rs);
rs_set_capacity(rs);
}
dm_table_event(rs->ti->table);
}
@ -1860,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
if (rs_takeover_requested(rs))
return false;
if (!mddev->level)
if (rs_is_raid0(rs))
return false;
change = mddev->new_layout != mddev->layout ||
@ -1868,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
rs->delta_disks;
/* Historical case to support raid1 reshape without delta disks */
if (mddev->level == 1) {
if (rs_is_raid1(rs)) {
if (rs->delta_disks)
return !!rs->delta_disks;
@ -1876,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
mddev->raid_disks != rs->raid_disks;
}
if (mddev->level == 10)
if (rs_is_raid10(rs))
return change &&
!__is_raid10_far(mddev->new_layout) &&
rs->delta_disks >= 0;
@ -2340,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
DMERR("new device%s provided without 'rebuild'",
new_devs > 1 ? "s" : "");
return -EINVAL;
} else if (rs_is_recovering(rs)) {
} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
(unsigned long long) mddev->recovery_cp);
return -EINVAL;
@ -2640,12 +2693,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
* Make sure we got a minimum amount of free sectors per device
*/
if (rs->data_offset &&
to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) {
to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
rs->ti->error = data_offset ? "No space for forward reshape" :
"No space for backward reshape";
return -ENOSPC;
}
out:
/*
* Raise recovery_cp in case data_offset != 0 to
* avoid false recovery positives in the constructor.
*/
if (rs->md.recovery_cp < rs->md.dev_sectors)
rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
rdev_for_each(rdev, &rs->md) {
if (!test_bit(Journal, &rdev->flags)) {
@ -2682,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs)
sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
if (rt_is_raid10(rs->raid_type)) {
if (mddev->level == 0) {
if (rs_is_raid0(rs)) {
/* Userpace reordered disks -> adjust raid_disk indexes */
__reorder_raid_disk_indexes(rs);
/* raid0 -> raid10_far layout */
mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
rs->raid10_copies);
} else if (mddev->level == 1)
} else if (rs_is_raid1(rs))
/* raid1 -> raid10_near layout */
mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
rs->raid_disks);
@ -2777,6 +2837,23 @@ static int rs_prepare_reshape(struct raid_set *rs)
return 0;
}
/* Get reshape sectors from data_offsets or raid set */
static sector_t _get_reshape_sectors(struct raid_set *rs)
{
struct md_rdev *rdev;
sector_t reshape_sectors = 0;
rdev_for_each(rdev, &rs->md)
if (!test_bit(Journal, &rdev->flags)) {
reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
rdev->data_offset - rdev->new_data_offset :
rdev->new_data_offset - rdev->data_offset;
break;
}
return max(reshape_sectors, (sector_t) rs->data_offset);
}
/*
*
* - change raid layout
@ -2788,6 +2865,7 @@ static int rs_setup_reshape(struct raid_set *rs)
{
int r = 0;
unsigned int cur_raid_devs, d;
sector_t reshape_sectors = _get_reshape_sectors(rs);
struct mddev *mddev = &rs->md;
struct md_rdev *rdev;
@ -2804,13 +2882,13 @@ static int rs_setup_reshape(struct raid_set *rs)
/*
* Adjust array size:
*
* - in case of adding disks, array size has
* - in case of adding disk(s), array size has
* to grow after the disk adding reshape,
* which'll hapen in the event handler;
* reshape will happen forward, so space has to
* be available at the beginning of each disk
*
* - in case of removing disks, array size
* - in case of removing disk(s), array size
* has to shrink before starting the reshape,
* which'll happen here;
* reshape will happen backward, so space has to
@ -2841,7 +2919,7 @@ static int rs_setup_reshape(struct raid_set *rs)
rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
}
mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */
/* Remove disk(s) */
} else if (rs->delta_disks < 0) {
@ -2874,6 +2952,15 @@ static int rs_setup_reshape(struct raid_set *rs)
mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
}
/*
* Adjust device size for forward reshape
* because md_finish_reshape() reduces it.
*/
if (!mddev->reshape_backwards)
rdev_for_each(rdev, &rs->md)
if (!test_bit(Journal, &rdev->flags))
rdev->sectors += reshape_sectors;
return r;
}
@ -2890,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs)
/*
* XXX: RAID level 4,5,6 require zeroing for safety.
*/
raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
raid456 = rs_is_raid456(rs);
for (i = 0; i < rs->raid_disks; i++) {
struct request_queue *q;
@ -2915,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs)
* RAID1 and RAID10 personalities require bio splitting,
* RAID0/4/5/6 don't and process large discard bios properly.
*/
ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10);
ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
ti->num_discard_bios = 1;
}
@ -2935,10 +3022,10 @@ static void configure_discard_support(struct raid_set *rs)
static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
int r;
bool resize;
bool resize = false;
struct raid_type *rt;
unsigned int num_raid_params, num_raid_devs;
sector_t calculated_dev_sectors, rdev_sectors;
sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
struct raid_set *rs = NULL;
const char *arg;
struct rs_layout rs_layout;
@ -3021,7 +3108,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
resize = calculated_dev_sectors != rdev_sectors;
reshape_sectors = _get_reshape_sectors(rs);
if (calculated_dev_sectors != rdev_sectors)
resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
@ -3105,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
/*
* We can only prepare for a reshape here, because the
* raid set needs to run to provide the repective reshape
* check functions via its MD personality instance.
*
* So do the reshape check after md_run() succeeded.
*/
r = rs_prepare_reshape(rs);
if (r)
return r;
/* Out-of-place space has to be available to allow for a reshape unless raid1! */
if (reshape_sectors || rs_is_raid1(rs)) {
/*
* We can only prepare for a reshape here, because the
* raid set needs to run to provide the repective reshape
* check functions via its MD personality instance.
*
* So do the reshape check after md_run() succeeded.
*/
r = rs_prepare_reshape(rs);
if (r)
return r;
/* Reshaping ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector);
/* Reshaping ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector);
}
rs_set_cur(rs);
} else {
/* May not set recovery when a device rebuild is requested */
@ -3144,7 +3237,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mddev_lock_nointr(&rs->md);
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
ti->error = "Failed to run raid array";
mddev_unlock(&rs->md);
@ -3248,25 +3340,27 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
}
/* Return string describing the current sync action of @mddev */
static const char *decipher_sync_action(struct mddev *mddev)
static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery)
{
if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
if (test_bit(MD_RECOVERY_FROZEN, &recovery))
return "frozen";
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
/* The MD sync thread can be done with io but still be running */
if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
(test_bit(MD_RECOVERY_RUNNING, &recovery) ||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return "reshape";
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
return "resync";
else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
else if (test_bit(MD_RECOVERY_CHECK, &recovery))
return "check";
return "repair";
}
if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
if (test_bit(MD_RECOVERY_RECOVER, &recovery))
return "recover";
}
@ -3283,7 +3377,7 @@ static const char *decipher_sync_action(struct mddev *mddev)
* 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
* '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/
static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
{
if (!rdev->bdev)
return "-";
@ -3291,85 +3385,108 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev,
return "D";
else if (test_bit(Journal, &rdev->flags))
return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) ||
(!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
!test_bit(In_sync, &rdev->flags)))
return "a";
else
return "A";
}
/* Helper to return resync/reshape progress for @rs and @array_in_sync */
static sector_t rs_get_progress(struct raid_set *rs,
sector_t resync_max_sectors, bool *array_in_sync)
/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
sector_t resync_max_sectors)
{
sector_t r, curr_resync_completed;
sector_t r;
struct mddev *mddev = &rs->md;
curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
*array_in_sync = false;
clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
if (rs_is_raid0(rs)) {
r = resync_max_sectors;
*array_in_sync = true;
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else {
r = mddev->reshape_position;
/* Reshape is relative to the array size */
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
r != MaxSector) {
if (r == MaxSector) {
*array_in_sync = true;
r = resync_max_sectors;
} else {
/* Got to reverse on backward reshape */
if (mddev->reshape_backwards)
r = mddev->array_sectors - r;
/* Devide by # of data stripes */
sector_div(r, mddev_data_stripes(rs));
}
/* Sync is relative to the component device size */
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
r = curr_resync_completed;
if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
test_bit(MD_RECOVERY_RUNNING, &recovery))
r = mddev->curr_resync_completed;
else
r = mddev->recovery_cp;
if ((r == MaxSector) ||
(test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
(mddev->curr_resync_completed == resync_max_sectors))) {
if (r >= resync_max_sectors &&
(!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
(!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
!test_bit(MD_RECOVERY_NEEDED, &recovery) &&
!test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
/*
* Sync complete.
*/
*array_in_sync = true;
r = resync_max_sectors;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* In case we have finished recovering, the array is in sync. */
if (test_bit(MD_RECOVERY_RECOVER, &recovery))
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) {
/*
* In case we are recovering, the array is not in sync
* and health chars should show the recovering legs.
*/
;
} else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
/*
* If "resync" is occurring, the raid set
* is or may be out of sync hence the health
* characters shall be 'a'.
*/
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
} else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
/*
* If "reshape" is occurring, the raid set
* is or may be out of sync hence the health
* characters shall be 'a'.
*/
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
/*
* If "check" or "repair" is occurring, the raid set has
* undergone an initial sync and the health characters
* should not be 'a' anymore.
*/
*array_in_sync = true;
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else {
struct md_rdev *rdev;
/*
* We are idle and recovery is needed, prevent 'A' chars race
* caused by components still set to in-sync by constrcuctor.
*/
if (test_bit(MD_RECOVERY_NEEDED, &recovery))
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
/*
* The raid set may be doing an initial sync, or it may
* be rebuilding individual components. If all the
* devices are In_sync, then it is the raid set that is
* being initialized.
*/
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags))
*array_in_sync = true;
#if 0
r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
#endif
!test_bit(In_sync, &rdev->flags)) {
clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
break;
}
}
}
return r;
return min(r, resync_max_sectors);
}
/* Helper to return @dev name or "-" if !@dev */
@ -3385,7 +3502,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
struct mddev *mddev = &rs->md;
struct r5conf *conf = mddev->private;
int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
bool array_in_sync;
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0;
unsigned int rebuild_disks;
@ -3405,17 +3522,18 @@ static void raid_status(struct dm_target *ti, status_type_t type,
/* Access most recent mddev properties for status output */
smp_rmb();
recovery = rs->md.recovery;
/* Get sensible max sectors even if raid set not yet started */
resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
mddev->resync_max_sectors : mddev->dev_sectors;
progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
progress = rs_get_progress(rs, recovery, resync_max_sectors);
resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = decipher_sync_action(&rs->md);
sync_action = decipher_sync_action(&rs->md, recovery);
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++)
DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev));
/*
* In-sync/Reshape ratio:
@ -3466,7 +3584,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* v1.10.0+:
*/
DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
__raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
__raid_dev_status(rs, &rs->journal_dev.rdev) : "-");
break;
case STATUSTYPE_TABLE:
@ -3622,24 +3740,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
}
static void raid_presuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
md_stop_writes(&rs->md);
}
static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
/* Writes have to be stopped before suspending to avoid deadlocks. */
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
md_stop_writes(&rs->md);
mddev_lock_nointr(&rs->md);
mddev_suspend(&rs->md);
mddev_unlock(&rs->md);
}
rs->md.ro = 1;
}
static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@ -3816,10 +3929,33 @@ static int raid_preresume(struct dm_target *ti)
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
/* This is a resume after a suspend of the set -> it's already started */
/* This is a resume after a suspend of the set -> it's already started. */
if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
return 0;
if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
struct raid_set *rs_active = rs_find_active(rs);
if (rs_active) {
/*
* In case no rebuilds have been requested
* and an active table slot exists, copy
* current resynchonization completed and
* reshape position pointers across from
* suspended raid set in the active slot.
*
* This resumes the new mapping at current
* offsets to continue recover/reshape without
* necessarily redoing a raid set partially or
* causing data corruption in case of a reshape.
*/
if (rs_active->md.curr_resync_completed != MaxSector)
mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
if (rs_active->md.reshape_position != MaxSector)
mddev->reshape_position = rs_active->md.reshape_position;
}
}
/*
* The superblocks need to be updated on disk if the
* array is new or new devices got added (thus zeroed
@ -3851,11 +3987,10 @@ static int raid_preresume(struct dm_target *ti)
mddev->resync_min = mddev->recovery_cp;
}
rs_set_capacity(rs);
/* Check for any reshape request unless new raid set */
if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
/* Initiate a reshape. */
rs_set_rdev_sectors(rs);
mddev_lock_nointr(mddev);
r = rs_start_reshape(rs);
mddev_unlock(mddev);
@ -3881,21 +4016,15 @@ static void raid_resume(struct dm_target *ti)
attempt_restore_of_faulty_devices(rs);
}
mddev->ro = 0;
mddev->in_sync = 0;
/*
* Keep the RAID set frozen if reshape/rebuild flags are set.
* The RAID set is unfrozen once the next table load/resume,
* which clears the reshape/rebuild flags, occurs.
* This ensures that the constructor for the inactive table
* retrieves an up-to-date reshape_position.
*/
if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
/* Only reduce raid set size before running a disk removing reshape. */
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
mddev_lock_nointr(mddev);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
mddev_resume(mddev);
mddev_unlock(mddev);
}
@ -3903,7 +4032,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
.version = {1, 13, 0},
.version = {1, 13, 2},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
@ -3912,7 +4041,6 @@ static struct target_type raid_target = {
.message = raid_message,
.iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints,
.presuspend = raid_presuspend,
.postsuspend = raid_postsuspend,
.preresume = raid_preresume,
.resume = raid_resume,