mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-03-27 17:44:30 +00:00
Merge branch 'cluster' into for-next
This commit is contained in:
commit
d51e4fe6d6
12 changed files with 1709 additions and 82 deletions
176
Documentation/md-cluster.txt
Normal file
176
Documentation/md-cluster.txt
Normal file
|
@ -0,0 +1,176 @@
|
||||||
|
The cluster MD is a shared-device RAID for a cluster.
|
||||||
|
|
||||||
|
|
||||||
|
1. On-disk format
|
||||||
|
|
||||||
|
Separate write-intent-bitmap are used for each cluster node.
|
||||||
|
The bitmaps record all writes that may have been started on that node,
|
||||||
|
and may not yet have finished. The on-disk layout is:
|
||||||
|
|
||||||
|
0 4k 8k 12k
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
| idle | md super | bm super [0] + bits |
|
||||||
|
| bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] |
|
||||||
|
| bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits |
|
||||||
|
| bm bits [3, contd] | | |
|
||||||
|
|
||||||
|
During "normal" functioning we assume the filesystem ensures that only one
|
||||||
|
node writes to any given block at a time, so a write
|
||||||
|
request will
|
||||||
|
- set the appropriate bit (if not already set)
|
||||||
|
- commit the write to all mirrors
|
||||||
|
- schedule the bit to be cleared after a timeout.
|
||||||
|
|
||||||
|
Reads are just handled normally. It is up to the filesystem to
|
||||||
|
ensure one node doesn't read from a location where another node (or the same
|
||||||
|
node) is writing.
|
||||||
|
|
||||||
|
|
||||||
|
2. DLM Locks for management
|
||||||
|
|
||||||
|
There are two locks for managing the device:
|
||||||
|
|
||||||
|
2.1 Bitmap lock resource (bm_lockres)
|
||||||
|
|
||||||
|
The bm_lockres protects individual node bitmaps. They are named in the
|
||||||
|
form bitmap001 for node 1, bitmap002 for node and so on. When a node
|
||||||
|
joins the cluster, it acquires the lock in PW mode and it stays so
|
||||||
|
during the lifetime the node is part of the cluster. The lock resource
|
||||||
|
number is based on the slot number returned by the DLM subsystem. Since
|
||||||
|
DLM starts node count from one and bitmap slots start from zero, one is
|
||||||
|
subtracted from the DLM slot number to arrive at the bitmap slot number.
|
||||||
|
|
||||||
|
3. Communication
|
||||||
|
|
||||||
|
Each node has to communicate with other nodes when starting or ending
|
||||||
|
resync, and metadata superblock updates.
|
||||||
|
|
||||||
|
3.1 Message Types
|
||||||
|
|
||||||
|
There are 3 types, of messages which are passed
|
||||||
|
|
||||||
|
3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been
|
||||||
|
updated, and the node must re-read the md superblock. This is performed
|
||||||
|
synchronously.
|
||||||
|
|
||||||
|
3.1.2 RESYNC: informs other nodes that a resync is initiated or ended
|
||||||
|
so that each node may suspend or resume the region.
|
||||||
|
|
||||||
|
3.2 Communication mechanism
|
||||||
|
|
||||||
|
The DLM LVB is used to communicate within nodes of the cluster. There
|
||||||
|
are three resources used for the purpose:
|
||||||
|
|
||||||
|
3.2.1 Token: The resource which protects the entire communication
|
||||||
|
system. The node having the token resource is allowed to
|
||||||
|
communicate.
|
||||||
|
|
||||||
|
3.2.2 Message: The lock resource which carries the data to
|
||||||
|
communicate.
|
||||||
|
|
||||||
|
3.2.3 Ack: The resource, acquiring which means the message has been
|
||||||
|
acknowledged by all nodes in the cluster. The BAST of the resource
|
||||||
|
is used to inform the receive node that a node wants to communicate.
|
||||||
|
|
||||||
|
The algorithm is:
|
||||||
|
|
||||||
|
1. receive status
|
||||||
|
|
||||||
|
sender receiver receiver
|
||||||
|
ACK:CR ACK:CR ACK:CR
|
||||||
|
|
||||||
|
2. sender get EX of TOKEN
|
||||||
|
sender get EX of MESSAGE
|
||||||
|
sender receiver receiver
|
||||||
|
TOKEN:EX ACK:CR ACK:CR
|
||||||
|
MESSAGE:EX
|
||||||
|
ACK:CR
|
||||||
|
|
||||||
|
Sender checks that it still needs to send a message. Messages received
|
||||||
|
or other events that happened while waiting for the TOKEN may have made
|
||||||
|
this message inappropriate or redundant.
|
||||||
|
|
||||||
|
3. sender write LVB.
|
||||||
|
sender down-convert MESSAGE from EX to CR
|
||||||
|
sender try to get EX of ACK
|
||||||
|
[ wait until all receiver has *processed* the MESSAGE ]
|
||||||
|
|
||||||
|
[ triggered by bast of ACK ]
|
||||||
|
receiver get CR of MESSAGE
|
||||||
|
receiver read LVB
|
||||||
|
receiver processes the message
|
||||||
|
[ wait finish ]
|
||||||
|
receiver release ACK
|
||||||
|
|
||||||
|
sender receiver receiver
|
||||||
|
TOKEN:EX MESSAGE:CR MESSAGE:CR
|
||||||
|
MESSAGE:CR
|
||||||
|
ACK:EX
|
||||||
|
|
||||||
|
4. triggered by grant of EX on ACK (indicating all receivers have processed
|
||||||
|
message)
|
||||||
|
sender down-convert ACK from EX to CR
|
||||||
|
sender release MESSAGE
|
||||||
|
sender release TOKEN
|
||||||
|
receiver upconvert to EX of MESSAGE
|
||||||
|
receiver get CR of ACK
|
||||||
|
receiver release MESSAGE
|
||||||
|
|
||||||
|
sender receiver receiver
|
||||||
|
ACK:CR ACK:CR ACK:CR
|
||||||
|
|
||||||
|
|
||||||
|
4. Handling Failures
|
||||||
|
|
||||||
|
4.1 Node Failure
|
||||||
|
When a node fails, the DLM informs the cluster with the slot. The node
|
||||||
|
starts a cluster recovery thread. The cluster recovery thread:
|
||||||
|
- acquires the bitmap<number> lock of the failed node
|
||||||
|
- opens the bitmap
|
||||||
|
- reads the bitmap of the failed node
|
||||||
|
- copies the set bitmap to local node
|
||||||
|
- cleans the bitmap of the failed node
|
||||||
|
- releases bitmap<number> lock of the failed node
|
||||||
|
- initiates resync of the bitmap on the current node
|
||||||
|
|
||||||
|
The resync process, is the regular md resync. However, in a clustered
|
||||||
|
environment when a resync is performed, it needs to tell other nodes
|
||||||
|
of the areas which are suspended. Before a resync starts, the node
|
||||||
|
send out RESYNC_START with the (lo,hi) range of the area which needs
|
||||||
|
to be suspended. Each node maintains a suspend_list, which contains
|
||||||
|
the list of ranges which are currently suspended. On receiving
|
||||||
|
RESYNC_START, the node adds the range to the suspend_list. Similarly,
|
||||||
|
when the node performing resync finishes, it send RESYNC_FINISHED
|
||||||
|
to other nodes and other nodes remove the corresponding entry from
|
||||||
|
the suspend_list.
|
||||||
|
|
||||||
|
A helper function, should_suspend() can be used to check if a particular
|
||||||
|
I/O range should be suspended or not.
|
||||||
|
|
||||||
|
4.2 Device Failure
|
||||||
|
Device failures are handled and communicated with the metadata update
|
||||||
|
routine.
|
||||||
|
|
||||||
|
5. Adding a new Device
|
||||||
|
For adding a new device, it is necessary that all nodes "see" the new device
|
||||||
|
to be added. For this, the following algorithm is used:
|
||||||
|
|
||||||
|
1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
|
||||||
|
ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD)
|
||||||
|
2. Node 1 sends NEWDISK with uuid and slot number
|
||||||
|
3. Other nodes issue kobject_uevent_env with uuid and slot number
|
||||||
|
(Steps 4,5 could be a udev rule)
|
||||||
|
4. In userspace, the node searches for the disk, perhaps
|
||||||
|
using blkid -t SUB_UUID=""
|
||||||
|
5. Other nodes issue either of the following depending on whether the disk
|
||||||
|
was found:
|
||||||
|
ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
|
||||||
|
disc.number set to slot number)
|
||||||
|
ioctl(CLUSTERED_DISK_NACK)
|
||||||
|
6. Other nodes drop lock on no-new-devs (CR) if device is found
|
||||||
|
7. Node 1 attempts EX lock on no-new-devs
|
||||||
|
8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk
|
||||||
|
as SpareLocal
|
||||||
|
9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED
|
||||||
|
10. Other nodes get the information whether a disk is added or not
|
||||||
|
by the following METADATA_UPDATED.
|
|
@ -175,6 +175,22 @@ config MD_FAULTY
|
||||||
|
|
||||||
In unsure, say N.
|
In unsure, say N.
|
||||||
|
|
||||||
|
|
||||||
|
config MD_CLUSTER
|
||||||
|
tristate "Cluster Support for MD (EXPERIMENTAL)"
|
||||||
|
depends on BLK_DEV_MD
|
||||||
|
depends on DLM
|
||||||
|
default n
|
||||||
|
---help---
|
||||||
|
Clustering support for MD devices. This enables locking and
|
||||||
|
synchronization across multiple systems on the cluster, so all
|
||||||
|
nodes in the cluster can access the MD devices simultaneously.
|
||||||
|
|
||||||
|
This brings the redundancy (and uptime) of RAID levels across the
|
||||||
|
nodes of the cluster.
|
||||||
|
|
||||||
|
If unsure, say N.
|
||||||
|
|
||||||
source "drivers/md/bcache/Kconfig"
|
source "drivers/md/bcache/Kconfig"
|
||||||
|
|
||||||
config BLK_DEV_DM_BUILTIN
|
config BLK_DEV_DM_BUILTIN
|
||||||
|
|
|
@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o
|
||||||
obj-$(CONFIG_MD_RAID456) += raid456.o
|
obj-$(CONFIG_MD_RAID456) += raid456.o
|
||||||
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
|
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
|
||||||
obj-$(CONFIG_MD_FAULTY) += faulty.o
|
obj-$(CONFIG_MD_FAULTY) += faulty.o
|
||||||
|
obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
|
||||||
obj-$(CONFIG_BCACHE) += bcache/
|
obj-$(CONFIG_BCACHE) += bcache/
|
||||||
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
|
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
|
||||||
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
|
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
|
||||||
|
|
|
@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
|
||||||
struct block_device *bdev;
|
struct block_device *bdev;
|
||||||
struct mddev *mddev = bitmap->mddev;
|
struct mddev *mddev = bitmap->mddev;
|
||||||
struct bitmap_storage *store = &bitmap->storage;
|
struct bitmap_storage *store = &bitmap->storage;
|
||||||
|
int node_offset = 0;
|
||||||
|
|
||||||
|
if (mddev_is_clustered(bitmap->mddev))
|
||||||
|
node_offset = bitmap->cluster_slot * store->file_pages;
|
||||||
|
|
||||||
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
|
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
|
||||||
int size = PAGE_SIZE;
|
int size = PAGE_SIZE;
|
||||||
|
@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
|
||||||
/* This might have been changed by a reshape */
|
/* This might have been changed by a reshape */
|
||||||
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
|
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
|
||||||
sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
|
sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
|
||||||
|
sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
|
||||||
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
|
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
|
||||||
bitmap_info.space);
|
bitmap_info.space);
|
||||||
kunmap_atomic(sb);
|
kunmap_atomic(sb);
|
||||||
|
@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
|
||||||
bitmap_super_t *sb;
|
bitmap_super_t *sb;
|
||||||
unsigned long chunksize, daemon_sleep, write_behind;
|
unsigned long chunksize, daemon_sleep, write_behind;
|
||||||
unsigned long long events;
|
unsigned long long events;
|
||||||
|
int nodes = 0;
|
||||||
unsigned long sectors_reserved = 0;
|
unsigned long sectors_reserved = 0;
|
||||||
int err = -EINVAL;
|
int err = -EINVAL;
|
||||||
struct page *sb_page;
|
struct page *sb_page;
|
||||||
|
@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
bitmap->storage.sb_page = sb_page;
|
bitmap->storage.sb_page = sb_page;
|
||||||
|
|
||||||
|
re_read:
|
||||||
|
/* If cluster_slot is set, the cluster is setup */
|
||||||
|
if (bitmap->cluster_slot >= 0) {
|
||||||
|
sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
|
||||||
|
|
||||||
|
sector_div(bm_blocks,
|
||||||
|
bitmap->mddev->bitmap_info.chunksize >> 9);
|
||||||
|
/* bits to bytes */
|
||||||
|
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
|
||||||
|
/* to 4k blocks */
|
||||||
|
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
|
||||||
|
bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
|
||||||
|
pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
|
||||||
|
bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
|
||||||
|
}
|
||||||
|
|
||||||
if (bitmap->storage.file) {
|
if (bitmap->storage.file) {
|
||||||
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
|
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
|
||||||
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
|
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
|
||||||
|
@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
err = -EINVAL;
|
||||||
sb = kmap_atomic(sb_page);
|
sb = kmap_atomic(sb_page);
|
||||||
|
|
||||||
chunksize = le32_to_cpu(sb->chunksize);
|
chunksize = le32_to_cpu(sb->chunksize);
|
||||||
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
|
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
|
||||||
write_behind = le32_to_cpu(sb->write_behind);
|
write_behind = le32_to_cpu(sb->write_behind);
|
||||||
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
|
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
|
||||||
|
nodes = le32_to_cpu(sb->nodes);
|
||||||
|
strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
|
||||||
|
|
||||||
/* verify that the bitmap-specific fields are valid */
|
/* verify that the bitmap-specific fields are valid */
|
||||||
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
|
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
|
||||||
|
@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
events = le64_to_cpu(sb->events);
|
events = le64_to_cpu(sb->events);
|
||||||
if (events < bitmap->mddev->events) {
|
if (!nodes && (events < bitmap->mddev->events)) {
|
||||||
printk(KERN_INFO
|
printk(KERN_INFO
|
||||||
"%s: bitmap file is out of date (%llu < %llu) "
|
"%s: bitmap file is out of date (%llu < %llu) "
|
||||||
"-- forcing full recovery\n",
|
"-- forcing full recovery\n",
|
||||||
|
@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap)
|
||||||
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
|
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
|
||||||
set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
|
set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
|
||||||
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
|
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
|
||||||
|
strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
|
||||||
err = 0;
|
err = 0;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
kunmap_atomic(sb);
|
kunmap_atomic(sb);
|
||||||
|
/* Assiging chunksize is required for "re_read" */
|
||||||
|
bitmap->mddev->bitmap_info.chunksize = chunksize;
|
||||||
|
if (nodes && (bitmap->cluster_slot < 0)) {
|
||||||
|
err = md_setup_cluster(bitmap->mddev, nodes);
|
||||||
|
if (err) {
|
||||||
|
pr_err("%s: Could not setup cluster service (%d)\n",
|
||||||
|
bmname(bitmap), err);
|
||||||
|
goto out_no_sb;
|
||||||
|
}
|
||||||
|
bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
|
||||||
|
goto re_read;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
out_no_sb:
|
out_no_sb:
|
||||||
if (test_bit(BITMAP_STALE, &bitmap->flags))
|
if (test_bit(BITMAP_STALE, &bitmap->flags))
|
||||||
bitmap->events_cleared = bitmap->mddev->events;
|
bitmap->events_cleared = bitmap->mddev->events;
|
||||||
bitmap->mddev->bitmap_info.chunksize = chunksize;
|
bitmap->mddev->bitmap_info.chunksize = chunksize;
|
||||||
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
|
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
|
||||||
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
|
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
|
||||||
|
bitmap->mddev->bitmap_info.nodes = nodes;
|
||||||
if (bitmap->mddev->bitmap_info.space == 0 ||
|
if (bitmap->mddev->bitmap_info.space == 0 ||
|
||||||
bitmap->mddev->bitmap_info.space > sectors_reserved)
|
bitmap->mddev->bitmap_info.space > sectors_reserved)
|
||||||
bitmap->mddev->bitmap_info.space = sectors_reserved;
|
bitmap->mddev->bitmap_info.space = sectors_reserved;
|
||||||
if (err)
|
if (err) {
|
||||||
bitmap_print_sb(bitmap);
|
bitmap_print_sb(bitmap);
|
||||||
|
if (bitmap->cluster_slot < 0)
|
||||||
|
md_cluster_stop(bitmap->mddev);
|
||||||
|
}
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,
|
||||||
}
|
}
|
||||||
|
|
||||||
static int bitmap_storage_alloc(struct bitmap_storage *store,
|
static int bitmap_storage_alloc(struct bitmap_storage *store,
|
||||||
unsigned long chunks, int with_super)
|
unsigned long chunks, int with_super,
|
||||||
|
int slot_number)
|
||||||
{
|
{
|
||||||
int pnum;
|
int pnum, offset = 0;
|
||||||
unsigned long num_pages;
|
unsigned long num_pages;
|
||||||
unsigned long bytes;
|
unsigned long bytes;
|
||||||
|
|
||||||
|
@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
|
||||||
bytes += sizeof(bitmap_super_t);
|
bytes += sizeof(bitmap_super_t);
|
||||||
|
|
||||||
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
|
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
|
||||||
|
offset = slot_number * (num_pages - 1);
|
||||||
|
|
||||||
store->filemap = kmalloc(sizeof(struct page *)
|
store->filemap = kmalloc(sizeof(struct page *)
|
||||||
* num_pages, GFP_KERNEL);
|
* num_pages, GFP_KERNEL);
|
||||||
|
@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
|
||||||
store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
|
store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
|
||||||
if (store->sb_page == NULL)
|
if (store->sb_page == NULL)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
store->sb_page->index = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pnum = 0;
|
pnum = 0;
|
||||||
if (store->sb_page) {
|
if (store->sb_page) {
|
||||||
store->filemap[0] = store->sb_page;
|
store->filemap[0] = store->sb_page;
|
||||||
pnum = 1;
|
pnum = 1;
|
||||||
|
store->sb_page->index = offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( ; pnum < num_pages; pnum++) {
|
for ( ; pnum < num_pages; pnum++) {
|
||||||
store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
|
store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
|
||||||
if (!store->filemap[pnum]) {
|
if (!store->filemap[pnum]) {
|
||||||
store->file_pages = pnum;
|
store->file_pages = pnum;
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
store->filemap[pnum]->index = pnum;
|
store->filemap[pnum]->index = pnum + offset;
|
||||||
}
|
}
|
||||||
store->file_pages = pnum;
|
store->file_pages = pnum;
|
||||||
|
|
||||||
|
@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
|
||||||
|
{
|
||||||
|
unsigned long bit;
|
||||||
|
struct page *page;
|
||||||
|
void *paddr;
|
||||||
|
unsigned long chunk = block >> bitmap->counts.chunkshift;
|
||||||
|
int set = 0;
|
||||||
|
|
||||||
|
page = filemap_get_page(&bitmap->storage, chunk);
|
||||||
|
if (!page)
|
||||||
|
return -EINVAL;
|
||||||
|
bit = file_page_offset(&bitmap->storage, chunk);
|
||||||
|
paddr = kmap_atomic(page);
|
||||||
|
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
|
||||||
|
set = test_bit(bit, paddr);
|
||||||
|
else
|
||||||
|
set = test_bit_le(bit, paddr);
|
||||||
|
kunmap_atomic(paddr);
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* this gets called when the md device is ready to unplug its underlying
|
/* this gets called when the md device is ready to unplug its underlying
|
||||||
* (slave) device queues -- before we let any writes go down, we need to
|
* (slave) device queues -- before we let any writes go down, we need to
|
||||||
* sync the dirty pages of the bitmap file to disk */
|
* sync the dirty pages of the bitmap file to disk */
|
||||||
|
@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
|
||||||
*/
|
*/
|
||||||
static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
|
static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
|
||||||
{
|
{
|
||||||
unsigned long i, chunks, index, oldindex, bit;
|
unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
|
||||||
struct page *page = NULL;
|
struct page *page = NULL;
|
||||||
unsigned long bit_cnt = 0;
|
unsigned long bit_cnt = 0;
|
||||||
struct file *file;
|
struct file *file;
|
||||||
|
@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
|
||||||
if (!bitmap->mddev->bitmap_info.external)
|
if (!bitmap->mddev->bitmap_info.external)
|
||||||
offset = sizeof(bitmap_super_t);
|
offset = sizeof(bitmap_super_t);
|
||||||
|
|
||||||
|
if (mddev_is_clustered(bitmap->mddev))
|
||||||
|
node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
|
||||||
|
|
||||||
for (i = 0; i < chunks; i++) {
|
for (i = 0; i < chunks; i++) {
|
||||||
int b;
|
int b;
|
||||||
index = file_page_index(&bitmap->storage, i);
|
index = file_page_index(&bitmap->storage, i);
|
||||||
|
@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
|
||||||
bitmap->mddev,
|
bitmap->mddev,
|
||||||
bitmap->mddev->bitmap_info.offset,
|
bitmap->mddev->bitmap_info.offset,
|
||||||
page,
|
page,
|
||||||
index, count);
|
index + node_offset, count);
|
||||||
|
|
||||||
if (ret)
|
if (ret)
|
||||||
goto err;
|
goto err;
|
||||||
|
@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev)
|
||||||
j < bitmap->storage.file_pages
|
j < bitmap->storage.file_pages
|
||||||
&& !test_bit(BITMAP_STALE, &bitmap->flags);
|
&& !test_bit(BITMAP_STALE, &bitmap->flags);
|
||||||
j++) {
|
j++) {
|
||||||
|
|
||||||
if (test_page_attr(bitmap, j,
|
if (test_page_attr(bitmap, j,
|
||||||
BITMAP_PAGE_DIRTY))
|
BITMAP_PAGE_DIRTY))
|
||||||
/* bitmap_unplug will handle the rest */
|
/* bitmap_unplug will handle the rest */
|
||||||
|
@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!*bmc) {
|
if (!*bmc) {
|
||||||
*bmc = 2 | (needed ? NEEDED_MASK : 0);
|
*bmc = 2;
|
||||||
bitmap_count_page(&bitmap->counts, offset, 1);
|
bitmap_count_page(&bitmap->counts, offset, 1);
|
||||||
bitmap_set_pending(&bitmap->counts, offset);
|
bitmap_set_pending(&bitmap->counts, offset);
|
||||||
bitmap->allclean = 0;
|
bitmap->allclean = 0;
|
||||||
}
|
}
|
||||||
|
if (needed)
|
||||||
|
*bmc |= NEEDED_MASK;
|
||||||
spin_unlock_irq(&bitmap->counts.lock);
|
spin_unlock_irq(&bitmap->counts.lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap)
|
||||||
if (!bitmap) /* there was no bitmap */
|
if (!bitmap) /* there was no bitmap */
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
|
||||||
|
bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
|
||||||
|
md_cluster_stop(bitmap->mddev);
|
||||||
|
|
||||||
/* Shouldn't be needed - but just in case.... */
|
/* Shouldn't be needed - but just in case.... */
|
||||||
wait_event(bitmap->write_wait,
|
wait_event(bitmap->write_wait,
|
||||||
atomic_read(&bitmap->pending_writes) == 0);
|
atomic_read(&bitmap->pending_writes) == 0);
|
||||||
|
@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev)
|
||||||
* initialize the bitmap structure
|
* initialize the bitmap structure
|
||||||
* if this returns an error, bitmap_destroy must be called to do clean up
|
* if this returns an error, bitmap_destroy must be called to do clean up
|
||||||
*/
|
*/
|
||||||
int bitmap_create(struct mddev *mddev)
|
struct bitmap *bitmap_create(struct mddev *mddev, int slot)
|
||||||
{
|
{
|
||||||
struct bitmap *bitmap;
|
struct bitmap *bitmap;
|
||||||
sector_t blocks = mddev->resync_max_sectors;
|
sector_t blocks = mddev->resync_max_sectors;
|
||||||
|
@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev)
|
||||||
|
|
||||||
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
|
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
|
||||||
if (!bitmap)
|
if (!bitmap)
|
||||||
return -ENOMEM;
|
return ERR_PTR(-ENOMEM);
|
||||||
|
|
||||||
spin_lock_init(&bitmap->counts.lock);
|
spin_lock_init(&bitmap->counts.lock);
|
||||||
atomic_set(&bitmap->pending_writes, 0);
|
atomic_set(&bitmap->pending_writes, 0);
|
||||||
|
@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev)
|
||||||
init_waitqueue_head(&bitmap->behind_wait);
|
init_waitqueue_head(&bitmap->behind_wait);
|
||||||
|
|
||||||
bitmap->mddev = mddev;
|
bitmap->mddev = mddev;
|
||||||
|
bitmap->cluster_slot = slot;
|
||||||
|
|
||||||
if (mddev->kobj.sd)
|
if (mddev->kobj.sd)
|
||||||
bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
|
bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
|
||||||
|
@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev)
|
||||||
printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
|
printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
|
||||||
bitmap->counts.pages, bmname(bitmap));
|
bitmap->counts.pages, bmname(bitmap));
|
||||||
|
|
||||||
mddev->bitmap = bitmap;
|
err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
|
||||||
return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
|
if (err)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
return bitmap;
|
||||||
error:
|
error:
|
||||||
bitmap_free(bitmap);
|
bitmap_free(bitmap);
|
||||||
return err;
|
return ERR_PTR(err);
|
||||||
}
|
}
|
||||||
|
|
||||||
int bitmap_load(struct mddev *mddev)
|
int bitmap_load(struct mddev *mddev)
|
||||||
|
@ -1765,6 +1847,60 @@ out:
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(bitmap_load);
|
EXPORT_SYMBOL_GPL(bitmap_load);
|
||||||
|
|
||||||
|
/* Loads the bitmap associated with slot and copies the resync information
|
||||||
|
* to our bitmap
|
||||||
|
*/
|
||||||
|
int bitmap_copy_from_slot(struct mddev *mddev, int slot,
|
||||||
|
sector_t *low, sector_t *high, bool clear_bits)
|
||||||
|
{
|
||||||
|
int rv = 0, i, j;
|
||||||
|
sector_t block, lo = 0, hi = 0;
|
||||||
|
struct bitmap_counts *counts;
|
||||||
|
struct bitmap *bitmap = bitmap_create(mddev, slot);
|
||||||
|
|
||||||
|
if (IS_ERR(bitmap))
|
||||||
|
return PTR_ERR(bitmap);
|
||||||
|
|
||||||
|
rv = bitmap_read_sb(bitmap);
|
||||||
|
if (rv)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
rv = bitmap_init_from_disk(bitmap, 0);
|
||||||
|
if (rv)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
counts = &bitmap->counts;
|
||||||
|
for (j = 0; j < counts->chunks; j++) {
|
||||||
|
block = (sector_t)j << counts->chunkshift;
|
||||||
|
if (bitmap_file_test_bit(bitmap, block)) {
|
||||||
|
if (!lo)
|
||||||
|
lo = block;
|
||||||
|
hi = block;
|
||||||
|
bitmap_file_clear_bit(bitmap, block);
|
||||||
|
bitmap_set_memory_bits(mddev->bitmap, block, 1);
|
||||||
|
bitmap_file_set_bit(mddev->bitmap, block);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (clear_bits) {
|
||||||
|
bitmap_update_sb(bitmap);
|
||||||
|
/* Setting this for the ev_page should be enough.
|
||||||
|
* And we do not require both write_all and PAGE_DIRT either
|
||||||
|
*/
|
||||||
|
for (i = 0; i < bitmap->storage.file_pages; i++)
|
||||||
|
set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
|
||||||
|
bitmap_write_all(bitmap);
|
||||||
|
bitmap_unplug(bitmap);
|
||||||
|
}
|
||||||
|
*low = lo;
|
||||||
|
*high = hi;
|
||||||
|
err:
|
||||||
|
bitmap_free(bitmap);
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
|
||||||
|
|
||||||
|
|
||||||
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
|
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
|
||||||
{
|
{
|
||||||
unsigned long chunk_kb;
|
unsigned long chunk_kb;
|
||||||
|
@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
||||||
memset(&store, 0, sizeof(store));
|
memset(&store, 0, sizeof(store));
|
||||||
if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
|
if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
|
||||||
ret = bitmap_storage_alloc(&store, chunks,
|
ret = bitmap_storage_alloc(&store, chunks,
|
||||||
!bitmap->mddev->bitmap_info.external);
|
!bitmap->mddev->bitmap_info.external,
|
||||||
|
bitmap->cluster_slot);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
|
@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
mddev->bitmap_info.offset = offset;
|
mddev->bitmap_info.offset = offset;
|
||||||
if (mddev->pers) {
|
if (mddev->pers) {
|
||||||
|
struct bitmap *bitmap;
|
||||||
mddev->pers->quiesce(mddev, 1);
|
mddev->pers->quiesce(mddev, 1);
|
||||||
rv = bitmap_create(mddev);
|
bitmap = bitmap_create(mddev, -1);
|
||||||
if (!rv)
|
if (IS_ERR(bitmap))
|
||||||
|
rv = PTR_ERR(bitmap);
|
||||||
|
else {
|
||||||
|
mddev->bitmap = bitmap;
|
||||||
rv = bitmap_load(mddev);
|
rv = bitmap_load(mddev);
|
||||||
if (rv) {
|
if (rv) {
|
||||||
bitmap_destroy(mddev);
|
bitmap_destroy(mddev);
|
||||||
mddev->bitmap_info.offset = 0;
|
mddev->bitmap_info.offset = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
mddev->pers->quiesce(mddev, 0);
|
mddev->pers->quiesce(mddev, 0);
|
||||||
if (rv)
|
if (rv)
|
||||||
|
@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
|
||||||
|
|
||||||
static ssize_t metadata_show(struct mddev *mddev, char *page)
|
static ssize_t metadata_show(struct mddev *mddev, char *page)
|
||||||
{
|
{
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
return sprintf(page, "clustered\n");
|
||||||
return sprintf(page, "%s\n", (mddev->bitmap_info.external
|
return sprintf(page, "%s\n", (mddev->bitmap_info.external
|
||||||
? "external" : "internal"));
|
? "external" : "internal"));
|
||||||
}
|
}
|
||||||
|
@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
if (strncmp(buf, "external", 8) == 0)
|
if (strncmp(buf, "external", 8) == 0)
|
||||||
mddev->bitmap_info.external = 1;
|
mddev->bitmap_info.external = 1;
|
||||||
else if (strncmp(buf, "internal", 8) == 0)
|
else if ((strncmp(buf, "internal", 8) == 0) ||
|
||||||
|
(strncmp(buf, "clustered", 9) == 0))
|
||||||
mddev->bitmap_info.external = 0;
|
mddev->bitmap_info.external = 0;
|
||||||
else
|
else
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
|
@ -130,8 +130,9 @@ typedef struct bitmap_super_s {
|
||||||
__le32 write_behind; /* 60 number of outstanding write-behind writes */
|
__le32 write_behind; /* 60 number of outstanding write-behind writes */
|
||||||
__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
|
__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
|
||||||
* reserved for the bitmap. */
|
* reserved for the bitmap. */
|
||||||
|
__le32 nodes; /* 68 the maximum number of nodes in cluster. */
|
||||||
__u8 pad[256 - 68]; /* set to zero */
|
__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
|
||||||
|
__u8 pad[256 - 136]; /* set to zero */
|
||||||
} bitmap_super_t;
|
} bitmap_super_t;
|
||||||
|
|
||||||
/* notes:
|
/* notes:
|
||||||
|
@ -226,12 +227,13 @@ struct bitmap {
|
||||||
wait_queue_head_t behind_wait;
|
wait_queue_head_t behind_wait;
|
||||||
|
|
||||||
struct kernfs_node *sysfs_can_clear;
|
struct kernfs_node *sysfs_can_clear;
|
||||||
|
int cluster_slot; /* Slot offset for clustered env */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* the bitmap API */
|
/* the bitmap API */
|
||||||
|
|
||||||
/* these are used only by md/bitmap */
|
/* these are used only by md/bitmap */
|
||||||
int bitmap_create(struct mddev *mddev);
|
struct bitmap *bitmap_create(struct mddev *mddev, int slot);
|
||||||
int bitmap_load(struct mddev *mddev);
|
int bitmap_load(struct mddev *mddev);
|
||||||
void bitmap_flush(struct mddev *mddev);
|
void bitmap_flush(struct mddev *mddev);
|
||||||
void bitmap_destroy(struct mddev *mddev);
|
void bitmap_destroy(struct mddev *mddev);
|
||||||
|
@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev);
|
||||||
|
|
||||||
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
||||||
int chunksize, int init);
|
int chunksize, int init);
|
||||||
|
int bitmap_copy_from_slot(struct mddev *mddev, int slot,
|
||||||
|
sector_t *lo, sector_t *hi, bool clear_bits);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
965
drivers/md/md-cluster.c
Normal file
965
drivers/md/md-cluster.c
Normal file
|
@ -0,0 +1,965 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2015, SUSE
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2, or (at your option)
|
||||||
|
* any later version.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <linux/dlm.h>
|
||||||
|
#include <linux/sched.h>
|
||||||
|
#include <linux/raid/md_p.h>
|
||||||
|
#include "md.h"
|
||||||
|
#include "bitmap.h"
|
||||||
|
#include "md-cluster.h"
|
||||||
|
|
||||||
|
#define LVB_SIZE 64
|
||||||
|
#define NEW_DEV_TIMEOUT 5000
|
||||||
|
|
||||||
|
struct dlm_lock_resource {
|
||||||
|
dlm_lockspace_t *ls;
|
||||||
|
struct dlm_lksb lksb;
|
||||||
|
char *name; /* lock name. */
|
||||||
|
uint32_t flags; /* flags to pass to dlm_lock() */
|
||||||
|
struct completion completion; /* completion for synchronized locking */
|
||||||
|
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
|
||||||
|
struct mddev *mddev; /* pointing back to mddev. */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct suspend_info {
|
||||||
|
int slot;
|
||||||
|
sector_t lo;
|
||||||
|
sector_t hi;
|
||||||
|
struct list_head list;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct resync_info {
|
||||||
|
__le64 lo;
|
||||||
|
__le64 hi;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* md_cluster_info flags */
|
||||||
|
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
|
||||||
|
|
||||||
|
|
||||||
|
struct md_cluster_info {
|
||||||
|
/* dlm lock space and resources for clustered raid. */
|
||||||
|
dlm_lockspace_t *lockspace;
|
||||||
|
int slot_number;
|
||||||
|
struct completion completion;
|
||||||
|
struct dlm_lock_resource *sb_lock;
|
||||||
|
struct mutex sb_mutex;
|
||||||
|
struct dlm_lock_resource *bitmap_lockres;
|
||||||
|
struct list_head suspend_list;
|
||||||
|
spinlock_t suspend_lock;
|
||||||
|
struct md_thread *recovery_thread;
|
||||||
|
unsigned long recovery_map;
|
||||||
|
/* communication loc resources */
|
||||||
|
struct dlm_lock_resource *ack_lockres;
|
||||||
|
struct dlm_lock_resource *message_lockres;
|
||||||
|
struct dlm_lock_resource *token_lockres;
|
||||||
|
struct dlm_lock_resource *no_new_dev_lockres;
|
||||||
|
struct md_thread *recv_thread;
|
||||||
|
struct completion newdisk_completion;
|
||||||
|
unsigned long state;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum msg_type {
|
||||||
|
METADATA_UPDATED = 0,
|
||||||
|
RESYNCING,
|
||||||
|
NEWDISK,
|
||||||
|
REMOVE,
|
||||||
|
RE_ADD,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct cluster_msg {
|
||||||
|
int type;
|
||||||
|
int slot;
|
||||||
|
/* TODO: Unionize this for smaller footprint */
|
||||||
|
sector_t low;
|
||||||
|
sector_t high;
|
||||||
|
char uuid[16];
|
||||||
|
int raid_slot;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void sync_ast(void *arg)
|
||||||
|
{
|
||||||
|
struct dlm_lock_resource *res;
|
||||||
|
|
||||||
|
res = (struct dlm_lock_resource *) arg;
|
||||||
|
complete(&res->completion);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
init_completion(&res->completion);
|
||||||
|
ret = dlm_lock(res->ls, mode, &res->lksb,
|
||||||
|
res->flags, res->name, strlen(res->name),
|
||||||
|
0, sync_ast, res, res->bast);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
wait_for_completion(&res->completion);
|
||||||
|
return res->lksb.sb_status;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dlm_unlock_sync(struct dlm_lock_resource *res)
|
||||||
|
{
|
||||||
|
return dlm_lock_sync(res, DLM_LOCK_NL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
|
||||||
|
char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
|
||||||
|
{
|
||||||
|
struct dlm_lock_resource *res = NULL;
|
||||||
|
int ret, namelen;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
|
||||||
|
if (!res)
|
||||||
|
return NULL;
|
||||||
|
res->ls = cinfo->lockspace;
|
||||||
|
res->mddev = mddev;
|
||||||
|
namelen = strlen(name);
|
||||||
|
res->name = kzalloc(namelen + 1, GFP_KERNEL);
|
||||||
|
if (!res->name) {
|
||||||
|
pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
|
||||||
|
goto out_err;
|
||||||
|
}
|
||||||
|
strlcpy(res->name, name, namelen + 1);
|
||||||
|
if (with_lvb) {
|
||||||
|
res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
|
||||||
|
if (!res->lksb.sb_lvbptr) {
|
||||||
|
pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
|
||||||
|
goto out_err;
|
||||||
|
}
|
||||||
|
res->flags = DLM_LKF_VALBLK;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bastfn)
|
||||||
|
res->bast = bastfn;
|
||||||
|
|
||||||
|
res->flags |= DLM_LKF_EXPEDITE;
|
||||||
|
|
||||||
|
ret = dlm_lock_sync(res, DLM_LOCK_NL);
|
||||||
|
if (ret) {
|
||||||
|
pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
|
||||||
|
goto out_err;
|
||||||
|
}
|
||||||
|
res->flags &= ~DLM_LKF_EXPEDITE;
|
||||||
|
res->flags |= DLM_LKF_CONVERT;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
out_err:
|
||||||
|
kfree(res->lksb.sb_lvbptr);
|
||||||
|
kfree(res->name);
|
||||||
|
kfree(res);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void lockres_free(struct dlm_lock_resource *res)
|
||||||
|
{
|
||||||
|
if (!res)
|
||||||
|
return;
|
||||||
|
|
||||||
|
init_completion(&res->completion);
|
||||||
|
dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
|
||||||
|
wait_for_completion(&res->completion);
|
||||||
|
|
||||||
|
kfree(res->name);
|
||||||
|
kfree(res->lksb.sb_lvbptr);
|
||||||
|
kfree(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
static char *pretty_uuid(char *dest, char *src)
|
||||||
|
{
|
||||||
|
int i, len = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < 16; i++) {
|
||||||
|
if (i == 4 || i == 6 || i == 8 || i == 10)
|
||||||
|
len += sprintf(dest + len, "-");
|
||||||
|
len += sprintf(dest + len, "%02x", (__u8)src[i]);
|
||||||
|
}
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
|
||||||
|
sector_t lo, sector_t hi)
|
||||||
|
{
|
||||||
|
struct resync_info *ri;
|
||||||
|
|
||||||
|
ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
|
||||||
|
ri->lo = cpu_to_le64(lo);
|
||||||
|
ri->hi = cpu_to_le64(hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
|
||||||
|
{
|
||||||
|
struct resync_info ri;
|
||||||
|
struct suspend_info *s = NULL;
|
||||||
|
sector_t hi = 0;
|
||||||
|
|
||||||
|
dlm_lock_sync(lockres, DLM_LOCK_CR);
|
||||||
|
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
|
||||||
|
hi = le64_to_cpu(ri.hi);
|
||||||
|
if (ri.hi > 0) {
|
||||||
|
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
|
||||||
|
if (!s)
|
||||||
|
goto out;
|
||||||
|
s->hi = hi;
|
||||||
|
s->lo = le64_to_cpu(ri.lo);
|
||||||
|
}
|
||||||
|
dlm_unlock_sync(lockres);
|
||||||
|
out:
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void recover_bitmaps(struct md_thread *thread)
|
||||||
|
{
|
||||||
|
struct mddev *mddev = thread->mddev;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
struct dlm_lock_resource *bm_lockres;
|
||||||
|
char str[64];
|
||||||
|
int slot, ret;
|
||||||
|
struct suspend_info *s, *tmp;
|
||||||
|
sector_t lo, hi;
|
||||||
|
|
||||||
|
while (cinfo->recovery_map) {
|
||||||
|
slot = fls64((u64)cinfo->recovery_map) - 1;
|
||||||
|
|
||||||
|
/* Clear suspend_area associated with the bitmap */
|
||||||
|
spin_lock_irq(&cinfo->suspend_lock);
|
||||||
|
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
|
||||||
|
if (slot == s->slot) {
|
||||||
|
list_del(&s->list);
|
||||||
|
kfree(s);
|
||||||
|
}
|
||||||
|
spin_unlock_irq(&cinfo->suspend_lock);
|
||||||
|
|
||||||
|
snprintf(str, 64, "bitmap%04d", slot);
|
||||||
|
bm_lockres = lockres_init(mddev, str, NULL, 1);
|
||||||
|
if (!bm_lockres) {
|
||||||
|
pr_err("md-cluster: Cannot initialize bitmaps\n");
|
||||||
|
goto clear_bit;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
|
||||||
|
if (ret) {
|
||||||
|
pr_err("md-cluster: Could not DLM lock %s: %d\n",
|
||||||
|
str, ret);
|
||||||
|
goto clear_bit;
|
||||||
|
}
|
||||||
|
ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
|
||||||
|
if (ret) {
|
||||||
|
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
|
||||||
|
goto dlm_unlock;
|
||||||
|
}
|
||||||
|
if (hi > 0) {
|
||||||
|
/* TODO:Wait for current resync to get over */
|
||||||
|
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||||
|
if (lo < mddev->recovery_cp)
|
||||||
|
mddev->recovery_cp = lo;
|
||||||
|
md_check_recovery(mddev);
|
||||||
|
}
|
||||||
|
dlm_unlock:
|
||||||
|
dlm_unlock_sync(bm_lockres);
|
||||||
|
clear_bit:
|
||||||
|
clear_bit(slot, &cinfo->recovery_map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void recover_prep(void *arg)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void recover_slot(void *arg, struct dlm_slot *slot)
|
||||||
|
{
|
||||||
|
struct mddev *mddev = arg;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
|
||||||
|
mddev->bitmap_info.cluster_name,
|
||||||
|
slot->nodeid, slot->slot,
|
||||||
|
cinfo->slot_number);
|
||||||
|
set_bit(slot->slot - 1, &cinfo->recovery_map);
|
||||||
|
if (!cinfo->recovery_thread) {
|
||||||
|
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
|
||||||
|
mddev, "recover");
|
||||||
|
if (!cinfo->recovery_thread) {
|
||||||
|
pr_warn("md-cluster: Could not create recovery thread\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
md_wakeup_thread(cinfo->recovery_thread);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void recover_done(void *arg, struct dlm_slot *slots,
|
||||||
|
int num_slots, int our_slot,
|
||||||
|
uint32_t generation)
|
||||||
|
{
|
||||||
|
struct mddev *mddev = arg;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
cinfo->slot_number = our_slot;
|
||||||
|
complete(&cinfo->completion);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct dlm_lockspace_ops md_ls_ops = {
|
||||||
|
.recover_prep = recover_prep,
|
||||||
|
.recover_slot = recover_slot,
|
||||||
|
.recover_done = recover_done,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The BAST function for the ack lock resource
|
||||||
|
* This function wakes up the receive thread in
|
||||||
|
* order to receive and process the message.
|
||||||
|
*/
|
||||||
|
static void ack_bast(void *arg, int mode)
|
||||||
|
{
|
||||||
|
struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
|
||||||
|
struct md_cluster_info *cinfo = res->mddev->cluster_info;
|
||||||
|
|
||||||
|
if (mode == DLM_LOCK_EX)
|
||||||
|
md_wakeup_thread(cinfo->recv_thread);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
|
||||||
|
{
|
||||||
|
struct suspend_info *s, *tmp;
|
||||||
|
|
||||||
|
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
|
||||||
|
if (slot == s->slot) {
|
||||||
|
pr_info("%s:%d Deleting suspend_info: %d\n",
|
||||||
|
__func__, __LINE__, slot);
|
||||||
|
list_del(&s->list);
|
||||||
|
kfree(s);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
|
||||||
|
{
|
||||||
|
spin_lock_irq(&cinfo->suspend_lock);
|
||||||
|
__remove_suspend_info(cinfo, slot);
|
||||||
|
spin_unlock_irq(&cinfo->suspend_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void process_suspend_info(struct md_cluster_info *cinfo,
|
||||||
|
int slot, sector_t lo, sector_t hi)
|
||||||
|
{
|
||||||
|
struct suspend_info *s;
|
||||||
|
|
||||||
|
if (!hi) {
|
||||||
|
remove_suspend_info(cinfo, slot);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
|
||||||
|
if (!s)
|
||||||
|
return;
|
||||||
|
s->slot = slot;
|
||||||
|
s->lo = lo;
|
||||||
|
s->hi = hi;
|
||||||
|
spin_lock_irq(&cinfo->suspend_lock);
|
||||||
|
/* Remove existing entry (if exists) before adding */
|
||||||
|
__remove_suspend_info(cinfo, slot);
|
||||||
|
list_add(&s->list, &cinfo->suspend_list);
|
||||||
|
spin_unlock_irq(&cinfo->suspend_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
|
||||||
|
{
|
||||||
|
char disk_uuid[64];
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
char event_name[] = "EVENT=ADD_DEVICE";
|
||||||
|
char raid_slot[16];
|
||||||
|
char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
|
||||||
|
int len;
|
||||||
|
|
||||||
|
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
|
||||||
|
pretty_uuid(disk_uuid + len, cmsg->uuid);
|
||||||
|
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
|
||||||
|
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
|
||||||
|
init_completion(&cinfo->newdisk_completion);
|
||||||
|
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
|
||||||
|
kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
|
||||||
|
wait_for_completion_timeout(&cinfo->newdisk_completion,
|
||||||
|
NEW_DEV_TIMEOUT);
|
||||||
|
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
md_reload_sb(mddev);
|
||||||
|
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
|
||||||
|
{
|
||||||
|
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
|
||||||
|
|
||||||
|
if (rdev)
|
||||||
|
md_kick_rdev_from_array(rdev);
|
||||||
|
else
|
||||||
|
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
|
||||||
|
{
|
||||||
|
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
|
||||||
|
|
||||||
|
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||||
|
clear_bit(Faulty, &rdev->flags);
|
||||||
|
else
|
||||||
|
pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
|
||||||
|
{
|
||||||
|
switch (msg->type) {
|
||||||
|
case METADATA_UPDATED:
|
||||||
|
pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
|
||||||
|
__func__, __LINE__, msg->slot);
|
||||||
|
process_metadata_update(mddev, msg);
|
||||||
|
break;
|
||||||
|
case RESYNCING:
|
||||||
|
pr_info("%s: %d Received message: RESYNCING from %d\n",
|
||||||
|
__func__, __LINE__, msg->slot);
|
||||||
|
process_suspend_info(mddev->cluster_info, msg->slot,
|
||||||
|
msg->low, msg->high);
|
||||||
|
break;
|
||||||
|
case NEWDISK:
|
||||||
|
pr_info("%s: %d Received message: NEWDISK from %d\n",
|
||||||
|
__func__, __LINE__, msg->slot);
|
||||||
|
process_add_new_disk(mddev, msg);
|
||||||
|
break;
|
||||||
|
case REMOVE:
|
||||||
|
pr_info("%s: %d Received REMOVE from %d\n",
|
||||||
|
__func__, __LINE__, msg->slot);
|
||||||
|
process_remove_disk(mddev, msg);
|
||||||
|
break;
|
||||||
|
case RE_ADD:
|
||||||
|
pr_info("%s: %d Received RE_ADD from %d\n",
|
||||||
|
__func__, __LINE__, msg->slot);
|
||||||
|
process_readd_disk(mddev, msg);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
pr_warn("%s:%d Received unknown message from %d\n",
|
||||||
|
__func__, __LINE__, msg->slot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* thread for receiving message
|
||||||
|
*/
|
||||||
|
static void recv_daemon(struct md_thread *thread)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = thread->mddev->cluster_info;
|
||||||
|
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
|
||||||
|
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
|
||||||
|
struct cluster_msg msg;
|
||||||
|
|
||||||
|
/*get CR on Message*/
|
||||||
|
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
|
||||||
|
pr_err("md/raid1:failed to get CR on MESSAGE\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* read lvb and wake up thread to process this message_lockres */
|
||||||
|
memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
|
||||||
|
process_recvd_msg(thread->mddev, &msg);
|
||||||
|
|
||||||
|
/*release CR on ack_lockres*/
|
||||||
|
dlm_unlock_sync(ack_lockres);
|
||||||
|
/*up-convert to EX on message_lockres*/
|
||||||
|
dlm_lock_sync(message_lockres, DLM_LOCK_EX);
|
||||||
|
/*get CR on ack_lockres again*/
|
||||||
|
dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
|
||||||
|
/*release CR on message_lockres*/
|
||||||
|
dlm_unlock_sync(message_lockres);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* lock_comm()
|
||||||
|
* Takes the lock on the TOKEN lock resource so no other
|
||||||
|
* node can communicate while the operation is underway.
|
||||||
|
*/
|
||||||
|
static int lock_comm(struct md_cluster_info *cinfo)
|
||||||
|
{
|
||||||
|
int error;
|
||||||
|
|
||||||
|
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
|
||||||
|
if (error)
|
||||||
|
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
|
||||||
|
__func__, __LINE__, error);
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void unlock_comm(struct md_cluster_info *cinfo)
|
||||||
|
{
|
||||||
|
dlm_unlock_sync(cinfo->token_lockres);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* __sendmsg()
|
||||||
|
* This function performs the actual sending of the message. This function is
|
||||||
|
* usually called after performing the encompassing operation
|
||||||
|
* The function:
|
||||||
|
* 1. Grabs the message lockresource in EX mode
|
||||||
|
* 2. Copies the message to the message LVB
|
||||||
|
* 3. Downconverts message lockresource to CR
|
||||||
|
* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
|
||||||
|
* and the other nodes read the message. The thread will wait here until all other
|
||||||
|
* nodes have released ack lock resource.
|
||||||
|
* 5. Downconvert ack lockresource to CR
|
||||||
|
*/
|
||||||
|
static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
|
||||||
|
{
|
||||||
|
int error;
|
||||||
|
int slot = cinfo->slot_number - 1;
|
||||||
|
|
||||||
|
cmsg->slot = cpu_to_le32(slot);
|
||||||
|
/*get EX on Message*/
|
||||||
|
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
|
||||||
|
if (error) {
|
||||||
|
pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
|
||||||
|
goto failed_message;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
|
||||||
|
sizeof(struct cluster_msg));
|
||||||
|
/*down-convert EX to CR on Message*/
|
||||||
|
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
|
||||||
|
if (error) {
|
||||||
|
pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
|
||||||
|
error);
|
||||||
|
goto failed_message;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*up-convert CR to EX on Ack*/
|
||||||
|
error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
|
||||||
|
if (error) {
|
||||||
|
pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
|
||||||
|
error);
|
||||||
|
goto failed_ack;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*down-convert EX to CR on Ack*/
|
||||||
|
error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
|
||||||
|
if (error) {
|
||||||
|
pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
|
||||||
|
error);
|
||||||
|
goto failed_ack;
|
||||||
|
}
|
||||||
|
|
||||||
|
failed_ack:
|
||||||
|
dlm_unlock_sync(cinfo->message_lockres);
|
||||||
|
failed_message:
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
lock_comm(cinfo);
|
||||||
|
ret = __sendmsg(cinfo, cmsg);
|
||||||
|
unlock_comm(cinfo);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int gather_all_resync_info(struct mddev *mddev, int total_slots)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
int i, ret = 0;
|
||||||
|
struct dlm_lock_resource *bm_lockres;
|
||||||
|
struct suspend_info *s;
|
||||||
|
char str[64];
|
||||||
|
|
||||||
|
|
||||||
|
for (i = 0; i < total_slots; i++) {
|
||||||
|
memset(str, '\0', 64);
|
||||||
|
snprintf(str, 64, "bitmap%04d", i);
|
||||||
|
bm_lockres = lockres_init(mddev, str, NULL, 1);
|
||||||
|
if (!bm_lockres)
|
||||||
|
return -ENOMEM;
|
||||||
|
if (i == (cinfo->slot_number - 1))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
bm_lockres->flags |= DLM_LKF_NOQUEUE;
|
||||||
|
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
|
||||||
|
if (ret == -EAGAIN) {
|
||||||
|
memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
|
||||||
|
s = read_resync_info(mddev, bm_lockres);
|
||||||
|
if (s) {
|
||||||
|
pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
|
||||||
|
__func__, __LINE__,
|
||||||
|
(unsigned long long) s->lo,
|
||||||
|
(unsigned long long) s->hi, i);
|
||||||
|
spin_lock_irq(&cinfo->suspend_lock);
|
||||||
|
s->slot = i;
|
||||||
|
list_add(&s->list, &cinfo->suspend_list);
|
||||||
|
spin_unlock_irq(&cinfo->suspend_lock);
|
||||||
|
}
|
||||||
|
ret = 0;
|
||||||
|
lockres_free(bm_lockres);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
/* TODO: Read the disk bitmap sb and check if it needs recovery */
|
||||||
|
dlm_unlock_sync(bm_lockres);
|
||||||
|
lockres_free(bm_lockres);
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int join(struct mddev *mddev, int nodes)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo;
|
||||||
|
int ret, ops_rv;
|
||||||
|
char str[64];
|
||||||
|
|
||||||
|
if (!try_module_get(THIS_MODULE))
|
||||||
|
return -ENOENT;
|
||||||
|
|
||||||
|
cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
|
||||||
|
if (!cinfo)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
init_completion(&cinfo->completion);
|
||||||
|
|
||||||
|
mutex_init(&cinfo->sb_mutex);
|
||||||
|
mddev->cluster_info = cinfo;
|
||||||
|
|
||||||
|
memset(str, 0, 64);
|
||||||
|
pretty_uuid(str, mddev->uuid);
|
||||||
|
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
|
||||||
|
DLM_LSFL_FS, LVB_SIZE,
|
||||||
|
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
|
||||||
|
if (ret)
|
||||||
|
goto err;
|
||||||
|
wait_for_completion(&cinfo->completion);
|
||||||
|
if (nodes < cinfo->slot_number) {
|
||||||
|
pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
|
||||||
|
cinfo->slot_number, nodes);
|
||||||
|
ret = -ERANGE;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
cinfo->sb_lock = lockres_init(mddev, "cmd-super",
|
||||||
|
NULL, 0);
|
||||||
|
if (!cinfo->sb_lock) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
/* Initiate the communication resources */
|
||||||
|
ret = -ENOMEM;
|
||||||
|
cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
|
||||||
|
if (!cinfo->recv_thread) {
|
||||||
|
pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
|
||||||
|
if (!cinfo->message_lockres)
|
||||||
|
goto err;
|
||||||
|
cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
|
||||||
|
if (!cinfo->token_lockres)
|
||||||
|
goto err;
|
||||||
|
cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
|
||||||
|
if (!cinfo->ack_lockres)
|
||||||
|
goto err;
|
||||||
|
cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
|
||||||
|
if (!cinfo->no_new_dev_lockres)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
/* get sync CR lock on ACK. */
|
||||||
|
if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
|
||||||
|
pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
|
||||||
|
ret);
|
||||||
|
/* get sync CR lock on no-new-dev. */
|
||||||
|
if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
|
||||||
|
pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
|
||||||
|
|
||||||
|
|
||||||
|
pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
|
||||||
|
snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
|
||||||
|
cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
|
||||||
|
if (!cinfo->bitmap_lockres)
|
||||||
|
goto err;
|
||||||
|
if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
|
||||||
|
pr_err("Failed to get bitmap lock\n");
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
INIT_LIST_HEAD(&cinfo->suspend_list);
|
||||||
|
spin_lock_init(&cinfo->suspend_lock);
|
||||||
|
|
||||||
|
ret = gather_all_resync_info(mddev, nodes);
|
||||||
|
if (ret)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
err:
|
||||||
|
lockres_free(cinfo->message_lockres);
|
||||||
|
lockres_free(cinfo->token_lockres);
|
||||||
|
lockres_free(cinfo->ack_lockres);
|
||||||
|
lockres_free(cinfo->no_new_dev_lockres);
|
||||||
|
lockres_free(cinfo->bitmap_lockres);
|
||||||
|
lockres_free(cinfo->sb_lock);
|
||||||
|
if (cinfo->lockspace)
|
||||||
|
dlm_release_lockspace(cinfo->lockspace, 2);
|
||||||
|
mddev->cluster_info = NULL;
|
||||||
|
kfree(cinfo);
|
||||||
|
module_put(THIS_MODULE);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int leave(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
if (!cinfo)
|
||||||
|
return 0;
|
||||||
|
md_unregister_thread(&cinfo->recovery_thread);
|
||||||
|
md_unregister_thread(&cinfo->recv_thread);
|
||||||
|
lockres_free(cinfo->message_lockres);
|
||||||
|
lockres_free(cinfo->token_lockres);
|
||||||
|
lockres_free(cinfo->ack_lockres);
|
||||||
|
lockres_free(cinfo->no_new_dev_lockres);
|
||||||
|
lockres_free(cinfo->sb_lock);
|
||||||
|
lockres_free(cinfo->bitmap_lockres);
|
||||||
|
dlm_release_lockspace(cinfo->lockspace, 2);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* slot_number(): Returns the MD slot number to use
|
||||||
|
* DLM starts the slot numbers from 1, wheras cluster-md
|
||||||
|
* wants the number to be from zero, so we deduct one
|
||||||
|
*/
|
||||||
|
static int slot_number(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
return cinfo->slot_number - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
|
||||||
|
/* Re-acquire the lock to refresh LVB */
|
||||||
|
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int metadata_update_start(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
return lock_comm(mddev->cluster_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int metadata_update_finish(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
struct cluster_msg cmsg;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
memset(&cmsg, 0, sizeof(cmsg));
|
||||||
|
cmsg.type = cpu_to_le32(METADATA_UPDATED);
|
||||||
|
ret = __sendmsg(cinfo, &cmsg);
|
||||||
|
unlock_comm(cinfo);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int metadata_update_cancel(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
return dlm_unlock_sync(cinfo->token_lockres);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int resync_send(struct mddev *mddev, enum msg_type type,
|
||||||
|
sector_t lo, sector_t hi)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
struct cluster_msg cmsg;
|
||||||
|
int slot = cinfo->slot_number - 1;
|
||||||
|
|
||||||
|
pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
|
||||||
|
(unsigned long long)lo,
|
||||||
|
(unsigned long long)hi);
|
||||||
|
resync_info_update(mddev, lo, hi);
|
||||||
|
cmsg.type = cpu_to_le32(type);
|
||||||
|
cmsg.slot = cpu_to_le32(slot);
|
||||||
|
cmsg.low = cpu_to_le64(lo);
|
||||||
|
cmsg.high = cpu_to_le64(hi);
|
||||||
|
return sendmsg(cinfo, &cmsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
|
||||||
|
{
|
||||||
|
pr_info("%s:%d\n", __func__, __LINE__);
|
||||||
|
return resync_send(mddev, RESYNCING, lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resync_finish(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
pr_info("%s:%d\n", __func__, __LINE__);
|
||||||
|
resync_send(mddev, RESYNCING, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
int ret = 0;
|
||||||
|
struct suspend_info *s;
|
||||||
|
|
||||||
|
spin_lock_irq(&cinfo->suspend_lock);
|
||||||
|
if (list_empty(&cinfo->suspend_list))
|
||||||
|
goto out;
|
||||||
|
list_for_each_entry(s, &cinfo->suspend_list, list)
|
||||||
|
if (hi > s->lo && lo < s->hi) {
|
||||||
|
ret = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
spin_unlock_irq(&cinfo->suspend_lock);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
struct cluster_msg cmsg;
|
||||||
|
int ret = 0;
|
||||||
|
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
|
||||||
|
char *uuid = sb->device_uuid;
|
||||||
|
|
||||||
|
memset(&cmsg, 0, sizeof(cmsg));
|
||||||
|
cmsg.type = cpu_to_le32(NEWDISK);
|
||||||
|
memcpy(cmsg.uuid, uuid, 16);
|
||||||
|
cmsg.raid_slot = rdev->desc_nr;
|
||||||
|
lock_comm(cinfo);
|
||||||
|
ret = __sendmsg(cinfo, &cmsg);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
|
||||||
|
ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
|
||||||
|
cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
|
||||||
|
/* Some node does not "see" the device */
|
||||||
|
if (ret == -EAGAIN)
|
||||||
|
ret = -ENOENT;
|
||||||
|
else
|
||||||
|
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int add_new_disk_finish(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
struct cluster_msg cmsg;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
int ret;
|
||||||
|
/* Write sb and inform others */
|
||||||
|
md_update_sb(mddev, 1);
|
||||||
|
cmsg.type = METADATA_UPDATED;
|
||||||
|
ret = __sendmsg(cinfo, &cmsg);
|
||||||
|
unlock_comm(cinfo);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int new_disk_ack(struct mddev *mddev, bool ack)
|
||||||
|
{
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
|
||||||
|
pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ack)
|
||||||
|
dlm_unlock_sync(cinfo->no_new_dev_lockres);
|
||||||
|
complete(&cinfo->newdisk_completion);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||||
|
{
|
||||||
|
struct cluster_msg cmsg;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
cmsg.type = REMOVE;
|
||||||
|
cmsg.raid_slot = rdev->desc_nr;
|
||||||
|
return __sendmsg(cinfo, &cmsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int gather_bitmaps(struct md_rdev *rdev)
|
||||||
|
{
|
||||||
|
int sn, err;
|
||||||
|
sector_t lo, hi;
|
||||||
|
struct cluster_msg cmsg;
|
||||||
|
struct mddev *mddev = rdev->mddev;
|
||||||
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
||||||
|
|
||||||
|
cmsg.type = RE_ADD;
|
||||||
|
cmsg.raid_slot = rdev->desc_nr;
|
||||||
|
err = sendmsg(cinfo, &cmsg);
|
||||||
|
if (err)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
|
||||||
|
if (sn == (cinfo->slot_number - 1))
|
||||||
|
continue;
|
||||||
|
err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
|
||||||
|
if (err) {
|
||||||
|
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
if ((hi > 0) && (lo < mddev->recovery_cp))
|
||||||
|
mddev->recovery_cp = lo;
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct md_cluster_operations cluster_ops = {
|
||||||
|
.join = join,
|
||||||
|
.leave = leave,
|
||||||
|
.slot_number = slot_number,
|
||||||
|
.resync_info_update = resync_info_update,
|
||||||
|
.resync_start = resync_start,
|
||||||
|
.resync_finish = resync_finish,
|
||||||
|
.metadata_update_start = metadata_update_start,
|
||||||
|
.metadata_update_finish = metadata_update_finish,
|
||||||
|
.metadata_update_cancel = metadata_update_cancel,
|
||||||
|
.area_resyncing = area_resyncing,
|
||||||
|
.add_new_disk_start = add_new_disk_start,
|
||||||
|
.add_new_disk_finish = add_new_disk_finish,
|
||||||
|
.new_disk_ack = new_disk_ack,
|
||||||
|
.remove_disk = remove_disk,
|
||||||
|
.gather_bitmaps = gather_bitmaps,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init cluster_init(void)
|
||||||
|
{
|
||||||
|
pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
|
||||||
|
pr_info("Registering Cluster MD functions\n");
|
||||||
|
register_md_cluster_operations(&cluster_ops, THIS_MODULE);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cluster_exit(void)
|
||||||
|
{
|
||||||
|
unregister_md_cluster_operations();
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(cluster_init);
|
||||||
|
module_exit(cluster_exit);
|
||||||
|
MODULE_LICENSE("GPL");
|
||||||
|
MODULE_DESCRIPTION("Clustering support for MD");
|
29
drivers/md/md-cluster.h
Normal file
29
drivers/md/md-cluster.h
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef _MD_CLUSTER_H
|
||||||
|
#define _MD_CLUSTER_H
|
||||||
|
|
||||||
|
#include "md.h"
|
||||||
|
|
||||||
|
struct mddev;
|
||||||
|
struct md_rdev;
|
||||||
|
|
||||||
|
struct md_cluster_operations {
|
||||||
|
int (*join)(struct mddev *mddev, int nodes);
|
||||||
|
int (*leave)(struct mddev *mddev);
|
||||||
|
int (*slot_number)(struct mddev *mddev);
|
||||||
|
void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
|
||||||
|
int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
|
||||||
|
void (*resync_finish)(struct mddev *mddev);
|
||||||
|
int (*metadata_update_start)(struct mddev *mddev);
|
||||||
|
int (*metadata_update_finish)(struct mddev *mddev);
|
||||||
|
int (*metadata_update_cancel)(struct mddev *mddev);
|
||||||
|
int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
|
||||||
|
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
|
||||||
|
int (*add_new_disk_finish)(struct mddev *mddev);
|
||||||
|
int (*new_disk_ack)(struct mddev *mddev, bool ack);
|
||||||
|
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
|
||||||
|
int (*gather_bitmaps)(struct md_rdev *rdev);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* _MD_CLUSTER_H */
|
353
drivers/md/md.c
353
drivers/md/md.c
|
@ -53,6 +53,7 @@
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include "md.h"
|
#include "md.h"
|
||||||
#include "bitmap.h"
|
#include "bitmap.h"
|
||||||
|
#include "md-cluster.h"
|
||||||
|
|
||||||
#ifndef MODULE
|
#ifndef MODULE
|
||||||
static void autostart_arrays(int part);
|
static void autostart_arrays(int part);
|
||||||
|
@ -66,6 +67,11 @@ static void autostart_arrays(int part);
|
||||||
static LIST_HEAD(pers_list);
|
static LIST_HEAD(pers_list);
|
||||||
static DEFINE_SPINLOCK(pers_lock);
|
static DEFINE_SPINLOCK(pers_lock);
|
||||||
|
|
||||||
|
struct md_cluster_operations *md_cluster_ops;
|
||||||
|
EXPORT_SYMBOL(md_cluster_ops);
|
||||||
|
struct module *md_cluster_mod;
|
||||||
|
EXPORT_SYMBOL(md_cluster_mod);
|
||||||
|
|
||||||
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
|
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
|
||||||
static struct workqueue_struct *md_wq;
|
static struct workqueue_struct *md_wq;
|
||||||
static struct workqueue_struct *md_misc_wq;
|
static struct workqueue_struct *md_misc_wq;
|
||||||
|
@ -640,7 +646,7 @@ void mddev_unlock(struct mddev *mddev)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(mddev_unlock);
|
EXPORT_SYMBOL_GPL(mddev_unlock);
|
||||||
|
|
||||||
static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
|
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
|
||||||
{
|
{
|
||||||
struct md_rdev *rdev;
|
struct md_rdev *rdev;
|
||||||
|
|
||||||
|
@ -650,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
|
||||||
|
|
||||||
static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
|
static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
|
||||||
{
|
{
|
||||||
|
@ -2047,11 +2054,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
|
||||||
int choice = 0;
|
int choice = 0;
|
||||||
if (mddev->pers)
|
if (mddev->pers)
|
||||||
choice = mddev->raid_disks;
|
choice = mddev->raid_disks;
|
||||||
while (find_rdev_nr_rcu(mddev, choice))
|
while (md_find_rdev_nr_rcu(mddev, choice))
|
||||||
choice++;
|
choice++;
|
||||||
rdev->desc_nr = choice;
|
rdev->desc_nr = choice;
|
||||||
} else {
|
} else {
|
||||||
if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
|
if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
@ -2166,11 +2173,12 @@ static void export_rdev(struct md_rdev *rdev)
|
||||||
kobject_put(&rdev->kobj);
|
kobject_put(&rdev->kobj);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kick_rdev_from_array(struct md_rdev *rdev)
|
void md_kick_rdev_from_array(struct md_rdev *rdev)
|
||||||
{
|
{
|
||||||
unbind_rdev_from_array(rdev);
|
unbind_rdev_from_array(rdev);
|
||||||
export_rdev(rdev);
|
export_rdev(rdev);
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
|
||||||
|
|
||||||
static void export_array(struct mddev *mddev)
|
static void export_array(struct mddev *mddev)
|
||||||
{
|
{
|
||||||
|
@ -2179,7 +2187,7 @@ static void export_array(struct mddev *mddev)
|
||||||
while (!list_empty(&mddev->disks)) {
|
while (!list_empty(&mddev->disks)) {
|
||||||
rdev = list_first_entry(&mddev->disks, struct md_rdev,
|
rdev = list_first_entry(&mddev->disks, struct md_rdev,
|
||||||
same_set);
|
same_set);
|
||||||
kick_rdev_from_array(rdev);
|
md_kick_rdev_from_array(rdev);
|
||||||
}
|
}
|
||||||
mddev->raid_disks = 0;
|
mddev->raid_disks = 0;
|
||||||
mddev->major_version = 0;
|
mddev->major_version = 0;
|
||||||
|
@ -2208,7 +2216,7 @@ static void sync_sbs(struct mddev *mddev, int nospares)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void md_update_sb(struct mddev *mddev, int force_change)
|
void md_update_sb(struct mddev *mddev, int force_change)
|
||||||
{
|
{
|
||||||
struct md_rdev *rdev;
|
struct md_rdev *rdev;
|
||||||
int sync_req;
|
int sync_req;
|
||||||
|
@ -2369,6 +2377,37 @@ repeat:
|
||||||
wake_up(&rdev->blocked_wait);
|
wake_up(&rdev->blocked_wait);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL(md_update_sb);
|
||||||
|
|
||||||
|
static int add_bound_rdev(struct md_rdev *rdev)
|
||||||
|
{
|
||||||
|
struct mddev *mddev = rdev->mddev;
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
if (!mddev->pers->hot_remove_disk) {
|
||||||
|
/* If there is hot_add_disk but no hot_remove_disk
|
||||||
|
* then added disks for geometry changes,
|
||||||
|
* and should be added immediately.
|
||||||
|
*/
|
||||||
|
super_types[mddev->major_version].
|
||||||
|
validate_super(mddev, rdev);
|
||||||
|
err = mddev->pers->hot_add_disk(mddev, rdev);
|
||||||
|
if (err) {
|
||||||
|
unbind_rdev_from_array(rdev);
|
||||||
|
export_rdev(rdev);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
||||||
|
|
||||||
|
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||||
|
if (mddev->degraded)
|
||||||
|
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
|
||||||
|
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||||
|
md_new_event(mddev);
|
||||||
|
md_wakeup_thread(mddev->thread);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* words written to sysfs files may, or may not, be \n terminated.
|
/* words written to sysfs files may, or may not, be \n terminated.
|
||||||
* We want to accept with case. For this we use cmd_match.
|
* We want to accept with case. For this we use cmd_match.
|
||||||
|
@ -2471,10 +2510,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||||
err = -EBUSY;
|
err = -EBUSY;
|
||||||
else {
|
else {
|
||||||
struct mddev *mddev = rdev->mddev;
|
struct mddev *mddev = rdev->mddev;
|
||||||
kick_rdev_from_array(rdev);
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->remove_disk(mddev, rdev);
|
||||||
|
md_kick_rdev_from_array(rdev);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
if (mddev->pers)
|
if (mddev->pers)
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
md_new_event(mddev);
|
md_new_event(mddev);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
err = 0;
|
err = 0;
|
||||||
}
|
}
|
||||||
} else if (cmd_match(buf, "writemostly")) {
|
} else if (cmd_match(buf, "writemostly")) {
|
||||||
|
@ -2553,6 +2598,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||||
clear_bit(Replacement, &rdev->flags);
|
clear_bit(Replacement, &rdev->flags);
|
||||||
err = 0;
|
err = 0;
|
||||||
}
|
}
|
||||||
|
} else if (cmd_match(buf, "re-add")) {
|
||||||
|
if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
|
||||||
|
/* clear_bit is performed _after_ all the devices
|
||||||
|
* have their local Faulty bit cleared. If any writes
|
||||||
|
* happen in the meantime in the local node, they
|
||||||
|
* will land in the local bitmap, which will be synced
|
||||||
|
* by this node eventually
|
||||||
|
*/
|
||||||
|
if (!mddev_is_clustered(rdev->mddev) ||
|
||||||
|
(err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
|
||||||
|
clear_bit(Faulty, &rdev->flags);
|
||||||
|
err = add_bound_rdev(rdev);
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
err = -EBUSY;
|
||||||
}
|
}
|
||||||
if (!err)
|
if (!err)
|
||||||
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
||||||
|
@ -3127,7 +3187,7 @@ static void analyze_sbs(struct mddev *mddev)
|
||||||
"md: fatal superblock inconsistency in %s"
|
"md: fatal superblock inconsistency in %s"
|
||||||
" -- removing from array\n",
|
" -- removing from array\n",
|
||||||
bdevname(rdev->bdev,b));
|
bdevname(rdev->bdev,b));
|
||||||
kick_rdev_from_array(rdev);
|
md_kick_rdev_from_array(rdev);
|
||||||
}
|
}
|
||||||
|
|
||||||
super_types[mddev->major_version].
|
super_types[mddev->major_version].
|
||||||
|
@ -3142,18 +3202,27 @@ static void analyze_sbs(struct mddev *mddev)
|
||||||
"md: %s: %s: only %d devices permitted\n",
|
"md: %s: %s: only %d devices permitted\n",
|
||||||
mdname(mddev), bdevname(rdev->bdev, b),
|
mdname(mddev), bdevname(rdev->bdev, b),
|
||||||
mddev->max_disks);
|
mddev->max_disks);
|
||||||
kick_rdev_from_array(rdev);
|
md_kick_rdev_from_array(rdev);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (rdev != freshest)
|
if (rdev != freshest) {
|
||||||
if (super_types[mddev->major_version].
|
if (super_types[mddev->major_version].
|
||||||
validate_super(mddev, rdev)) {
|
validate_super(mddev, rdev)) {
|
||||||
printk(KERN_WARNING "md: kicking non-fresh %s"
|
printk(KERN_WARNING "md: kicking non-fresh %s"
|
||||||
" from array!\n",
|
" from array!\n",
|
||||||
bdevname(rdev->bdev,b));
|
bdevname(rdev->bdev,b));
|
||||||
kick_rdev_from_array(rdev);
|
md_kick_rdev_from_array(rdev);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
/* No device should have a Candidate flag
|
||||||
|
* when reading devices
|
||||||
|
*/
|
||||||
|
if (test_bit(Candidate, &rdev->flags)) {
|
||||||
|
pr_info("md: kicking Cluster Candidate %s from array!\n",
|
||||||
|
bdevname(rdev->bdev, b));
|
||||||
|
md_kick_rdev_from_array(rdev);
|
||||||
|
}
|
||||||
|
}
|
||||||
if (mddev->level == LEVEL_MULTIPATH) {
|
if (mddev->level == LEVEL_MULTIPATH) {
|
||||||
rdev->desc_nr = i++;
|
rdev->desc_nr = i++;
|
||||||
rdev->raid_disk = rdev->desc_nr;
|
rdev->raid_disk = rdev->desc_nr;
|
||||||
|
@ -4008,8 +4077,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
if (mddev->pers) {
|
if (mddev->pers) {
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
err = update_size(mddev, sectors);
|
err = update_size(mddev, sectors);
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
} else {
|
} else {
|
||||||
if (mddev->dev_sectors == 0 ||
|
if (mddev->dev_sectors == 0 ||
|
||||||
mddev->dev_sectors > sectors)
|
mddev->dev_sectors > sectors)
|
||||||
|
@ -5077,10 +5150,16 @@ int md_run(struct mddev *mddev)
|
||||||
}
|
}
|
||||||
if (err == 0 && pers->sync_request &&
|
if (err == 0 && pers->sync_request &&
|
||||||
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
|
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
|
||||||
err = bitmap_create(mddev);
|
struct bitmap *bitmap;
|
||||||
if (err)
|
|
||||||
|
bitmap = bitmap_create(mddev, -1);
|
||||||
|
if (IS_ERR(bitmap)) {
|
||||||
|
err = PTR_ERR(bitmap);
|
||||||
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
|
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
|
||||||
mdname(mddev), err);
|
mdname(mddev), err);
|
||||||
|
} else
|
||||||
|
mddev->bitmap = bitmap;
|
||||||
|
|
||||||
}
|
}
|
||||||
if (err) {
|
if (err) {
|
||||||
mddev_detach(mddev);
|
mddev_detach(mddev);
|
||||||
|
@ -5232,6 +5311,8 @@ static void md_clean(struct mddev *mddev)
|
||||||
|
|
||||||
static void __md_stop_writes(struct mddev *mddev)
|
static void __md_stop_writes(struct mddev *mddev)
|
||||||
{
|
{
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||||
flush_workqueue(md_misc_wq);
|
flush_workqueue(md_misc_wq);
|
||||||
if (mddev->sync_thread) {
|
if (mddev->sync_thread) {
|
||||||
|
@ -5250,6 +5331,8 @@ static void __md_stop_writes(struct mddev *mddev)
|
||||||
mddev->in_sync = 1;
|
mddev->in_sync = 1;
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
}
|
}
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
}
|
}
|
||||||
|
|
||||||
void md_stop_writes(struct mddev *mddev)
|
void md_stop_writes(struct mddev *mddev)
|
||||||
|
@ -5636,6 +5719,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
|
||||||
info.state = (1<<MD_SB_CLEAN);
|
info.state = (1<<MD_SB_CLEAN);
|
||||||
if (mddev->bitmap && mddev->bitmap_info.offset)
|
if (mddev->bitmap && mddev->bitmap_info.offset)
|
||||||
info.state |= (1<<MD_SB_BITMAP_PRESENT);
|
info.state |= (1<<MD_SB_BITMAP_PRESENT);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
info.state |= (1<<MD_SB_CLUSTERED);
|
||||||
info.active_disks = insync;
|
info.active_disks = insync;
|
||||||
info.working_disks = working;
|
info.working_disks = working;
|
||||||
info.failed_disks = failed;
|
info.failed_disks = failed;
|
||||||
|
@ -5691,7 +5776,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
|
||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
rdev = find_rdev_nr_rcu(mddev, info.number);
|
rdev = md_find_rdev_nr_rcu(mddev, info.number);
|
||||||
if (rdev) {
|
if (rdev) {
|
||||||
info.major = MAJOR(rdev->bdev->bd_dev);
|
info.major = MAJOR(rdev->bdev->bd_dev);
|
||||||
info.minor = MINOR(rdev->bdev->bd_dev);
|
info.minor = MINOR(rdev->bdev->bd_dev);
|
||||||
|
@ -5724,6 +5809,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
|
||||||
struct md_rdev *rdev;
|
struct md_rdev *rdev;
|
||||||
dev_t dev = MKDEV(info->major,info->minor);
|
dev_t dev = MKDEV(info->major,info->minor);
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev) &&
|
||||||
|
!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
|
||||||
|
pr_err("%s: Cannot add to clustered mddev.\n",
|
||||||
|
mdname(mddev));
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
|
if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
|
||||||
return -EOVERFLOW;
|
return -EOVERFLOW;
|
||||||
|
|
||||||
|
@ -5810,31 +5902,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
|
||||||
else
|
else
|
||||||
clear_bit(WriteMostly, &rdev->flags);
|
clear_bit(WriteMostly, &rdev->flags);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* check whether the device shows up in other nodes
|
||||||
|
*/
|
||||||
|
if (mddev_is_clustered(mddev)) {
|
||||||
|
if (info->state & (1 << MD_DISK_CANDIDATE)) {
|
||||||
|
/* Through --cluster-confirm */
|
||||||
|
set_bit(Candidate, &rdev->flags);
|
||||||
|
err = md_cluster_ops->new_disk_ack(mddev, true);
|
||||||
|
if (err) {
|
||||||
|
export_rdev(rdev);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
|
||||||
|
/* --add initiated by this node */
|
||||||
|
err = md_cluster_ops->add_new_disk_start(mddev, rdev);
|
||||||
|
if (err) {
|
||||||
|
md_cluster_ops->add_new_disk_finish(mddev);
|
||||||
|
export_rdev(rdev);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
rdev->raid_disk = -1;
|
rdev->raid_disk = -1;
|
||||||
err = bind_rdev_to_array(rdev, mddev);
|
err = bind_rdev_to_array(rdev, mddev);
|
||||||
if (!err && !mddev->pers->hot_remove_disk) {
|
|
||||||
/* If there is hot_add_disk but no hot_remove_disk
|
|
||||||
* then added disks for geometry changes,
|
|
||||||
* and should be added immediately.
|
|
||||||
*/
|
|
||||||
super_types[mddev->major_version].
|
|
||||||
validate_super(mddev, rdev);
|
|
||||||
err = mddev->pers->hot_add_disk(mddev, rdev);
|
|
||||||
if (err)
|
|
||||||
unbind_rdev_from_array(rdev);
|
|
||||||
}
|
|
||||||
if (err)
|
if (err)
|
||||||
export_rdev(rdev);
|
export_rdev(rdev);
|
||||||
else
|
else
|
||||||
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
err = add_bound_rdev(rdev);
|
||||||
|
if (mddev_is_clustered(mddev) &&
|
||||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
(info->state & (1 << MD_DISK_CLUSTER_ADD)))
|
||||||
if (mddev->degraded)
|
md_cluster_ops->add_new_disk_finish(mddev);
|
||||||
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
|
|
||||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
|
||||||
if (!err)
|
|
||||||
md_new_event(mddev);
|
|
||||||
md_wakeup_thread(mddev->thread);
|
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5895,18 +5994,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
|
||||||
if (!rdev)
|
if (!rdev)
|
||||||
return -ENXIO;
|
return -ENXIO;
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
|
|
||||||
clear_bit(Blocked, &rdev->flags);
|
clear_bit(Blocked, &rdev->flags);
|
||||||
remove_and_add_spares(mddev, rdev);
|
remove_and_add_spares(mddev, rdev);
|
||||||
|
|
||||||
if (rdev->raid_disk >= 0)
|
if (rdev->raid_disk >= 0)
|
||||||
goto busy;
|
goto busy;
|
||||||
|
|
||||||
kick_rdev_from_array(rdev);
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->remove_disk(mddev, rdev);
|
||||||
|
|
||||||
|
md_kick_rdev_from_array(rdev);
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
md_new_event(mddev);
|
md_new_event(mddev);
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
busy:
|
busy:
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_cancel(mddev);
|
||||||
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
|
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
|
||||||
bdevname(rdev->bdev,b), mdname(mddev));
|
bdevname(rdev->bdev,b), mdname(mddev));
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
@ -5956,12 +6066,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
goto abort_export;
|
goto abort_export;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
clear_bit(In_sync, &rdev->flags);
|
clear_bit(In_sync, &rdev->flags);
|
||||||
rdev->desc_nr = -1;
|
rdev->desc_nr = -1;
|
||||||
rdev->saved_raid_disk = -1;
|
rdev->saved_raid_disk = -1;
|
||||||
err = bind_rdev_to_array(rdev, mddev);
|
err = bind_rdev_to_array(rdev, mddev);
|
||||||
if (err)
|
if (err)
|
||||||
goto abort_export;
|
goto abort_clustered;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The rest should better be atomic, we can have disk failures
|
* The rest should better be atomic, we can have disk failures
|
||||||
|
@ -5972,6 +6085,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
|
||||||
|
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
/*
|
/*
|
||||||
* Kick recovery, maybe this spare has to be added to the
|
* Kick recovery, maybe this spare has to be added to the
|
||||||
* array immediately.
|
* array immediately.
|
||||||
|
@ -5981,6 +6096,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
|
||||||
md_new_event(mddev);
|
md_new_event(mddev);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
abort_clustered:
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_cancel(mddev);
|
||||||
abort_export:
|
abort_export:
|
||||||
export_rdev(rdev);
|
export_rdev(rdev);
|
||||||
return err;
|
return err;
|
||||||
|
@ -6038,9 +6156,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
|
||||||
if (mddev->pers) {
|
if (mddev->pers) {
|
||||||
mddev->pers->quiesce(mddev, 1);
|
mddev->pers->quiesce(mddev, 1);
|
||||||
if (fd >= 0) {
|
if (fd >= 0) {
|
||||||
err = bitmap_create(mddev);
|
struct bitmap *bitmap;
|
||||||
if (!err)
|
|
||||||
|
bitmap = bitmap_create(mddev, -1);
|
||||||
|
if (!IS_ERR(bitmap)) {
|
||||||
|
mddev->bitmap = bitmap;
|
||||||
err = bitmap_load(mddev);
|
err = bitmap_load(mddev);
|
||||||
|
} else
|
||||||
|
err = PTR_ERR(bitmap);
|
||||||
}
|
}
|
||||||
if (fd < 0 || err) {
|
if (fd < 0 || err) {
|
||||||
bitmap_destroy(mddev);
|
bitmap_destroy(mddev);
|
||||||
|
@ -6293,6 +6416,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
|
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
|
||||||
rv = update_size(mddev, (sector_t)info->size * 2);
|
rv = update_size(mddev, (sector_t)info->size * 2);
|
||||||
|
|
||||||
|
@ -6300,33 +6425,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
||||||
rv = update_raid_disks(mddev, info->raid_disks);
|
rv = update_raid_disks(mddev, info->raid_disks);
|
||||||
|
|
||||||
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
|
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
|
||||||
if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
|
if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
|
||||||
return -EINVAL;
|
rv = -EINVAL;
|
||||||
if (mddev->recovery || mddev->sync_thread)
|
goto err;
|
||||||
return -EBUSY;
|
}
|
||||||
|
if (mddev->recovery || mddev->sync_thread) {
|
||||||
|
rv = -EBUSY;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
|
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
|
||||||
|
struct bitmap *bitmap;
|
||||||
/* add the bitmap */
|
/* add the bitmap */
|
||||||
if (mddev->bitmap)
|
if (mddev->bitmap) {
|
||||||
return -EEXIST;
|
rv = -EEXIST;
|
||||||
if (mddev->bitmap_info.default_offset == 0)
|
goto err;
|
||||||
return -EINVAL;
|
}
|
||||||
|
if (mddev->bitmap_info.default_offset == 0) {
|
||||||
|
rv = -EINVAL;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
mddev->bitmap_info.offset =
|
mddev->bitmap_info.offset =
|
||||||
mddev->bitmap_info.default_offset;
|
mddev->bitmap_info.default_offset;
|
||||||
mddev->bitmap_info.space =
|
mddev->bitmap_info.space =
|
||||||
mddev->bitmap_info.default_space;
|
mddev->bitmap_info.default_space;
|
||||||
mddev->pers->quiesce(mddev, 1);
|
mddev->pers->quiesce(mddev, 1);
|
||||||
rv = bitmap_create(mddev);
|
bitmap = bitmap_create(mddev, -1);
|
||||||
if (!rv)
|
if (!IS_ERR(bitmap)) {
|
||||||
|
mddev->bitmap = bitmap;
|
||||||
rv = bitmap_load(mddev);
|
rv = bitmap_load(mddev);
|
||||||
|
} else
|
||||||
|
rv = PTR_ERR(bitmap);
|
||||||
if (rv)
|
if (rv)
|
||||||
bitmap_destroy(mddev);
|
bitmap_destroy(mddev);
|
||||||
mddev->pers->quiesce(mddev, 0);
|
mddev->pers->quiesce(mddev, 0);
|
||||||
} else {
|
} else {
|
||||||
/* remove the bitmap */
|
/* remove the bitmap */
|
||||||
if (!mddev->bitmap)
|
if (!mddev->bitmap) {
|
||||||
return -ENOENT;
|
rv = -ENOENT;
|
||||||
if (mddev->bitmap->storage.file)
|
goto err;
|
||||||
return -EINVAL;
|
}
|
||||||
|
if (mddev->bitmap->storage.file) {
|
||||||
|
rv = -EINVAL;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
mddev->pers->quiesce(mddev, 1);
|
mddev->pers->quiesce(mddev, 1);
|
||||||
bitmap_destroy(mddev);
|
bitmap_destroy(mddev);
|
||||||
mddev->pers->quiesce(mddev, 0);
|
mddev->pers->quiesce(mddev, 0);
|
||||||
|
@ -6334,6 +6475,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
|
return rv;
|
||||||
|
err:
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_cancel(mddev);
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6393,6 +6540,7 @@ static inline bool md_ioctl_valid(unsigned int cmd)
|
||||||
case SET_DISK_FAULTY:
|
case SET_DISK_FAULTY:
|
||||||
case STOP_ARRAY:
|
case STOP_ARRAY:
|
||||||
case STOP_ARRAY_RO:
|
case STOP_ARRAY_RO:
|
||||||
|
case CLUSTERED_DISK_NACK:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -6665,6 +6813,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
|
||||||
goto unlock;
|
goto unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case CLUSTERED_DISK_NACK:
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->new_disk_ack(mddev, false);
|
||||||
|
else
|
||||||
|
err = -EINVAL;
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
case HOT_ADD_DISK:
|
case HOT_ADD_DISK:
|
||||||
err = hot_add_disk(mddev, new_decode_dev(arg));
|
err = hot_add_disk(mddev, new_decode_dev(arg));
|
||||||
goto unlock;
|
goto unlock;
|
||||||
|
@ -7238,6 +7393,55 @@ int unregister_md_personality(struct md_personality *p)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(unregister_md_personality);
|
EXPORT_SYMBOL(unregister_md_personality);
|
||||||
|
|
||||||
|
int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
|
||||||
|
{
|
||||||
|
if (md_cluster_ops != NULL)
|
||||||
|
return -EALREADY;
|
||||||
|
spin_lock(&pers_lock);
|
||||||
|
md_cluster_ops = ops;
|
||||||
|
md_cluster_mod = module;
|
||||||
|
spin_unlock(&pers_lock);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(register_md_cluster_operations);
|
||||||
|
|
||||||
|
int unregister_md_cluster_operations(void)
|
||||||
|
{
|
||||||
|
spin_lock(&pers_lock);
|
||||||
|
md_cluster_ops = NULL;
|
||||||
|
spin_unlock(&pers_lock);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(unregister_md_cluster_operations);
|
||||||
|
|
||||||
|
int md_setup_cluster(struct mddev *mddev, int nodes)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = request_module("md-cluster");
|
||||||
|
if (err) {
|
||||||
|
pr_err("md-cluster module not found.\n");
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
spin_lock(&pers_lock);
|
||||||
|
if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
|
||||||
|
spin_unlock(&pers_lock);
|
||||||
|
return -ENOENT;
|
||||||
|
}
|
||||||
|
spin_unlock(&pers_lock);
|
||||||
|
|
||||||
|
return md_cluster_ops->join(mddev, nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void md_cluster_stop(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
if (!md_cluster_ops)
|
||||||
|
return;
|
||||||
|
md_cluster_ops->leave(mddev);
|
||||||
|
module_put(md_cluster_mod);
|
||||||
|
}
|
||||||
|
|
||||||
static int is_mddev_idle(struct mddev *mddev, int init)
|
static int is_mddev_idle(struct mddev *mddev, int init)
|
||||||
{
|
{
|
||||||
struct md_rdev *rdev;
|
struct md_rdev *rdev;
|
||||||
|
@ -7375,7 +7579,11 @@ int md_allow_write(struct mddev *mddev)
|
||||||
mddev->safemode == 0)
|
mddev->safemode == 0)
|
||||||
mddev->safemode = 1;
|
mddev->safemode = 1;
|
||||||
spin_unlock(&mddev->lock);
|
spin_unlock(&mddev->lock);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
md_update_sb(mddev, 0);
|
md_update_sb(mddev, 0);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||||
} else
|
} else
|
||||||
spin_unlock(&mddev->lock);
|
spin_unlock(&mddev->lock);
|
||||||
|
@ -7576,6 +7784,9 @@ void md_do_sync(struct md_thread *thread)
|
||||||
md_new_event(mddev);
|
md_new_event(mddev);
|
||||||
update_time = jiffies;
|
update_time = jiffies;
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->resync_start(mddev, j, max_sectors);
|
||||||
|
|
||||||
blk_start_plug(&plug);
|
blk_start_plug(&plug);
|
||||||
while (j < max_sectors) {
|
while (j < max_sectors) {
|
||||||
sector_t sectors;
|
sector_t sectors;
|
||||||
|
@ -7636,6 +7847,8 @@ void md_do_sync(struct md_thread *thread)
|
||||||
j += sectors;
|
j += sectors;
|
||||||
if (j > 2)
|
if (j > 2)
|
||||||
mddev->curr_resync = j;
|
mddev->curr_resync = j;
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->resync_info_update(mddev, j, max_sectors);
|
||||||
mddev->curr_mark_cnt = io_sectors;
|
mddev->curr_mark_cnt = io_sectors;
|
||||||
if (last_check == 0)
|
if (last_check == 0)
|
||||||
/* this is the earliest that rebuild will be
|
/* this is the earliest that rebuild will be
|
||||||
|
@ -7696,6 +7909,9 @@ void md_do_sync(struct md_thread *thread)
|
||||||
/* tell personality that we are finished */
|
/* tell personality that we are finished */
|
||||||
mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
|
mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
|
||||||
|
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->resync_finish(mddev);
|
||||||
|
|
||||||
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
|
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
|
||||||
mddev->curr_resync > 2) {
|
mddev->curr_resync > 2) {
|
||||||
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
|
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
|
||||||
|
@ -7925,8 +8141,13 @@ void md_check_recovery(struct mddev *mddev)
|
||||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mddev->flags & MD_UPDATE_SB_FLAGS)
|
if (mddev->flags & MD_UPDATE_SB_FLAGS) {
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
md_update_sb(mddev, 0);
|
md_update_sb(mddev, 0);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
|
}
|
||||||
|
|
||||||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
|
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
|
||||||
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
|
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
|
||||||
|
@ -8024,6 +8245,8 @@ void md_reap_sync_thread(struct mddev *mddev)
|
||||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_start(mddev);
|
||||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
||||||
mddev->pers->finish_reshape)
|
mddev->pers->finish_reshape)
|
||||||
mddev->pers->finish_reshape(mddev);
|
mddev->pers->finish_reshape(mddev);
|
||||||
|
@ -8036,6 +8259,8 @@ void md_reap_sync_thread(struct mddev *mddev)
|
||||||
rdev->saved_raid_disk = -1;
|
rdev->saved_raid_disk = -1;
|
||||||
|
|
||||||
md_update_sb(mddev, 1);
|
md_update_sb(mddev, 1);
|
||||||
|
if (mddev_is_clustered(mddev))
|
||||||
|
md_cluster_ops->metadata_update_finish(mddev);
|
||||||
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
||||||
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||||
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||||
|
@ -8656,6 +8881,28 @@ err_wq:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void md_reload_sb(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
struct md_rdev *rdev, *tmp;
|
||||||
|
|
||||||
|
rdev_for_each_safe(rdev, tmp, mddev) {
|
||||||
|
rdev->sb_loaded = 0;
|
||||||
|
ClearPageUptodate(rdev->sb_page);
|
||||||
|
}
|
||||||
|
mddev->raid_disks = 0;
|
||||||
|
analyze_sbs(mddev);
|
||||||
|
rdev_for_each_safe(rdev, tmp, mddev) {
|
||||||
|
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
|
||||||
|
/* since we don't write to faulty devices, we figure out if the
|
||||||
|
* disk is faulty by comparing events
|
||||||
|
*/
|
||||||
|
if (mddev->events > sb->events)
|
||||||
|
set_bit(Faulty, &rdev->flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(md_reload_sb);
|
||||||
|
|
||||||
#ifndef MODULE
|
#ifndef MODULE
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include <linux/timer.h>
|
#include <linux/timer.h>
|
||||||
#include <linux/wait.h>
|
#include <linux/wait.h>
|
||||||
#include <linux/workqueue.h>
|
#include <linux/workqueue.h>
|
||||||
|
#include "md-cluster.h"
|
||||||
|
|
||||||
#define MaxSector (~(sector_t)0)
|
#define MaxSector (~(sector_t)0)
|
||||||
|
|
||||||
|
@ -170,6 +171,10 @@ enum flag_bits {
|
||||||
* a want_replacement device with same
|
* a want_replacement device with same
|
||||||
* raid_disk number.
|
* raid_disk number.
|
||||||
*/
|
*/
|
||||||
|
Candidate, /* For clustered environments only:
|
||||||
|
* This device is seen locally but not
|
||||||
|
* by the whole cluster
|
||||||
|
*/
|
||||||
};
|
};
|
||||||
|
|
||||||
#define BB_LEN_MASK (0x00000000000001FFULL)
|
#define BB_LEN_MASK (0x00000000000001FFULL)
|
||||||
|
@ -202,6 +207,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||||
int is_new);
|
int is_new);
|
||||||
extern void md_ack_all_badblocks(struct badblocks *bb);
|
extern void md_ack_all_badblocks(struct badblocks *bb);
|
||||||
|
|
||||||
|
struct md_cluster_info;
|
||||||
|
|
||||||
struct mddev {
|
struct mddev {
|
||||||
void *private;
|
void *private;
|
||||||
struct md_personality *pers;
|
struct md_personality *pers;
|
||||||
|
@ -430,6 +437,8 @@ struct mddev {
|
||||||
unsigned long daemon_sleep; /* how many jiffies between updates? */
|
unsigned long daemon_sleep; /* how many jiffies between updates? */
|
||||||
unsigned long max_write_behind; /* write-behind mode */
|
unsigned long max_write_behind; /* write-behind mode */
|
||||||
int external;
|
int external;
|
||||||
|
int nodes; /* Maximum number of nodes in the cluster */
|
||||||
|
char cluster_name[64]; /* Name of the cluster */
|
||||||
} bitmap_info;
|
} bitmap_info;
|
||||||
|
|
||||||
atomic_t max_corr_read_errors; /* max read retries */
|
atomic_t max_corr_read_errors; /* max read retries */
|
||||||
|
@ -448,6 +457,7 @@ struct mddev {
|
||||||
struct work_struct flush_work;
|
struct work_struct flush_work;
|
||||||
struct work_struct event_work; /* used by dm to report failure event */
|
struct work_struct event_work; /* used by dm to report failure event */
|
||||||
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
|
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
|
||||||
|
struct md_cluster_info *cluster_info;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline int __must_check mddev_lock(struct mddev *mddev)
|
static inline int __must_check mddev_lock(struct mddev *mddev)
|
||||||
|
@ -608,6 +618,11 @@ static inline void safe_put_page(struct page *p)
|
||||||
|
|
||||||
extern int register_md_personality(struct md_personality *p);
|
extern int register_md_personality(struct md_personality *p);
|
||||||
extern int unregister_md_personality(struct md_personality *p);
|
extern int unregister_md_personality(struct md_personality *p);
|
||||||
|
extern int register_md_cluster_operations(struct md_cluster_operations *ops,
|
||||||
|
struct module *module);
|
||||||
|
extern int unregister_md_cluster_operations(void);
|
||||||
|
extern int md_setup_cluster(struct mddev *mddev, int nodes);
|
||||||
|
extern void md_cluster_stop(struct mddev *mddev);
|
||||||
extern struct md_thread *md_register_thread(
|
extern struct md_thread *md_register_thread(
|
||||||
void (*run)(struct md_thread *thread),
|
void (*run)(struct md_thread *thread),
|
||||||
struct mddev *mddev,
|
struct mddev *mddev,
|
||||||
|
@ -654,6 +669,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
|
||||||
struct mddev *mddev);
|
struct mddev *mddev);
|
||||||
|
|
||||||
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
|
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
|
||||||
|
extern void md_reload_sb(struct mddev *mddev);
|
||||||
|
extern void md_update_sb(struct mddev *mddev, int force);
|
||||||
|
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
|
||||||
|
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
|
||||||
static inline int mddev_check_plugged(struct mddev *mddev)
|
static inline int mddev_check_plugged(struct mddev *mddev)
|
||||||
{
|
{
|
||||||
return !!blk_check_plugged(md_unplug, mddev,
|
return !!blk_check_plugged(md_unplug, mddev,
|
||||||
|
@ -669,4 +688,9 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern struct md_cluster_operations *md_cluster_ops;
|
||||||
|
static inline int mddev_is_clustered(struct mddev *mddev)
|
||||||
|
{
|
||||||
|
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
|
||||||
|
}
|
||||||
#endif /* _MD_MD_H */
|
#endif /* _MD_MD_H */
|
||||||
|
|
|
@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
|
||||||
has_nonrot_disk = 0;
|
has_nonrot_disk = 0;
|
||||||
choose_next_idle = 0;
|
choose_next_idle = 0;
|
||||||
|
|
||||||
choose_first = (conf->mddev->recovery_cp < this_sector + sectors);
|
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
|
||||||
|
(mddev_is_clustered(conf->mddev) &&
|
||||||
|
md_cluster_ops->area_resyncing(conf->mddev, this_sector,
|
||||||
|
this_sector + sectors)))
|
||||||
|
choose_first = 1;
|
||||||
|
else
|
||||||
|
choose_first = 0;
|
||||||
|
|
||||||
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
|
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
|
||||||
sector_t dist;
|
sector_t dist;
|
||||||
|
@ -1102,8 +1108,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||||
md_write_start(mddev, bio); /* wait on superblock update early */
|
md_write_start(mddev, bio); /* wait on superblock update early */
|
||||||
|
|
||||||
if (bio_data_dir(bio) == WRITE &&
|
if (bio_data_dir(bio) == WRITE &&
|
||||||
bio_end_sector(bio) > mddev->suspend_lo &&
|
((bio_end_sector(bio) > mddev->suspend_lo &&
|
||||||
bio->bi_iter.bi_sector < mddev->suspend_hi) {
|
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
|
||||||
|
(mddev_is_clustered(mddev) &&
|
||||||
|
md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
|
||||||
/* As the suspend_* range is controlled by
|
/* As the suspend_* range is controlled by
|
||||||
* userspace, we want an interruptible
|
* userspace, we want an interruptible
|
||||||
* wait.
|
* wait.
|
||||||
|
@ -1114,7 +1122,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||||
prepare_to_wait(&conf->wait_barrier,
|
prepare_to_wait(&conf->wait_barrier,
|
||||||
&w, TASK_INTERRUPTIBLE);
|
&w, TASK_INTERRUPTIBLE);
|
||||||
if (bio_end_sector(bio) <= mddev->suspend_lo ||
|
if (bio_end_sector(bio) <= mddev->suspend_lo ||
|
||||||
bio->bi_iter.bi_sector >= mddev->suspend_hi)
|
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
|
||||||
|
(mddev_is_clustered(mddev) &&
|
||||||
|
!md_cluster_ops->area_resyncing(mddev,
|
||||||
|
bio->bi_iter.bi_sector, bio_end_sector(bio))))
|
||||||
break;
|
break;
|
||||||
schedule();
|
schedule();
|
||||||
}
|
}
|
||||||
|
@ -1561,6 +1572,7 @@ static int raid1_spare_active(struct mddev *mddev)
|
||||||
struct md_rdev *rdev = conf->mirrors[i].rdev;
|
struct md_rdev *rdev = conf->mirrors[i].rdev;
|
||||||
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
|
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
|
||||||
if (repl
|
if (repl
|
||||||
|
&& !test_bit(Candidate, &repl->flags)
|
||||||
&& repl->recovery_offset == MaxSector
|
&& repl->recovery_offset == MaxSector
|
||||||
&& !test_bit(Faulty, &repl->flags)
|
&& !test_bit(Faulty, &repl->flags)
|
||||||
&& !test_and_set_bit(In_sync, &repl->flags)) {
|
&& !test_and_set_bit(In_sync, &repl->flags)) {
|
||||||
|
|
|
@ -78,6 +78,12 @@
|
||||||
#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
|
#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
|
||||||
#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
|
#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
|
||||||
#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
|
#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
|
||||||
|
#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster
|
||||||
|
* For clustered enviroments only.
|
||||||
|
*/
|
||||||
|
#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed
|
||||||
|
* For clustered enviroments only.
|
||||||
|
*/
|
||||||
|
|
||||||
#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
|
#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
|
||||||
* read requests will only be sent here in
|
* read requests will only be sent here in
|
||||||
|
@ -101,6 +107,7 @@ typedef struct mdp_device_descriptor_s {
|
||||||
#define MD_SB_CLEAN 0
|
#define MD_SB_CLEAN 0
|
||||||
#define MD_SB_ERRORS 1
|
#define MD_SB_ERRORS 1
|
||||||
|
|
||||||
|
#define MD_SB_CLUSTERED 5 /* MD is clustered */
|
||||||
#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */
|
#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -62,6 +62,7 @@
|
||||||
#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
|
#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
|
||||||
#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
|
#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
|
||||||
#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
|
#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
|
||||||
|
#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35)
|
||||||
|
|
||||||
/* 63 partitions with the alternate major number (mdp) */
|
/* 63 partitions with the alternate major number (mdp) */
|
||||||
#define MdpMinorShift 6
|
#define MdpMinorShift 6
|
||||||
|
|
Loading…
Add table
Reference in a new issue