Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1aee41f6 authored by Goldwyn Rodrigues's avatar Goldwyn Rodrigues
Browse files

Add new disk to clustered array



Algorithm:
1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
   ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD)
2. Node 1 sends NEWDISK with uuid and slot number
3. Other nodes issue kobject_uevent_env with uuid and slot number
(Steps 4,5 could be a udev rule)
4. In userspace, the node searches for the disk, perhaps
   using blkid -t SUB_UUID=""
5. Other nodes issue either of the following depending on whether the disk
   was found:
   ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
	 disc.number set to slot number)
   ioctl(CLUSTERED_DISK_NACK)
6. Other nodes drop lock on no-new-devs (CR) if device is found
7. Node 1 attempts EX lock on no-new-devs
8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk
   as SpareLocal
9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED
10. Other nodes understand if the device is added or not by reading the superblock again after receiving the METADATA_UPDATED message.

Signed-off-by: default avatarLidong Zhong <lzhong@suse.com>
Signed-off-by: default avatarGoldwyn Rodrigues <rgoldwyn@suse.com>
parent 7d49ffcf
Loading
Loading
Loading
Loading
+103 −1
Original line number Diff line number Diff line
@@ -12,11 +12,13 @@
#include <linux/module.h>
#include <linux/dlm.h>
#include <linux/sched.h>
#include <linux/raid/md_p.h>
#include "md.h"
#include "bitmap.h"
#include "md-cluster.h"

#define LVB_SIZE	64
#define NEW_DEV_TIMEOUT 5000

struct dlm_lock_resource {
	dlm_lockspace_t *ls;
@@ -56,19 +58,25 @@ struct md_cluster_info {
	struct dlm_lock_resource *ack_lockres;
	struct dlm_lock_resource *message_lockres;
	struct dlm_lock_resource *token_lockres;
	struct dlm_lock_resource *no_new_dev_lockres;
	struct md_thread *recv_thread;
	struct completion newdisk_completion;
};

enum msg_type {
	METADATA_UPDATED = 0,
	RESYNCING,
	NEWDISK,
};

struct cluster_msg {
	int type;
	int slot;
	/* TODO: Unionize this for smaller footprint */
	sector_t low;
	sector_t high;
	char uuid[16];
	int raid_slot;
};

static void sync_ast(void *arg)
@@ -358,13 +366,41 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
	spin_unlock_irq(&cinfo->suspend_lock);
}

static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
{
	char disk_uuid[64];
	struct md_cluster_info *cinfo = mddev->cluster_info;
	char event_name[] = "EVENT=ADD_DEVICE";
	char raid_slot[16];
	char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
	int len;

	len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
	pretty_uuid(disk_uuid + len, cmsg->uuid);
	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
	init_completion(&cinfo->newdisk_completion);
	kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
	wait_for_completion_timeout(&cinfo->newdisk_completion,
			NEW_DEV_TIMEOUT);
}


static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
	struct md_cluster_info *cinfo = mddev->cluster_info;

	md_reload_sb(mddev);
	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
}

static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{
	switch (msg->type) {
	case METADATA_UPDATED:
		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
			__func__, __LINE__, msg->slot);
		md_reload_sb(mddev);
		process_metadata_update(mddev, msg);
		break;
	case RESYNCING:
		pr_info("%s: %d Received message: RESYNCING from %d\n",
@@ -372,6 +408,10 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
		process_suspend_info(mddev->cluster_info, msg->slot,
				msg->low, msg->high);
		break;
	case NEWDISK:
		pr_info("%s: %d Received message: NEWDISK from %d\n",
			__func__, __LINE__, msg->slot);
		process_add_new_disk(mddev, msg);
	};
}

@@ -593,10 +633,18 @@ static int join(struct mddev *mddev, int nodes)
	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
	if (!cinfo->ack_lockres)
		goto err;
	cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
	if (!cinfo->no_new_dev_lockres)
		goto err;

	/* get sync CR lock on ACK. */
	if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
		pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
				ret);
	/* get sync CR lock on no-new-dev. */
	if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
		pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);


	pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
	snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
@@ -621,6 +669,7 @@ static int join(struct mddev *mddev, int nodes)
	lockres_free(cinfo->message_lockres);
	lockres_free(cinfo->token_lockres);
	lockres_free(cinfo->ack_lockres);
	lockres_free(cinfo->no_new_dev_lockres);
	lockres_free(cinfo->bitmap_lockres);
	lockres_free(cinfo->sb_lock);
	if (cinfo->lockspace)
@@ -642,6 +691,7 @@ static int leave(struct mddev *mddev)
	lockres_free(cinfo->message_lockres);
	lockres_free(cinfo->token_lockres);
	lockres_free(cinfo->ack_lockres);
	lockres_free(cinfo->no_new_dev_lockres);
	lockres_free(cinfo->sb_lock);
	lockres_free(cinfo->bitmap_lockres);
	dlm_release_lockspace(cinfo->lockspace, 2);
@@ -742,6 +792,55 @@ static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
	return ret;
}

static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
{
	struct md_cluster_info *cinfo = mddev->cluster_info;
	struct cluster_msg cmsg;
	int ret = 0;
	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
	char *uuid = sb->device_uuid;

	memset(&cmsg, 0, sizeof(cmsg));
	cmsg.type = cpu_to_le32(NEWDISK);
	memcpy(cmsg.uuid, uuid, 16);
	cmsg.raid_slot = rdev->desc_nr;
	lock_comm(cinfo);
	ret = __sendmsg(cinfo, &cmsg);
	if (ret)
		return ret;
	cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
	ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
	cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
	/* Some node does not "see" the device */
	if (ret == -EAGAIN)
		ret = -ENOENT;
	else
		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
	return ret;
}

static int add_new_disk_finish(struct mddev *mddev)
{
	struct cluster_msg cmsg;
	struct md_cluster_info *cinfo = mddev->cluster_info;
	int ret;
	/* Write sb and inform others */
	md_update_sb(mddev, 1);
	cmsg.type = METADATA_UPDATED;
	ret = __sendmsg(cinfo, &cmsg);
	unlock_comm(cinfo);
	return ret;
}

static void new_disk_ack(struct mddev *mddev, bool ack)
{
	struct md_cluster_info *cinfo = mddev->cluster_info;

	if (ack)
		dlm_unlock_sync(cinfo->no_new_dev_lockres);
	complete(&cinfo->newdisk_completion);
}

static struct md_cluster_operations cluster_ops = {
	.join   = join,
	.leave  = leave,
@@ -753,6 +852,9 @@ static struct md_cluster_operations cluster_ops = {
	.metadata_update_finish = metadata_update_finish,
	.metadata_update_cancel = metadata_update_cancel,
	.area_resyncing = area_resyncing,
	.add_new_disk_start = add_new_disk_start,
	.add_new_disk_finish = add_new_disk_finish,
	.new_disk_ack = new_disk_ack,
};

static int __init cluster_init(void)
+4 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#include "md.h"

struct mddev;
struct md_rdev;

struct md_cluster_operations {
	int (*join)(struct mddev *mddev, int nodes);
@@ -18,6 +19,9 @@ struct md_cluster_operations {
	int (*metadata_update_finish)(struct mddev *mddev);
	int (*metadata_update_cancel)(struct mddev *mddev);
	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
	int (*add_new_disk_finish)(struct mddev *mddev);
	void (*new_disk_ack)(struct mddev *mddev, bool ack);
};

#endif /* _MD_CLUSTER_H */
+49 −3
Original line number Diff line number Diff line
@@ -2210,7 +2210,7 @@ static void sync_sbs(struct mddev *mddev, int nospares)
	}
}

static void md_update_sb(struct mddev *mddev, int force_change)
void md_update_sb(struct mddev *mddev, int force_change)
{
	struct md_rdev *rdev;
	int sync_req;
@@ -2371,6 +2371,7 @@ static void md_update_sb(struct mddev *mddev, int force_change)
		wake_up(&rdev->blocked_wait);
	}
}
EXPORT_SYMBOL(md_update_sb);

/* words written to sysfs files may, or may not, be \n terminated.
 * We want to accept with case. For this we use cmd_match.
@@ -3151,7 +3152,7 @@ static void analyze_sbs(struct mddev *mddev)
			kick_rdev_from_array(rdev);
			continue;
		}
		if (rdev != freshest)
		if (rdev != freshest) {
			if (super_types[mddev->major_version].
			    validate_super(mddev, rdev)) {
				printk(KERN_WARNING "md: kicking non-fresh %s"
@@ -3160,6 +3161,15 @@ static void analyze_sbs(struct mddev *mddev)
				kick_rdev_from_array(rdev);
				continue;
			}
			/* No device should have a Candidate flag
			 * when reading devices
			 */
			if (test_bit(Candidate, &rdev->flags)) {
				pr_info("md: kicking Cluster Candidate %s from array!\n",
					bdevname(rdev->bdev, b));
				kick_rdev_from_array(rdev);
			}
		}
		if (mddev->level == LEVEL_MULTIPATH) {
			rdev->desc_nr = i++;
			rdev->raid_disk = rdev->desc_nr;
@@ -5655,7 +5665,6 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
		info.state |= (1<<MD_SB_BITMAP_PRESENT);
	if (mddev_is_clustered(mddev))
		info.state |= (1<<MD_SB_CLUSTERED);

	info.active_disks  = insync;
	info.working_disks = working;
	info.failed_disks  = failed;
@@ -5744,6 +5753,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
	struct md_rdev *rdev;
	dev_t dev = MKDEV(info->major,info->minor);

	if (mddev_is_clustered(mddev) &&
		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
		pr_err("%s: Cannot add to clustered mddev. Try --cluster-add\n",
			       mdname(mddev));
		return -EINVAL;
	}

	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
		return -EOVERFLOW;

@@ -5830,6 +5846,25 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
		else
			clear_bit(WriteMostly, &rdev->flags);

		/*
		 * check whether the device shows up in other nodes
		 */
		if (mddev_is_clustered(mddev)) {
			if (info->state & (1 << MD_DISK_CANDIDATE)) {
				/* Through --cluster-confirm */
				set_bit(Candidate, &rdev->flags);
				md_cluster_ops->new_disk_ack(mddev, true);
			} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
				/* --add initiated by this node */
				err = md_cluster_ops->add_new_disk_start(mddev, rdev);
				if (err) {
					md_cluster_ops->add_new_disk_finish(mddev);
					export_rdev(rdev);
					return err;
				}
			}
		}

		rdev->raid_disk = -1;
		err = bind_rdev_to_array(rdev, mddev);
		if (!err && !mddev->pers->hot_remove_disk) {
@@ -5855,6 +5890,9 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
		if (!err)
			md_new_event(mddev);
		md_wakeup_thread(mddev->thread);
		if (mddev_is_clustered(mddev) &&
				(info->state & (1 << MD_DISK_CLUSTER_ADD)))
			md_cluster_ops->add_new_disk_finish(mddev);
		return err;
	}

@@ -6456,6 +6494,7 @@ static inline bool md_ioctl_valid(unsigned int cmd)
	case SET_DISK_FAULTY:
	case STOP_ARRAY:
	case STOP_ARRAY_RO:
	case CLUSTERED_DISK_NACK:
		return true;
	default:
		return false;
@@ -6728,6 +6767,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
		goto unlock;
	}

	case CLUSTERED_DISK_NACK:
		if (mddev_is_clustered(mddev))
			md_cluster_ops->new_disk_ack(mddev, false);
		else
			err = -EINVAL;
		goto unlock;

	case HOT_ADD_DISK:
		err = hot_add_disk(mddev, new_decode_dev(arg));
		goto unlock;
+5 −0
Original line number Diff line number Diff line
@@ -171,6 +171,10 @@ enum flag_bits {
				 * a want_replacement device with same
				 * raid_disk number.
				 */
	Candidate,		/* For clustered environments only:
				 * This device is seen locally but not
				 * by the whole cluster
				 */
};

#define BB_LEN_MASK	(0x00000000000001FFULL)
@@ -666,6 +670,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,

extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
extern void md_reload_sb(struct mddev *mddev);
extern void md_update_sb(struct mddev *mddev, int force);
static inline int mddev_check_plugged(struct mddev *mddev)
{
	return !!blk_check_plugged(md_unplug, mddev,
+1 −0
Original line number Diff line number Diff line
@@ -1571,6 +1571,7 @@ static int raid1_spare_active(struct mddev *mddev)
		struct md_rdev *rdev = conf->mirrors[i].rdev;
		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
		if (repl
		    && !test_bit(Candidate, &repl->flags)
		    && repl->recovery_offset == MaxSector
		    && !test_bit(Faulty, &repl->flags)
		    && !test_and_set_bit(In_sync, &repl->flags)) {
Loading