Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 2a013e37 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'md/4.3' of git://neil.brown.name/md

Pull md updates from Neil Brown:

 - an assortment of little fixes, several for minor races only likely to
   be hit during testing

 - further cluster-md-raid1 development, not ready for real use yet.

 - new RAID6 syndrome code for ARM NEON

 - fix a race where a write can return before failure of one device is
   properly recorded in metadata, so an immediate crash might result in
   that write being lost.

* tag 'md/4.3' of git://neil.brown.name/md: (33 commits)
  md/raid5: ensure device failure recorded before write request returns.
  md/raid5: use bio_list for the list of bios to return.
  md/raid10: ensure device failure recorded before write request returns.
  md/raid1: ensure device failure recorded before write request returns.
  md-cluster: remove inappropriate try_module_get from join()
  md: extend spinlock protection in register_md_cluster_operations
  md-cluster: Read the disk bitmap sb and check if it needs recovery
  md-cluster: only call complete(&cinfo->completion) when node join cluster
  md-cluster: add missed lockres_free
  md-cluster: remove the unused sb_lock
  md-cluster: init suspend_list and suspend_lock early in join
  md-cluster: add the error check if failed to get dlm lock
  md-cluster: init completion within lockres_init
  md-cluster: fix deadlock issue on message lock
  md-cluster: transfer the resync ownership to another node
  md-cluster: split recover_slot for future code reuse
  md-cluster: use %pU to print UUIDs
  md: setup safemode_timer before it's being used
  md/raid5: handle possible race as reshape completes.
  md: sync sync_completed has correct value as recovery finishes.
  ...
parents 17447717 e89c6fdf
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -91,7 +91,7 @@ The algorithm is:
    this message inappropriate or redundant.

 3. sender write LVB.
    sender down-convert MESSAGE from EX to CR
    sender down-convert MESSAGE from EX to CW
    sender try to get EX of ACK
    [ wait until all receiver has *processed* the MESSAGE ]

@@ -112,7 +112,7 @@ The algorithm is:
    sender down-convert ACK from EX to CR
    sender release MESSAGE
    sender release TOKEN
                               receiver upconvert to EX of MESSAGE
                               receiver upconvert to PR of MESSAGE
                               receiver get CR of ACK
                               receiver release MESSAGE

+105 −54
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ struct resync_info {
/* md_cluster_info flags */
#define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
#define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
#define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3


struct md_cluster_info {
@@ -52,7 +53,6 @@ struct md_cluster_info {
	dlm_lockspace_t *lockspace;
	int slot_number;
	struct completion completion;
	struct dlm_lock_resource *sb_lock;
	struct mutex sb_mutex;
	struct dlm_lock_resource *bitmap_lockres;
	struct list_head suspend_list;
@@ -75,6 +75,7 @@ enum msg_type {
	NEWDISK,
	REMOVE,
	RE_ADD,
	BITMAP_NEEDS_SYNC,
};

struct cluster_msg {
@@ -99,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
{
	int ret = 0;

	init_completion(&res->completion);
	ret = dlm_lock(res->ls, mode, &res->lksb,
			res->flags, res->name, strlen(res->name),
			0, sync_ast, res, res->bast);
@@ -124,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
	if (!res)
		return NULL;
	init_completion(&res->completion);
	res->ls = cinfo->lockspace;
	res->mddev = mddev;
	namelen = strlen(name);
@@ -165,11 +166,24 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,

static void lockres_free(struct dlm_lock_resource *res)
{
	int ret;

	if (!res)
		return;

	init_completion(&res->completion);
	dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
	/* cancel a lock request or a conversion request that is blocked */
	res->flags |= DLM_LKF_CANCEL;
retry:
	ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
	if (unlikely(ret != 0)) {
		pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);

		/* if a lock conversion is cancelled, then the lock is put
		 * back to grant queue, need to ensure it is unlocked */
		if (ret == -DLM_ECANCEL)
			goto retry;
	}
	res->flags &= ~DLM_LKF_CANCEL;
	wait_for_completion(&res->completion);

	kfree(res->name);
@@ -177,18 +191,6 @@ static void lockres_free(struct dlm_lock_resource *res)
	kfree(res);
}

static char *pretty_uuid(char *dest, char *src)
{
	int i, len = 0;

	for (i = 0; i < 16; i++) {
		if (i == 4 || i == 6 || i == 8 || i == 10)
			len += sprintf(dest + len, "-");
		len += sprintf(dest + len, "%02x", (__u8)src[i]);
	}
	return dest;
}

static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
		sector_t lo, sector_t hi)
{
@@ -281,16 +283,11 @@ static void recover_prep(void *arg)
	set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}

static void recover_slot(void *arg, struct dlm_slot *slot)
static void __recover_slot(struct mddev *mddev, int slot)
{
	struct mddev *mddev = arg;
	struct md_cluster_info *cinfo = mddev->cluster_info;

	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
			mddev->bitmap_info.cluster_name,
			slot->nodeid, slot->slot,
			cinfo->slot_number);
	set_bit(slot->slot - 1, &cinfo->recovery_map);
	set_bit(slot, &cinfo->recovery_map);
	if (!cinfo->recovery_thread) {
		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
				mddev, "recover");
@@ -302,6 +299,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
	md_wakeup_thread(cinfo->recovery_thread);
}

static void recover_slot(void *arg, struct dlm_slot *slot)
{
	struct mddev *mddev = arg;
	struct md_cluster_info *cinfo = mddev->cluster_info;

	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
			mddev->bitmap_info.cluster_name,
			slot->nodeid, slot->slot,
			cinfo->slot_number);
	/* deduct one since dlm slot starts from one while the num of
	 * cluster-md begins with 0 */
	__recover_slot(mddev, slot->slot - 1);
}

static void recover_done(void *arg, struct dlm_slot *slots,
		int num_slots, int our_slot,
		uint32_t generation)
@@ -310,10 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
	struct md_cluster_info *cinfo = mddev->cluster_info;

	cinfo->slot_number = our_slot;
	/* completion is only need to be complete when node join cluster,
	 * it doesn't need to run during another node's failure */
	if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
		complete(&cinfo->completion);
		clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
	}
	clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}

/* the ops is called when node join the cluster, and do lock recovery
 * if node failure occurs */
static const struct dlm_lockspace_ops md_ls_ops = {
	.recover_prep = recover_prep,
	.recover_slot = recover_slot,
@@ -388,7 +406,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
	int len;

	len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
	pretty_uuid(disk_uuid + len, cmsg->uuid);
	sprintf(disk_uuid + len, "%pU", cmsg->uuid);
	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
	init_completion(&cinfo->newdisk_completion);
@@ -457,6 +475,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
			__func__, __LINE__, msg->slot);
		process_readd_disk(mddev, msg);
		break;
	case BITMAP_NEEDS_SYNC:
		pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
			__func__, __LINE__, msg->slot);
		__recover_slot(mddev, msg->slot);
		break;
	default:
		pr_warn("%s:%d Received unknown message from %d\n",
			__func__, __LINE__, msg->slot);
@@ -472,6 +495,7 @@ static void recv_daemon(struct md_thread *thread)
	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
	struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
	struct cluster_msg msg;
	int ret;

	/*get CR on Message*/
	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
@@ -484,13 +508,21 @@ static void recv_daemon(struct md_thread *thread)
	process_recvd_msg(thread->mddev, &msg);

	/*release CR on ack_lockres*/
	dlm_unlock_sync(ack_lockres);
	/*up-convert to EX on message_lockres*/
	dlm_lock_sync(message_lockres, DLM_LOCK_EX);
	ret = dlm_unlock_sync(ack_lockres);
	if (unlikely(ret != 0))
		pr_info("unlock ack failed return %d\n", ret);
	/*up-convert to PR on message_lockres*/
	ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
	if (unlikely(ret != 0))
		pr_info("lock PR on msg failed return %d\n", ret);
	/*get CR on ack_lockres again*/
	dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
	ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
	if (unlikely(ret != 0))
		pr_info("lock CR on ack failed return %d\n", ret);
	/*release CR on message_lockres*/
	dlm_unlock_sync(message_lockres);
	ret = dlm_unlock_sync(message_lockres);
	if (unlikely(ret != 0))
		pr_info("unlock msg failed return %d\n", ret);
}

/* lock_comm()
@@ -519,7 +551,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
 * The function:
 * 1. Grabs the message lockresource in EX mode
 * 2. Copies the message to the message LVB
 * 3. Downconverts message lockresource to CR
 * 3. Downconverts message lockresource to CW
 * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
 *    and the other nodes read the message. The thread will wait here until all other
 *    nodes have released ack lock resource.
@@ -540,12 +572,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)

	memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
			sizeof(struct cluster_msg));
	/*down-convert EX to CR on Message*/
	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
	/*down-convert EX to CW on Message*/
	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
	if (error) {
		pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
		pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
				error);
		goto failed_message;
		goto failed_ack;
	}

	/*up-convert CR to EX on Ack*/
@@ -565,7 +597,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
	}

failed_ack:
	dlm_unlock_sync(cinfo->message_lockres);
	error = dlm_unlock_sync(cinfo->message_lockres);
	if (unlikely(error != 0)) {
		pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
			error);
		/* in case the message can't be released due to some reason */
		goto failed_ack;
	}
failed_message:
	return error;
}
@@ -587,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
	struct dlm_lock_resource *bm_lockres;
	struct suspend_info *s;
	char str[64];
	sector_t lo, hi;


	for (i = 0; i < total_slots; i++) {
@@ -617,9 +656,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
			lockres_free(bm_lockres);
			continue;
		}
		if (ret)
		if (ret) {
			lockres_free(bm_lockres);
			goto out;
		/* TODO: Read the disk bitmap sb and check if it needs recovery */
		}

		/* Read the disk bitmap sb and check if it needs recovery */
		ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
		if (ret) {
			pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
			lockres_free(bm_lockres);
			continue;
		}
		if ((hi > 0) && (lo < mddev->recovery_cp)) {
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			mddev->recovery_cp = lo;
			md_check_recovery(mddev);
		}

		dlm_unlock_sync(bm_lockres);
		lockres_free(bm_lockres);
	}
@@ -633,20 +687,20 @@ static int join(struct mddev *mddev, int nodes)
	int ret, ops_rv;
	char str[64];

	if (!try_module_get(THIS_MODULE))
		return -ENOENT;

	cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
	if (!cinfo)
		return -ENOMEM;

	INIT_LIST_HEAD(&cinfo->suspend_list);
	spin_lock_init(&cinfo->suspend_lock);
	init_completion(&cinfo->completion);
	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);

	mutex_init(&cinfo->sb_mutex);
	mddev->cluster_info = cinfo;

	memset(str, 0, 64);
	pretty_uuid(str, mddev->uuid);
	sprintf(str, "%pU", mddev->uuid);
	ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
				DLM_LSFL_FS, LVB_SIZE,
				&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
@@ -659,12 +713,6 @@ static int join(struct mddev *mddev, int nodes)
		ret = -ERANGE;
		goto err;
	}
	cinfo->sb_lock = lockres_init(mddev, "cmd-super",
					NULL, 0);
	if (!cinfo->sb_lock) {
		ret = -ENOMEM;
		goto err;
	}
	/* Initiate the communication resources */
	ret = -ENOMEM;
	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
@@ -705,9 +753,6 @@ static int join(struct mddev *mddev, int nodes)
		goto err;
	}

	INIT_LIST_HEAD(&cinfo->suspend_list);
	spin_lock_init(&cinfo->suspend_lock);

	ret = gather_all_resync_info(mddev, nodes);
	if (ret)
		goto err;
@@ -719,12 +764,10 @@ static int join(struct mddev *mddev, int nodes)
	lockres_free(cinfo->ack_lockres);
	lockres_free(cinfo->no_new_dev_lockres);
	lockres_free(cinfo->bitmap_lockres);
	lockres_free(cinfo->sb_lock);
	if (cinfo->lockspace)
		dlm_release_lockspace(cinfo->lockspace, 2);
	mddev->cluster_info = NULL;
	kfree(cinfo);
	module_put(THIS_MODULE);
	return ret;
}

@@ -740,7 +783,6 @@ static int leave(struct mddev *mddev)
	lockres_free(cinfo->token_lockres);
	lockres_free(cinfo->ack_lockres);
	lockres_free(cinfo->no_new_dev_lockres);
	lockres_free(cinfo->sb_lock);
	lockres_free(cinfo->bitmap_lockres);
	dlm_release_lockspace(cinfo->lockspace, 2);
	return 0;
@@ -817,8 +859,17 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)

static void resync_finish(struct mddev *mddev)
{
	struct md_cluster_info *cinfo = mddev->cluster_info;
	struct cluster_msg cmsg;
	int slot = cinfo->slot_number - 1;

	pr_info("%s:%d\n", __func__, __LINE__);
	resync_send(mddev, RESYNCING, 0, 0);
	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
		cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
		cmsg.slot = cpu_to_le32(slot);
		sendmsg(cinfo, &cmsg);
	}
}

static int area_resyncing(struct mddev *mddev, int direction,
+71 −39
Original line number Diff line number Diff line
@@ -483,6 +483,8 @@ static void mddev_put(struct mddev *mddev)
		bioset_free(bs);
}

static void md_safemode_timeout(unsigned long data);

void mddev_init(struct mddev *mddev)
{
	mutex_init(&mddev->open_mutex);
@@ -490,7 +492,8 @@ void mddev_init(struct mddev *mddev)
	mutex_init(&mddev->bitmap_info.mutex);
	INIT_LIST_HEAD(&mddev->disks);
	INIT_LIST_HEAD(&mddev->all_mddevs);
	init_timer(&mddev->safemode_timer);
	setup_timer(&mddev->safemode_timer, md_safemode_timeout,
		    (unsigned long) mddev);
	atomic_set(&mddev->active, 1);
	atomic_set(&mddev->openers, 0);
	atomic_set(&mddev->active_io, 0);
@@ -3255,8 +3258,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
	return 0;
}

static void md_safemode_timeout(unsigned long data);

static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
@@ -4189,6 +4190,8 @@ action_show(struct mddev *mddev, char *page)
				type = "repair";
		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
			type = "recover";
		else if (mddev->reshape_position != MaxSector)
			type = "reshape";
	}
	return sprintf(page, "%s\n", type);
}
@@ -5180,8 +5183,6 @@ int md_run(struct mddev *mddev)
	atomic_set(&mddev->max_corr_read_errors,
		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
	mddev->safemode = 0;
	mddev->safemode_timer.function = md_safemode_timeout;
	mddev->safemode_timer.data = (unsigned long) mddev;
	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
	mddev->in_sync = 1;
	smp_wmb();
@@ -5194,6 +5195,11 @@ int md_run(struct mddev *mddev)
			if (sysfs_link_rdev(mddev, rdev))
				/* failure here is OK */;

	if (mddev->degraded && !mddev->ro)
		/* This ensures that recovering status is reported immediately
		 * via sysfs - until a lack of spares is confirmed.
		 */
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

	if (mddev->flags & MD_UPDATE_SB_FLAGS)
@@ -5741,16 +5747,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)

	err = 0;
	spin_lock(&mddev->lock);
	/* bitmap disabled, zero the first byte and copy out */
	if (!mddev->bitmap_info.file)
		file->pathname[0] = '\0';
	else if ((ptr = file_path(mddev->bitmap_info.file,
			       file->pathname, sizeof(file->pathname))),
		 IS_ERR(ptr))
	/* bitmap enabled */
	if (mddev->bitmap_info.file) {
		ptr = file_path(mddev->bitmap_info.file, file->pathname,
				sizeof(file->pathname));
		if (IS_ERR(ptr))
			err = PTR_ERR(ptr);
		else
			memmove(file->pathname, ptr,
				sizeof(file->pathname)-(ptr-file->pathname));
	}
	spin_unlock(&mddev->lock);

	if (err == 0 &&
@@ -7069,7 +7075,7 @@ static void status_unused(struct seq_file *seq)
	seq_printf(seq, "\n");
}

static void status_resync(struct seq_file *seq, struct mddev *mddev)
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
	sector_t max_sectors, resync, res;
	unsigned long dt, db;
@@ -7077,18 +7083,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
	int scale;
	unsigned int per_milli;

	if (mddev->curr_resync <= 3)
		resync = 0;
	else
		resync = mddev->curr_resync
			- atomic_read(&mddev->recovery_active);

	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		max_sectors = mddev->resync_max_sectors;
	else
		max_sectors = mddev->dev_sectors;

	resync = mddev->curr_resync;
	if (resync <= 3) {
		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
			/* Still cleaning up */
			resync = max_sectors;
	} else
		resync -= atomic_read(&mddev->recovery_active);

	if (resync == 0) {
		if (mddev->recovery_cp < MaxSector) {
			seq_printf(seq, "\tresync=PENDING");
			return 1;
		}
		return 0;
	}
	if (resync < 3) {
		seq_printf(seq, "\tresync=DELAYED");
		return 1;
	}

	WARN_ON(max_sectors == 0);
	/* Pick 'scale' such that (resync>>scale)*1000 will fit
	 * in a sector_t, and (max_sectors>>scale) will fit in a
@@ -7153,6 +7173,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
		   ((unsigned long)rt % 60)/6);

	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
	return 1;
}

static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -7298,13 +7319,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
			mddev->pers->status(seq, mddev);
			seq_printf(seq, "\n      ");
			if (mddev->pers->sync_request) {
				if (mddev->curr_resync > 2) {
					status_resync(seq, mddev);
				if (status_resync(seq, mddev))
					seq_printf(seq, "\n      ");
				} else if (mddev->curr_resync >= 1)
					seq_printf(seq, "\tresync=DELAYED\n      ");
				else if (mddev->recovery_cp < MaxSector)
					seq_printf(seq, "\tresync=PENDING\n      ");
			}
		} else
			seq_printf(seq, "\n       ");
@@ -7387,15 +7403,19 @@ int unregister_md_personality(struct md_personality *p)
}
EXPORT_SYMBOL(unregister_md_personality);

int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
int register_md_cluster_operations(struct md_cluster_operations *ops,
				   struct module *module)
{
	if (md_cluster_ops != NULL)
		return -EALREADY;
	int ret = 0;
	spin_lock(&pers_lock);
	if (md_cluster_ops != NULL)
		ret = -EALREADY;
	else {
		md_cluster_ops = ops;
		md_cluster_mod = module;
	}
	spin_unlock(&pers_lock);
	return 0;
	return ret;
}
EXPORT_SYMBOL(register_md_cluster_operations);

@@ -7793,7 +7813,8 @@ void md_do_sync(struct md_thread *thread)
		      > (max_sectors >> 4)) ||
		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
		     (j - mddev->curr_resync_completed)*2
		     >= mddev->resync_max - mddev->curr_resync_completed
		     >= mddev->resync_max - mddev->curr_resync_completed ||
		     mddev->curr_resync_completed > mddev->resync_max
			    )) {
			/* time to update curr_resync_completed */
			wait_event(mddev->recovery_wait,
@@ -7838,6 +7859,9 @@ void md_do_sync(struct md_thread *thread)
			break;

		j += sectors;
		if (j > max_sectors)
			/* when skipping, extra large numbers can be returned. */
			j = max_sectors;
		if (j > 2)
			mddev->curr_resync = j;
		if (mddev_is_clustered(mddev))
@@ -7906,12 +7930,15 @@ void md_do_sync(struct md_thread *thread)
	blk_finish_plug(&plug);
	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));

	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
	    mddev->curr_resync > 2) {
		mddev->curr_resync_completed = mddev->curr_resync;
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	}
	/* tell personality that we are finished */
	mddev->pers->sync_request(mddev, max_sectors, &skipped);

	if (mddev_is_clustered(mddev))
		md_cluster_ops->resync_finish(mddev);

	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
	    mddev->curr_resync > 2) {
		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -7945,6 +7972,9 @@ void md_do_sync(struct md_thread *thread)
		}
	}
 skip:
	if (mddev_is_clustered(mddev))
		md_cluster_ops->resync_finish(mddev);

	set_bit(MD_CHANGE_DEVS, &mddev->flags);

	spin_lock(&mddev->lock);
@@ -7955,11 +7985,11 @@ void md_do_sync(struct md_thread *thread)
		mddev->resync_max = MaxSector;
	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
		mddev->resync_min = mddev->curr_resync_completed;
	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
	mddev->curr_resync = 0;
	spin_unlock(&mddev->lock);

	wake_up(&resync_wait);
	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
	md_wakeup_thread(mddev->thread);
	return;
}
@@ -8128,6 +8158,7 @@ void md_check_recovery(struct mddev *mddev)
			 */
			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
			md_reap_sync_thread(mddev);
			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			goto unlock;
		}
@@ -8574,6 +8605,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
		/* Make sure they get written out promptly */
		sysfs_notify_dirent_safe(rdev->sysfs_state);
		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
		md_wakeup_thread(rdev->mddev->thread);
	}
	return rv;
+39 −36
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
	char b[BDEVNAME_SIZE];
	char b2[BDEVNAME_SIZE];
	struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
	bool discard_supported = false;
	unsigned short blksize = 512;

	if (!conf)
		return -ENOMEM;
@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
		sector_div(sectors, mddev->chunk_sectors);
		rdev1->sectors = sectors * mddev->chunk_sectors;

		blksize = max(blksize, queue_logical_block_size(
				      rdev1->bdev->bd_disk->queue));

		rdev_for_each(rdev2, mddev) {
			pr_debug("md/raid0:%s:   comparing %s(%llu)"
				 " with %s(%llu)\n",
@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
	}
	pr_debug("md/raid0:%s: FINAL %d zones\n",
		 mdname(mddev), conf->nr_strip_zones);
	/*
	 * now since we have the hard sector sizes, we can make sure
	 * chunk size is a multiple of that sector size
	 */
	if ((mddev->chunk_sectors << 9) % blksize) {
		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
		       mdname(mddev),
		       mddev->chunk_sectors << 9, blksize);
		err = -EINVAL;
		goto abort;
	}

	err = -ENOMEM;
	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
				conf->nr_strip_zones, GFP_KERNEL);
@@ -188,16 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
		}
		dev[j] = rdev1;

		if (mddev->queue)
			disk_stack_limits(mddev->gendisk, rdev1->bdev,
					  rdev1->data_offset << 9);

		if (!smallest || (rdev1->sectors < smallest->sectors))
			smallest = rdev1;
		cnt++;

		if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
			discard_supported = true;
	}
	if (cnt != mddev->raid_disks) {
		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -258,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
			 (unsigned long long)smallest->sectors);
	}

	/*
	 * now since we have the hard sector sizes, we can make sure
	 * chunk size is a multiple of that sector size
	 */
	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
		       mdname(mddev),
		       mddev->chunk_sectors << 9);
		goto abort;
	}

	if (mddev->queue) {
		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
		blk_queue_io_opt(mddev->queue,
				 (mddev->chunk_sectors << 9) * mddev->raid_disks);

		if (!discard_supported)
			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
		else
			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
	}

	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
	*private_conf = conf;

@@ -378,12 +364,6 @@ static int raid0_run(struct mddev *mddev)
	if (md_check_no_bitmap(mddev))
		return -EINVAL;

	if (mddev->queue) {
		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
	}

	/* if private is not null, we are here after takeover */
	if (mddev->private == NULL) {
		ret = create_strip_zones(mddev, &conf);
@@ -392,6 +372,29 @@ static int raid0_run(struct mddev *mddev)
		mddev->private = conf;
	}
	conf = mddev->private;
	if (mddev->queue) {
		struct md_rdev *rdev;
		bool discard_supported = false;

		rdev_for_each(rdev, mddev) {
			disk_stack_limits(mddev->gendisk, rdev->bdev,
					  rdev->data_offset << 9);
			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
				discard_supported = true;
		}
		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);

		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
		blk_queue_io_opt(mddev->queue,
				 (mddev->chunk_sectors << 9) * mddev->raid_disks);

		if (!discard_supported)
			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
		else
			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
	}

	/* calculate array device size */
	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
+29 −1
Original line number Diff line number Diff line
@@ -1474,6 +1474,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
	 */
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
	set_bit(MD_CHANGE_DEVS, &mddev->flags);
	set_bit(MD_CHANGE_PENDING, &mddev->flags);
	printk(KERN_ALERT
	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
	       "md/raid1:%s: Operation continuing on %d devices.\n",
@@ -2235,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
	int m;
	bool fail = false;
	for (m = 0; m < conf->raid_disks * 2 ; m++)
		if (r1_bio->bios[m] == IO_MADE_GOOD) {
			struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2247,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
			 * narrow down and record precise write
			 * errors.
			 */
			fail = true;
			if (!narrow_write_error(r1_bio, m)) {
				md_error(conf->mddev,
					 conf->mirrors[m].rdev);
@@ -2258,6 +2261,12 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
		}
	if (test_bit(R1BIO_WriteError, &r1_bio->state))
		close_write(r1_bio);
	if (fail) {
		spin_lock_irq(&conf->device_lock);
		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
		spin_unlock_irq(&conf->device_lock);
		md_wakeup_thread(conf->mddev->thread);
	} else
		raid_end_bio_io(r1_bio);
}

@@ -2364,6 +2373,23 @@ static void raid1d(struct md_thread *thread)

	md_check_recovery(mddev);

	if (!list_empty_careful(&conf->bio_end_io_list) &&
	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
		LIST_HEAD(tmp);
		spin_lock_irqsave(&conf->device_lock, flags);
		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
			list_add(&tmp, &conf->bio_end_io_list);
			list_del_init(&conf->bio_end_io_list);
		}
		spin_unlock_irqrestore(&conf->device_lock, flags);
		while (!list_empty(&tmp)) {
			r1_bio = list_first_entry(&conf->bio_end_io_list,
						  struct r1bio, retry_list);
			list_del(&r1_bio->retry_list);
			raid_end_bio_io(r1_bio);
		}
	}

	blk_start_plug(&plug);
	for (;;) {

@@ -2763,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
	conf->raid_disks = mddev->raid_disks;
	conf->mddev = mddev;
	INIT_LIST_HEAD(&conf->retry_list);
	INIT_LIST_HEAD(&conf->bio_end_io_list);

	spin_lock_init(&conf->resync_lock);
	init_waitqueue_head(&conf->wait_barrier);
@@ -3057,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev)

	unfreeze_array(conf);

	set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
	md_wakeup_thread(mddev->thread);

Loading