Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4fd1ffaa authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.5/drivers

Philipp writes:

This are the updates we have in the drbd-8.3 tree. They are intended
for your "for-3.5/drivers" drivers branch.

These changes include one new feature:
 * Allow detach from frozen backing devices with the new --force option;
   configurable timeout for backing devices by the new disk-timeout option

And huge number of bug fixes:
 * Fixed a write ordering problem on SyncTarget nodes for a write
   to a block that gets resynced at the same time. The bug can
   only be triggered with a device that has a firmware that
   actually reorders writes to the same block
 * Fixed a race between disconnect and receive_state, that could cause
   a IO lockup
 * Fixed resend/resubmit for requests with disk or network timeout
 * Make sure that hard state changed do not disturb the connection
   establishing process (I.e. detach due to an IO error). When the
   bug was triggered it caused a retry in the connect process
 * Postpone soft state changes to no disturb the connection
   establishing process (I.e. becoming primary). When the bug
   was triggered it could cause both nodes going into SyncSource state
 * Fixed a refcount leak that could cause failures when trying to
   unload a protocol family modules, that was used by DRBD
 * Dedicated page pool for meta data IOs
 * Deny normal detach (as opposed to --forced) if the user tries
   to detach from the last UpToDate disk in the resource
 * Fixed a possible protocol error that could be caused by
   "unusual" BIOs.
 * Enforce the disk-timeout option also on meta-data IO operations
 * Implemented stable bitmap pages when we do a full write out of
   the bitmap
 * Fixed a rare compatibility issue with DRBD's older than 8.3.7
   when negotiating the bio_size
 * Fixed a rare race condition where an empty resync could stall with
   if pause/unpause events happen in parallel
 * Made the re-establishing of connections quicker, if it got a broken pipe
   once. Previously there was a bug in the code caused it to waste the first
   successful established connection after a broken pipe event.

PS: I am postponing the drbd-8.4 for mainline for one or two kernel
    development cycles more (the ~400 patchets set).
parents 13828dec 92b4ca29
Loading
Loading
Loading
Loading
+78 −26
Original line number Diff line number Diff line
@@ -65,39 +65,80 @@ struct drbd_atodb_wait {

int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);

void *drbd_md_get_buffer(struct drbd_conf *mdev)
{
	int r;

	wait_event(mdev->misc_wait,
		   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
		   mdev->state.disk <= D_FAILED);

	return r ? NULL : page_address(mdev->md_io_page);
}

void drbd_md_put_buffer(struct drbd_conf *mdev)
{
	if (atomic_dec_and_test(&mdev->md_io_in_use))
		wake_up(&mdev->misc_wait);
}

static bool md_io_allowed(struct drbd_conf *mdev)
{
	enum drbd_disk_state ds = mdev->state.disk;
	return ds >= D_NEGOTIATING || ds == D_ATTACHING;
}

void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
				     unsigned int *done)
{
	long dt = bdev->dc.disk_timeout * HZ / 10;
	if (dt == 0)
		dt = MAX_SCHEDULE_TIMEOUT;

	dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
	if (dt == 0)
		dev_err(DEV, "meta-data IO operation timed out\n");
}

static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
				 struct drbd_backing_dev *bdev,
				 struct page *page, sector_t sector,
				 int rw, int size)
{
	struct bio *bio;
	struct drbd_md_io md_io;
	int ok;

	md_io.mdev = mdev;
	init_completion(&md_io.event);
	md_io.error = 0;
	mdev->md_io.done = 0;
	mdev->md_io.error = -ENODEV;

	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
		rw |= REQ_FUA | REQ_FLUSH;
	rw |= REQ_SYNC;

	bio = bio_alloc(GFP_NOIO, 1);
	bio = bio_alloc_drbd(GFP_NOIO);
	bio->bi_bdev = bdev->md_bdev;
	bio->bi_sector = sector;
	ok = (bio_add_page(bio, page, size, 0) == size);
	if (!ok)
		goto out;
	bio->bi_private = &md_io;
	bio->bi_private = &mdev->md_io;
	bio->bi_end_io = drbd_md_io_complete;
	bio->bi_rw = rw;

	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
		ok = 0;
		goto out;
	}

	bio_get(bio); /* one bio_put() is in the completion handler */
	atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
		bio_endio(bio, -EIO);
	else
		submit_bio(rw, bio);
	wait_for_completion(&md_io.event);
	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
	wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
	ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;

 out:
	bio_put(bio);
@@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
	int offset = 0;
	struct page *iop = mdev->md_io_page;

	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
	D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);

	BUG_ON(!bdev->md_bdev);

@@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
		return 1;
	}

	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
	buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
	if (!buffer) {
		dev_err(DEV, "disk failed while waiting for md_io buffer\n");
		complete(&((struct update_al_work *)w)->event);
		put_ldev(mdev);
		return 1;
	}

	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
@@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
	mdev->al_tr_number++;

	mutex_unlock(&mdev->md_io_mutex);
	drbd_md_put_buffer(mdev);

	complete(&((struct update_al_work *)w)->event);
	put_ldev(mdev);
@@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
	/* lock out all other meta data io for now,
	 * and make sure the page is mapped.
	 */
	mutex_lock(&mdev->md_io_mutex);
	buffer = page_address(mdev->md_io_page);
	buffer = drbd_md_get_buffer(mdev);
	if (!buffer)
		return 0;

	/* Find the valid transaction in the log */
	for (i = 0; i <= mx; i++) {
@@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
		if (rv == 0)
			continue;
		if (rv == -1) {
			mutex_unlock(&mdev->md_io_mutex);
			drbd_md_put_buffer(mdev);
			return 0;
		}
		cnr = be32_to_cpu(buffer->tr_number);
@@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)

	if (!found_valid) {
		dev_warn(DEV, "No usable activity log found.\n");
		mutex_unlock(&mdev->md_io_mutex);
		drbd_md_put_buffer(mdev);
		return 1;
	}

@@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
		ERR_IF(rv == 0) goto cancel;
		if (rv == -1) {
			mutex_unlock(&mdev->md_io_mutex);
			drbd_md_put_buffer(mdev);
			return 0;
		}

@@ -534,7 +581,7 @@ cancel:
		mdev->al_tr_pos = 0;

	/* ok, we are done with it */
	mutex_unlock(&mdev->md_io_mutex);
	drbd_md_put_buffer(mdev);

	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
	     transactions, active_extents);
@@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
			else
				ext->rs_failed += count;
			if (ext->rs_left < ext->rs_failed) {
				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
				    "rs_failed=%d count=%d\n",
				dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
				    "rs_failed=%d count=%d cstate=%s\n",
				     (unsigned long long)sector,
				     ext->lce.lc_number, ext->rs_left,
				     ext->rs_failed, count);
				dump_stack();
				     ext->rs_failed, count,
				     drbd_conn_str(mdev->state.conn));

				lc_put(mdev->resync, &ext->lce);
				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
				return;
				/* We don't expect to be able to clear more bits
				 * than have been set when we originally counted
				 * the set bits to cache that value in ext->rs_left.
				 * Whatever the reason (disconnect during resync,
				 * delayed local completion of an application write),
				 * try to fix it up by recounting here. */
				ext->rs_left = drbd_bm_e_weight(mdev, enr);
			}
		} else {
			/* Normally this element should be in the cache,
@@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
		put_ldev(mdev);
	}
	spin_unlock_irq(&mdev->al_lock);
	wake_up(&mdev->al_wait);

	return 0;
}
+107 −39
Original line number Diff line number Diff line
@@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
static void bm_store_page_idx(struct page *page, unsigned long idx)
{
	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
	page_private(page) |= idx;
	set_page_private(page, idx);
}

static unsigned long bm_page_to_idx(struct page *page)
@@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
struct bm_aio_ctx {
	struct drbd_conf *mdev;
	atomic_t in_flight;
	struct completion done;
	unsigned int done;
	unsigned flags;
#define BM_AIO_COPY_PAGES	1
	int error;
	struct kref kref;
};

static void bm_aio_ctx_destroy(struct kref *kref)
{
	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);

	put_ldev(ctx->mdev);
	kfree(ctx);
}

/* bv_page may be a copy, or may be the original */
static void bm_async_io_complete(struct bio *bio, int error)
{
@@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error)

	bm_page_unlock_io(mdev, idx);

	/* FIXME give back to page pool */
	if (ctx->flags & BM_AIO_COPY_PAGES)
		put_page(bio->bi_io_vec[0].bv_page);
		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);

	bio_put(bio);

	if (atomic_dec_and_test(&ctx->in_flight))
		complete(&ctx->done);
	if (atomic_dec_and_test(&ctx->in_flight)) {
		ctx->done = 1;
		wake_up(&mdev->misc_wait);
		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
	}
}

static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
{
	/* we are process context. we always get a bio */
	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
	struct drbd_conf *mdev = ctx->mdev;
	struct drbd_bitmap *b = mdev->bitmap;
	struct page *page;
@@ -966,10 +976,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
	bm_set_page_unchanged(b->bm_pages[page_nr]);

	if (ctx->flags & BM_AIO_COPY_PAGES) {
		/* FIXME alloc_page is good enough for now, but actually needs
		 * to use pre-allocated page pool */
		void *src, *dest;
		page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
		dest = kmap_atomic(page);
		src = kmap_atomic(b->bm_pages[page_nr]);
		memcpy(dest, src, PAGE_SIZE);
@@ -981,6 +989,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must

	bio->bi_bdev = mdev->ldev->md_bdev;
	bio->bi_sector = on_disk_sector;
	/* bio_add_page of a single page to an empty bio will always succeed,
	 * according to api.  Do we want to assert that? */
	bio_add_page(bio, page, len, 0);
	bio->bi_private = ctx;
	bio->bi_end_io = bm_async_io_complete;
@@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
/*
 * bm_rw: read/write the whole bitmap from/to its on disk location.
 */
static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
{
	struct bm_aio_ctx ctx = {
		.mdev = mdev,
		.in_flight = ATOMIC_INIT(1),
		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
		.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
	};
	struct bm_aio_ctx *ctx;
	struct drbd_bitmap *b = mdev->bitmap;
	int num_pages, i, count = 0;
	unsigned long now;
@@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
	 * as we submit copies of pages anyways.
	 */
	if (!ctx.flags)

	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
	if (!ctx)
		return -ENOMEM;

	*ctx = (struct bm_aio_ctx) {
		.mdev = mdev,
		.in_flight = ATOMIC_INIT(1),
		.done = 0,
		.flags = flags,
		.error = 0,
		.kref = { ATOMIC_INIT(2) },
	};

	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
		kfree(ctx);
		return -ENODEV;
	}

	if (!ctx->flags)
		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));

	num_pages = b->bm_number_of_pages;
@@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
				continue;
			}
		}
		atomic_inc(&ctx.in_flight);
		bm_page_io_async(&ctx, i, rw);
		atomic_inc(&ctx->in_flight);
		bm_page_io_async(ctx, i, rw);
		++count;
		cond_resched();
	}

	/*
	 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
	 * will not complete() early, and decrement / test it here.  If there
	 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
	 * will not set ctx->done early, and decrement / test it here.  If there
	 * are still some bios in flight, we need to wait for them here.
	 * If all IO is done already (or nothing had been submitted), there is
	 * no need to wait.  Still, we need to put the kref associated with the
	 * "in_flight reached zero, all done" event.
	 */
	if (!atomic_dec_and_test(&ctx.in_flight))
		wait_for_completion(&ctx.done);
	if (!atomic_dec_and_test(&ctx->in_flight))
		wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
	else
		kref_put(&ctx->kref, &bm_aio_ctx_destroy);

	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
			rw == WRITE ? "WRITE" : "READ",
			count, jiffies - now);

	if (ctx.error) {
	if (ctx->error) {
		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
		drbd_chk_io_error(mdev, 1, true);
		err = -EIO; /* ctx.error ? */
		err = -EIO; /* ctx->error ? */
	}

	if (atomic_read(&ctx->in_flight))
		err = -EIO; /* Disk failed during IO... */

	now = jiffies;
	if (rw == WRITE) {
		drbd_md_flush(mdev);
@@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);

	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
	return err;
}

@@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 */
int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
{
	return bm_rw(mdev, READ, 0);
	return bm_rw(mdev, READ, 0, 0);
}

/**
@@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 */
int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
{
	return bm_rw(mdev, WRITE, 0);
	return bm_rw(mdev, WRITE, 0, 0);
}

/**
@@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 */
int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
{
	return bm_rw(mdev, WRITE, upper_idx);
	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
}

/**
 * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
 * @mdev:	DRBD device.
 *
 * Will only write pages that have changed since last IO.
 * In contrast to drbd_bm_write(), this will copy the bitmap pages
 * to temporary writeout pages. It is intended to trigger a full write-out
 * while still allowing the bitmap to change, for example if a resync or online
 * verify is aborted due to a failed peer disk, while local IO continues, or
 * pending resync acks are still being processed.
 */
int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
{
	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}


@@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l
 */
int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
{
	struct bm_aio_ctx ctx = {
	struct bm_aio_ctx *ctx;
	int err;

	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
		return 0;
	}

	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
	if (!ctx)
		return -ENOMEM;

	*ctx = (struct bm_aio_ctx) {
		.mdev = mdev,
		.in_flight = ATOMIC_INIT(1),
		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
		.done = 0,
		.flags = BM_AIO_COPY_PAGES,
		.error = 0,
		.kref = { ATOMIC_INIT(2) },
	};

	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
		return 0;
	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
		kfree(ctx);
		return -ENODEV;
	}

	bm_page_io_async(&ctx, idx, WRITE_SYNC);
	wait_for_completion(&ctx.done);
	bm_page_io_async(ctx, idx, WRITE_SYNC);
	wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);

	if (ctx.error)
	if (ctx->error)
		drbd_chk_io_error(mdev, 1, true);
		/* that should force detach, so the in memory bitmap will be
		 * gone in a moment as well. */

	mdev->bm_writ_cnt++;
	return ctx.error;
	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
	return err;
}

/* NOTE
+55 −35
Original line number Diff line number Diff line
@@ -712,7 +712,6 @@ struct drbd_request {
	struct list_head tl_requests; /* ring list in the transfer log */
	struct bio *master_bio;       /* master bio pointer */
	unsigned long rq_state; /* see comments above _req_mod() */
	int seq_num;
	unsigned long start_time;
};

@@ -851,6 +850,7 @@ enum {
	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
	AL_SUSPENDED,		/* Activity logging is currently suspended. */
	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
	STATE_SENT,		/* Do not change state/UUIDs while this is set */
};

struct drbd_bitmap; /* opaque for drbd_conf */
@@ -862,31 +862,30 @@ enum bm_flag {
	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */

	/* currently locked for bulk operation */
	BM_LOCKED_MASK = 0x7,
	BM_LOCKED_MASK = 0xf,

	/* in detail, that is: */
	BM_DONT_CLEAR = 0x1,
	BM_DONT_SET   = 0x2,
	BM_DONT_TEST  = 0x4,

	/* so we can mark it locked for bulk operation,
	 * and still allow all non-bulk operations */
	BM_IS_LOCKED  = 0x8,

	/* (test bit, count bit) allowed (common case) */
	BM_LOCKED_TEST_ALLOWED = 0x3,
	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,

	/* testing bits, as well as setting new bits allowed, but clearing bits
	 * would be unexpected.  Used during bitmap receive.  Setting new bits
	 * requires sending of "out-of-sync" information, though. */
	BM_LOCKED_SET_ALLOWED = 0x1,
	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,

	/* clear is not expected while bitmap is locked for bulk operation */
	/* for drbd_bm_write_copy_pages, everything is allowed,
	 * only concurrent bulk operations are locked out. */
	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
};


/* TODO sort members for performance
 * MAYBE group them further */

/* THINK maybe we actually want to use the default "event/%s" worker threads
 * or similar in linux 2.6, which uses per cpu data and threads.
 */
struct drbd_work_queue {
	struct list_head q;
	struct semaphore s; /* producers up it, worker down()s it */
@@ -938,8 +937,7 @@ struct drbd_backing_dev {
};

struct drbd_md_io {
	struct drbd_conf *mdev;
	struct completion event;
	unsigned int done;
	int error;
};

@@ -1022,6 +1020,7 @@ struct drbd_conf {
	struct drbd_tl_epoch *newest_tle;
	struct drbd_tl_epoch *oldest_tle;
	struct list_head out_of_sequence_requests;
	struct list_head barrier_acked_requests;
	struct hlist_head *tl_hash;
	unsigned int tl_hash_s;

@@ -1056,6 +1055,8 @@ struct drbd_conf {
	struct crypto_hash *csums_tfm;
	struct crypto_hash *verify_tfm;

	unsigned long last_reattach_jif;
	unsigned long last_reconnect_jif;
	struct drbd_thread receiver;
	struct drbd_thread worker;
	struct drbd_thread asender;
@@ -1094,7 +1095,8 @@ struct drbd_conf {
	wait_queue_head_t ee_wait;
	struct page *md_io_page;	/* one page buffer for md_io */
	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
	struct mutex md_io_mutex;	/* protects the md_io_buffer */
	struct drbd_md_io md_io;
	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
	spinlock_t al_lock;
	wait_queue_head_t al_wait;
	struct lru_cache *act_log;	/* activity log */
@@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
extern int drbd_send_current_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
			enum drbd_packets cmd, struct p_header80 *h,
			size_t size, unsigned msg_flags);
@@ -1461,6 +1463,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
		unsigned long al_enr);
extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
@@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
extern mempool_t *drbd_ee_mempool;

extern struct page *drbd_pp_pool; /* drbd's page pool */
/* drbd's page pool, used to buffer data received from the peer,
 * or data requested by the peer.
 *
 * This does not have an emergency reserve.
 *
 * When allocating from this pool, it first takes pages from the pool.
 * Only if the pool is depleted will try to allocate from the system.
 *
 * The assumption is that pages taken from this pool will be processed,
 * and given back, "quickly", and then can be recycled, so we can avoid
 * frequent calls to alloc_page(), and still will be able to make progress even
 * under memory pressure.
 */
extern struct page *drbd_pp_pool;
extern spinlock_t   drbd_pp_lock;
extern int	    drbd_pp_vacant;
extern wait_queue_head_t drbd_pp_wait;

/* We also need a standard (emergency-reserve backed) page pool
 * for meta data IO (activity log, bitmap).
 * We can keep it global, as long as it is used as "N pages at a time".
 * 128 should be plenty, currently we probably can get away with as few as 1.
 */
#define DRBD_MIN_POOL_PAGES	128
extern mempool_t *drbd_md_io_page_pool;

/* We also need to make sure we get a bio
 * when we need it for housekeeping purposes */
extern struct bio_set *drbd_md_io_bio_set;
/* to allocate from that set */
extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);

extern rwlock_t global_state_lock;

extern struct drbd_conf *drbd_new_device(unsigned int minor);
@@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev);
extern void suspend_other_sg(struct drbd_conf *mdev);
extern int drbd_resync_finished(struct drbd_conf *mdev);
/* maybe rather drbd_main.c ? */
extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
extern void drbd_md_put_buffer(struct drbd_conf *mdev);
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
				struct drbd_backing_dev *bdev, sector_t sector, int rw);
extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
					    unsigned int *done);
extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
extern void drbd_rs_controller_reset(struct drbd_conf *mdev);

@@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page)
#define page_chain_for_each_safe(page, n) \
	for (; page && ({ n = page_chain_next(page); 1; }); page = n)

static inline int drbd_bio_has_active_page(struct bio *bio)
{
	struct bio_vec *bvec;
	int i;

	__bio_for_each_segment(bvec, bio, i, 0) {
		if (page_count(bvec->bv_page) > 1)
			return 1;
	}

	return 0;
}

static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
{
	struct page *page = e->pages;
@@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
	return 0;
}


static inline void drbd_state_lock(struct drbd_conf *mdev)
{
	wait_event(mdev->misc_wait,
@@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
		 * Note: currently we don't support such large bitmaps on 32bit
		 * arch anyways, but no harm done to be prepared for it here.
		 */
		unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
		unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
		unsigned long left = *bits_left >> shift;
		unsigned long total = 1UL + (mdev->rs_total >> shift);
		unsigned long tmp = 1000UL - left * 1000UL/total;
@@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
	case D_OUTDATED:
	case D_CONSISTENT:
	case D_UP_TO_DATE:
	case D_FAILED:
		/* disk state is stable as well. */
		break;

	/* no new io accepted during tansitional states */
	case D_ATTACHING:
	case D_FAILED:
	case D_NEGOTIATING:
	case D_UNKNOWN:
	case D_MASK:
+269 −88

File changed.

Preview size limit exceeded, changes collapsed.

+38 −10
Original line number Diff line number Diff line
@@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data)
	*/
	spin_lock_irq(&mdev->req_lock);
	ns = mdev->state;
	if (ns.conn < C_WF_REPORT_PARAMS) {
	if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
		ns.pdsk = nps;
		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
	}
@@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
		/* if this was forced, we should consider sync */
		if (forced)
			drbd_send_uuids(mdev);
		drbd_send_state(mdev);
		drbd_send_current_state(mdev);
	}

	drbd_md_sync(mdev);
@@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
	   Because new from 8.3.8 onwards the peer can use multiple
	   BIOs for a single peer_request */
	if (mdev->state.conn >= C_CONNECTED) {
		if (mdev->agreed_pro_version < 94)
			peer = mdev->peer_max_bio_size;
		else if (mdev->agreed_pro_version == 94)
		if (mdev->agreed_pro_version < 94) {
			peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
		} else if (mdev->agreed_pro_version == 94)
			peer = DRBD_MAX_SIZE_H80_PACKET;
		else /* drbd 8.3.8 onwards */
			peer = DRBD_MAX_BIO_SIZE;
@@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
			(unsigned long long) drbd_get_max_capacity(nbc),
			(unsigned long long) nbc->dc.disk_size);
		retcode = ERR_DISK_TO_SMALL;
		retcode = ERR_DISK_TOO_SMALL;
		goto fail;
	}

@@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
	}

	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
		retcode = ERR_MD_DISK_TO_SMALL;
		retcode = ERR_MD_DISK_TOO_SMALL;
		dev_warn(DEV, "refusing attach: md-device too small, "
		     "at least %llu sectors needed for this meta-disk type\n",
		     (unsigned long long) min_md_device_sectors);
@@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
	 * (we may currently be R_PRIMARY with no local disk...) */
	if (drbd_get_max_capacity(nbc) <
	    drbd_get_capacity(mdev->this_bdev)) {
		retcode = ERR_DISK_TO_SMALL;
		retcode = ERR_DISK_TOO_SMALL;
		goto fail;
	}

@@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
		dev_warn(DEV, "refusing to truncate a consistent device\n");
		retcode = ERR_DISK_TO_SMALL;
		retcode = ERR_DISK_TOO_SMALL;
		goto force_diskless_dec;
	}

@@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
{
	enum drbd_ret_code retcode;
	int ret;
	struct detach dt = {};

	if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
		reply->ret_code = ERR_MANDATORY_TAG;
		goto out;
	}

	if (dt.detach_force) {
		drbd_force_state(mdev, NS(disk, D_FAILED));
		reply->ret_code = SS_SUCCESS;
		goto out;
	}

	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
	drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
	retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
	drbd_md_put_buffer(mdev);
	/* D_FAILED will transition to DISKLESS. */
	ret = wait_event_interruptible(mdev->misc_wait,
			mdev->state.disk != D_FAILED);
	drbd_resume_io(mdev);

	if ((int)retcode == (int)SS_IS_DISKLESS)
		retcode = SS_NOTHING_TO_DO;
	if (ret)
		retcode = ERR_INTR;
	reply->ret_code = retcode;
out:
	return 0;
}

@@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,

	if (rs.no_resync && mdev->agreed_pro_version < 93) {
		retcode = ERR_NEED_APV_93;
		goto fail;
		goto fail_ldev;
	}

	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
@@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 fail:
	reply->ret_code = retcode;
	return 0;

 fail_ldev:
	put_ldev(mdev);
	goto fail;
}

static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl

	/* If there is still bitmap IO pending, probably because of a previous
	 * resync just being finished, wait for it before requesting a new resync. */
	drbd_suspend_io(mdev);
	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));

	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl

		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
	}
	drbd_resume_io(mdev);

	reply->ret_code = retcode;
	return 0;
@@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re

	/* If there is still bitmap IO pending, probably because of a previous
	 * resync just being finished, wait for it before requesting a new resync. */
	drbd_suspend_io(mdev);
	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));

	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
@@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
		} else
			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
	}
	drbd_resume_io(mdev);

	reply->ret_code = retcode;
	return 0;
@@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,

	/* If there is still bitmap IO pending, e.g. previous resync or verify
	 * just being finished, wait for it before requesting a new resync. */
	drbd_suspend_io(mdev);
	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));

	/* w_make_ov_request expects position to be aligned */
	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
	drbd_resume_io(mdev);
	return 0;
}

Loading