Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1484d4ff authored by Joe Thornber's avatar Joe Thornber Committed by Greg Kroah-Hartman
Browse files

dm thin metadata: try to avoid ever aborting transactions



[ Upstream commit 3ab91828166895600efd9cdc3a0eb32001f7204a ]

Committing a transaction can consume some metadata of it's own, we now
reserve a small amount of metadata to cover this.  Free metadata
reported by the kernel will not include this reserve.

If any of the reserve has been used after a commit we enter a new
internal state PM_OUT_OF_METADATA_SPACE.  This is reported as
PM_READ_ONLY, so no userland changes are needed.  If the metadata
device is resized the pool will move back to PM_WRITE.

These changes mean we never need to abort and rollback a transaction due
to running out of metadata space.  This is particularly important
because there have been a handful of reports of data corruption against
DM thin-provisioning that can all be attributed to the thin-pool having
ran out of metadata space.

Signed-off-by: default avatarJoe Thornber <ejt@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
Signed-off-by: default avatarSasha Levin <alexander.levin@microsoft.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 1e9054e7
Loading
Loading
Loading
Loading
+35 −1
Original line number Diff line number Diff line
@@ -188,6 +188,12 @@ struct dm_pool_metadata {
	unsigned long flags;
	sector_t data_block_size;

	/*
	 * We reserve a section of the metadata for commit overhead.
	 * All reported space does *not* include this.
	 */
	dm_block_t metadata_reserve;

	/*
	 * Set if a transaction has to be aborted but the attempt to roll back
	 * to the previous (good) transaction failed.  The only pool metadata
@@ -825,6 +831,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
	return dm_tm_commit(pmd->tm, sblock);
}

static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
{
	int r;
	dm_block_t total;
	dm_block_t max_blocks = 4096; /* 16M */

	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
	if (r) {
		DMERR("could not get size of metadata device");
		pmd->metadata_reserve = max_blocks;
	} else {
		sector_div(total, 10);
		pmd->metadata_reserve = min(max_blocks, total);
	}
}

struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
					       sector_t data_block_size,
					       bool format_device)
@@ -858,6 +880,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
		return ERR_PTR(r);
	}

	__set_metadata_reserve(pmd);

	return pmd;
}

@@ -1829,6 +1853,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
	down_read(&pmd->root_lock);
	if (!pmd->fail_io)
		r = dm_sm_get_nr_free(pmd->metadata_sm, result);

	if (!r) {
		if (*result < pmd->metadata_reserve)
			*result = 0;
		else
			*result -= pmd->metadata_reserve;
	}
	up_read(&pmd->root_lock);

	return r;
@@ -1941,8 +1972,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
	int r = -EINVAL;

	down_write(&pmd->root_lock);
	if (!pmd->fail_io)
	if (!pmd->fail_io) {
		r = __resize_space_map(pmd->metadata_sm, new_count);
		if (!r)
			__set_metadata_reserve(pmd);
	}
	up_write(&pmd->root_lock);

	return r;
+65 −8
Original line number Diff line number Diff line
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
enum pool_mode {
	PM_WRITE,		/* metadata may be changed */
	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */

	/*
	 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
	 */
	PM_OUT_OF_METADATA_SPACE,
	PM_READ_ONLY,		/* metadata may not be changed */

	PM_FAIL,		/* all I/O fails */
};

@@ -1382,7 +1388,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);

static void requeue_bios(struct pool *pool);

static void check_for_space(struct pool *pool)
static bool is_read_only_pool_mode(enum pool_mode mode)
{
	return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
}

static bool is_read_only(struct pool *pool)
{
	return is_read_only_pool_mode(get_pool_mode(pool));
}

static void check_for_metadata_space(struct pool *pool)
{
	int r;
	const char *ooms_reason = NULL;
	dm_block_t nr_free;

	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
	if (r)
		ooms_reason = "Could not get free metadata blocks";
	else if (!nr_free)
		ooms_reason = "No free metadata blocks";

	if (ooms_reason && !is_read_only(pool)) {
		DMERR("%s", ooms_reason);
		set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
	}
}

static void check_for_data_space(struct pool *pool)
{
	int r;
	dm_block_t nr_free;
@@ -1408,14 +1442,16 @@ static int commit(struct pool *pool)
{
	int r;

	if (get_pool_mode(pool) >= PM_READ_ONLY)
	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
		return -EINVAL;

	r = dm_pool_commit_metadata(pool->pmd);
	if (r)
		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
	else
		check_for_space(pool);
	else {
		check_for_metadata_space(pool);
		check_for_data_space(pool);
	}

	return r;
}
@@ -1481,6 +1517,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
		return r;
	}

	r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
	if (r) {
		metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
		return r;
	}

	if (!free_blocks) {
		/* Let's commit before we use up the metadata reserve. */
		r = commit(pool);
		if (r)
			return r;
	}

	return 0;
}

@@ -1512,6 +1561,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool)
	case PM_OUT_OF_DATA_SPACE:
		return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;

	case PM_OUT_OF_METADATA_SPACE:
	case PM_READ_ONLY:
	case PM_FAIL:
		return BLK_STS_IOERR;
@@ -2475,8 +2525,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
		error_retry_list(pool);
		break;

	case PM_OUT_OF_METADATA_SPACE:
	case PM_READ_ONLY:
		if (old_mode != new_mode)
		if (!is_read_only_pool_mode(old_mode))
			notify_of_pool_mode_change(pool, "read-only");
		dm_pool_metadata_read_only(pool->pmd);
		pool->process_bio = process_bio_read_only;
@@ -3412,6 +3463,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
		       dm_device_name(pool->pool_md),
		       sb_metadata_dev_size, metadata_dev_size);

		if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
			set_pool_mode(pool, PM_WRITE);

		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
		if (r) {
			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3715,7 +3770,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
	struct pool_c *pt = ti->private;
	struct pool *pool = pt->pool;

	if (get_pool_mode(pool) >= PM_READ_ONLY) {
	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
		      dm_device_name(pool->pool_md));
		return -EOPNOTSUPP;
@@ -3789,6 +3844,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
	dm_block_t nr_blocks_data;
	dm_block_t nr_blocks_metadata;
	dm_block_t held_root;
	enum pool_mode mode;
	char buf[BDEVNAME_SIZE];
	char buf2[BDEVNAME_SIZE];
	struct pool_c *pt = ti->private;
@@ -3859,9 +3915,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
		else
			DMEMIT("- ");

		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
		mode = get_pool_mode(pool);
		if (mode == PM_OUT_OF_DATA_SPACE)
			DMEMIT("out_of_data_space ");
		else if (pool->pf.mode == PM_READ_ONLY)
		else if (is_read_only_pool_mode(mode))
			DMEMIT("ro ");
		else
			DMEMIT("rw ");