Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 53365383 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits)
  dm snapshot: use merge origin if snapshot invalid
  dm snapshot: report merge failure in status
  dm snapshot: merge consecutive chunks together
  dm snapshot: trigger exceptions in remaining snapshots during merge
  dm snapshot: delay merging a chunk until writes to it complete
  dm snapshot: queue writes to chunks being merged
  dm snapshot: add merging
  dm snapshot: permit only one merge at once
  dm snapshot: support barriers in snapshot merge target
  dm snapshot: avoid allocating exceptions in merge
  dm snapshot: rework writing to origin
  dm snapshot: add merge target
  dm exception store: add merge specific methods
  dm snapshot: create function for chunk_is_tracked wait
  dm snapshot: make bio optional in __origin_write
  dm mpath: reject messages when device is suspended
  dm: export suspended state to targets
  dm: rename dm_suspended to dm_suspended_md
  dm: swap target postsuspend call and setting suspended flag
  dm crypt: add plain64 iv
  ...
parents 51b736b8 d2fdb776
Loading
Loading
Loading
Loading
+55 −5
Original line number Diff line number Diff line
@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
original content;
*) To create device "forks", i.e. multiple different versions of the
same data stream.
*) To merge a snapshot of a block device back into the snapshot's origin
device.

In the first two cases, dm copies only the chunks of data that get
changed and uses a separate copy-on-write (COW) block device for
storage.

In both cases, dm copies only the chunks of data that get changed and
uses a separate copy-on-write (COW) block device for storage.
For snapshot merge the contents of the COW storage are merged back into
the origin device.


There are two dm targets available: snapshot and snapshot-origin.
There are three dm targets available:
snapshot, snapshot-origin, and snapshot-merge.

*) snapshot-origin <origin>

@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
saved on disk - they can be kept in memory by the kernel.


How this is used by LVM2
========================
* snapshot-merge <origin> <COW device> <persistent> <chunksize>

takes the same table arguments as the snapshot target except it only
works with persistent snapshots.  This target assumes the role of the
"snapshot-origin" target and must not be loaded if the "snapshot-origin"
is still present for <origin>.

Creates a merging snapshot that takes control of the changed chunks
stored in the <COW device> of an existing snapshot, through a handover
procedure, and merges these chunks back into the <origin>.  Once merging
has started (in the background) the <origin> may be opened and the merge
will continue while I/O is flowing to it.  Changes to the <origin> are
deferred until the merging snapshot's corresponding chunk(s) have been
merged.  Once merging has started the snapshot device, associated with
the "snapshot" target, will return -EIO when accessed.


How snapshot is used by LVM2
============================
When you create the first LVM2 snapshot of a volume, four dm devices are used:

1) a device containing the original mapping table of the source volume;
@@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
brw-------  1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw-------  1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base


How snapshot-merge is used by LVM2
==================================
A merging snapshot assumes the role of the "snapshot-origin" while
merging.  As such the "snapshot-origin" is replaced with
"snapshot-merge".  The "-real" device is not changed and the "-cow"
device is renamed to <origin name>-cow to aid LVM2's cleanup of the
merging snapshot after it completes.  The "snapshot" that hands over its
COW device to the "snapshot-merge" is deactivated (unless using lvchange
--refresh); but if it is left active it will simply return I/O errors.

A snapshot will merge into its origin with the following command:

lvconvert --merge volumeGroup/snap

we'll now have this situation:

# dmsetup table|grep volumeGroup

volumeGroup-base-real: 0 2097152 linear 8:19 384
volumeGroup-base-cow: 0 204800 linear 8:19 2097536
volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16

# ls -lL /dev/mapper/volumeGroup-*
brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
brw-------  1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
brw-------  1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
+139 −68
Original line number Diff line number Diff line
/*
 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
		   const char *opts);
	void (*dtr)(struct crypt_config *cc);
	const char *(*status)(struct crypt_config *cc);
	int (*init)(struct crypt_config *cc);
	int (*wipe)(struct crypt_config *cc);
	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
};

struct iv_essiv_private {
	struct crypto_cipher *tfm;
	struct crypto_hash *hash_tfm;
	u8 *salt;
};

struct iv_benbi_private {
	int shift;
};

/*
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
	struct crypt_iv_operations *iv_gen_ops;
	char *iv_mode;
	union {
		struct crypto_cipher *essiv_tfm;
		int benbi_shift;
		struct iv_essiv_private essiv;
		struct iv_benbi_private benbi;
	} iv_gen_private;
	sector_t iv_offset;
	unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 * plain: the initial vector is the 32-bit little-endian version of the sector
 *        number, padded with zeros if necessary.
 *
 * plain64: the initial vector is the 64-bit little-endian version of the sector
 *        number, padded with zeros if necessary.
 *
 * essiv: "encrypted sector|salt initial vector", the sector number is
 *        encrypted with the bulk cipher using a salt as key. The salt
 *        should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
	return 0;
}

static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
			      const char *opts)
static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
				sector_t sector)
{
	struct crypto_cipher *essiv_tfm;
	struct crypto_hash *hash_tfm;
	memset(iv, 0, cc->iv_size);
	*(u64 *)iv = cpu_to_le64(sector);

	return 0;
}

/* Initialise ESSIV - compute salt but no local memory allocations */
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
	struct hash_desc desc;
	struct scatterlist sg;
	unsigned int saltsize;
	u8 *salt;
	int err;

	if (opts == NULL) {
	sg_init_one(&sg, cc->key, cc->key_size);
	desc.tfm = essiv->hash_tfm;
	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;

	err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
	if (err)
		return err;

	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
				    crypto_hash_digestsize(essiv->hash_tfm));
}

/* Wipe salt and reset key derived from volume key */
static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);

	memset(essiv->salt, 0, salt_size);

	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
}

static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;

	crypto_free_cipher(essiv->tfm);
	essiv->tfm = NULL;

	crypto_free_hash(essiv->hash_tfm);
	essiv->hash_tfm = NULL;

	kzfree(essiv->salt);
	essiv->salt = NULL;
}

static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
			      const char *opts)
{
	struct crypto_cipher *essiv_tfm = NULL;
	struct crypto_hash *hash_tfm = NULL;
	u8 *salt = NULL;
	int err;

	if (!opts) {
		ti->error = "Digest algorithm missing for ESSIV mode";
		return -EINVAL;
	}

	/* Hash the cipher key with the given hash algorithm */
	/* Allocate hash algorithm */
	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
	if (IS_ERR(hash_tfm)) {
		ti->error = "Error initializing ESSIV hash";
		return PTR_ERR(hash_tfm);
		err = PTR_ERR(hash_tfm);
		goto bad;
	}

	saltsize = crypto_hash_digestsize(hash_tfm);
	salt = kmalloc(saltsize, GFP_KERNEL);
	if (salt == NULL) {
	salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
	if (!salt) {
		ti->error = "Error kmallocing salt storage in ESSIV";
		crypto_free_hash(hash_tfm);
		return -ENOMEM;
	}

	sg_init_one(&sg, cc->key, cc->key_size);
	desc.tfm = hash_tfm;
	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
	err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
	crypto_free_hash(hash_tfm);

	if (err) {
		ti->error = "Error calculating hash in ESSIV";
		kfree(salt);
		return err;
		err = -ENOMEM;
		goto bad;
	}

	/* Setup the essiv_tfm with the given salt */
	/* Allocate essiv_tfm */
	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
	if (IS_ERR(essiv_tfm)) {
		ti->error = "Error allocating crypto tfm for ESSIV";
		kfree(salt);
		return PTR_ERR(essiv_tfm);
		err = PTR_ERR(essiv_tfm);
		goto bad;
	}
	if (crypto_cipher_blocksize(essiv_tfm) !=
	    crypto_ablkcipher_ivsize(cc->tfm)) {
		ti->error = "Block size of ESSIV cipher does "
			    "not match IV size of block cipher";
		crypto_free_cipher(essiv_tfm);
		kfree(salt);
		return -EINVAL;
	}
	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
	if (err) {
		ti->error = "Failed to set key for ESSIV cipher";
		crypto_free_cipher(essiv_tfm);
		kfree(salt);
		return err;
		err = -EINVAL;
		goto bad;
	}
	kfree(salt);

	cc->iv_gen_private.essiv_tfm = essiv_tfm;
	cc->iv_gen_private.essiv.salt = salt;
	cc->iv_gen_private.essiv.tfm = essiv_tfm;
	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;

	return 0;
}

static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
	crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
	cc->iv_gen_private.essiv_tfm = NULL;
bad:
	if (essiv_tfm && !IS_ERR(essiv_tfm))
		crypto_free_cipher(essiv_tfm);
	if (hash_tfm && !IS_ERR(hash_tfm))
		crypto_free_hash(hash_tfm);
	kfree(salt);
	return err;
}

static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
	memset(iv, 0, cc->iv_size);
	*(u64 *)iv = cpu_to_le64(sector);
	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
	return 0;
}

@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
		return -EINVAL;
	}

	cc->iv_gen_private.benbi_shift = 9 - log;
	cc->iv_gen_private.benbi.shift = 9 - log;

	return 0;
}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)

	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */

	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));

	return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
	.generator = crypt_iv_plain_gen
};

static struct crypt_iv_operations crypt_iv_plain64_ops = {
	.generator = crypt_iv_plain64_gen
};

static struct crypt_iv_operations crypt_iv_essiv_ops = {
	.ctr       = crypt_iv_essiv_ctr,
	.dtr       = crypt_iv_essiv_dtr,
	.init      = crypt_iv_essiv_init,
	.wipe      = crypt_iv_essiv_wipe,
	.generator = crypt_iv_essiv_gen
};

@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)

	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);

	return 0;
	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}

static int crypt_wipe_key(struct crypt_config *cc)
{
	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
	memset(&cc->key, 0, cc->key_size * sizeof(u8));
	return 0;
	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}

/*
@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		return -ENOMEM;
	}

 	if (crypt_set_key(cc, argv[1])) {
		ti->error = "Error decoding key";
		goto bad_cipher;
	}

	/* Compatibility mode for old dm-crypt cipher strings */
	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
		chainmode = "cbc";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	strcpy(cc->chainmode, chainmode);
	cc->tfm = tfm;

	if (crypt_set_key(cc, argv[1]) < 0) {
		ti->error = "Error decoding and setting key";
		goto bad_ivmode;
	}

	/*
	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
	 * See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		cc->iv_gen_ops = NULL;
	else if (strcmp(ivmode, "plain") == 0)
		cc->iv_gen_ops = &crypt_iv_plain_ops;
	else if (strcmp(ivmode, "plain64") == 0)
		cc->iv_gen_ops = &crypt_iv_plain64_ops;
	else if (strcmp(ivmode, "essiv") == 0)
		cc->iv_gen_ops = &crypt_iv_essiv_ops;
	else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
		goto bad_ivmode;

	if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
	    cc->iv_gen_ops->init(cc) < 0) {
		ti->error = "Error initialising IV";
		goto bad_slab_pool;
	}

	cc->iv_size = crypto_ablkcipher_ivsize(tfm);
	if (cc->iv_size)
		/* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		goto bad_bs;
	}

	if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
		ti->error = "Error setting key";
		goto bad_device;
	}

	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
		ti->error = "Invalid iv_offset sector";
		goto bad_device;
@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
{
	struct crypt_config *cc = ti->private;
	int ret = -EINVAL;

	if (argc < 2)
		goto error;
@@ -1287,11 +1346,23 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
			DMWARN("not suspended during key manipulation.");
			return -EINVAL;
		}
		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
			return crypt_set_key(cc, argv[2]);
		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
			ret = crypt_set_key(cc, argv[2]);
			if (ret)
				return ret;
			if (cc->iv_gen_ops && cc->iv_gen_ops->init)
				ret = cc->iv_gen_ops->init(cc);
			return ret;
		}
		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
			if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
				ret = cc->iv_gen_ops->wipe(cc);
				if (ret)
					return ret;
			}
			return crypt_wipe_key(cc);
		}
	}

error:
	DMWARN("unrecognised message received.");
+13 −20
Original line number Diff line number Diff line
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
	}

	/* Validate the chunk size against the device block size */
	if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
	if (chunk_size %
	    (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
		*error = "Chunk size is not a multiple of device blocksize";
		return -EINVAL;
	}
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}

int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
			      struct dm_snapshot *snap,
			      unsigned *args_used,
			      struct dm_exception_store **store)
{
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
	struct dm_exception_store *tmp_store;
	char persistent;

	if (argc < 3) {
	if (argc < 2) {
		ti->error = "Insufficient exception store arguments";
		return -EINVAL;
	}
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
		return -ENOMEM;
	}

	persistent = toupper(*argv[1]);
	persistent = toupper(*argv[0]);
	if (persistent == 'P')
		type = get_type("P");
	else if (persistent == 'N')
		type = get_type("N");
	else {
		ti->error = "Persistent flag is not P or N";
		return -EINVAL;
		r = -EINVAL;
		goto bad_type;
	}

	if (!type) {
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
	}

	tmp_store->type = type;
	tmp_store->ti = ti;

	r = dm_get_device(ti, argv[0], 0, 0,
			  FMODE_READ | FMODE_WRITE, &tmp_store->cow);
	if (r) {
		ti->error = "Cannot get COW device";
		goto bad_cow;
	}
	tmp_store->snap = snap;

	r = set_chunk_size(tmp_store, argv[2], &ti->error);
	r = set_chunk_size(tmp_store, argv[1], &ti->error);
	if (r)
		goto bad_ctr;
		goto bad;

	r = type->ctr(tmp_store, 0, NULL);
	if (r) {
		ti->error = "Exception store type constructor failed";
		goto bad_ctr;
		goto bad;
	}

	*args_used = 3;
	*args_used = 2;
	*store = tmp_store;
	return 0;

bad_ctr:
	dm_put_device(ti, tmp_store->cow);
bad_cow:
bad:
	put_type(type);
bad_type:
	kfree(tmp_store);
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
void dm_exception_store_destroy(struct dm_exception_store *store)
{
	store->type->dtr(store);
	dm_put_device(store->ti, store->cow);
	put_type(store->type);
	kfree(store);
}
+48 −14
Original line number Diff line number Diff line
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
 * of chunks that follow contiguously.  Remaining bits hold the number of the
 * chunk within the device.
 */
struct dm_snap_exception {
struct dm_exception {
	struct list_head hash_list;

	chunk_t old_chunk;
@@ -64,16 +64,33 @@ struct dm_exception_store_type {
	 * Find somewhere to store the next exception.
	 */
	int (*prepare_exception) (struct dm_exception_store *store,
				  struct dm_snap_exception *e);
				  struct dm_exception *e);

	/*
	 * Update the metadata with this exception.
	 */
	void (*commit_exception) (struct dm_exception_store *store,
				  struct dm_snap_exception *e,
				  struct dm_exception *e,
				  void (*callback) (void *, int success),
				  void *callback_context);

	/*
	 * Returns 0 if the exception store is empty.
	 *
	 * If there are exceptions still to be merged, sets
	 * *last_old_chunk and *last_new_chunk to the most recent
	 * still-to-be-merged chunk and returns the number of
	 * consecutive previous ones.
	 */
	int (*prepare_merge) (struct dm_exception_store *store,
			      chunk_t *last_old_chunk, chunk_t *last_new_chunk);

	/*
	 * Clear the last n exceptions.
	 * nr_merged must be <= the value returned by prepare_merge.
	 */
	int (*commit_merge) (struct dm_exception_store *store, int nr_merged);

	/*
	 * The snapshot is invalid, note this in the metadata.
	 */
@@ -86,19 +103,19 @@ struct dm_exception_store_type {
	/*
	 * Return how full the snapshot is.
	 */
	void (*fraction_full) (struct dm_exception_store *store,
			       sector_t *numerator,
			       sector_t *denominator);
	void (*usage) (struct dm_exception_store *store,
		       sector_t *total_sectors, sector_t *sectors_allocated,
		       sector_t *metadata_sectors);

	/* For internal device-mapper use only. */
	struct list_head list;
};

struct dm_snapshot;

struct dm_exception_store {
	struct dm_exception_store_type *type;
	struct dm_target *ti;

	struct dm_dev *cow;
	struct dm_snapshot *snap;

	/* Size of data blocks saved - must be a power of 2 */
	unsigned chunk_size;
@@ -108,6 +125,11 @@ struct dm_exception_store {
	void *context;
};

/*
 * Obtain the cow device used by a given snapshot.
 */
struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);

/*
 * Funtions to manipulate consecutive chunks
 */
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}

static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}

static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);

	BUG_ON(!dm_consecutive_chunk_count(e));
}

static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
	BUG_ON(!dm_consecutive_chunk_count(e));

	e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
}

#  else
#    define DM_CHUNK_CONSECUTIVE_BITS 0

@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
	return chunk;
}

static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
	return 0;
}

static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
}

static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
}

@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
				      sector_t sector)
{
	return (sector & ~store->chunk_mask) >> store->chunk_shift;
	return sector >> store->chunk_shift;
}

int dm_exception_store_type_register(struct dm_exception_store_type *type);
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
				      char **error);

int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
			      struct dm_snapshot *snap,
			      unsigned *args_used,
			      struct dm_exception_store **store);
void dm_exception_store_destroy(struct dm_exception_store *store);
+82 −38
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@
 * This file is released under the GPL.
 */

#include "dm.h"

#include <linux/device-mapper.h>

#include <linux/bio.h>
@@ -14,12 +16,19 @@
#include <linux/slab.h>
#include <linux/dm-io.h>

#define DM_MSG_PREFIX "io"

#define DM_IO_MAX_REGIONS	BITS_PER_LONG

struct dm_io_client {
	mempool_t *pool;
	struct bio_set *bios;
};

/* FIXME: can we shrink this ? */
/*
 * Aligning 'struct io' reduces the number of bits required to store
 * its address.  Refer to store_io_and_region_in_bio() below.
 */
struct io {
	unsigned long error_bits;
	unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
	struct dm_io_client *client;
	io_notify_fn callback;
	void *context;
};
} __attribute__((aligned(DM_IO_MAX_REGIONS)));

static struct kmem_cache *_dm_io_cache;

/*
 * io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
	if (!client)
		return ERR_PTR(-ENOMEM);

	client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
	client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
	if (!client->pool)
		goto bad;

@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);

/*-----------------------------------------------------------------
 * We need to keep track of which region a bio is doing io for.
 * In order to save a memory allocation we store this the last
 * bvec which we know is unused (blech).
 * XXX This is ugly and can OOPS with some configs... find another way.
 * To avoid a memory allocation to store just 5 or 6 bits, we
 * ensure the 'struct io' pointer is aligned so enough low bits are
 * always zero and then combine it with the region number directly in
 * bi_private.
 *---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
				       unsigned region)
{
	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
	if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
		DMCRIT("Unaligned struct io pointer %p", io);
		BUG();
	}

	bio->bi_private = (void *)((unsigned long)io | region);
}

static inline unsigned bio_get_region(struct bio *bio)
static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
				       unsigned *region)
{
	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
	unsigned long val = (unsigned long)bio->bi_private;

	*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
	*region = val & (DM_IO_MAX_REGIONS - 1);
}

/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
	/*
	 * The bio destructor in bio_put() may use the io object.
	 */
	io = bio->bi_private;
	region = bio_get_region(bio);
	retrieve_io_and_region_from_bio(bio, &io, &region);

	bio->bi_max_vecs++;
	bio_put(bio);

	dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)

static void dm_bio_destructor(struct bio *bio)
{
	struct io *io = bio->bi_private;
	unsigned region;
	struct io *io;

	retrieve_io_and_region_from_bio(bio, &io, &region);

	bio_free(bio, io->client->bios);
}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
	unsigned num_bvecs;
	sector_t remaining = where->count;

	while (remaining) {
	/*
		 * Allocate a suitably sized-bio: we add an extra
		 * bvec for bio_get/set_region() and decrement bi_max_vecs
		 * to hide it from bio_add_page().
	 * where->count may be zero if rw holds a write barrier and we
	 * need to send a zero-sized barrier.
	 */
	do {
		/*
		 * Allocate a suitably sized-bio.
		 */
		num_bvecs = dm_sector_div_up(remaining,
					     (PAGE_SIZE >> SECTOR_SHIFT));
		num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
				      num_bvecs);
		if (unlikely(num_bvecs > BIO_MAX_PAGES))
			num_bvecs = BIO_MAX_PAGES;
		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
		bio->bi_sector = where->sector + (where->count - remaining);
		bio->bi_bdev = where->bdev;
		bio->bi_end_io = endio;
		bio->bi_private = io;
		bio->bi_destructor = dm_bio_destructor;
		bio->bi_max_vecs--;
		bio_set_region(bio, region);
		store_io_and_region_in_bio(bio, io, region);

		/*
		 * Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,

		atomic_inc(&io->count);
		submit_bio(rw, bio);
	}
	} while (remaining);
}

static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
	int i;
	struct dpages old_pages = *dp;

	BUG_ON(num_regions > DM_IO_MAX_REGIONS);

	if (sync)
		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);

@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
	 */
	for (i = 0; i < num_regions; i++) {
		*dp = old_pages;
		if (where[i].count)
		if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
			do_region(rw, i, where + i, dp, io);
	}

@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
		   struct dm_io_region *where, int rw, struct dpages *dp,
		   unsigned long *error_bits)
{
	struct io io;
	/*
	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
	 * align it on our own.
	 * volatile prevents the optimizer from removing or reusing
	 * "io_" field from the stack frame (allowed in ANSI C).
	 */
	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));

	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
		WARN_ON(1);
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
	}

retry:
	io.error_bits = 0;
	io.eopnotsupp_bits = 0;
	atomic_set(&io.count, 1); /* see dispatch_io() */
	io.sleeper = current;
	io.client = client;
	io->error_bits = 0;
	io->eopnotsupp_bits = 0;
	atomic_set(&io->count, 1); /* see dispatch_io() */
	io->sleeper = current;
	io->client = client;

	dispatch_io(rw, num_regions, where, dp, &io, 1);
	dispatch_io(rw, num_regions, where, dp, io, 1);

	while (1) {
		set_current_state(TASK_UNINTERRUPTIBLE);

		if (!atomic_read(&io.count))
		if (!atomic_read(&io->count))
			break;

		io_schedule();
	}
	set_current_state(TASK_RUNNING);

	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
	if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
		rw &= ~(1 << BIO_RW_BARRIER);
		goto retry;
	}

	if (error_bits)
		*error_bits = io.error_bits;
		*error_bits = io->error_bits;

	return io.error_bits ? -EIO : 0;
	return io->error_bits ? -EIO : 0;
}

static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
			&dp, io_req->notify.fn, io_req->notify.context);
}
EXPORT_SYMBOL(dm_io);

int __init dm_io_init(void)
{
	_dm_io_cache = KMEM_CACHE(io, 0);
	if (!_dm_io_cache)
		return -ENOMEM;

	return 0;
}

void dm_io_exit(void)
{
	kmem_cache_destroy(_dm_io_cache);
	_dm_io_cache = NULL;
}
Loading