Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 53365383 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits)
  dm snapshot: use merge origin if snapshot invalid
  dm snapshot: report merge failure in status
  dm snapshot: merge consecutive chunks together
  dm snapshot: trigger exceptions in remaining snapshots during merge
  dm snapshot: delay merging a chunk until writes to it complete
  dm snapshot: queue writes to chunks being merged
  dm snapshot: add merging
  dm snapshot: permit only one merge at once
  dm snapshot: support barriers in snapshot merge target
  dm snapshot: avoid allocating exceptions in merge
  dm snapshot: rework writing to origin
  dm snapshot: add merge target
  dm exception store: add merge specific methods
  dm snapshot: create function for chunk_is_tracked wait
  dm snapshot: make bio optional in __origin_write
  dm mpath: reject messages when device is suspended
  dm: export suspended state to targets
  dm: rename dm_suspended to dm_suspended_md
  dm: swap target postsuspend call and setting suspended flag
  dm crypt: add plain64 iv
  ...
parents 51b736b8 d2fdb776
Loading
Loading
Loading
Loading
+55 −5
Original line number Original line Diff line number Diff line
@@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
original content;
original content;
*) To create device "forks", i.e. multiple different versions of the
*) To create device "forks", i.e. multiple different versions of the
same data stream.
same data stream.
*) To merge a snapshot of a block device back into the snapshot's origin
device.


In the first two cases, dm copies only the chunks of data that get
changed and uses a separate copy-on-write (COW) block device for
storage.


In both cases, dm copies only the chunks of data that get changed and
For snapshot merge the contents of the COW storage are merged back into
uses a separate copy-on-write (COW) block device for storage.
the origin device.




There are two dm targets available: snapshot and snapshot-origin.
There are three dm targets available:
snapshot, snapshot-origin, and snapshot-merge.


*) snapshot-origin <origin>
*) snapshot-origin <origin>


@@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
saved on disk - they can be kept in memory by the kernel.
saved on disk - they can be kept in memory by the kernel.




How this is used by LVM2
* snapshot-merge <origin> <COW device> <persistent> <chunksize>
========================

takes the same table arguments as the snapshot target except it only
works with persistent snapshots.  This target assumes the role of the
"snapshot-origin" target and must not be loaded if the "snapshot-origin"
is still present for <origin>.

Creates a merging snapshot that takes control of the changed chunks
stored in the <COW device> of an existing snapshot, through a handover
procedure, and merges these chunks back into the <origin>.  Once merging
has started (in the background) the <origin> may be opened and the merge
will continue while I/O is flowing to it.  Changes to the <origin> are
deferred until the merging snapshot's corresponding chunk(s) have been
merged.  Once merging has started the snapshot device, associated with
the "snapshot" target, will return -EIO when accessed.


How snapshot is used by LVM2
============================
When you create the first LVM2 snapshot of a volume, four dm devices are used:
When you create the first LVM2 snapshot of a volume, four dm devices are used:


1) a device containing the original mapping table of the source volume;
1) a device containing the original mapping table of the source volume;
@@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
brw-------  1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw-------  1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw-------  1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
brw-------  1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base



How snapshot-merge is used by LVM2
==================================
A merging snapshot assumes the role of the "snapshot-origin" while
merging.  As such the "snapshot-origin" is replaced with
"snapshot-merge".  The "-real" device is not changed and the "-cow"
device is renamed to <origin name>-cow to aid LVM2's cleanup of the
merging snapshot after it completes.  The "snapshot" that hands over its
COW device to the "snapshot-merge" is deactivated (unless using lvchange
--refresh); but if it is left active it will simply return I/O errors.

A snapshot will merge into its origin with the following command:

lvconvert --merge volumeGroup/snap

we'll now have this situation:

# dmsetup table|grep volumeGroup

volumeGroup-base-real: 0 2097152 linear 8:19 384
volumeGroup-base-cow: 0 204800 linear 8:19 2097536
volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16

# ls -lL /dev/mapper/volumeGroup-*
brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
brw-------  1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
brw-------  1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
+139 −68
Original line number Original line Diff line number Diff line
/*
/*
 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
 *
 *
 * This file is released under the GPL.
 * This file is released under the GPL.
 */
 */
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
		   const char *opts);
		   const char *opts);
	void (*dtr)(struct crypt_config *cc);
	void (*dtr)(struct crypt_config *cc);
	const char *(*status)(struct crypt_config *cc);
	int (*init)(struct crypt_config *cc);
	int (*wipe)(struct crypt_config *cc);
	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
};
};


struct iv_essiv_private {
	struct crypto_cipher *tfm;
	struct crypto_hash *hash_tfm;
	u8 *salt;
};

struct iv_benbi_private {
	int shift;
};

/*
/*
 * Crypt: maps a linear range of a block device
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
 * and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
	struct crypt_iv_operations *iv_gen_ops;
	struct crypt_iv_operations *iv_gen_ops;
	char *iv_mode;
	char *iv_mode;
	union {
	union {
		struct crypto_cipher *essiv_tfm;
		struct iv_essiv_private essiv;
		int benbi_shift;
		struct iv_benbi_private benbi;
	} iv_gen_private;
	} iv_gen_private;
	sector_t iv_offset;
	sector_t iv_offset;
	unsigned int iv_size;
	unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 * plain: the initial vector is the 32-bit little-endian version of the sector
 * plain: the initial vector is the 32-bit little-endian version of the sector
 *        number, padded with zeros if necessary.
 *        number, padded with zeros if necessary.
 *
 *
 * plain64: the initial vector is the 64-bit little-endian version of the sector
 *        number, padded with zeros if necessary.
 *
 * essiv: "encrypted sector|salt initial vector", the sector number is
 * essiv: "encrypted sector|salt initial vector", the sector number is
 *        encrypted with the bulk cipher using a salt as key. The salt
 *        encrypted with the bulk cipher using a salt as key. The salt
 *        should be derived from the bulk cipher's key via hashing.
 *        should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
	return 0;
	return 0;
}
}


static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
			      const char *opts)
				sector_t sector)
{
{
	struct crypto_cipher *essiv_tfm;
	memset(iv, 0, cc->iv_size);
	struct crypto_hash *hash_tfm;
	*(u64 *)iv = cpu_to_le64(sector);

	return 0;
}

/* Initialise ESSIV - compute salt but no local memory allocations */
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
	struct hash_desc desc;
	struct hash_desc desc;
	struct scatterlist sg;
	struct scatterlist sg;
	unsigned int saltsize;
	u8 *salt;
	int err;
	int err;


	if (opts == NULL) {
	sg_init_one(&sg, cc->key, cc->key_size);
	desc.tfm = essiv->hash_tfm;
	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;

	err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
	if (err)
		return err;

	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
				    crypto_hash_digestsize(essiv->hash_tfm));
}

/* Wipe salt and reset key derived from volume key */
static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);

	memset(essiv->salt, 0, salt_size);

	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
}

static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;

	crypto_free_cipher(essiv->tfm);
	essiv->tfm = NULL;

	crypto_free_hash(essiv->hash_tfm);
	essiv->hash_tfm = NULL;

	kzfree(essiv->salt);
	essiv->salt = NULL;
}

static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
			      const char *opts)
{
	struct crypto_cipher *essiv_tfm = NULL;
	struct crypto_hash *hash_tfm = NULL;
	u8 *salt = NULL;
	int err;

	if (!opts) {
		ti->error = "Digest algorithm missing for ESSIV mode";
		ti->error = "Digest algorithm missing for ESSIV mode";
		return -EINVAL;
		return -EINVAL;
	}
	}


	/* Hash the cipher key with the given hash algorithm */
	/* Allocate hash algorithm */
	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
	if (IS_ERR(hash_tfm)) {
	if (IS_ERR(hash_tfm)) {
		ti->error = "Error initializing ESSIV hash";
		ti->error = "Error initializing ESSIV hash";
		return PTR_ERR(hash_tfm);
		err = PTR_ERR(hash_tfm);
		goto bad;
	}
	}


	saltsize = crypto_hash_digestsize(hash_tfm);
	salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
	salt = kmalloc(saltsize, GFP_KERNEL);
	if (!salt) {
	if (salt == NULL) {
		ti->error = "Error kmallocing salt storage in ESSIV";
		ti->error = "Error kmallocing salt storage in ESSIV";
		crypto_free_hash(hash_tfm);
		err = -ENOMEM;
		return -ENOMEM;
		goto bad;
	}

	sg_init_one(&sg, cc->key, cc->key_size);
	desc.tfm = hash_tfm;
	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
	err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
	crypto_free_hash(hash_tfm);

	if (err) {
		ti->error = "Error calculating hash in ESSIV";
		kfree(salt);
		return err;
	}
	}


	/* Setup the essiv_tfm with the given salt */
	/* Allocate essiv_tfm */
	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
	if (IS_ERR(essiv_tfm)) {
	if (IS_ERR(essiv_tfm)) {
		ti->error = "Error allocating crypto tfm for ESSIV";
		ti->error = "Error allocating crypto tfm for ESSIV";
		kfree(salt);
		err = PTR_ERR(essiv_tfm);
		return PTR_ERR(essiv_tfm);
		goto bad;
	}
	}
	if (crypto_cipher_blocksize(essiv_tfm) !=
	if (crypto_cipher_blocksize(essiv_tfm) !=
	    crypto_ablkcipher_ivsize(cc->tfm)) {
	    crypto_ablkcipher_ivsize(cc->tfm)) {
		ti->error = "Block size of ESSIV cipher does "
		ti->error = "Block size of ESSIV cipher does "
			    "not match IV size of block cipher";
			    "not match IV size of block cipher";
		crypto_free_cipher(essiv_tfm);
		err = -EINVAL;
		kfree(salt);
		goto bad;
		return -EINVAL;
	}
	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
	if (err) {
		ti->error = "Failed to set key for ESSIV cipher";
		crypto_free_cipher(essiv_tfm);
		kfree(salt);
		return err;
	}
	}
	kfree(salt);


	cc->iv_gen_private.essiv_tfm = essiv_tfm;
	cc->iv_gen_private.essiv.salt = salt;
	cc->iv_gen_private.essiv.tfm = essiv_tfm;
	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;

	return 0;
	return 0;
}


static void crypt_iv_essiv_dtr(struct crypt_config *cc)
bad:
{
	if (essiv_tfm && !IS_ERR(essiv_tfm))
	crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
		crypto_free_cipher(essiv_tfm);
	cc->iv_gen_private.essiv_tfm = NULL;
	if (hash_tfm && !IS_ERR(hash_tfm))
		crypto_free_hash(hash_tfm);
	kfree(salt);
	return err;
}
}


static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
{
	memset(iv, 0, cc->iv_size);
	memset(iv, 0, cc->iv_size);
	*(u64 *)iv = cpu_to_le64(sector);
	*(u64 *)iv = cpu_to_le64(sector);
	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
	return 0;
	return 0;
}
}


@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
		return -EINVAL;
		return -EINVAL;
	}
	}


	cc->iv_gen_private.benbi_shift = 9 - log;
	cc->iv_gen_private.benbi.shift = 9 - log;


	return 0;
	return 0;
}
}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)


	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */


	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));


	return 0;
	return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
	.generator = crypt_iv_plain_gen
	.generator = crypt_iv_plain_gen
};
};


static struct crypt_iv_operations crypt_iv_plain64_ops = {
	.generator = crypt_iv_plain64_gen
};

static struct crypt_iv_operations crypt_iv_essiv_ops = {
static struct crypt_iv_operations crypt_iv_essiv_ops = {
	.ctr       = crypt_iv_essiv_ctr,
	.ctr       = crypt_iv_essiv_ctr,
	.dtr       = crypt_iv_essiv_dtr,
	.dtr       = crypt_iv_essiv_dtr,
	.init      = crypt_iv_essiv_init,
	.wipe      = crypt_iv_essiv_wipe,
	.generator = crypt_iv_essiv_gen
	.generator = crypt_iv_essiv_gen
};
};


@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)


	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);


	return 0;
	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
}


static int crypt_wipe_key(struct crypt_config *cc)
static int crypt_wipe_key(struct crypt_config *cc)
{
{
	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
	memset(&cc->key, 0, cc->key_size * sizeof(u8));
	memset(&cc->key, 0, cc->key_size * sizeof(u8));
	return 0;
	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
}


/*
/*
@@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		return -ENOMEM;
		return -ENOMEM;
	}
	}


 	if (crypt_set_key(cc, argv[1])) {
		ti->error = "Error decoding key";
		goto bad_cipher;
	}

	/* Compatibility mode for old dm-crypt cipher strings */
	/* Compatibility mode for old dm-crypt cipher strings */
	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
		chainmode = "cbc";
		chainmode = "cbc";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	strcpy(cc->chainmode, chainmode);
	strcpy(cc->chainmode, chainmode);
	cc->tfm = tfm;
	cc->tfm = tfm;


	if (crypt_set_key(cc, argv[1]) < 0) {
		ti->error = "Error decoding and setting key";
		goto bad_ivmode;
	}

	/*
	/*
	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
	 * See comments at iv code
	 * See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		cc->iv_gen_ops = NULL;
		cc->iv_gen_ops = NULL;
	else if (strcmp(ivmode, "plain") == 0)
	else if (strcmp(ivmode, "plain") == 0)
		cc->iv_gen_ops = &crypt_iv_plain_ops;
		cc->iv_gen_ops = &crypt_iv_plain_ops;
	else if (strcmp(ivmode, "plain64") == 0)
		cc->iv_gen_ops = &crypt_iv_plain64_ops;
	else if (strcmp(ivmode, "essiv") == 0)
	else if (strcmp(ivmode, "essiv") == 0)
		cc->iv_gen_ops = &crypt_iv_essiv_ops;
		cc->iv_gen_ops = &crypt_iv_essiv_ops;
	else if (strcmp(ivmode, "benbi") == 0)
	else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
		goto bad_ivmode;
		goto bad_ivmode;


	if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
	    cc->iv_gen_ops->init(cc) < 0) {
		ti->error = "Error initialising IV";
		goto bad_slab_pool;
	}

	cc->iv_size = crypto_ablkcipher_ivsize(tfm);
	cc->iv_size = crypto_ablkcipher_ivsize(tfm);
	if (cc->iv_size)
	if (cc->iv_size)
		/* at least a 64 bit sector number should fit in our buffer */
		/* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
		goto bad_bs;
		goto bad_bs;
	}
	}


	if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
		ti->error = "Error setting key";
		goto bad_device;
	}

	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
		ti->error = "Invalid iv_offset sector";
		ti->error = "Invalid iv_offset sector";
		goto bad_device;
		goto bad_device;
@@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
{
{
	struct crypt_config *cc = ti->private;
	struct crypt_config *cc = ti->private;
	int ret = -EINVAL;


	if (argc < 2)
	if (argc < 2)
		goto error;
		goto error;
@@ -1287,11 +1346,23 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
			DMWARN("not suspended during key manipulation.");
			DMWARN("not suspended during key manipulation.");
			return -EINVAL;
			return -EINVAL;
		}
		}
		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
			return crypt_set_key(cc, argv[2]);
			ret = crypt_set_key(cc, argv[2]);
		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
			if (ret)
				return ret;
			if (cc->iv_gen_ops && cc->iv_gen_ops->init)
				ret = cc->iv_gen_ops->init(cc);
			return ret;
		}
		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
			if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
				ret = cc->iv_gen_ops->wipe(cc);
				if (ret)
					return ret;
			}
			return crypt_wipe_key(cc);
			return crypt_wipe_key(cc);
		}
		}
	}


error:
error:
	DMWARN("unrecognised message received.");
	DMWARN("unrecognised message received.");
+13 −20
Original line number Original line Diff line number Diff line
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
	}
	}


	/* Validate the chunk size against the device block size */
	/* Validate the chunk size against the device block size */
	if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
	if (chunk_size %
	    (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
		*error = "Chunk size is not a multiple of device blocksize";
		*error = "Chunk size is not a multiple of device blocksize";
		return -EINVAL;
		return -EINVAL;
	}
	}
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
}


int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
			      struct dm_snapshot *snap,
			      unsigned *args_used,
			      unsigned *args_used,
			      struct dm_exception_store **store)
			      struct dm_exception_store **store)
{
{
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
	struct dm_exception_store *tmp_store;
	struct dm_exception_store *tmp_store;
	char persistent;
	char persistent;


	if (argc < 3) {
	if (argc < 2) {
		ti->error = "Insufficient exception store arguments";
		ti->error = "Insufficient exception store arguments";
		return -EINVAL;
		return -EINVAL;
	}
	}
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
		return -ENOMEM;
		return -ENOMEM;
	}
	}


	persistent = toupper(*argv[1]);
	persistent = toupper(*argv[0]);
	if (persistent == 'P')
	if (persistent == 'P')
		type = get_type("P");
		type = get_type("P");
	else if (persistent == 'N')
	else if (persistent == 'N')
		type = get_type("N");
		type = get_type("N");
	else {
	else {
		ti->error = "Persistent flag is not P or N";
		ti->error = "Persistent flag is not P or N";
		return -EINVAL;
		r = -EINVAL;
		goto bad_type;
	}
	}


	if (!type) {
	if (!type) {
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
	}
	}


	tmp_store->type = type;
	tmp_store->type = type;
	tmp_store->ti = ti;
	tmp_store->snap = snap;

	r = dm_get_device(ti, argv[0], 0, 0,
			  FMODE_READ | FMODE_WRITE, &tmp_store->cow);
	if (r) {
		ti->error = "Cannot get COW device";
		goto bad_cow;
	}


	r = set_chunk_size(tmp_store, argv[2], &ti->error);
	r = set_chunk_size(tmp_store, argv[1], &ti->error);
	if (r)
	if (r)
		goto bad_ctr;
		goto bad;


	r = type->ctr(tmp_store, 0, NULL);
	r = type->ctr(tmp_store, 0, NULL);
	if (r) {
	if (r) {
		ti->error = "Exception store type constructor failed";
		ti->error = "Exception store type constructor failed";
		goto bad_ctr;
		goto bad;
	}
	}


	*args_used = 3;
	*args_used = 2;
	*store = tmp_store;
	*store = tmp_store;
	return 0;
	return 0;


bad_ctr:
bad:
	dm_put_device(ti, tmp_store->cow);
bad_cow:
	put_type(type);
	put_type(type);
bad_type:
bad_type:
	kfree(tmp_store);
	kfree(tmp_store);
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
void dm_exception_store_destroy(struct dm_exception_store *store)
void dm_exception_store_destroy(struct dm_exception_store *store)
{
{
	store->type->dtr(store);
	store->type->dtr(store);
	dm_put_device(store->ti, store->cow);
	put_type(store->type);
	put_type(store->type);
	kfree(store);
	kfree(store);
}
}
+48 −14
Original line number Original line Diff line number Diff line
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
 * of chunks that follow contiguously.  Remaining bits hold the number of the
 * of chunks that follow contiguously.  Remaining bits hold the number of the
 * chunk within the device.
 * chunk within the device.
 */
 */
struct dm_snap_exception {
struct dm_exception {
	struct list_head hash_list;
	struct list_head hash_list;


	chunk_t old_chunk;
	chunk_t old_chunk;
@@ -64,16 +64,33 @@ struct dm_exception_store_type {
	 * Find somewhere to store the next exception.
	 * Find somewhere to store the next exception.
	 */
	 */
	int (*prepare_exception) (struct dm_exception_store *store,
	int (*prepare_exception) (struct dm_exception_store *store,
				  struct dm_snap_exception *e);
				  struct dm_exception *e);


	/*
	/*
	 * Update the metadata with this exception.
	 * Update the metadata with this exception.
	 */
	 */
	void (*commit_exception) (struct dm_exception_store *store,
	void (*commit_exception) (struct dm_exception_store *store,
				  struct dm_snap_exception *e,
				  struct dm_exception *e,
				  void (*callback) (void *, int success),
				  void (*callback) (void *, int success),
				  void *callback_context);
				  void *callback_context);


	/*
	 * Returns 0 if the exception store is empty.
	 *
	 * If there are exceptions still to be merged, sets
	 * *last_old_chunk and *last_new_chunk to the most recent
	 * still-to-be-merged chunk and returns the number of
	 * consecutive previous ones.
	 */
	int (*prepare_merge) (struct dm_exception_store *store,
			      chunk_t *last_old_chunk, chunk_t *last_new_chunk);

	/*
	 * Clear the last n exceptions.
	 * nr_merged must be <= the value returned by prepare_merge.
	 */
	int (*commit_merge) (struct dm_exception_store *store, int nr_merged);

	/*
	/*
	 * The snapshot is invalid, note this in the metadata.
	 * The snapshot is invalid, note this in the metadata.
	 */
	 */
@@ -86,19 +103,19 @@ struct dm_exception_store_type {
	/*
	/*
	 * Return how full the snapshot is.
	 * Return how full the snapshot is.
	 */
	 */
	void (*fraction_full) (struct dm_exception_store *store,
	void (*usage) (struct dm_exception_store *store,
			       sector_t *numerator,
		       sector_t *total_sectors, sector_t *sectors_allocated,
			       sector_t *denominator);
		       sector_t *metadata_sectors);


	/* For internal device-mapper use only. */
	/* For internal device-mapper use only. */
	struct list_head list;
	struct list_head list;
};
};


struct dm_snapshot;

struct dm_exception_store {
struct dm_exception_store {
	struct dm_exception_store_type *type;
	struct dm_exception_store_type *type;
	struct dm_target *ti;
	struct dm_snapshot *snap;

	struct dm_dev *cow;


	/* Size of data blocks saved - must be a power of 2 */
	/* Size of data blocks saved - must be a power of 2 */
	unsigned chunk_size;
	unsigned chunk_size;
@@ -108,6 +125,11 @@ struct dm_exception_store {
	void *context;
	void *context;
};
};


/*
 * Obtain the cow device used by a given snapshot.
 */
struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);

/*
/*
 * Funtions to manipulate consecutive chunks
 * Funtions to manipulate consecutive chunks
 */
 */
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
	return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}
}


static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
{
	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
	return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}
}


static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
{
	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
	e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);


	BUG_ON(!dm_consecutive_chunk_count(e));
	BUG_ON(!dm_consecutive_chunk_count(e));
}
}


static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
	BUG_ON(!dm_consecutive_chunk_count(e));

	e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
}

#  else
#  else
#    define DM_CHUNK_CONSECUTIVE_BITS 0
#    define DM_CHUNK_CONSECUTIVE_BITS 0


@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
	return chunk;
	return chunk;
}
}


static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
{
	return 0;
	return 0;
}
}


static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
}

static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
{
}
}


@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
				      sector_t sector)
				      sector_t sector)
{
{
	return (sector & ~store->chunk_mask) >> store->chunk_shift;
	return sector >> store->chunk_shift;
}
}


int dm_exception_store_type_register(struct dm_exception_store_type *type);
int dm_exception_store_type_register(struct dm_exception_store_type *type);
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
				      char **error);
				      char **error);


int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
			      struct dm_snapshot *snap,
			      unsigned *args_used,
			      unsigned *args_used,
			      struct dm_exception_store **store);
			      struct dm_exception_store **store);
void dm_exception_store_destroy(struct dm_exception_store *store);
void dm_exception_store_destroy(struct dm_exception_store *store);
+82 −38
Original line number Original line Diff line number Diff line
@@ -5,6 +5,8 @@
 * This file is released under the GPL.
 * This file is released under the GPL.
 */
 */


#include "dm.h"

#include <linux/device-mapper.h>
#include <linux/device-mapper.h>


#include <linux/bio.h>
#include <linux/bio.h>
@@ -14,12 +16,19 @@
#include <linux/slab.h>
#include <linux/slab.h>
#include <linux/dm-io.h>
#include <linux/dm-io.h>


#define DM_MSG_PREFIX "io"

#define DM_IO_MAX_REGIONS	BITS_PER_LONG

struct dm_io_client {
struct dm_io_client {
	mempool_t *pool;
	mempool_t *pool;
	struct bio_set *bios;
	struct bio_set *bios;
};
};


/* FIXME: can we shrink this ? */
/*
 * Aligning 'struct io' reduces the number of bits required to store
 * its address.  Refer to store_io_and_region_in_bio() below.
 */
struct io {
struct io {
	unsigned long error_bits;
	unsigned long error_bits;
	unsigned long eopnotsupp_bits;
	unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
	struct dm_io_client *client;
	struct dm_io_client *client;
	io_notify_fn callback;
	io_notify_fn callback;
	void *context;
	void *context;
};
} __attribute__((aligned(DM_IO_MAX_REGIONS)));

static struct kmem_cache *_dm_io_cache;


/*
/*
 * io contexts are only dynamically allocated for asynchronous
 * io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
	if (!client)
	if (!client)
		return ERR_PTR(-ENOMEM);
		return ERR_PTR(-ENOMEM);


	client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
	client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
	if (!client->pool)
	if (!client->pool)
		goto bad;
		goto bad;


@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);


/*-----------------------------------------------------------------
/*-----------------------------------------------------------------
 * We need to keep track of which region a bio is doing io for.
 * We need to keep track of which region a bio is doing io for.
 * In order to save a memory allocation we store this the last
 * To avoid a memory allocation to store just 5 or 6 bits, we
 * bvec which we know is unused (blech).
 * ensure the 'struct io' pointer is aligned so enough low bits are
 * XXX This is ugly and can OOPS with some configs... find another way.
 * always zero and then combine it with the region number directly in
 * bi_private.
 *---------------------------------------------------------------*/
 *---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
				       unsigned region)
{
{
	bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
	if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
		DMCRIT("Unaligned struct io pointer %p", io);
		BUG();
	}

	bio->bi_private = (void *)((unsigned long)io | region);
}
}


static inline unsigned bio_get_region(struct bio *bio)
static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
				       unsigned *region)
{
{
	return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
	unsigned long val = (unsigned long)bio->bi_private;

	*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
	*region = val & (DM_IO_MAX_REGIONS - 1);
}
}


/*-----------------------------------------------------------------
/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
	/*
	/*
	 * The bio destructor in bio_put() may use the io object.
	 * The bio destructor in bio_put() may use the io object.
	 */
	 */
	io = bio->bi_private;
	retrieve_io_and_region_from_bio(bio, &io, &region);
	region = bio_get_region(bio);


	bio->bi_max_vecs++;
	bio_put(bio);
	bio_put(bio);


	dec_count(io, region, error);
	dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)


static void dm_bio_destructor(struct bio *bio)
static void dm_bio_destructor(struct bio *bio)
{
{
	struct io *io = bio->bi_private;
	unsigned region;
	struct io *io;

	retrieve_io_and_region_from_bio(bio, &io, &region);


	bio_free(bio, io->client->bios);
	bio_free(bio, io->client->bios);
}
}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
	unsigned num_bvecs;
	unsigned num_bvecs;
	sector_t remaining = where->count;
	sector_t remaining = where->count;


	while (remaining) {
	/*
	/*
		 * Allocate a suitably sized-bio: we add an extra
	 * where->count may be zero if rw holds a write barrier and we
		 * bvec for bio_get/set_region() and decrement bi_max_vecs
	 * need to send a zero-sized barrier.
		 * to hide it from bio_add_page().
	 */
	do {
		/*
		 * Allocate a suitably sized-bio.
		 */
		 */
		num_bvecs = dm_sector_div_up(remaining,
		num_bvecs = dm_sector_div_up(remaining,
					     (PAGE_SIZE >> SECTOR_SHIFT));
					     (PAGE_SIZE >> SECTOR_SHIFT));
		num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
		num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
				      num_bvecs);
		if (unlikely(num_bvecs > BIO_MAX_PAGES))
			num_bvecs = BIO_MAX_PAGES;
		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
		bio->bi_sector = where->sector + (where->count - remaining);
		bio->bi_sector = where->sector + (where->count - remaining);
		bio->bi_bdev = where->bdev;
		bio->bi_bdev = where->bdev;
		bio->bi_end_io = endio;
		bio->bi_end_io = endio;
		bio->bi_private = io;
		bio->bi_destructor = dm_bio_destructor;
		bio->bi_destructor = dm_bio_destructor;
		bio->bi_max_vecs--;
		store_io_and_region_in_bio(bio, io, region);
		bio_set_region(bio, region);


		/*
		/*
		 * Try and add as many pages as possible.
		 * Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,


		atomic_inc(&io->count);
		atomic_inc(&io->count);
		submit_bio(rw, bio);
		submit_bio(rw, bio);
	}
	} while (remaining);
}
}


static void dispatch_io(int rw, unsigned int num_regions,
static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
	int i;
	int i;
	struct dpages old_pages = *dp;
	struct dpages old_pages = *dp;


	BUG_ON(num_regions > DM_IO_MAX_REGIONS);

	if (sync)
	if (sync)
		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);


@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
	 */
	 */
	for (i = 0; i < num_regions; i++) {
	for (i = 0; i < num_regions; i++) {
		*dp = old_pages;
		*dp = old_pages;
		if (where[i].count)
		if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
			do_region(rw, i, where + i, dp, io);
			do_region(rw, i, where + i, dp, io);
	}
	}


@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
		   struct dm_io_region *where, int rw, struct dpages *dp,
		   struct dm_io_region *where, int rw, struct dpages *dp,
		   unsigned long *error_bits)
		   unsigned long *error_bits)
{
{
	struct io io;
	/*
	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
	 * align it on our own.
	 * volatile prevents the optimizer from removing or reusing
	 * "io_" field from the stack frame (allowed in ANSI C).
	 */
	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));


	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
		WARN_ON(1);
		WARN_ON(1);
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
	}
	}


retry:
retry:
	io.error_bits = 0;
	io->error_bits = 0;
	io.eopnotsupp_bits = 0;
	io->eopnotsupp_bits = 0;
	atomic_set(&io.count, 1); /* see dispatch_io() */
	atomic_set(&io->count, 1); /* see dispatch_io() */
	io.sleeper = current;
	io->sleeper = current;
	io.client = client;
	io->client = client;


	dispatch_io(rw, num_regions, where, dp, &io, 1);
	dispatch_io(rw, num_regions, where, dp, io, 1);


	while (1) {
	while (1) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		set_current_state(TASK_UNINTERRUPTIBLE);


		if (!atomic_read(&io.count))
		if (!atomic_read(&io->count))
			break;
			break;


		io_schedule();
		io_schedule();
	}
	}
	set_current_state(TASK_RUNNING);
	set_current_state(TASK_RUNNING);


	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
	if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
		rw &= ~(1 << BIO_RW_BARRIER);
		rw &= ~(1 << BIO_RW_BARRIER);
		goto retry;
		goto retry;
	}
	}


	if (error_bits)
	if (error_bits)
		*error_bits = io.error_bits;
		*error_bits = io->error_bits;


	return io.error_bits ? -EIO : 0;
	return io->error_bits ? -EIO : 0;
}
}


static int async_io(struct dm_io_client *client, unsigned int num_regions,
static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
			&dp, io_req->notify.fn, io_req->notify.context);
			&dp, io_req->notify.fn, io_req->notify.context);
}
}
EXPORT_SYMBOL(dm_io);
EXPORT_SYMBOL(dm_io);

int __init dm_io_init(void)
{
	_dm_io_cache = KMEM_CACHE(io, 0);
	if (!_dm_io_cache)
		return -ENOMEM;

	return 0;
}

void dm_io_exit(void)
{
	kmem_cache_destroy(_dm_io_cache);
	_dm_io_cache = NULL;
}
Loading