Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ca4ba96e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull Ceph updates from Sage Weil:
 "There are several patches from Ilya fixing RBD allocation lifecycle
  issues, a series adding a nocephx_sign_messages option (and associated
  bug fixes/cleanups), several patches from Zheng improving the
  (directory) fsync behavior, a big improvement in IO for direct-io
  requests when striping is enabled from Caifeng, and several other
  small fixes and cleanups"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  libceph: clear msg->con in ceph_msg_release() only
  libceph: add nocephx_sign_messages option
  libceph: stop duplicating client fields in messenger
  libceph: drop authorizer check from cephx msg signing routines
  libceph: msg signing callouts don't need con argument
  libceph: evaluate osd_req_op_data() arguments only once
  ceph: make fsync() wait unsafe requests that created/modified inode
  ceph: add request to i_unsafe_dirops when getting unsafe reply
  libceph: introduce ceph_x_authorizer_cleanup()
  ceph: don't invalidate page cache when inode is no longer used
  rbd: remove duplicate calls to rbd_dev_mapping_clear()
  rbd: set device_type::release instead of device::release
  rbd: don't free rbd_dev outside of the release callback
  rbd: return -ENOMEM instead of pool id if rbd_dev_create() fails
  libceph: use local variable cursor instead of &msg->cursor
  libceph: remove con argument in handle_reply()
  ceph: combine as many iovec as possile into one OSD request
  ceph: fix message length computation
  ceph: fix a comment typo
  rbd: drop null test before destroy functions
parents 4aeabc6b 583d0fef
Loading
Loading
Loading
Loading
+54 −55
Original line number Diff line number Diff line
@@ -418,8 +418,6 @@ MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (d

static int rbd_img_request_submit(struct rbd_img_request *img_request);

static void rbd_dev_device_release(struct device *dev);

static ssize_t rbd_add(struct bus_type *bus, const char *buf,
		       size_t count);
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
@@ -3991,14 +3989,12 @@ static const struct attribute_group *rbd_attr_groups[] = {
	NULL
};

static void rbd_sysfs_dev_release(struct device *dev)
{
}
static void rbd_dev_release(struct device *dev);

static struct device_type rbd_device_type = {
	.name		= "rbd",
	.groups		= rbd_attr_groups,
	.release	= rbd_sysfs_dev_release,
	.release	= rbd_dev_release,
};

static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
@@ -4041,6 +4037,25 @@ static void rbd_spec_free(struct kref *kref)
	kfree(spec);
}

static void rbd_dev_release(struct device *dev)
{
	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
	bool need_put = !!rbd_dev->opts;

	rbd_put_client(rbd_dev->rbd_client);
	rbd_spec_put(rbd_dev->spec);
	kfree(rbd_dev->opts);
	kfree(rbd_dev);

	/*
	 * This is racy, but way better than putting module outside of
	 * the release callback.  The race window is pretty small, so
	 * doing something similar to dm (dm-builtin.c) is overkill.
	 */
	if (need_put)
		module_put(THIS_MODULE);
}

static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
					 struct rbd_spec *spec,
					 struct rbd_options *opts)
@@ -4057,6 +4072,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
	INIT_LIST_HEAD(&rbd_dev->node);
	init_rwsem(&rbd_dev->header_rwsem);

	rbd_dev->dev.bus = &rbd_bus_type;
	rbd_dev->dev.type = &rbd_device_type;
	rbd_dev->dev.parent = &rbd_root_dev;
	device_initialize(&rbd_dev->dev);

	rbd_dev->rbd_client = rbdc;
	rbd_dev->spec = spec;
	rbd_dev->opts = opts;
@@ -4068,15 +4088,21 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);

	/*
	 * If this is a mapping rbd_dev (as opposed to a parent one),
	 * pin our module.  We have a ref from do_rbd_add(), so use
	 * __module_get().
	 */
	if (rbd_dev->opts)
		__module_get(THIS_MODULE);

	return rbd_dev;
}

static void rbd_dev_destroy(struct rbd_device *rbd_dev)
{
	rbd_put_client(rbd_dev->rbd_client);
	rbd_spec_put(rbd_dev->spec);
	kfree(rbd_dev->opts);
	kfree(rbd_dev);
	if (rbd_dev)
		put_device(&rbd_dev->dev);
}

/*
@@ -4702,27 +4728,6 @@ static int rbd_dev_header_info(struct rbd_device *rbd_dev)
	return rbd_dev_v2_header_info(rbd_dev);
}

static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
{
	struct device *dev;
	int ret;

	dev = &rbd_dev->dev;
	dev->bus = &rbd_bus_type;
	dev->type = &rbd_device_type;
	dev->parent = &rbd_root_dev;
	dev->release = rbd_dev_device_release;
	dev_set_name(dev, "%d", rbd_dev->dev_id);
	ret = device_register(dev);

	return ret;
}

static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
{
	device_unregister(&rbd_dev->dev);
}

/*
 * Get a unique rbd identifier for the given new rbd_dev, and add
 * the rbd_dev to the global list.
@@ -5225,7 +5230,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);

	ret = rbd_bus_add_dev(rbd_dev);
	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
	ret = device_add(&rbd_dev->dev);
	if (ret)
		goto err_out_mapping;

@@ -5248,8 +5254,6 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
		unregister_blkdev(rbd_dev->major, rbd_dev->name);
err_out_id:
	rbd_dev_id_put(rbd_dev);
	rbd_dev_mapping_clear(rbd_dev);

	return ret;
}

@@ -5397,7 +5401,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
	struct rbd_spec *spec = NULL;
	struct rbd_client *rbdc;
	bool read_only;
	int rc = -ENOMEM;
	int rc;

	if (!try_module_get(THIS_MODULE))
		return -ENODEV;
@@ -5405,7 +5409,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
	/* parse add command */
	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
	if (rc < 0)
		goto err_out_module;
		goto out;

	rbdc = rbd_get_client(ceph_opts);
	if (IS_ERR(rbdc)) {
@@ -5432,8 +5436,10 @@ static ssize_t do_rbd_add(struct bus_type *bus,
	}

	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
	if (!rbd_dev)
	if (!rbd_dev) {
		rc = -ENOMEM;
		goto err_out_client;
	}
	rbdc = NULL;		/* rbd_dev now owns this */
	spec = NULL;		/* rbd_dev now owns this */
	rbd_opts = NULL;	/* rbd_dev now owns this */
@@ -5458,10 +5464,13 @@ static ssize_t do_rbd_add(struct bus_type *bus,
		 */
		rbd_dev_header_unwatch_sync(rbd_dev);
		rbd_dev_image_release(rbd_dev);
		goto err_out_module;
		goto out;
	}

	return count;
	rc = count;
out:
	module_put(THIS_MODULE);
	return rc;

err_out_rbd_dev:
	rbd_dev_destroy(rbd_dev);
@@ -5470,12 +5479,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
err_out_args:
	rbd_spec_put(spec);
	kfree(rbd_opts);
err_out_module:
	module_put(THIS_MODULE);

	dout("Error adding device %s\n", buf);

	return (ssize_t)rc;
	goto out;
}

static ssize_t rbd_add(struct bus_type *bus,
@@ -5495,17 +5499,15 @@ static ssize_t rbd_add_single_major(struct bus_type *bus,
	return do_rbd_add(bus, buf, count);
}

static void rbd_dev_device_release(struct device *dev)
static void rbd_dev_device_release(struct rbd_device *rbd_dev)
{
	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);

	rbd_free_disk(rbd_dev);
	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
	device_del(&rbd_dev->dev);
	rbd_dev_mapping_clear(rbd_dev);
	if (!single_major)
		unregister_blkdev(rbd_dev->major, rbd_dev->name);
	rbd_dev_id_put(rbd_dev);
	rbd_dev_mapping_clear(rbd_dev);
}

static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
@@ -5590,9 +5592,8 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
	 * in a potential use after free of rbd_dev->disk or rbd_dev.
	 */
	rbd_bus_del_dev(rbd_dev);
	rbd_dev_device_release(rbd_dev);
	rbd_dev_image_release(rbd_dev);
	module_put(THIS_MODULE);

	return count;
}
@@ -5663,10 +5664,8 @@ static int rbd_slab_init(void)
	if (rbd_segment_name_cache)
		return 0;
out_err:
	if (rbd_obj_request_cache) {
	kmem_cache_destroy(rbd_obj_request_cache);
	rbd_obj_request_cache = NULL;
	}

	kmem_cache_destroy(rbd_img_request_cache);
	rbd_img_request_cache = NULL;
+1 −1
Original line number Diff line number Diff line
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
	const struct ceph_inode_info* ci = cookie_netfs_data;
	uint16_t klen;

	/* use ceph virtual inode (id + snaphot) */
	/* use ceph virtual inode (id + snapshot) */
	klen = sizeof(ci->i_vino);
	if (klen > maxbuf)
		return 0;
+36 −40
Original line number Diff line number Diff line
@@ -1655,9 +1655,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
	    !S_ISDIR(inode->i_mode) &&		/* ignore readdir cache */
	    ci->i_wrbuffer_ref == 0 &&		/* no dirty pages... */
	    inode->i_data.nrpages &&		/* have cached pages */
	    (file_wanted == 0 ||		/* no open files */
	    (revoking & (CEPH_CAP_FILE_CACHE|
			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
	    !tried_invalidate) {
		dout("check_caps trying to invalidate on %p\n", inode);
		if (try_nonblocking_invalidate(inode) < 0) {
@@ -1971,49 +1970,46 @@ static void sync_write_wait(struct inode *inode)
}

/*
 * wait for any uncommitted directory operations to commit.
 * wait for any unsafe requests to complete.
 */
static int unsafe_dirop_wait(struct inode *inode)
static int unsafe_request_wait(struct inode *inode)
{
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct list_head *head = &ci->i_unsafe_dirops;
	struct ceph_mds_request *req;
	u64 last_tid;
	int ret = 0;

	if (!S_ISDIR(inode->i_mode))
		return 0;
	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
	int ret, err = 0;

	spin_lock(&ci->i_unsafe_lock);
	if (list_empty(head))
		goto out;

	req = list_last_entry(head, struct ceph_mds_request,
	if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
		req1 = list_last_entry(&ci->i_unsafe_dirops,
					struct ceph_mds_request,
					r_unsafe_dir_item);
	last_tid = req->r_tid;

	do {
		ceph_mdsc_get_request(req);
		ceph_mdsc_get_request(req1);
	}
	if (!list_empty(&ci->i_unsafe_iops)) {
		req2 = list_last_entry(&ci->i_unsafe_iops,
					struct ceph_mds_request,
					r_unsafe_target_item);
		ceph_mdsc_get_request(req2);
	}
	spin_unlock(&ci->i_unsafe_lock);

		dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
		     inode, req->r_tid, last_tid);
		ret = !wait_for_completion_timeout(&req->r_safe_completion,
					ceph_timeout_jiffies(req->r_timeout));
	dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
	if (req1) {
		ret = !wait_for_completion_timeout(&req1->r_safe_completion,
					ceph_timeout_jiffies(req1->r_timeout));
		if (ret)
			ret = -EIO;  /* timed out */

		ceph_mdsc_put_request(req);

		spin_lock(&ci->i_unsafe_lock);
		if (ret || list_empty(head))
			break;
		req = list_first_entry(head, struct ceph_mds_request,
				       r_unsafe_dir_item);
	} while (req->r_tid < last_tid);
out:
	spin_unlock(&ci->i_unsafe_lock);
	return ret;
			err = -EIO;
		ceph_mdsc_put_request(req1);
	}
	if (req2) {
		ret = !wait_for_completion_timeout(&req2->r_safe_completion,
					ceph_timeout_jiffies(req2->r_timeout));
		if (ret)
			err = -EIO;
		ceph_mdsc_put_request(req2);
	}
	return err;
}

int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
	dirty = try_flush_caps(inode, &flush_tid);
	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));

	ret = unsafe_dirop_wait(inode);
	ret = unsafe_request_wait(inode);

	/*
	 * only wait on non-file metadata writeback (the mds
+77 −10
Original line number Diff line number Diff line
@@ -34,6 +34,74 @@
 * need to wait for MDS acknowledgement.
 */

/*
 * Calculate the length sum of direct io vectors that can
 * be combined into one page vector.
 */
static size_t dio_get_pagev_size(const struct iov_iter *it)
{
    const struct iovec *iov = it->iov;
    const struct iovec *iovend = iov + it->nr_segs;
    size_t size;

    size = iov->iov_len - it->iov_offset;
    /*
     * An iov can be page vectored when both the current tail
     * and the next base are page aligned.
     */
    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
        size += iov->iov_len;
    }
    dout("dio_get_pagevlen len = %zu\n", size);
    return size;
}

/*
 * Allocate a page vector based on (@it, @nbytes).
 * The return value is the tuple describing a page vector,
 * that is (@pages, @page_align, @num_pages).
 */
static struct page **
dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
		    size_t *page_align, int *num_pages)
{
	struct iov_iter tmp_it = *it;
	size_t align;
	struct page **pages;
	int ret = 0, idx, npages;

	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
		(PAGE_SIZE - 1);
	npages = calc_pages_for(align, nbytes);
	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
	if (!pages) {
		pages = vmalloc(sizeof(*pages) * npages);
		if (!pages)
			return ERR_PTR(-ENOMEM);
	}

	for (idx = 0; idx < npages; ) {
		size_t start;
		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
					 npages - idx, &start);
		if (ret < 0)
			goto fail;

		iov_iter_advance(&tmp_it, ret);
		nbytes -= ret;
		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
	}

	BUG_ON(nbytes != 0);
	*num_pages = npages;
	*page_align = align;
	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
	return pages;
fail:
	ceph_put_page_vector(pages, idx, false);
	return ERR_PTR(ret);
}

/*
 * Prepare an open request.  Preallocate ceph_cap to avoid an
@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
			size_t start;
			ssize_t n;

			n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
			if (n < 0)
				return n;

			num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
			n = dio_get_pagev_size(i);
			pages = dio_get_pages_alloc(i, n, &start, &num_pages);
			if (IS_ERR(pages))
				return PTR_ERR(pages);

			ret = striped_read(inode, off, n,
					   pages, num_pages, checkeof,
@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
		CEPH_OSD_FLAG_WRITE;

	while (iov_iter_count(from) > 0) {
		u64 len = iov_iter_single_seg_count(from);
		u64 len = dio_get_pagev_size(from);
		size_t start;
		ssize_t n;

@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,

		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);

		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
		if (unlikely(n < 0)) {
			ret = n;
		n = len;
		pages = dio_get_pages_alloc(from, len, &start, &num_pages);
		if (IS_ERR(pages)) {
			ceph_osdc_put_request(req);
			ret = PTR_ERR(pages);
			break;
		}

		num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
		/*
		 * throw out any page cache pages in this range. this
		 * may block.
+1 −0
Original line number Diff line number Diff line
@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)

	INIT_LIST_HEAD(&ci->i_unsafe_writes);
	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
	INIT_LIST_HEAD(&ci->i_unsafe_iops);
	spin_lock_init(&ci->i_unsafe_lock);

	ci->i_snap_realm = NULL;
Loading