Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 240cd6a8 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull Ceph updates from Sage Weil:
 "The biggest chunk is a series of patches from Ilya that add support
  for new Ceph osd and crush map features, including some new tunables,
  primary affinity, and the new encoding that is needed for erasure
  coding support.  This brings things into parity with the server side
  and the looming firefly release.  There is also support for allocation
  hints in RBD that help limit fragmentation on the server side.

  There is also a series of patches from Zheng fixing NFS reexport,
  directory fragmentation support, flock vs fnctl behavior, and some
  issues with clustered MDS.

  Finally, there are some miscellaneous fixes from Yunchuan Wen for
  fscache, Fabian Frederick for ACLs, and from me for fsync(dirfd)
  behavior"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (79 commits)
  ceph: skip invalid dentry during dcache readdir
  libceph: dump pool {read,write}_tier to debugfs
  libceph: output primary affinity values on osdmap updates
  ceph: flush cap release queue when trimming session caps
  ceph: don't grabs open file reference for aborted request
  ceph: drop extra open file reference in ceph_atomic_open()
  ceph: preallocate buffer for readdir reply
  libceph: enable PRIMARY_AFFINITY feature bit
  libceph: redo ceph_calc_pg_primary() in terms of ceph_calc_pg_acting()
  libceph: add support for osd primary affinity
  libceph: add support for primary_temp mappings
  libceph: return primary from ceph_calc_pg_acting()
  libceph: switch ceph_calc_pg_acting() to new helpers
  libceph: introduce apply_temps() helper
  libceph: introduce pg_to_raw_osds() and raw_to_up_osds() helpers
  libceph: ceph_can_shift_osds(pool) and pool type defines
  libceph: ceph_osd_{exists,is_up,is_down}(osd) definitions
  libceph: enable OSDMAP_ENC feature bit
  libceph: primary_affinity decode bits
  libceph: primary_affinity infrastructure
  ...
parents 30211125 a30be7cb
Loading
Loading
Loading
Loading
+59 −28
Original line number Diff line number Diff line
@@ -1654,7 +1654,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
	if (osd_req->r_result < 0)
		obj_request->result = osd_req->r_result;

	BUG_ON(osd_req->r_num_ops > 2);
	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);

	/*
	 * We support a 64-bit length, but ultimately it has to be
@@ -1662,11 +1662,15 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
	 */
	obj_request->xferred = osd_req->r_reply_op_len[0];
	rbd_assert(obj_request->xferred < (u64)UINT_MAX);

	opcode = osd_req->r_ops[0].op;
	switch (opcode) {
	case CEPH_OSD_OP_READ:
		rbd_osd_read_callback(obj_request);
		break;
	case CEPH_OSD_OP_SETALLOCHINT:
		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
		/* fall through */
	case CEPH_OSD_OP_WRITE:
		rbd_osd_write_callback(obj_request);
		break;
@@ -1715,9 +1719,16 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
			snapc, CEPH_NOSNAP, &mtime);
}

/*
 * Create an osd request.  A read request has one osd op (read).
 * A write request has either one (watch) or two (hint+write) osd ops.
 * (All rbd data writes are prefixed with an allocation hint op, but
 * technically osd watch is a write request, hence this distinction.)
 */
static struct ceph_osd_request *rbd_osd_req_create(
					struct rbd_device *rbd_dev,
					bool write_request,
					unsigned int num_ops,
					struct rbd_obj_request *obj_request)
{
	struct ceph_snap_context *snapc = NULL;
@@ -1733,10 +1744,13 @@ static struct ceph_osd_request *rbd_osd_req_create(
			snapc = img_request->snapc;
	}

	/* Allocate and initialize the request, for the single op */
	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));

	/* Allocate and initialize the request, for the num_ops ops */

	osdc = &rbd_dev->rbd_client->client->osdc;
	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
					  GFP_ATOMIC);
	if (!osd_req)
		return NULL;	/* ENOMEM */

@@ -1756,8 +1770,8 @@ static struct ceph_osd_request *rbd_osd_req_create(

/*
 * Create a copyup osd request based on the information in the
 * object request supplied.  A copyup request has two osd ops,
 * a copyup method call, and a "normal" write request.
 * object request supplied.  A copyup request has three osd ops,
 * a copyup method call, a hint op, and a write op.
 */
static struct ceph_osd_request *
rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
@@ -1773,12 +1787,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
	rbd_assert(img_request);
	rbd_assert(img_request_write_test(img_request));

	/* Allocate and initialize the request, for the two ops */
	/* Allocate and initialize the request, for the three ops */

	snapc = img_request->snapc;
	rbd_dev = img_request->rbd_dev;
	osdc = &rbd_dev->rbd_client->client->osdc;
	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
	if (!osd_req)
		return NULL;	/* ENOMEM */

@@ -2178,6 +2192,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
		const char *object_name;
		u64 offset;
		u64 length;
		unsigned int which = 0;

		object_name = rbd_segment_name(rbd_dev, img_offset);
		if (!object_name)
@@ -2190,6 +2205,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
		rbd_segment_name_free(object_name);
		if (!obj_request)
			goto out_unwind;

		/*
		 * set obj_request->img_request before creating the
		 * osd_request so that it gets the right snapc
@@ -2207,7 +2223,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
								clone_size,
								GFP_ATOMIC);
			if (!obj_request->bio_list)
				goto out_partial;
				goto out_unwind;
		} else {
			unsigned int page_count;

@@ -2220,19 +2236,27 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
		}

		osd_req = rbd_osd_req_create(rbd_dev, write_request,
					     (write_request ? 2 : 1),
					     obj_request);
		if (!osd_req)
			goto out_partial;
			goto out_unwind;
		obj_request->osd_req = osd_req;
		obj_request->callback = rbd_img_obj_callback;

		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
		if (write_request) {
			osd_req_op_alloc_hint_init(osd_req, which,
					     rbd_obj_bytes(&rbd_dev->header),
					     rbd_obj_bytes(&rbd_dev->header));
			which++;
		}

		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
				       0, 0);
		if (type == OBJ_REQUEST_BIO)
			osd_req_op_extent_osd_data_bio(osd_req, 0,
			osd_req_op_extent_osd_data_bio(osd_req, which,
					obj_request->bio_list, length);
		else
			osd_req_op_extent_osd_data_pages(osd_req, 0,
			osd_req_op_extent_osd_data_pages(osd_req, which,
					obj_request->pages, length,
					offset & ~PAGE_MASK, false, false);

@@ -2249,11 +2273,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,

	return 0;

out_partial:
	rbd_obj_request_put(obj_request);
out_unwind:
	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
		rbd_obj_request_put(obj_request);
		rbd_img_obj_request_del(img_request, obj_request);

	return -ENOMEM;
}
@@ -2353,7 +2375,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)

	/*
	 * The original osd request is of no use to use any more.
	 * We need a new one that can hold the two ops in a copyup
	 * We need a new one that can hold the three ops in a copyup
	 * request.  Allocate the new copyup osd request for the
	 * original request, and release the old one.
	 */
@@ -2372,17 +2394,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
						false, false);

	/* Then the original write request op */
	/* Then the hint op */

	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
				   rbd_obj_bytes(&rbd_dev->header));

	/* And the original write request op */

	offset = orig_request->offset;
	length = orig_request->length;
	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
					offset, length, 0, 0);
	if (orig_request->type == OBJ_REQUEST_BIO)
		osd_req_op_extent_osd_data_bio(osd_req, 1,
		osd_req_op_extent_osd_data_bio(osd_req, 2,
					orig_request->bio_list, length);
	else
		osd_req_op_extent_osd_data_pages(osd_req, 1,
		osd_req_op_extent_osd_data_pages(osd_req, 2,
					orig_request->pages, length,
					offset & ~PAGE_MASK, false, false);

@@ -2603,7 +2630,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)

	rbd_assert(obj_request->img_request);
	rbd_dev = obj_request->img_request->rbd_dev;
	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
						   stat_request);
	if (!stat_request->osd_req)
		goto out;
@@ -2807,7 +2834,8 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
		return -ENOMEM;

	ret = -ENOMEM;
	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
						  obj_request);
	if (!obj_request->osd_req)
		goto out;

@@ -2870,7 +2898,8 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
	if (!obj_request)
		goto out_cancel;

	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
						  obj_request);
	if (!obj_request->osd_req)
		goto out_cancel;

@@ -2978,7 +3007,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
	obj_request->pages = pages;
	obj_request->page_count = page_count;

	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
						  obj_request);
	if (!obj_request->osd_req)
		goto out;

@@ -3211,7 +3241,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
	obj_request->pages = pages;
	obj_request->page_count = page_count;

	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
						  obj_request);
	if (!obj_request->osd_req)
		goto out;

+1 −0
Original line number Diff line number Diff line
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
	ci->fscache = fscache_acquire_cookie(fsc->fscache,
					     &ceph_fscache_inode_object_def,
					     ci, true);
	fscache_check_consistency(ci->fscache);
done:
	mutex_unlock(&inode->i_mutex);

+10 −0
Original line number Diff line number Diff line
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);

static inline void ceph_fscache_update_objectsize(struct inode *inode)
{
	struct ceph_inode_info *ci = ceph_inode(inode);
	fscache_attr_changed(ci->fscache);
}

static inline void ceph_fscache_invalidate(struct inode *inode)
{
	fscache_invalidate(ceph_inode(inode)->fscache);
@@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}

static inline void ceph_fscache_update_objectsize(struct inode *inode)
{
}

static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
+7 −2
Original line number Diff line number Diff line
@@ -622,8 +622,10 @@ retry:

	if (flags & CEPH_CAP_FLAG_AUTH) {
		if (ci->i_auth_cap == NULL ||
		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
			ci->i_auth_cap = cap;
			cap->mds_wanted = wanted;
		}
		ci->i_cap_exporting_issued = 0;
	} else {
		WARN_ON(ci->i_auth_cap == cap);
@@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
		cap = rb_entry(p, struct ceph_cap, ci_node);
		if (!__cap_is_valid(cap))
			continue;
		if (cap == ci->i_auth_cap)
			mds_wanted |= cap->mds_wanted;
		else
			mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
	}
	return mds_wanted;
}
+4 −1
Original line number Diff line number Diff line
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
		} else if (req->r_path1) {
			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
				   req->r_path1);
		} else {
			seq_printf(s, " #%llx", req->r_ino1.ino);
		}

		if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
				path = NULL;
			spin_lock(&req->r_old_dentry->d_lock);
			seq_printf(s, " #%llx/%.*s (%s)",
			   ceph_ino(req->r_old_dentry_dir),
				   req->r_old_dentry_dir ?
				   ceph_ino(req->r_old_dentry_dir) : 0,
				   req->r_old_dentry->d_name.len,
				   req->r_old_dentry->d_name.name,
				   path ? path : "");
Loading