Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ff87e97a authored by Andy Grover's avatar Andy Grover
Browse files

RDS: make m_rdma_op a member of rds_message



This eliminates a separate memory alloc, although
it is now necessary to add an "r_active" flag, since
it is no longer to use the m_rdma_op pointer as an
indicator of if an rdma op is present.

rdma SGs allocated from rm sg pool.

rds_rm_size also gets bigger. It's a little inefficient to
run through CMSGs twice, but it makes later steps a lot smoother.

Signed-off-by: default avatarAndy Grover <andy.grover@oracle.com>
parent 21f79afa
Loading
Loading
Loading
Loading
+10 −10
Original line number Diff line number Diff line
@@ -85,8 +85,8 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
			rm->data.m_sg, rm->data.m_nents,
			DMA_TO_DEVICE);

	if (rm->rdma.m_rdma_op) {
		rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
	if (rm->rdma.m_rdma_op.r_active) {
		rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);

		/* If the user asked for a completion notification on this
		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
		 */
		rds_ib_send_rdma_complete(rm, wc_status);

		if (rm->rdma.m_rdma_op->r_write)
			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
		if (rm->rdma.m_rdma_op.r_write)
			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
		else
			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
	}

	/* If anyone waited for this message to get flushed out, wake
@@ -243,8 +243,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)

				rm = rds_send_get_message(conn, send->s_op);
				if (rm) {
					if (rm->rdma.m_rdma_op)
						rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
					if (rm->rdma.m_rdma_op.r_active)
						rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
					rds_ib_send_rdma_complete(rm, wc.status);
					rds_message_put(rm);
				}
@@ -560,10 +560,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,

		/* If it has a RDMA op, tell the peer we did it. This is
		 * used by the peer to release use-once RDMA MRs. */
		if (rm->rdma.m_rdma_op) {
		if (rm->rdma.m_rdma_op.r_active) {
			struct rds_ext_header_rdma ext_hdr;

			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
			rds_message_add_extension(&rm->m_inc.i_hdr,
					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
		}
@@ -601,7 +601,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
	 * or when requested by the user. Right now, we let
	 * the application choose.
	 */
	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
		send_flags = IB_SEND_FENCE;

	/*
+8 −8
Original line number Diff line number Diff line
@@ -85,8 +85,8 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
		     rm->data.m_sg, rm->data.m_nents,
		     DMA_TO_DEVICE);

	if (rm->rdma.m_rdma_op) {
		rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
	if (rm->rdma.m_rdma_op.r_active) {
		rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);

		/* If the user asked for a completion notification on this
		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
		 */
		rds_iw_send_rdma_complete(rm, wc_status);

		if (rm->rdma.m_rdma_op->r_write)
			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
		if (rm->rdma.m_rdma_op.r_write)
			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
		else
			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
	}

	/* If anyone waited for this message to get flushed out, wake
@@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,

		/* If it has a RDMA op, tell the peer we did it. This is
		 * used by the peer to release use-once RDMA MRs. */
		if (rm->rdma.m_rdma_op) {
		if (rm->rdma.m_rdma_op.r_active) {
			struct rds_ext_header_rdma ext_hdr;

			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
			rds_message_add_extension(&rm->m_inc.i_hdr,
					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
		}
@@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
	 * or when requested by the user. Right now, we let
	 * the application choose.
	 */
	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
		send_flags = IB_SEND_FENCE;

	/*
+6 −3
Original line number Diff line number Diff line
@@ -69,8 +69,8 @@ static void rds_message_purge(struct rds_message *rm)
	}
	rm->data.m_nents = 0;

	if (rm->rdma.m_rdma_op)
		rds_rdma_free_op(rm->rdma.m_rdma_op);
	if (rm->rdma.m_rdma_op.r_active)
		rds_rdma_free_op(&rm->rdma.m_rdma_op);
	if (rm->rdma.m_rdma_mr)
		rds_mr_put(rm->rdma.m_rdma_mr);
}
@@ -259,14 +259,17 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
{
	struct rds_message *rm;
	unsigned int i;
	int num_sgs = ceil(total_len, PAGE_SIZE);
	int extra_bytes = num_sgs * sizeof(struct scatterlist);

	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
	rm = rds_message_alloc(extra_bytes, GFP_KERNEL);
	if (!rm)
		return ERR_PTR(-ENOMEM);

	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
	rm->data.m_nents = ceil(total_len, PAGE_SIZE);
	rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs);

	for (i = 0; i < rm->data.m_nents; ++i) {
		sg_set_page(&rm->data.m_sg[i],
+60 −53
Original line number Diff line number Diff line
@@ -458,26 +458,60 @@ void rds_rdma_free_op(struct rds_rdma_op *ro)
	}

	kfree(ro->r_notifier);
	kfree(ro);
	ro->r_notifier = NULL;
	ro->r_active = 0;
}

/*
 * Count the number of pages needed to describe an incoming iovec.
 */
static int rds_rdma_pages(struct rds_rdma_args *args)
{
	struct rds_iovec vec;
	struct rds_iovec __user *local_vec;
	unsigned int tot_pages = 0;
	unsigned int nr_pages;
	unsigned int i;

	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;

	/* figure out the number of pages in the vector */
	for (i = 0; i < args->nr_local; i++) {
		if (copy_from_user(&vec, &local_vec[i],
				   sizeof(struct rds_iovec)))
			return -EFAULT;

		nr_pages = rds_pages_in_vec(&vec);
		if (nr_pages == 0)
			return -EINVAL;

		tot_pages += nr_pages;
	}

	return tot_pages;
}

int rds_rdma_extra_size(struct rds_rdma_args *args)
{
	return rds_rdma_pages(args) * sizeof(struct scatterlist);
}

/*
 * args is a pointer to an in-kernel copy in the sendmsg cmsg.
 */
static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
static int rds_rdma_prepare(struct rds_message *rm,
			    struct rds_sock *rs,
			    struct rds_rdma_args *args)
{
	struct rds_iovec vec;
	struct rds_rdma_op *op = NULL;
	struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
	unsigned int nr_pages;
	unsigned int max_pages;
	unsigned int nr_bytes;
	struct page **pages = NULL;
	struct rds_iovec __user *local_vec;
	struct scatterlist *sg;
	unsigned int nr;
	unsigned int i, j;
	int ret;
	int ret = 0;


	if (rs->rs_bound_addr == 0) {
@@ -490,44 +524,21 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
		goto out;
	}

	nr_pages = 0;
	max_pages = 0;

	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;

	/* figure out the number of pages in the vector */
	for (i = 0; i < args->nr_local; i++) {
		if (copy_from_user(&vec, &local_vec[i],
				   sizeof(struct rds_iovec))) {
			ret = -EFAULT;
			goto out;
		}

		nr = rds_pages_in_vec(&vec);
		if (nr == 0) {
			ret = -EINVAL;
	nr_pages = rds_rdma_pages(args);
	if (nr_pages < 0)
		goto out;
		}

		max_pages = max(nr, max_pages);
		nr_pages += nr;
	}

	pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
	if (!pages) {
		ret = -ENOMEM;
		goto out;
	}

	op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
	if (!op) {
		ret = -ENOMEM;
		goto out;
	}

	op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
	op->r_active = 1;
	op->r_recverr = rs->rs_recverr;
	WARN_ON(!nr_pages);
	sg_init_table(op->r_sg, nr_pages);
@@ -564,6 +575,8 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
	       (unsigned long long)args->remote_vec.addr,
	       op->r_key);

	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;

	for (i = 0; i < args->nr_local; i++) {
		if (copy_from_user(&vec, &local_vec[i],
				   sizeof(struct rds_iovec))) {
@@ -580,11 +593,6 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
		rs->rs_user_addr = vec.addr;
		rs->rs_user_bytes = vec.bytes;

		/* did the user change the vec under us? */
		if (nr > max_pages || op->r_nents + nr > nr_pages) {
			ret = -EINVAL;
			goto out;
		}
		/* If it's a WRITE operation, we want to pin the pages for reading.
		 * If it's a READ operation, we need to pin the pages for writing.
		 */
@@ -599,6 +607,7 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,

		for (j = 0; j < nr; j++) {
			unsigned int offset = vec.addr & ~PAGE_MASK;
			struct scatterlist *sg;

			sg = &op->r_sg[op->r_nents + j];
			sg_set_page(sg, pages[j],
@@ -628,12 +637,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
	ret = 0;
out:
	kfree(pages);
	if (ret) {
		if (op)
	if (ret)
		rds_rdma_free_op(op);
		op = ERR_PTR(ret);
	}
	return op;

	return ret;
}

/*
@@ -643,17 +650,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
			  struct cmsghdr *cmsg)
{
	struct rds_rdma_op *op;
	int ret;

	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
	    rm->rdma.m_rdma_op)
	    rm->rdma.m_rdma_op.r_active)
		return -EINVAL;

	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
	if (IS_ERR(op))
		return PTR_ERR(op);
	ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg));
	if (ret)
		return ret;

	rds_stats_inc(s_send_rdma);
	rm->rdma.m_rdma_op = op;
	return 0;
}

+1 −1
Original line number Diff line number Diff line
@@ -316,7 +316,7 @@ struct rds_message {
	rds_rdma_cookie_t	m_rdma_cookie;
	struct {
		struct {
			struct rds_rdma_op	*m_rdma_op;
			struct rds_rdma_op	m_rdma_op;
			struct rds_mr		*m_rdma_mr;
		} rdma;
		struct {
Loading