Merge tag 'nfs-rdma-for-3.20' of git://git.linux-nfs.org/projects/anna/nfs-rdma (cc3ea893) · Commits · e / devices / android_kernel_samsung_universal8895

include/linux/sunrpc/rpc_rdma.h

+13 −1

Original line number	Original line	Diff line number	Diff line
	@@ -42,6 +42,9 @@

	#include <linux/types.h>		#include <linux/types.h>

			#define RPCRDMA_VERSION 1
			#define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION)

	struct rpcrdma_segment {		struct rpcrdma_segment {
	__be32 rs_handle; /* Registered memory handle */		__be32 rs_handle; /* Registered memory handle */
	__be32 rs_length; /* Length of the chunk in bytes */		__be32 rs_length; /* Length of the chunk in bytes */
	@@ -95,7 +98,10 @@ struct rpcrdma_msg {
	} rm_body;		} rm_body;
	};		};

	#define RPCRDMA_HDRLEN_MIN 28		/*
			* Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
			*/
			#define RPCRDMA_HDRLEN_MIN (sizeof(__be32) * 7)

	enum rpcrdma_errcode {		enum rpcrdma_errcode {
	ERR_VERS = 1,		ERR_VERS = 1,
	@@ -115,4 +121,10 @@ enum rpcrdma_proc {
	RDMA_ERROR = 4 /* An RPC RDMA encoding error */		RDMA_ERROR = 4 /* An RPC RDMA encoding error */
	};		};

			#define rdma_msg cpu_to_be32(RDMA_MSG)
			#define rdma_nomsg cpu_to_be32(RDMA_NOMSG)
			#define rdma_msgp cpu_to_be32(RDMA_MSGP)
			#define rdma_done cpu_to_be32(RDMA_DONE)
			#define rdma_error cpu_to_be32(RDMA_ERROR)

	#endif /* _LINUX_SUNRPC_RPC_RDMA_H */		#endif /* _LINUX_SUNRPC_RPC_RDMA_H */

include/linux/sunrpc/svc_rdma.h

+0 −2

Original line number	Original line	Diff line number	Diff line
	@@ -63,8 +63,6 @@ extern atomic_t rdma_stat_rq_prod;
	extern atomic_t rdma_stat_sq_poll;		extern atomic_t rdma_stat_sq_poll;
	extern atomic_t rdma_stat_sq_prod;		extern atomic_t rdma_stat_sq_prod;

	#define RPCRDMA_VERSION 1

	/*		/*
	* Contexts are built when an RDMA request is created and are a		* Contexts are built when an RDMA request is created and are a
	* record of the resources that can be recovered when the request		* record of the resources that can be recovered when the request

net/sunrpc/xprtrdma/rpc_rdma.c

+62 −46

Original line number	Original line	Diff line number	Diff line
	@@ -209,9 +209,11 @@ rpcrdma_create_chunks(struct rpc_rqst rqst, struct xdr_buf target,
	if (cur_rchunk) { /* read */		if (cur_rchunk) { /* read */
	cur_rchunk->rc_discrim = xdr_one;		cur_rchunk->rc_discrim = xdr_one;
	/* all read chunks have the same "position" */		/* all read chunks have the same "position" */
	cur_rchunk->rc_position = htonl(pos);		cur_rchunk->rc_position = cpu_to_be32(pos);
	cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);		cur_rchunk->rc_target.rs_handle =
	cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);		cpu_to_be32(seg->mr_rkey);
			cur_rchunk->rc_target.rs_length =
			cpu_to_be32(seg->mr_len);
	xdr_encode_hyper(		xdr_encode_hyper(
	(__be32 *)&cur_rchunk->rc_target.rs_offset,		(__be32 *)&cur_rchunk->rc_target.rs_offset,
	seg->mr_base);		seg->mr_base);
	@@ -222,8 +224,10 @@ rpcrdma_create_chunks(struct rpc_rqst rqst, struct xdr_buf target,
	cur_rchunk++;		cur_rchunk++;
	r_xprt->rx_stats.read_chunk_count++;		r_xprt->rx_stats.read_chunk_count++;
	} else { /* write/reply */		} else { /* write/reply */
	cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);		cur_wchunk->wc_target.rs_handle =
	cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);		cpu_to_be32(seg->mr_rkey);
			cur_wchunk->wc_target.rs_length =
			cpu_to_be32(seg->mr_len);
	xdr_encode_hyper(		xdr_encode_hyper(
	(__be32 *)&cur_wchunk->wc_target.rs_offset,		(__be32 *)&cur_wchunk->wc_target.rs_offset,
	seg->mr_base);		seg->mr_base);
	@@ -257,7 +261,7 @@ rpcrdma_create_chunks(struct rpc_rqst rqst, struct xdr_buf target,
	iptr++ = xdr_zero; / encode a NULL reply chunk */		iptr++ = xdr_zero; / encode a NULL reply chunk */
	} else {		} else {
	warray->wc_discrim = xdr_one;		warray->wc_discrim = xdr_one;
	warray->wc_nchunks = htonl(nchunks);		warray->wc_nchunks = cpu_to_be32(nchunks);
	iptr = (__be32 *) cur_wchunk;		iptr = (__be32 *) cur_wchunk;
	if (type == rpcrdma_writech) {		if (type == rpcrdma_writech) {
	iptr++ = xdr_zero; / finish the write chunk list */		iptr++ = xdr_zero; / finish the write chunk list */
	@@ -290,7 +294,7 @@ ssize_t
	rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)		rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
	{		{
	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);		struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
	struct rpcrdma_msg headerp = (struct rpcrdma_msg )req->rl_base;		struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);

	if (req->rl_rtype != rpcrdma_noch)		if (req->rl_rtype != rpcrdma_noch)
	result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,		result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
	@@ -402,13 +406,12 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
	base = rqst->rq_svec[0].iov_base;		base = rqst->rq_svec[0].iov_base;
	rpclen = rqst->rq_svec[0].iov_len;		rpclen = rqst->rq_svec[0].iov_len;

	/* build RDMA header in private area at front */		headerp = rdmab_to_msg(req->rl_rdmabuf);
	headerp = (struct rpcrdma_msg *) req->rl_base;		/* don't byte-swap XID, it's already done in request */
	/* don't htonl XID, it's already done in request */
	headerp->rm_xid = rqst->rq_xid;		headerp->rm_xid = rqst->rq_xid;
	headerp->rm_vers = xdr_one;		headerp->rm_vers = rpcrdma_version;
	headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);		headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
	headerp->rm_type = htonl(RDMA_MSG);		headerp->rm_type = rdma_msg;

	/*		/*
	* Chunks needed for results?		* Chunks needed for results?
	@@ -468,7 +471,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
	return -EIO;		return -EIO;
	}		}

	hdrlen = 28; /sizeof headerp;*/		hdrlen = RPCRDMA_HDRLEN_MIN;
	padlen = 0;		padlen = 0;

	/*		/*
	@@ -482,11 +485,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
	RPCRDMA_INLINE_PAD_VALUE(rqst));		RPCRDMA_INLINE_PAD_VALUE(rqst));

	if (padlen) {		if (padlen) {
	headerp->rm_type = htonl(RDMA_MSGP);		headerp->rm_type = rdma_msgp;
	headerp->rm_body.rm_padded.rm_align =		headerp->rm_body.rm_padded.rm_align =
	htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));		cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
	headerp->rm_body.rm_padded.rm_thresh =		headerp->rm_body.rm_padded.rm_thresh =
	htonl(RPCRDMA_INLINE_PAD_THRESH);		cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
	headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;		headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
	headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;		headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
	headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;		headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
	@@ -524,7 +527,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
	dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"		dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
	" headerp 0x%p base 0x%p lkey 0x%x\n",		" headerp 0x%p base 0x%p lkey 0x%x\n",
	__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,		__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
	headerp, base, req->rl_iov.lkey);		headerp, base, rdmab_lkey(req->rl_rdmabuf));

	/*		/*
	* initialize send_iov's - normally only two: rdma chunk header and		* initialize send_iov's - normally only two: rdma chunk header and
	@@ -533,26 +536,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
	* header and any write data. In all non-rdma cases, any following		* header and any write data. In all non-rdma cases, any following
	* data has been copied into the RPC header buffer.		* data has been copied into the RPC header buffer.
	*/		*/
	req->rl_send_iov[0].addr = req->rl_iov.addr;		req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
	req->rl_send_iov[0].length = hdrlen;		req->rl_send_iov[0].length = hdrlen;
	req->rl_send_iov[0].lkey = req->rl_iov.lkey;		req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);

	req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);		req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
	req->rl_send_iov[1].length = rpclen;		req->rl_send_iov[1].length = rpclen;
	req->rl_send_iov[1].lkey = req->rl_iov.lkey;		req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);

	req->rl_niovs = 2;		req->rl_niovs = 2;

	if (padlen) {		if (padlen) {
	struct rpcrdma_ep *ep = &r_xprt->rx_ep;		struct rpcrdma_ep *ep = &r_xprt->rx_ep;

	req->rl_send_iov[2].addr = ep->rep_pad.addr;		req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
	req->rl_send_iov[2].length = padlen;		req->rl_send_iov[2].length = padlen;
	req->rl_send_iov[2].lkey = ep->rep_pad.lkey;		req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);

	req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;		req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
	req->rl_send_iov[3].length = rqst->rq_slen - rpclen;		req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
	req->rl_send_iov[3].lkey = req->rl_iov.lkey;		req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);

	req->rl_niovs = 4;		req->rl_niovs = 4;
	}		}
	@@ -569,8 +572,9 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
	{		{
	unsigned int i, total_len;		unsigned int i, total_len;
	struct rpcrdma_write_chunk *cur_wchunk;		struct rpcrdma_write_chunk *cur_wchunk;
			char base = (char )rdmab_to_msg(rep->rr_rdmabuf);

	i = ntohl(*iptrp); / get array count */		i = be32_to_cpu(**iptrp);
	if (i > max)		if (i > max)
	return -1;		return -1;
	cur_wchunk = (struct rpcrdma_write_chunk ) (iptrp + 1);		cur_wchunk = (struct rpcrdma_write_chunk ) (iptrp + 1);
	@@ -582,11 +586,11 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
	xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);		xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
	dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",		dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
	__func__,		__func__,
	ntohl(seg->rs_length),		be32_to_cpu(seg->rs_length),
	(unsigned long long)off,		(unsigned long long)off,
	ntohl(seg->rs_handle));		be32_to_cpu(seg->rs_handle));
	}		}
	total_len += ntohl(seg->rs_length);		total_len += be32_to_cpu(seg->rs_length);
	++cur_wchunk;		++cur_wchunk;
	}		}
	/* check and adjust for properly terminated write chunk */		/* check and adjust for properly terminated write chunk */
	@@ -596,7 +600,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
	return -1;		return -1;
	cur_wchunk = (struct rpcrdma_write_chunk *) w;		cur_wchunk = (struct rpcrdma_write_chunk *) w;
	}		}
	if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)		if ((char *)cur_wchunk > base + rep->rr_len)
	return -1;		return -1;

	iptrp = (__be32 ) cur_wchunk;		iptrp = (__be32 ) cur_wchunk;
	@@ -691,7 +695,9 @@ rpcrdma_connect_worker(struct work_struct *work)
	{		{
	struct rpcrdma_ep *ep =		struct rpcrdma_ep *ep =
	container_of(work, struct rpcrdma_ep, rep_connect_worker.work);		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
	struct rpc_xprt *xprt = ep->rep_xprt;		struct rpcrdma_xprt *r_xprt =
			container_of(ep, struct rpcrdma_xprt, rx_ep);
			struct rpc_xprt *xprt = &r_xprt->rx_xprt;

	spin_lock_bh(&xprt->transport_lock);		spin_lock_bh(&xprt->transport_lock);
	if (++xprt->connect_cookie == 0) /* maintain a reserved value */		if (++xprt->connect_cookie == 0) /* maintain a reserved value */
	@@ -732,7 +738,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
	struct rpc_xprt *xprt = rep->rr_xprt;		struct rpc_xprt *xprt = rep->rr_xprt;
	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);		struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
	__be32 *iptr;		__be32 *iptr;
	int rdmalen, status;		int credits, rdmalen, status;
	unsigned long cwnd;		unsigned long cwnd;

	/* Check status. If bad, signal disconnect and return rep to pool */		/* Check status. If bad, signal disconnect and return rep to pool */
	@@ -744,14 +750,14 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
	}		}
	return;		return;
	}		}
	if (rep->rr_len < 28) {		if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
	dprintk("RPC: %s: short/invalid reply\n", __func__);		dprintk("RPC: %s: short/invalid reply\n", __func__);
	goto repost;		goto repost;
	}		}
	headerp = (struct rpcrdma_msg *) rep->rr_base;		headerp = rdmab_to_msg(rep->rr_rdmabuf);
	if (headerp->rm_vers != xdr_one) {		if (headerp->rm_vers != rpcrdma_version) {
	dprintk("RPC: %s: invalid version %d\n",		dprintk("RPC: %s: invalid version %d\n",
	__func__, ntohl(headerp->rm_vers));		__func__, be32_to_cpu(headerp->rm_vers));
	goto repost;		goto repost;
	}		}

	@@ -762,7 +768,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
	spin_unlock(&xprt->transport_lock);		spin_unlock(&xprt->transport_lock);
	dprintk("RPC: %s: reply 0x%p failed "		dprintk("RPC: %s: reply 0x%p failed "
	"to match any request xid 0x%08x len %d\n",		"to match any request xid 0x%08x len %d\n",
	__func__, rep, headerp->rm_xid, rep->rr_len);		__func__, rep, be32_to_cpu(headerp->rm_xid),
			rep->rr_len);
	repost:		repost:
	r_xprt->rx_stats.bad_reply_count++;		r_xprt->rx_stats.bad_reply_count++;
	rep->rr_func = rpcrdma_reply_handler;		rep->rr_func = rpcrdma_reply_handler;
	@@ -778,13 +785,14 @@ repost:
	spin_unlock(&xprt->transport_lock);		spin_unlock(&xprt->transport_lock);
	dprintk("RPC: %s: duplicate reply 0x%p to RPC "		dprintk("RPC: %s: duplicate reply 0x%p to RPC "
	"request 0x%p: xid 0x%08x\n", __func__, rep, req,		"request 0x%p: xid 0x%08x\n", __func__, rep, req,
	headerp->rm_xid);		be32_to_cpu(headerp->rm_xid));
	goto repost;		goto repost;
	}		}

	dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"		dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
	" RPC request 0x%p xid 0x%08x\n",		" RPC request 0x%p xid 0x%08x\n",
	__func__, rep, req, rqst, headerp->rm_xid);		__func__, rep, req, rqst,
			be32_to_cpu(headerp->rm_xid));

	/* from here on, the reply is no longer an orphan */		/* from here on, the reply is no longer an orphan */
	req->rl_reply = rep;		req->rl_reply = rep;
	@@ -793,7 +801,7 @@ repost:
	/* check for expected message types */		/* check for expected message types */
	/* The order of some of these tests is important. */		/* The order of some of these tests is important. */
	switch (headerp->rm_type) {		switch (headerp->rm_type) {
	case htonl(RDMA_MSG):		case rdma_msg:
	/* never expect read chunks */		/* never expect read chunks */
	/* never expect reply chunks (two ways to check) */		/* never expect reply chunks (two ways to check) */
	/* never expect write chunks without having offered RDMA */		/* never expect write chunks without having offered RDMA */
	@@ -824,22 +832,24 @@ repost:
	} else {		} else {
	/* else ordinary inline */		/* else ordinary inline */
	rdmalen = 0;		rdmalen = 0;
	iptr = (__be32 )((unsigned char )headerp + 28);		iptr = (__be32 )((unsigned char )headerp +
	rep->rr_len -= 28; /sizeof headerp;*/		RPCRDMA_HDRLEN_MIN);
			rep->rr_len -= RPCRDMA_HDRLEN_MIN;
	status = rep->rr_len;		status = rep->rr_len;
	}		}
	/* Fix up the rpc results for upper layer */		/* Fix up the rpc results for upper layer */
	rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
	break;		break;

	case htonl(RDMA_NOMSG):		case rdma_nomsg:
	/* never expect read or write chunks, always reply chunks */		/* never expect read or write chunks, always reply chunks */
	if (headerp->rm_body.rm_chunks[0] != xdr_zero \|\|		if (headerp->rm_body.rm_chunks[0] != xdr_zero \|\|
	headerp->rm_body.rm_chunks[1] != xdr_zero \|\|		headerp->rm_body.rm_chunks[1] != xdr_zero \|\|
	headerp->rm_body.rm_chunks[2] != xdr_one \|\|		headerp->rm_body.rm_chunks[2] != xdr_one \|\|
	req->rl_nchunks == 0)		req->rl_nchunks == 0)
	goto badheader;		goto badheader;
	iptr = (__be32 )((unsigned char )headerp + 28);		iptr = (__be32 )((unsigned char )headerp +
			RPCRDMA_HDRLEN_MIN);
	rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
	if (rdmalen < 0)		if (rdmalen < 0)
	goto badheader;		goto badheader;
	@@ -853,7 +863,7 @@ badheader:
	dprintk("%s: invalid rpcrdma reply header (type %d):"		dprintk("%s: invalid rpcrdma reply header (type %d):"
	" chunks[012] == %d %d %d"		" chunks[012] == %d %d %d"
	" expected chunks <= %d\n",		" expected chunks <= %d\n",
	__func__, ntohl(headerp->rm_type),		__func__, be32_to_cpu(headerp->rm_type),
	headerp->rm_body.rm_chunks[0],		headerp->rm_body.rm_chunks[0],
	headerp->rm_body.rm_chunks[1],		headerp->rm_body.rm_chunks[1],
	headerp->rm_body.rm_chunks[2],		headerp->rm_body.rm_chunks[2],
	@@ -863,8 +873,14 @@ badheader:
	break;		break;
	}		}

			credits = be32_to_cpu(headerp->rm_credit);
			if (credits == 0)
			credits = 1; /* don't deadlock */
			else if (credits > r_xprt->rx_buf.rb_max_requests)
			credits = r_xprt->rx_buf.rb_max_requests;

	cwnd = xprt->cwnd;		cwnd = xprt->cwnd;
	xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;		xprt->cwnd = credits << RPC_CWNDSHIFT;
	if (xprt->cwnd > cwnd)		if (xprt->cwnd > cwnd)
	xprt_release_rqst_cong(rqst->rq_task);		xprt_release_rqst_cong(rqst->rq_task);

net/sunrpc/xprtrdma/transport.c

+82 −100

Original line number	Original line	Diff line number	Diff line
	@@ -200,9 +200,9 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
	static void		static void
	xprt_rdma_connect_worker(struct work_struct *work)		xprt_rdma_connect_worker(struct work_struct *work)
	{		{
	struct rpcrdma_xprt *r_xprt =		struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
	container_of(work, struct rpcrdma_xprt, rdma_connect.work);		rx_connect_worker.work);
	struct rpc_xprt *xprt = &r_xprt->xprt;		struct rpc_xprt *xprt = &r_xprt->rx_xprt;
	int rc = 0;		int rc = 0;

	xprt_clear_connected(xprt);		xprt_clear_connected(xprt);
	@@ -235,7 +235,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)

	dprintk("RPC: %s: called\n", __func__);		dprintk("RPC: %s: called\n", __func__);

	cancel_delayed_work_sync(&r_xprt->rdma_connect);		cancel_delayed_work_sync(&r_xprt->rx_connect_worker);

	xprt_clear_connected(xprt);		xprt_clear_connected(xprt);

	@@ -364,8 +364,7 @@ xprt_setup_rdma(struct xprt_create *args)
	* any inline data. Also specify any padding which will be provided		* any inline data. Also specify any padding which will be provided
	* from a preregistered zero buffer.		* from a preregistered zero buffer.
	*/		*/
	rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,		rc = rpcrdma_buffer_create(new_xprt);
	&new_xprt->rx_data);
	if (rc)		if (rc)
	goto out3;		goto out3;

	@@ -374,9 +373,8 @@ xprt_setup_rdma(struct xprt_create *args)
	* connection loss notification is async. We also catch connection loss		* connection loss notification is async. We also catch connection loss
	* when reaping receives.		* when reaping receives.
	*/		*/
	INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);		INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
	new_ep->rep_func = rpcrdma_conn_func;		xprt_rdma_connect_worker);
	new_ep->rep_xprt = xprt;

	xprt_rdma_format_addresses(xprt);		xprt_rdma_format_addresses(xprt);
	xprt->max_payload = rpcrdma_max_payload(new_xprt);		xprt->max_payload = rpcrdma_max_payload(new_xprt);
	@@ -434,7 +432,7 @@ xprt_rdma_connect(struct rpc_xprt xprt, struct rpc_task task)

	if (r_xprt->rx_ep.rep_connected != 0) {		if (r_xprt->rx_ep.rep_connected != 0) {
	/* Reconnect */		/* Reconnect */
	schedule_delayed_work(&r_xprt->rdma_connect,		schedule_delayed_work(&r_xprt->rx_connect_worker,
	xprt->reestablish_timeout);		xprt->reestablish_timeout);
	xprt->reestablish_timeout <<= 1;		xprt->reestablish_timeout <<= 1;
	if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)		if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
	@@ -442,86 +440,93 @@ xprt_rdma_connect(struct rpc_xprt xprt, struct rpc_task task)
	else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)		else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;		xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
	} else {		} else {
	schedule_delayed_work(&r_xprt->rdma_connect, 0);		schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
	if (!RPC_IS_ASYNC(task))		if (!RPC_IS_ASYNC(task))
	flush_delayed_work(&r_xprt->rdma_connect);		flush_delayed_work(&r_xprt->rx_connect_worker);
	}		}
	}		}

	/*		/*
	* The RDMA allocate/free functions need the task structure as a place		* The RDMA allocate/free functions need the task structure as a place
	* to hide the struct rpcrdma_req, which is necessary for the actual send/recv		* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
	* sequence. For this reason, the recv buffers are attached to send		* sequence.
	* buffers for portions of the RPC. Note that the RPC layer allocates		*
	* both send and receive buffers in the same call. We may register		* The RPC layer allocates both send and receive buffers in the same call
	* the receive buffer portion when using reply chunks.		* (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
			* We may register rq_rcv_buf when using reply chunks.
	*/		*/
	static void *		static void *
	xprt_rdma_allocate(struct rpc_task *task, size_t size)		xprt_rdma_allocate(struct rpc_task *task, size_t size)
	{		{
	struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;		struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
	struct rpcrdma_req req, nreq;		struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
			struct rpcrdma_regbuf *rb;
			struct rpcrdma_req *req;
			size_t min_size;
			gfp_t flags;

	req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);		req = rpcrdma_buffer_get(&r_xprt->rx_buf);
	if (req == NULL)		if (req == NULL)
	return NULL;		return NULL;

	if (size > req->rl_size) {		flags = GFP_NOIO \| __GFP_NOWARN;
	dprintk("RPC: %s: size %zd too large for buffer[%zd]: "		if (RPC_IS_SWAPPER(task))
	"prog %d vers %d proc %d\n",		flags = __GFP_MEMALLOC \| GFP_NOWAIT \| __GFP_NOWARN;
	__func__, size, req->rl_size,
	task->tk_client->cl_prog, task->tk_client->cl_vers,		if (req->rl_rdmabuf == NULL)
	task->tk_msg.rpc_proc->p_proc);		goto out_rdmabuf;
	/*		if (req->rl_sendbuf == NULL)
	* Outgoing length shortage. Our inline write max must have		goto out_sendbuf;
	* been configured to perform direct i/o.		if (size > req->rl_sendbuf->rg_size)
			goto out_sendbuf;

			out:
			dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
			req->rl_connect_cookie = 0; /* our reserved value */
			return req->rl_sendbuf->rg_base;

			out_rdmabuf:
			min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
			rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
			if (IS_ERR(rb))
			goto out_fail;
			req->rl_rdmabuf = rb;

			out_sendbuf:
			/* XDR encoding and RPC/RDMA marshaling of this request has not
			* yet occurred. Thus a lower bound is needed to prevent buffer
			* overrun during marshaling.
	*		*
	* This is therefore a large metadata operation, and the		* RPC/RDMA marshaling may choose to send payload bearing ops
	* allocate call was made on the maximum possible message,		* inline, if the result is smaller than the inline threshold.
	* e.g. containing long filename(s) or symlink data. In		* The value of the "size" argument accounts for header
	* fact, while these metadata operations might carry		* requirements but not for the payload in these cases.
	* large outgoing payloads, they rarely do. However, we
	* have to commit to the request here, so reallocate and
	* register it now. The data path will never require this
	* reallocation.
	*		*
	* If the allocation or registration fails, the RPC framework		* Likewise, allocate enough space to receive a reply up to the
	* will (doggedly) retry.		* size of the inline threshold.
			*
			* It's unlikely that both the send header and the received
			* reply will be large, but slush is provided here to allow
			* flexibility when marshaling.
	*/		*/
	if (task->tk_flags & RPC_TASK_SWAPPER)		min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
	nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);		min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
	else		if (size < min_size)
	nreq = kmalloc(sizeof *req + size, GFP_NOFS);		size = min_size;
	if (nreq == NULL)
	goto outfail;		rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
			if (IS_ERR(rb))
	if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,		goto out_fail;
	nreq->rl_base, size + sizeof(struct rpcrdma_req)		rb->rg_owner = req;
	- offsetof(struct rpcrdma_req, rl_base),
	&nreq->rl_handle, &nreq->rl_iov)) {		r_xprt->rx_stats.hardway_register_count += size;
	kfree(nreq);		rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
	goto outfail;		req->rl_sendbuf = rb;
	}		goto out;
	rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
	nreq->rl_size = size;		out_fail:
	nreq->rl_niovs = 0;
	nreq->rl_nchunks = 0;
	nreq->rl_buffer = (struct rpcrdma_buffer *)req;
	nreq->rl_reply = req->rl_reply;
	memcpy(nreq->rl_segments,
	req->rl_segments, sizeof nreq->rl_segments);
	/* flag the swap with an unused field */
	nreq->rl_iov.length = 0;
	req->rl_reply = NULL;
	req = nreq;
	}
	dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
	req->rl_connect_cookie = 0; /* our reserved value */
	return req->rl_xdr_buf;

	outfail:
	rpcrdma_buffer_put(req);		rpcrdma_buffer_put(req);
	rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;		r_xprt->rx_stats.failed_marshal_count++;
	return NULL;		return NULL;
	}		}

	@@ -533,47 +538,24 @@ xprt_rdma_free(void *buffer)
	{		{
	struct rpcrdma_req *req;		struct rpcrdma_req *req;
	struct rpcrdma_xprt *r_xprt;		struct rpcrdma_xprt *r_xprt;
	struct rpcrdma_rep *rep;		struct rpcrdma_regbuf *rb;
	int i;		int i;

	if (buffer == NULL)		if (buffer == NULL)
	return;		return;

	req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);		rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
	if (req->rl_iov.length == 0) { /* see allocate above */		req = rb->rg_owner;
	r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer,
	struct rpcrdma_xprt, rx_buf);
	} else
	r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);		r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
	rep = req->rl_reply;

	dprintk("RPC: %s: called on 0x%p%s\n",		dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
	__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");

	/*
	* Finish the deregistration. The process is considered
	* complete when the rr_func vector becomes NULL - this
	* was put in place during rpcrdma_reply_handler() - the wait
	* call below will not block if the dereg is "done". If
	* interrupted, our framework will clean up.
	*/
	for (i = 0; req->rl_nchunks;) {		for (i = 0; req->rl_nchunks;) {
	--req->rl_nchunks;		--req->rl_nchunks;
	i += rpcrdma_deregister_external(		i += rpcrdma_deregister_external(
	&req->rl_segments[i], r_xprt);		&req->rl_segments[i], r_xprt);
	}		}

	if (req->rl_iov.length == 0) { /* see allocate above */
	struct rpcrdma_req oreq = (struct rpcrdma_req )req->rl_buffer;
	oreq->rl_reply = req->rl_reply;
	(void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
	req->rl_handle,
	&req->rl_iov);
	kfree(req);
	req = oreq;
	}

	/* Put back request+reply buffers */
	rpcrdma_buffer_put(req);		rpcrdma_buffer_put(req);
	}		}