Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 95d04f07 authored by Roland Dreier's avatar Roland Dreier
Browse files

IB/mlx4: Add support for memory management extensions and local DMA L_Key



Add support for the following operations to mlx4 when device firmware
supports them:

 - Send with invalidate and local invalidate send queue work requests;
 - Allocate/free fast register MRs;
 - Allocate/free fast register MR page lists;
 - Fast register MR send queue work requests;
 - Local DMA L_Key.

Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent e4044cfc
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -637,6 +637,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
		case MLX4_OPCODE_SEND_IMM:
			wc->wc_flags |= IB_WC_WITH_IMM;
		case MLX4_OPCODE_SEND:
		case MLX4_OPCODE_SEND_INVAL:
			wc->opcode    = IB_WC_SEND;
			break;
		case MLX4_OPCODE_RDMA_READ:
@@ -657,6 +658,12 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
		case MLX4_OPCODE_LSO:
			wc->opcode    = IB_WC_LSO;
			break;
		case MLX4_OPCODE_FMR:
			wc->opcode    = IB_WC_FAST_REG_MR;
			break;
		case MLX4_OPCODE_LOCAL_INVAL:
			wc->opcode    = IB_WC_LOCAL_INV;
			break;
		}
	} else {
		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
@@ -667,6 +674,11 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
			wc->wc_flags	= IB_WC_WITH_IMM;
			wc->ex.imm_data = cqe->immed_rss_invalid;
			break;
		case MLX4_RECV_OPCODE_SEND_INVAL:
			wc->opcode	= IB_WC_RECV;
			wc->wc_flags	= IB_WC_WITH_INVALIDATE;
			wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
			break;
		case MLX4_RECV_OPCODE_SEND:
			wc->opcode   = IB_WC_RECV;
			wc->wc_flags = 0;
+11 −0
Original line number Diff line number Diff line
@@ -104,6 +104,12 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
	if (dev->dev->caps.max_gso_sz)
		props->device_cap_flags |= IB_DEVICE_UD_TSO;
	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
	if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;

	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
		0xffffff;
@@ -127,6 +133,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
	props->max_srq		   = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
	props->max_fast_reg_page_list_len = PAGE_SIZE / sizeof (u64);
	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
@@ -565,6 +572,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
	ibdev->ib_dev.owner		= THIS_MODULE;
	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
	ibdev->ib_dev.phys_port_cnt	= dev->caps.num_ports;
	ibdev->ib_dev.num_comp_vectors	= 1;
	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
@@ -627,6 +635,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+15 −0
Original line number Diff line number Diff line
@@ -83,6 +83,11 @@ struct mlx4_ib_mr {
	struct ib_umem	       *umem;
};

struct mlx4_ib_fast_reg_page_list {
	struct ib_fast_reg_page_list	ibfrpl;
	dma_addr_t			map;
};

struct mlx4_ib_fmr {
	struct ib_fmr           ibfmr;
	struct mlx4_fmr         mfmr;
@@ -199,6 +204,11 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
}

static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
{
	return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl);
}

static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
{
	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
@@ -239,6 +249,11 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
				  u64 virt_addr, int access_flags,
				  struct ib_udata *udata);
int mlx4_ib_dereg_mr(struct ib_mr *mr);
struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
					int max_page_list_len);
struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
							       int page_list_len);
void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);

int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+70 −0
Original line number Diff line number Diff line
@@ -183,6 +183,76 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
	return 0;
}

struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
					int max_page_list_len)
{
	struct mlx4_ib_dev *dev = to_mdev(pd->device);
	struct mlx4_ib_mr *mr;
	int err;

	mr = kmalloc(sizeof *mr, GFP_KERNEL);
	if (!mr)
		return ERR_PTR(-ENOMEM);

	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
			    max_page_list_len, 0, &mr->mmr);
	if (err)
		goto err_free;

	err = mlx4_mr_enable(dev->dev, &mr->mmr);
	if (err)
		goto err_mr;

	return &mr->ibmr;

err_mr:
	mlx4_mr_free(dev->dev, &mr->mmr);

err_free:
	kfree(mr);
	return ERR_PTR(err);
}

struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
							       int page_list_len)
{
	struct mlx4_ib_dev *dev = to_mdev(ibdev);
	struct mlx4_ib_fast_reg_page_list *mfrpl;
	int size = page_list_len * sizeof (u64);

	if (size > PAGE_SIZE)
		return ERR_PTR(-EINVAL);

	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
	if (!mfrpl)
		return ERR_PTR(-ENOMEM);

	mfrpl->ibfrpl.page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
						     size, &mfrpl->map,
						     GFP_KERNEL);
	if (!mfrpl->ibfrpl.page_list)
		goto err_free;

	WARN_ON(mfrpl->map & 0x3f);

	return &mfrpl->ibfrpl;

err_free:
	kfree(mfrpl);
	return ERR_PTR(-ENOMEM);
}

void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
{
	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
	int size = page_list->max_page_list_len * sizeof (u64);

	dma_free_coherent(&dev->dev->pdev->dev, size, page_list->page_list,
			  mfrpl->map);
	kfree(mfrpl);
}

struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
				 struct ib_fmr_attr *fmr_attr)
{
+67 −5
Original line number Diff line number Diff line
@@ -78,6 +78,9 @@ static const __be32 mlx4_ib_opcode[] = {
	[IB_WR_RDMA_READ]		= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
	[IB_WR_ATOMIC_CMP_AND_SWP]	= __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
	[IB_WR_ATOMIC_FETCH_AND_ADD]	= __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
	[IB_WR_SEND_WITH_INV]		= __constant_cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
	[IB_WR_LOCAL_INV]		= __constant_cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
	[IB_WR_FAST_REG_MR]		= __constant_cpu_to_be32(MLX4_OPCODE_FMR),
};

static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
@@ -976,6 +979,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
	context->pd	    = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
	context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);

	/* Set "fast registration enabled" for all kernel QPs */
	if (!qp->ibqp.uobject)
		context->params1 |= cpu_to_be32(1 << 11);

	if (attr_mask & IB_QP_RNR_RETRY) {
		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
@@ -1322,6 +1329,38 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq
	return cur + nreq >= wq->max_post;
}

static __be32 convert_access(int acc)
{
	return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC)       : 0) |
	       (acc & IB_ACCESS_REMOTE_WRITE  ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
	       (acc & IB_ACCESS_REMOTE_READ   ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ)  : 0) |
	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
}

static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
{
	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);

	fseg->flags		= convert_access(wr->wr.fast_reg.access_flags);
	fseg->mem_key		= cpu_to_be32(wr->wr.fast_reg.rkey);
	fseg->buf_list		= cpu_to_be64(mfrpl->map);
	fseg->start_addr	= cpu_to_be64(wr->wr.fast_reg.iova_start);
	fseg->reg_len		= cpu_to_be64(wr->wr.fast_reg.length);
	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
	fseg->page_size		= cpu_to_be32(wr->wr.fast_reg.page_shift);
	fseg->reserved[0]	= 0;
	fseg->reserved[1]	= 0;
}

static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
{
	iseg->flags	= 0;
	iseg->mem_key	= cpu_to_be32(rkey);
	iseg->guest_id	= 0;
	iseg->pa	= 0;
}

static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
					  u64 remote_addr, u32 rkey)
{
@@ -1423,6 +1462,21 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
	return 0;
}

static __be32 send_ieth(struct ib_send_wr *wr)
{
	switch (wr->opcode) {
	case IB_WR_SEND_WITH_IMM:
	case IB_WR_RDMA_WRITE_WITH_IMM:
		return wr->ex.imm_data;

	case IB_WR_SEND_WITH_INV:
		return cpu_to_be32(wr->ex.invalidate_rkey);

	default:
		return 0;
	}
}

int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
		      struct ib_send_wr **bad_wr)
{
@@ -1469,11 +1523,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
			qp->sq_signal_bits;

		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
			ctrl->imm = wr->ex.imm_data;
		else
			ctrl->imm = 0;
		ctrl->imm = send_ieth(wr);

		wqe += sizeof *ctrl;
		size = sizeof *ctrl / 16;
@@ -1505,6 +1555,18 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
				break;

			case IB_WR_LOCAL_INV:
				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
				break;

			case IB_WR_FAST_REG_MR:
				set_fmr_seg(wqe, wr);
				wqe  += sizeof (struct mlx4_wqe_fmr_seg);
				size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
				break;

			default:
				/* No extra segments required for sends */
				break;
Loading