Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 08ff3235 authored by Or Gerlitz's avatar Or Gerlitz Committed by Roland Dreier
Browse files

mlx4: 64-byte CQE/EQE support



ConnectX-3 devices can use either 64- or 32-byte completion queue
entries (CQEs) and event queue entries (EQEs).  Using 64-byte
EQEs/CQEs performs better because each entry is aligned to a complete
cacheline.  This patch queries the HCA's capabilities, and if it
supports 64-byte CQEs and EQES the driver will configure the HW to
work in 64-byte mode.

The 32-byte vs 64-byte mode is global per HCA and not per CQ or EQ.

Since this mode is global, userspace (libmlx4) must be updated to work
with the configured CQE size, and guests using SR-IOV virtual
functions need to know both EQE and CQE size.

In case one of the 64-byte CQE/EQE capabilities is activated, the
patch makes sure that older guest drivers that use the QUERY_DEV_FUNC
command (e.g as done in mlx4_core of Linux 3.3..3.6) will notice that
they need an update to be able to work with the PPF. This is done by
changing the returned pf_context_behaviour not to be zero any more. In
case none of these capabilities is activated that value remains zero
and older guest drivers can run OK.

The SRIOV related flow is as follows

1. the PPF does the detection of the new capabilities using
   QUERY_DEV_CAP command.

2. the PPF activates the new capabilities using INIT_HCA.

3. the VF detects if the PPF activated the capabilities using
   QUERY_HCA, and if this is the case activates them for itself too.

Note that the VF detects that it must be aware to the new PF behaviour
using QUERY_FUNC_CAP.  Steps 1 and 2 apply also for native mode.

User space notification is done through a new field introduced in
struct mlx4_ib_ucontext which holds device capabilities for which user
space must take action. This changes the binary interface so the ABI
towards libmlx4 exposed through uverbs is bumped from 3 to 4 but only
when **needed** i.e. only when the driver does use 64-byte CQEs or
future device capabilities which must be in sync by user space. This
practice allows to work with unmodified libmlx4 on older devices (e.g
A0, B0) which don't support 64-byte CQEs.

In order to keep existing systems functional when they update to a
newer kernel that contains these changes in VF and userspace ABI, a
module parameter enable_64b_cqe_eqe must be set to enable 64-byte
mode; the default is currently false.

Signed-off-by: default avatarEli Cohen <eli@mellanox.com>
Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: default avatarRoland Dreier <roland@purestorage.com>
parent f4a75d2e
Loading
Loading
Loading
Loading
+26 −8
Original line number Diff line number Diff line
@@ -66,7 +66,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)

static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
{
	return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe));
	return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
}

static void *get_cqe(struct mlx4_ib_cq *cq, int n)
@@ -77,8 +77,9 @@ static void *get_cqe(struct mlx4_ib_cq *cq, int n)
static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
{
	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
	struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);

	return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
}

@@ -99,12 +100,13 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
{
	int err;

	err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe),
	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
			     PAGE_SIZE * 2, &buf->buf);

	if (err)
		goto out;

	buf->entry_size = dev->dev->caps.cqe_size;
	err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
				    &buf->mtt);
	if (err)
@@ -120,8 +122,7 @@ err_mtt:
	mlx4_mtt_cleanup(dev->dev, &buf->mtt);

err_buf:
	mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe),
			      &buf->buf);
	mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);

out:
	return err;
@@ -129,7 +130,7 @@ out:

static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
{
	mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf);
	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
}

static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
@@ -137,8 +138,9 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont
			       u64 buf_addr, int cqe)
{
	int err;
	int cqe_size = dev->dev->caps.cqe_size;

	*umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe),
	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
			    IB_ACCESS_LOCAL_WRITE, 1);
	if (IS_ERR(*umem))
		return PTR_ERR(*umem);
@@ -331,16 +333,23 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
{
	struct mlx4_cqe *cqe, *new_cqe;
	int i;
	int cqe_size = cq->buf.entry_size;
	int cqe_inc = cqe_size == 64 ? 1 : 0;

	i = cq->mcq.cons_index;
	cqe = get_cqe(cq, i & cq->ibcq.cqe);
	cqe += cqe_inc;

	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
		new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
					   (i + 1) & cq->resize_buf->cqe);
		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe));
		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
		new_cqe += cqe_inc;

		new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
			(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
		cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
		cqe += cqe_inc;
	}
	++cq->mcq.cons_index;
}
@@ -438,6 +447,7 @@ err_buf:

out:
	mutex_unlock(&cq->resize_mutex);

	return err;
}

@@ -586,6 +596,9 @@ repoll:
	if (!cqe)
		return -EAGAIN;

	if (cq->buf.entry_size == 64)
		cqe++;

	++cq->mcq.cons_index;

	/*
@@ -807,6 +820,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
	int nfreed = 0;
	struct mlx4_cqe *cqe, *dest;
	u8 owner_bit;
	int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;

	/*
	 * First we need to find the current producer index, so we
@@ -825,12 +839,16 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
	 */
	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
		cqe += cqe_inc;

		if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
			++nfreed;
		} else if (nfreed) {
			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
			dest += cqe_inc;

			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
			memcpy(dest, cqe, sizeof *cqe);
			dest->owner_sr_opcode = owner_bit |
+22 −5
Original line number Diff line number Diff line
@@ -563,15 +563,24 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
{
	struct mlx4_ib_dev *dev = to_mdev(ibdev);
	struct mlx4_ib_ucontext *context;
	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
	struct mlx4_ib_alloc_ucontext_resp resp;
	int err;

	if (!dev->ib_active)
		return ERR_PTR(-EAGAIN);

	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
		resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
		resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
	} else {
		resp.dev_caps	      = dev->dev->caps.userspace_caps;
		resp.qp_tab_size      = dev->dev->caps.num_qps;
		resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
		resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
		resp.cqe_size	      = dev->dev->caps.cqe_size;
	}

	context = kmalloc(sizeof *context, GFP_KERNEL);
	if (!context)
@@ -586,7 +595,11 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
	INIT_LIST_HEAD(&context->db_page_list);
	mutex_init(&context->db_page_mutex);

	err = ib_copy_to_udata(udata, &resp, sizeof resp);
	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
	else
		err = ib_copy_to_udata(udata, &resp, sizeof(resp));

	if (err) {
		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
		kfree(context);
@@ -1342,7 +1355,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
	ibdev->ib_dev.dma_device	= &dev->pdev->dev;

	if (dev->caps.userspace_caps)
		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
	else
		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;

	ibdev->ib_dev.uverbs_cmd_mask	=
		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+1 −0
Original line number Diff line number Diff line
@@ -90,6 +90,7 @@ struct mlx4_ib_xrcd {
struct mlx4_ib_cq_buf {
	struct mlx4_buf		buf;
	struct mlx4_mtt		mtt;
	int			entry_size;
};

struct mlx4_ib_cq_resize {
+11 −1
Original line number Diff line number Diff line
@@ -40,7 +40,9 @@
 * Increment this value if any changes that break userspace ABI
 * compatibility are made.
 */
#define MLX4_IB_UVERBS_ABI_VERSION	3

#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
#define MLX4_IB_UVERBS_ABI_VERSION		4

/*
 * Make sure that all structs defined in this file remain laid out so
@@ -50,10 +52,18 @@
 * instead.
 */

struct mlx4_ib_alloc_ucontext_resp_v3 {
	__u32	qp_tab_size;
	__u16	bf_reg_size;
	__u16	bf_regs_per_page;
};

struct mlx4_ib_alloc_ucontext_resp {
	__u32	dev_caps;
	__u32	qp_tab_size;
	__u16	bf_reg_size;
	__u16	bf_regs_per_page;
	__u32	cqe_size;
};

struct mlx4_ib_alloc_pd_resp {
+1 −1
Original line number Diff line number Diff line
@@ -1755,7 +1755,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
			spin_lock_init(&s_state->lock);
		}

		memset(&priv->mfunc.master.cmd_eqe, 0, sizeof(struct mlx4_eqe));
		memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size);
		priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD;
		INIT_WORK(&priv->mfunc.master.comm_work,
			  mlx4_master_comm_channel);
Loading