Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 525dfa2c authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'mlx5-odp'



Saeed Mahameed says:

====================
Mellanox mlx5 core and ODP updates 2017-01-01

The following eleven patches mainly come from Artemy Kovalyov
who expanded mlx5 on-demand-paging (ODP) support. In addition
there are three cleanup patches which don't change any functionality,
but are needed to align codebase prior accepting other patches.

Memory region (MR) in IB can be huge and ODP (on-demand paging)
technique allows to use unpinned memory, which can be consumed and
released on demand. This allows to applications do not pin down
the underlying physical pages of the address space, and save from them
need to track the validity of the mappings.

Rather, the HCA requests the latest translations from the OS when pages
are not present, and the OS invalidates translations which are no longer
valid due to either non-present pages or mapping changes.

In existing ODP implementation applications is needed to register
memory buffers for communication, though registered memory regions
need not have valid mappings at registration time.

This patch set performs the following steps to expand
current ODP implementation:

1. It refactors UMR to support large regions, by introducing generic
   function to perform HCA translation table modifications. This
   function supports both atomic and process contexts and is not limited
   by number of modified entries.

   This function allows to enable reallocated memory regions of
   arbitrary size, so adding MR cache buckets to support up to 16GB MRs.

2. It changes page fault event format and refactor page faults logic
   together with addition of atomic support.

3. It prepares mlx5 core code to support implicit registration with
   simplified and relaxed semantics.

   Implicit ODP semantics allows to applications provide special memory
   key that represents their complete address space. Thus all IO accesses
   referencing to this key (with proper access rights associated with the key)
   wouldn't need not register any virtual address range.

Thanks,
        Artemy, Ilya and Leon

v1->v2:
  - Don't use 'inline' in .c files
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 85eb018f aa8e08d2
Loading
Loading
Loading
Loading
+27 −23
Original line number Diff line number Diff line
@@ -672,17 +672,6 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
	}

	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
			uhw->outlen)) {
		resp.mlx5_ib_support_multi_pkt_send_wqes =
			MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
		resp.response_length +=
			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
	}

	if (field_avail(typeof(resp), reserved, uhw->outlen))
		resp.response_length += sizeof(resp.reserved);

	if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
		resp.cqe_comp_caps.max_num =
			MLX5_CAP_GEN(dev->mdev, cqe_compression) ?
@@ -706,6 +695,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
		resp.response_length += sizeof(resp.packet_pacing_caps);
	}

	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
			uhw->outlen)) {
		resp.mlx5_ib_support_multi_pkt_send_wqes =
			MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
		resp.response_length +=
			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
	}

	if (field_avail(typeof(resp), reserved, uhw->outlen))
		resp.response_length += sizeof(resp.reserved);

	if (uhw->outlen) {
		err = ib_copy_to_udata(uhw, &resp, resp.response_length);

@@ -1112,11 +1112,18 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
#endif

	context->upd_xlt_page = __get_free_page(GFP_KERNEL);
	if (!context->upd_xlt_page) {
		err = -ENOMEM;
		goto out_uars;
	}
	mutex_init(&context->upd_xlt_page_mutex);

	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
		err = mlx5_core_alloc_transport_domain(dev->mdev,
						       &context->tdn);
		if (err)
			goto out_uars;
			goto out_page;
	}

	INIT_LIST_HEAD(&context->vma_private_list);
@@ -1168,6 +1175,9 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);

out_page:
	free_page(context->upd_xlt_page);

out_uars:
	for (i--; i >= 0; i--)
		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
@@ -1195,6 +1205,8 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);

	free_page(context->upd_xlt_page);

	for (i = 0; i < uuari->num_uars; i++) {
		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
@@ -3307,6 +3319,9 @@ static struct mlx5_interface mlx5_ib_interface = {
	.add            = mlx5_ib_add,
	.remove         = mlx5_ib_remove,
	.event          = mlx5_ib_event,
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
	.pfault		= mlx5_ib_pfault,
#endif
	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
};

@@ -3317,25 +3332,14 @@ static int __init mlx5_ib_init(void)
	if (deprecated_prof_sel != 2)
		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");

	err = mlx5_ib_odp_init();
	if (err)
		return err;

	err = mlx5_register_interface(&mlx5_ib_interface);
	if (err)
		goto clean_odp;

	return err;

clean_odp:
	mlx5_ib_odp_cleanup();
	return err;
}

static void __exit mlx5_ib_cleanup(void)
{
	mlx5_unregister_interface(&mlx5_ib_interface);
	mlx5_ib_odp_cleanup();
}

module_init(mlx5_ib_init);
+25 −7
Original line number Diff line number Diff line
@@ -159,7 +159,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
	unsigned long umem_page_shift = ilog2(umem->page_size);
	int shift = page_shift - umem_page_shift;
	int mask = (1 << shift) - 1;
	int i, k;
	int i, k, idx;
	u64 cur = 0;
	u64 base;
	int len;
@@ -185,18 +185,36 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
		len = sg_dma_len(sg) >> umem_page_shift;
		base = sg_dma_address(sg);
		for (k = 0; k < len; k++) {

		/* Skip elements below offset */
		if (i + len < offset << shift) {
			i += len;
			continue;
		}

		/* Skip pages below offset */
		if (i < offset << shift) {
			k = (offset << shift) - i;
			i = offset << shift;
		} else {
			k = 0;
		}

		for (; k < len; k++) {
			if (!(i & mask)) {
				cur = base + (k << umem_page_shift);
				cur |= access_flags;
				idx = (i >> shift) - offset;

				pas[i >> shift] = cpu_to_be64(cur);
				pas[idx] = cpu_to_be64(cur);
				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
					    i >> shift, be64_to_cpu(pas[i >> shift]));
			}  else
				mlx5_ib_dbg(dev, "=====> 0x%llx\n",
					    base + (k << umem_page_shift));
					    i >> shift, be64_to_cpu(pas[idx]));
			}
			i++;

			/* Stop after num_pages reached */
			if (i >> shift >= offset + num_pages)
				return;
		}
	}
}
+28 −61
Original line number Diff line number Diff line
@@ -125,6 +125,10 @@ struct mlx5_ib_ucontext {
	/* Transport Domain number */
	u32			tdn;
	struct list_head	vma_private_list;

	unsigned long		upd_xlt_page;
	/* protect ODP/KSM */
	struct mutex		upd_xlt_page_mutex;
};

static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -174,13 +178,12 @@ struct mlx5_ib_flow_db {
 * enum ib_send_flags and enum ib_qp_type for low-level driver
 */

#define MLX5_IB_SEND_UMR_UNREG	IB_SEND_RESERVED_START
#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)

#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION	(IB_SEND_RESERVED_START << 3)
#define MLX5_IB_SEND_UMR_UPDATE_PD		(IB_SEND_RESERVED_START << 4)
#define MLX5_IB_SEND_UMR_UPDATE_ACCESS		IB_SEND_RESERVED_END
#define MLX5_IB_SEND_UMR_ENABLE_MR	       (IB_SEND_RESERVED_START << 0)
#define MLX5_IB_SEND_UMR_DISABLE_MR	       (IB_SEND_RESERVED_START << 1)
#define MLX5_IB_SEND_UMR_FAIL_IF_FREE	       (IB_SEND_RESERVED_START << 2)
#define MLX5_IB_SEND_UMR_UPDATE_XLT	       (IB_SEND_RESERVED_START << 3)
#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION    (IB_SEND_RESERVED_START << 4)
#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS       IB_SEND_RESERVED_END

#define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
/*
@@ -190,6 +193,16 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_QPT_HW_GSI	IB_QPT_RESERVED2
#define MLX5_IB_WR_UMR		IB_WR_RESERVED1

#define MLX5_IB_UMR_OCTOWORD	       16
#define MLX5_IB_UMR_XLT_ALIGNMENT      64

#define MLX5_IB_UPD_XLT_ZAP	      BIT(0)
#define MLX5_IB_UPD_XLT_ENABLE	      BIT(1)
#define MLX5_IB_UPD_XLT_ATOMIC	      BIT(2)
#define MLX5_IB_UPD_XLT_ADDR	      BIT(3)
#define MLX5_IB_UPD_XLT_PD	      BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS	      BIT(5)

/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
 *
 * These flags are intended for internal use by the mlx5_ib driver, and they
@@ -264,29 +277,6 @@ struct mlx5_ib_rwq_ind_table {
	u32			rqtn;
};

/*
 * Connect-IB can trigger up to four concurrent pagefaults
 * per-QP.
 */
enum mlx5_ib_pagefault_context {
	MLX5_IB_PAGEFAULT_RESPONDER_READ,
	MLX5_IB_PAGEFAULT_REQUESTOR_READ,
	MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
	MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
	MLX5_IB_PAGEFAULT_CONTEXTS
};

static inline enum mlx5_ib_pagefault_context
	mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
{
	return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
}

struct mlx5_ib_pfault {
	struct work_struct	work;
	struct mlx5_pagefault	mpfault;
};

struct mlx5_ib_ubuffer {
	struct ib_umem	       *umem;
	int			buf_size;
@@ -372,20 +362,6 @@ struct mlx5_ib_qp {
	/* Store signature errors */
	bool			signature_en;

#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
	/*
	 * A flag that is true for QP's that are in a state that doesn't
	 * allow page faults, and shouldn't schedule any more faults.
	 */
	int                     disable_page_faults;
	/*
	 * The disable_page_faults_lock protects a QP's disable_page_faults
	 * field, allowing for a thread to atomically check whether the QP
	 * allows page faults, and if so schedule a page fault.
	 */
	spinlock_t              disable_page_faults_lock;
	struct mlx5_ib_pfault	pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
#endif
	struct list_head	qps_list;
	struct list_head	cq_recv_list;
	struct list_head	cq_send_list;
@@ -414,13 +390,11 @@ enum mlx5_ib_qp_flags {

struct mlx5_umr_wr {
	struct ib_send_wr		wr;
	union {
	u64				virt_addr;
	u64				offset;
	} target;
	struct ib_pd		       *pd;
	unsigned int			page_shift;
	unsigned int			npages;
	unsigned int			xlt_size;
	u64				length;
	int				access_flags;
	u32				mkey;
@@ -634,6 +608,7 @@ struct mlx5_ib_dev {
	int				fill_delay;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
	struct ib_odp_caps	odp_caps;
	u64			odp_max_size;
	/*
	 * Sleepable RCU that prevents destruction of MRs while they are still
	 * being used by a page fault handler.
@@ -787,8 +762,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
			       struct ib_udata *udata);
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
		       int npages, int zap);
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
		       int page_shift, int flags);
int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
			  u64 length, u64 virt_addr, int access_flags,
			  struct ib_pd *pd, struct ib_udata *udata);
@@ -857,18 +832,13 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);

#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
extern struct workqueue_struct *mlx5_ib_page_fault_wq;

void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
			       struct mlx5_ib_pfault *pfault);
void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
		    struct mlx5_pagefault *pfault);
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
			      unsigned long end);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
@@ -877,13 +847,10 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
	return;
}

static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)		{}
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	{}
static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void)				{}
static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}

#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */

+200 −318

File changed.

Preview size limit exceeded, changes collapsed.

+187 −237

File changed.

Preview size limit exceeded, changes collapsed.

Loading