Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a054374f authored by Mike Marciniszyn's avatar Mike Marciniszyn Committed by Greg Kroah-Hartman
Browse files

staging/rdma/hfi1: convert buffers allocated atomic to per cpu



Profiling has shown the the atomic is a performance issue
for the pio hot path.

If multiple cpus allocated an sc's buffer, the cacheline
containing the atomic will bounce from L0 to L0.

Convert the atomic to a percpu variable.

Reviewed-by: default avatarJubin John <jubin.john@intel.com>
Signed-off-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent a5a9e8cc
Loading
Loading
Loading
Loading
+35 −5
Original line number Diff line number Diff line
@@ -660,6 +660,24 @@ void set_pio_integrity(struct send_context *sc)
	write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
}

static u32 get_buffers_allocated(struct send_context *sc)
{
	int cpu;
	u32 ret = 0;

	for_each_possible_cpu(cpu)
		ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
	return ret;
}

static void reset_buffers_allocated(struct send_context *sc)
{
	int cpu;

	for_each_possible_cpu(cpu)
		(*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
}

/*
 * Allocate a NUMA relative send context structure of the given type along
 * with a HW context.
@@ -668,7 +686,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
			      uint hdrqentsize, int numa)
{
	struct send_context_info *sci;
	struct send_context *sc;
	struct send_context *sc = NULL;
	dma_addr_t pa;
	unsigned long flags;
	u64 reg;
@@ -686,10 +704,20 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
	if (!sc)
		return NULL;

	sc->buffers_allocated = alloc_percpu(u32);
	if (!sc->buffers_allocated) {
		kfree(sc);
		dd_dev_err(dd,
			   "Cannot allocate buffers_allocated per cpu counters\n"
			  );
		return NULL;
	}

	spin_lock_irqsave(&dd->sc_lock, flags);
	ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
	if (ret) {
		spin_unlock_irqrestore(&dd->sc_lock, flags);
		free_percpu(sc->buffers_allocated);
		kfree(sc);
		return NULL;
	}
@@ -705,7 +733,6 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
	spin_lock_init(&sc->credit_ctrl_lock);
	INIT_LIST_HEAD(&sc->piowait);
	INIT_WORK(&sc->halt_work, sc_halted);
	atomic_set(&sc->buffers_allocated, 0);
	init_waitqueue_head(&sc->halt_wait);

	/* grouping is always single context for now */
@@ -866,6 +893,7 @@ void sc_free(struct send_context *sc)
	spin_unlock_irqrestore(&dd->sc_lock, flags);

	kfree(sc->sr);
	free_percpu(sc->buffers_allocated);
	kfree(sc);
}

@@ -1029,7 +1057,7 @@ int sc_restart(struct send_context *sc)
		/* kernel context */
		loop = 0;
		while (1) {
			count = atomic_read(&sc->buffers_allocated);
			count = get_buffers_allocated(sc);
			if (count == 0)
				break;
			if (loop > 100) {
@@ -1197,7 +1225,8 @@ int sc_enable(struct send_context *sc)
	sc->sr_head = 0;
	sc->sr_tail = 0;
	sc->flags = 0;
	atomic_set(&sc->buffers_allocated, 0);
	/* the alloc lock insures no fast path allocation */
	reset_buffers_allocated(sc);

	/*
	 * Clear all per-context errors.  Some of these will be set when
@@ -1373,7 +1402,8 @@ struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,

	/* there is enough room */

	atomic_inc(&sc->buffers_allocated);
	preempt_disable();
	this_cpu_inc(*sc->buffers_allocated);

	/* read this once */
	head = sc->sr_head;
+1 −1
Original line number Diff line number Diff line
@@ -130,7 +130,7 @@ struct send_context {
	spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
	u64 credit_ctrl;		/* cache for credit control */
	u32 credit_intr_count;		/* count of credit intr users */
	atomic_t buffers_allocated;	/* count of buffers allocated */
	u32 __percpu *buffers_allocated;/* count of buffers allocated */
	wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
};

+4 −2
Original line number Diff line number Diff line
@@ -160,7 +160,8 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
	}

	/* finished with this buffer */
	atomic_dec(&pbuf->sc->buffers_allocated);
	this_cpu_dec(*pbuf->sc->buffers_allocated);
	preempt_enable();
}

/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
@@ -854,5 +855,6 @@ void seg_pio_copy_end(struct pio_buf *pbuf)
	}

	/* finished with this buffer */
	atomic_dec(&pbuf->sc->buffers_allocated);
	this_cpu_dec(*pbuf->sc->buffers_allocated);
	preempt_enable();
}