Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 966a9671 authored by Ying Huang's avatar Ying Huang Committed by Ingo Molnar
Browse files

smp: Avoid using two cache lines for struct call_single_data



struct call_single_data is used in IPIs to transfer information between
CPUs.  Its size is bigger than sizeof(unsigned long) and less than
cache line size.  Currently it is not allocated with any explicit alignment
requirements.  This makes it possible for allocated call_single_data to
cross two cache lines, which results in double the number of the cache lines
that need to be transferred among CPUs.

This can be fixed by requiring call_single_data to be aligned with the
size of call_single_data. Currently the size of call_single_data is the
power of 2.  If we add new fields to call_single_data, we may need to
add padding to make sure the size of new definition is the power of 2
as well.

Fortunately, this is enforced by GCC, which will report bad sizes.

To set alignment requirements of call_single_data to the size of
call_single_data, a struct definition and a typedef is used.

To test the effect of the patch, I used the vm-scalability multiple
thread swap test case (swap-w-seq-mt).  The test will create multiple
threads and each thread will eat memory until all RAM and part of swap
is used, so that huge number of IPIs are triggered when unmapping
memory.  In the test, the throughput of memory writing improves ~5%
compared with misaligned call_single_data, because of faster IPIs.

Suggested-by: default avatarPeter Zijlstra <peterz@infradead.org>
Signed-off-by: default avatarHuang, Ying <ying.huang@intel.com>
[ Add call_single_data_t and align with size of call_single_data. ]
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent f52be570
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST

static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);

void tick_broadcast(const struct cpumask *mask)
{
	atomic_t *count;
	struct call_single_data *csd;
	call_single_data_t *csd;
	int cpu;

	for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)

static int __init tick_broadcast_init(void)
{
	struct call_single_data *csd;
	call_single_data_t *csd;
	int cpu;

	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+1 −1
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
static int raise_blk_irq(int cpu, struct request *rq)
{
	if (cpu_online(cpu)) {
		struct call_single_data *data = &rq->csd;
		call_single_data_t *data = &rq->csd;

		data->func = trigger_softirq;
		data->info = rq;
+1 −1
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@
struct nullb_cmd {
	struct list_head list;
	struct llist_node ll_list;
	struct call_single_data csd;
	call_single_data_t csd;
	struct request *rq;
	struct bio *bio;
	unsigned int tag;
+5 −5
Original line number Diff line number Diff line
@@ -119,13 +119,13 @@ struct cpuidle_coupled {

#define CPUIDLE_COUPLED_NOT_IDLE	(-1)

static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);

/*
 * The cpuidle_coupled_poke_pending mask is used to avoid calling
 * __smp_call_function_single with the per cpu call_single_data struct already
 * __smp_call_function_single with the per cpu call_single_data_t struct already
 * in use.  This prevents a deadlock where two cpus are waiting for each others
 * call_single_data struct to be available
 * call_single_data_t struct to be available
 */
static cpumask_t cpuidle_coupled_poke_pending;

@@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
 */
static void cpuidle_coupled_poke(int cpu)
{
	struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
	call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);

	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
		smp_call_function_single_async(cpu, csd);
@@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
{
	int cpu;
	struct cpuidle_device *other_dev;
	struct call_single_data *csd;
	call_single_data_t *csd;
	struct cpuidle_coupled *coupled;

	if (cpumask_empty(&dev->coupled_cpus))
+1 −1
Original line number Diff line number Diff line
@@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
	if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
		napi_schedule_irqoff(&droq->napi);
	} else {
		struct call_single_data *csd = &droq->csd;
		call_single_data_t *csd = &droq->csd;

		csd->func = napi_schedule_wrapper;
		csd->info = &droq->napi;
Loading