Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6dbde353 authored by Ingo Molnar's avatar Ingo Molnar
Browse files

percpu: add optimized generic percpu accessors



It is an optimization and a cleanup, and adds the following new
generic percpu methods:

  percpu_read()
  percpu_write()
  percpu_add()
  percpu_sub()
  percpu_and()
  percpu_or()
  percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable,
instead of this sequence:

 return __get_cpu_var(var);

 ffffffff8102ca2b:	48 8b 14 fd 80 09 74 	mov    -0x7e8bf680(,%rdi,8),%rdx
 ffffffff8102ca32:	81
 ffffffff8102ca33:	48 c7 c0 d8 59 00 00 	mov    $0x59d8,%rax
 ffffffff8102ca3a:	48 8b 04 10          	mov    (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

 return percpu_read(var);

 ffffffff8102ca3f:	65 48 8b 05 91 8f fd 	mov    %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use
these new generic percpu primitives.

tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
    * added percpu_and() for completeness's sake
    * made generic percpu ops atomic against preemption

Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 004aa322
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@ struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
	return x86_read_percpu(current_task);
	return percpu_read(current_task);
}

#else /* X86_32 */
+2 −2
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs);

static inline struct pt_regs *get_irq_regs(void)
{
	return x86_read_percpu(irq_regs);
	return percpu_read(irq_regs);
}

static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
	struct pt_regs *old_regs;

	old_regs = get_irq_regs();
	x86_write_percpu(irq_regs, new_regs);
	percpu_write(irq_regs, new_regs);

	return old_regs;
}
+6 −6
Original line number Diff line number Diff line
@@ -4,8 +4,8 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}

@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
		/* stop flush ipis for the previous mm */
		cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
		x86_write_percpu(cpu_tlbstate.active_mm, next);
		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		percpu_write(cpu_tlbstate.active_mm, next);
#endif
		cpu_set(cpu, next->cpu_vm_mask);

@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
	}
#ifdef CONFIG_SMP
	else {
		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
		BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);

		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
			/* We were in lazy tlb mode and leave_mm disabled
+5 −5
Original line number Diff line number Diff line
@@ -45,11 +45,11 @@ extern void pda_init(int);

#define cpu_pda(cpu)		(&per_cpu(__pda, cpu))

#define read_pda(field)		x86_read_percpu(__pda.field)
#define write_pda(field, val)	x86_write_percpu(__pda.field, val)
#define add_pda(field, val)	x86_add_percpu(__pda.field, val)
#define sub_pda(field, val)	x86_sub_percpu(__pda.field, val)
#define or_pda(field, val)	x86_or_percpu(__pda.field, val)
#define read_pda(field)		percpu_read(__pda.field)
#define write_pda(field, val)	percpu_write(__pda.field, val)
#define add_pda(field, val)	percpu_add(__pda.field, val)
#define sub_pda(field, val)	percpu_sub(__pda.field, val)
#define or_pda(field, val)	percpu_or(__pda.field, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define test_and_clear_bit_pda(bit, field)				\
+13 −11
Original line number Diff line number Diff line
@@ -40,16 +40,11 @@

#ifdef CONFIG_SMP
#define __percpu_seg_str	"%%"__stringify(__percpu_seg)":"
#define __my_cpu_offset		x86_read_percpu(this_cpu_off)
#define __my_cpu_offset		percpu_read(this_cpu_off)
#else
#define __percpu_seg_str
#endif

#include <asm-generic/percpu.h>

/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

/* For arch-specific code, we can use direct single-insn ops (they
 * don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -115,11 +110,13 @@ do { \
	ret__;						\
})

#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
#define percpu_read(var)	percpu_from_op("mov", per_cpu__##var)
#define percpu_write(var, val)	percpu_to_op("mov", per_cpu__##var, val)
#define percpu_add(var, val)	percpu_to_op("add", per_cpu__##var, val)
#define percpu_sub(var, val)	percpu_to_op("sub", per_cpu__##var, val)
#define percpu_and(var, val)	percpu_to_op("and", per_cpu__##var, val)
#define percpu_or(var, val)	percpu_to_op("or", per_cpu__##var, val)
#define percpu_xor(var, val)	percpu_to_op("xor", per_cpu__##var, val)

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var)				\
@@ -131,6 +128,11 @@ do { \
	old__;								\
})

#include <asm-generic/percpu.h>

/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

#ifdef CONFIG_X86_64
extern void load_pda_offset(int cpu);
#else
Loading