Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8270137a authored by Christoph Lameter's avatar Christoph Lameter Committed by Tejun Heo
Browse files

cpuops: Use cmpxchg for xchg to avoid lock semantics



Use cmpxchg instead of xchg to realize this_cpu_xchg.

xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
will not.

Baselines:

xchg()		= 18 cycles (no segment prefix, LOCK semantics)
__this_cpu_xchg = 1 cycle

(simulated using this_cpu_read/write, two prefixes. Looks like the
cpu can use loop optimization to get rid of most of the overhead)

Cycles before:

this_cpu_xchg	 = 37 cycles (segment prefix and LOCK (implied by xchg))

After:

this_cpu_xchg	= 11 cycle (using cmpxchg without lock semantics)

Signed-off-by: default avatarChristoph Lameter <cl@linux.com>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parent 7296e08a
Loading
Loading
Loading
Loading
+15 −6
Original line number Diff line number Diff line
@@ -263,8 +263,9 @@ do { \
})

/*
 * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
 * full lock semantics even though they are not needed.
 * xchg is implemented using cmpxchg without a lock prefix. xchg is
 * expensive due to the implied lock prefix.  The processor cannot prefetch
 * cachelines if xchg is used.
 */
#define percpu_xchg_op(var, nval)					\
({									\
@@ -272,25 +273,33 @@ do { \
	typeof(var) pxo_new__ = (nval);					\
	switch (sizeof(var)) {						\
	case 1:								\
		asm("xchgb %2, "__percpu_arg(1)				\
		asm("\n1:mov "__percpu_arg(1)",%%al"			\
		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
		    "\n\tjnz 1b"					\
			    : "=a" (pxo_ret__), "+m" (var)		\
			    : "q" (pxo_new__)				\
			    : "memory");				\
		break;							\
	case 2:								\
		asm("xchgw %2, "__percpu_arg(1)				\
		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
		    "\n\tjnz 1b"					\
			    : "=a" (pxo_ret__), "+m" (var)		\
			    : "r" (pxo_new__)				\
			    : "memory");				\
		break;							\
	case 4:								\
		asm("xchgl %2, "__percpu_arg(1)				\
		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
		    "\n\tjnz 1b"					\
			    : "=a" (pxo_ret__), "+m" (var)		\
			    : "r" (pxo_new__)				\
			    : "memory");				\
		break;							\
	case 8:								\
		asm("xchgq %2, "__percpu_arg(1)				\
		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
		    "\n\tjnz 1b"					\
			    : "=a" (pxo_ret__), "+m" (var)		\
			    : "r" (pxo_new__)				\
			    : "memory");				\