Commit 9d8e2277 authored Jan 26, 2012 by Jan Beulich Committed by Ingo Molnar Jan 26, 2012

x86-64: Handle byte-wise tail copying in memcpy() without a loop



While hard to measure, reducing the number of possibly/likely
mis-predicted branches can generally be expected to be slightly
better.

Other than apparent at the first glance, this also doesn't grow
the function size (the alignment gap to the next function just
gets smaller).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com


Signed-off-by: Ingo Molnar <mingo@elte.hu>

parent 2ab56091

arch/x86/lib/memcpy_64.S

+10 −9

Original line number	Original line	Diff line number	Diff line
	@@ -164,18 +164,19 @@ ENTRY(memcpy)
	retq		retq
	.p2align 4		.p2align 4
	.Lless_3bytes:		.Lless_3bytes:
	cmpl $0, %edx		subl $1, %edx
	je .Lend		jb .Lend
	/*		/*
	* Move data from 1 bytes to 3 bytes.		* Move data from 1 bytes to 3 bytes.
	*/		*/
	.Lloop_1:		movzbl (%rsi), %ecx
	movb (%rsi), %r8b		jz .Lstore_1byte
	movb %r8b, (%rdi)		movzbq 1(%rsi), %r8
	incq %rdi		movzbq (%rsi, %rdx), %r9
	incq %rsi		movb %r8b, 1(%rdi)
	decl %edx		movb %r9b, (%rdi, %rdx)
	jnz .Lloop_1		.Lstore_1byte:
			movb %cl, (%rdi)

	.Lend:		.Lend:
	retq		retq