x86: memcpy, clean up (f3b6eaf0) · Commits · e / devices / android_kernel_fairphone_FP3

arch/x86/lib/memcpy_64.S

+81 −55

Original line number	Diff line number	Diff line
		/* Copyright 2002 Andi Kleen */

		#include <linux/linkage.h>
		#include <asm/dwarf2.h>

		#include <asm/cpufeature.h>
		#include <asm/dwarf2.h>

		/*
		* memcpy - Copy a memory block.
		@@ -16,10 +17,17 @@
		* rax original destination
		*/

		/*
		* memcpy_c() - fast string ops (REP MOVSQ) based variant.
		*
		* Calls to this get patched into the kernel image via the
		* alternative instructions framework:
		*/
		ALIGN
		memcpy_c:
		CFI_STARTPROC
		movq %rdi, %rax

		movl %edx, %ecx
		shrl $3, %ecx
		andl $7, %edx
		@@ -33,42 +41,51 @@ ENDPROC(memcpy_c)
		ENTRY(__memcpy)
		ENTRY(memcpy)
		CFI_STARTPROC
		movq %rdi,%rax

		/*
		* Put the number of full 64-byte blocks into %ecx.
		* Tail portion is handled at the end:
		*/
		movq %rdi, %rax
		movl %edx, %ecx
		shrl $6, %ecx
		jz .Lhandle_tail

		.p2align 4
		.Lloop_64:
		/*
		* We decrement the loop index here - and the zero-flag is
		* checked at the end of the loop (instructions inbetween do
		* not change the zero flag):
		*/
		decl %ecx

		movq (%rsi),%r11
		movq 8(%rsi),%r8

		movq %r11,(%rdi)
		/*
		* Move in blocks of 4x16 bytes:
		*/
		movq 0*8(%rsi), %r11
		movq 1*8(%rsi), %r8
		movq %r11, 0*8(%rdi)
		movq %r8, 1*8(%rdi)

		movq 2*8(%rsi), %r9
		movq 3*8(%rsi), %r10

		movq %r9, 2*8(%rdi)
		movq %r10, 3*8(%rdi)

		movq 4*8(%rsi), %r11
		movq 5*8(%rsi), %r8

		movq %r11, 4*8(%rdi)
		movq %r8, 5*8(%rdi)

		movq 6*8(%rsi), %r9
		movq 7*8(%rsi), %r10

		movq %r9, 6*8(%rdi)
		movq %r10, 7*8(%rdi)

		leaq 64(%rsi), %rsi
		leaq 64(%rdi), %rdi

		jnz .Lloop_64

		.Lhandle_tail:
		@@ -76,6 +93,7 @@ ENTRY(memcpy)
		andl $63, %ecx
		shrl $3, %ecx
		jz .Lhandle_7

		.p2align 4
		.Lloop_8:
		decl %ecx
		@@ -88,7 +106,8 @@ ENTRY(memcpy)
		.Lhandle_7:
		movl %edx, %ecx
		andl $7, %ecx
		jz .Lende
		jz .Lend

		.p2align 4
		.Lloop_1:
		movb (%rsi), %r8b
		@@ -98,27 +117,34 @@ ENTRY(memcpy)
		decl %ecx
		jnz .Lloop_1

		.Lende:
		.Lend:
		ret
		CFI_ENDPROC
		ENDPROC(memcpy)
		ENDPROC(__memcpy)

		/* Some CPUs run faster using the string copy instructions.
		It is also a lot simpler. Use this when possible */
		/*
		* Some CPUs run faster using the string copy instructions.
		* It is also a lot simpler. Use this when possible:
		*/

		.section .altinstr_replacement, "ax"
		1: .byte 0xeb /* jmp <disp8> */
		.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
		2:
		.previous

		.section .altinstructions, "a"
		.align 8
		.quad memcpy
		.quad 1b
		.byte X86_FEATURE_REP_GOOD
		/* Replace only beginning, memcpy is used to apply alternatives, so it
		* is silly to overwrite itself with nops - reboot is only outcome... */

		/*
		* Replace only beginning, memcpy is used to apply alternatives,
		* so it is silly to overwrite itself with nops - reboot is the
		* only outcome...
		*/
		.byte 2b - 1b
		.byte 2b - 1b
		.previous