[PATCH] x86_64: Remove optimization for B stepping AMD K8 (a5b250a4) · Commits · e / devices / android_kernel_sony_msm8994

arch/x86_64/lib/clear_page.S

+0 −38

Original line number	Diff line number	Diff line
		@@ -5,46 +5,8 @@
		.globl clear_page
		.p2align 4
		clear_page:
		xorl %eax,%eax
		movl $4096/64,%ecx
		.p2align 4
		.Lloop:
		decl %ecx
		#define PUT(x) movq %rax,x*8(%rdi)
		movq %rax,(%rdi)
		PUT(1)
		PUT(2)
		PUT(3)
		PUT(4)
		PUT(5)
		PUT(6)
		PUT(7)
		leaq 64(%rdi),%rdi
		jnz .Lloop
		nop
		ret
		clear_page_end:

		/* C stepping K8 run faster using the string instructions.
		It is also a lot simpler. Use this when possible */

		#include <asm/cpufeature.h>

		.section .altinstructions,"a"
		.align 8
		.quad clear_page
		.quad clear_page_c
		.byte X86_FEATURE_K8_C
		.byte clear_page_end-clear_page
		.byte clear_page_c_end-clear_page_c
		.previous

		.section .altinstr_replacement,"ax"
		clear_page_c:
		movl $4096/8,%ecx
		xorl %eax,%eax
		rep
		stosq
		ret
		clear_page_c_end:
		.previous

arch/x86_64/lib/copy_page.S

+0 −87

Original line number	Diff line number	Diff line
		@@ -8,94 +8,7 @@
		.globl copy_page
		.p2align 4
		copy_page:
		subq $3*8,%rsp
		movq %rbx,(%rsp)
		movq %r12,1*8(%rsp)
		movq %r13,2*8(%rsp)

		movl $(4096/64)-5,%ecx
		.p2align 4
		.Loop64:
		dec %rcx

		movq (%rsi), %rax
		movq 8 (%rsi), %rbx
		movq 16 (%rsi), %rdx
		movq 24 (%rsi), %r8
		movq 32 (%rsi), %r9
		movq 40 (%rsi), %r10
		movq 48 (%rsi), %r11
		movq 56 (%rsi), %r12

		prefetcht0 5*64(%rsi)

		movq %rax, (%rdi)
		movq %rbx, 8 (%rdi)
		movq %rdx, 16 (%rdi)
		movq %r8, 24 (%rdi)
		movq %r9, 32 (%rdi)
		movq %r10, 40 (%rdi)
		movq %r11, 48 (%rdi)
		movq %r12, 56 (%rdi)

		leaq 64 (%rsi), %rsi
		leaq 64 (%rdi), %rdi

		jnz .Loop64

		movl $5,%ecx
		.p2align 4
		.Loop2:
		decl %ecx

		movq (%rsi), %rax
		movq 8 (%rsi), %rbx
		movq 16 (%rsi), %rdx
		movq 24 (%rsi), %r8
		movq 32 (%rsi), %r9
		movq 40 (%rsi), %r10
		movq 48 (%rsi), %r11
		movq 56 (%rsi), %r12

		movq %rax, (%rdi)
		movq %rbx, 8 (%rdi)
		movq %rdx, 16 (%rdi)
		movq %r8, 24 (%rdi)
		movq %r9, 32 (%rdi)
		movq %r10, 40 (%rdi)
		movq %r11, 48 (%rdi)
		movq %r12, 56 (%rdi)

		leaq 64(%rdi),%rdi
		leaq 64(%rsi),%rsi

		jnz .Loop2

		movq (%rsp),%rbx
		movq 1*8(%rsp),%r12
		movq 2*8(%rsp),%r13
		addq $3*8,%rsp
		ret

		/* C stepping K8 run faster using the string copy instructions.
		It is also a lot simpler. Use this when possible */

		#include <asm/cpufeature.h>

		.section .altinstructions,"a"
		.align 8
		.quad copy_page
		.quad copy_page_c
		.byte X86_FEATURE_K8_C
		.byte copy_page_c_end-copy_page_c
		.byte copy_page_c_end-copy_page_c
		.previous

		.section .altinstr_replacement,"ax"
		copy_page_c:
		movl $4096/8,%ecx
		rep
		movsq
		ret
		copy_page_c_end:
		.previous

arch/x86_64/lib/memcpy.S

+2 −91

Original line number	Diff line number	Diff line
		@@ -11,6 +11,8 @@
		*
		* Output:
		* rax original destination
		*
		* TODO: check best memcpy for PSC
		*/

		.globl __memcpy
		@@ -18,95 +20,6 @@
		.p2align 4
		__memcpy:
		memcpy:
		pushq %rbx
		movq %rdi,%rax

		movl %edx,%ecx
		shrl $6,%ecx
		jz .Lhandle_tail

		.p2align 4
		.Lloop_64:
		decl %ecx

		movq (%rsi),%r11
		movq 8(%rsi),%r8

		movq %r11,(%rdi)
		movq %r8,1*8(%rdi)

		movq 2*8(%rsi),%r9
		movq 3*8(%rsi),%r10

		movq %r9,2*8(%rdi)
		movq %r10,3*8(%rdi)

		movq 4*8(%rsi),%r11
		movq 5*8(%rsi),%r8

		movq %r11,4*8(%rdi)
		movq %r8,5*8(%rdi)

		movq 6*8(%rsi),%r9
		movq 7*8(%rsi),%r10

		movq %r9,6*8(%rdi)
		movq %r10,7*8(%rdi)

		leaq 64(%rsi),%rsi
		leaq 64(%rdi),%rdi
		jnz .Lloop_64

		.Lhandle_tail:
		movl %edx,%ecx
		andl $63,%ecx
		shrl $3,%ecx
		jz .Lhandle_7
		.p2align 4
		.Lloop_8:
		decl %ecx
		movq (%rsi),%r8
		movq %r8,(%rdi)
		leaq 8(%rdi),%rdi
		leaq 8(%rsi),%rsi
		jnz .Lloop_8

		.Lhandle_7:
		movl %edx,%ecx
		andl $7,%ecx
		jz .Lende
		.p2align 4
		.Lloop_1:
		movb (%rsi),%r8b
		movb %r8b,(%rdi)
		incq %rdi
		incq %rsi
		decl %ecx
		jnz .Lloop_1

		.Lende:
		popq %rbx
		ret
		.Lfinal:

		/* C stepping K8 run faster using the string copy instructions.
		It is also a lot simpler. Use this when possible */

		.section .altinstructions,"a"
		.align 8
		.quad memcpy
		.quad memcpy_c
		.byte X86_FEATURE_K8_C
		.byte .Lfinal-memcpy
		.byte memcpy_c_end-memcpy_c
		.previous

		.section .altinstr_replacement,"ax"
		/* rdi destination
		* rsi source
		* rdx count
		*/
		memcpy_c:
		movq %rdi,%rax
		movl %edx,%ecx
		shrl $3,%ecx
		@@ -117,5 +30,3 @@ memcpy_c:
		rep
		movsb
		ret
		memcpy_c_end:
		.previous

arch/x86_64/lib/memset.S

+0 −94

Original line number	Diff line number	Diff line
		@@ -13,98 +13,6 @@
		.p2align 4
		memset:
		__memset:
		movq %rdi,%r10
		movq %rdx,%r11

		/* expand byte value */
		movzbl %sil,%ecx
		movabs $0x0101010101010101,%rax
		mul %rcx /* with rax, clobbers rdx */

		/* align dst */
		movl %edi,%r9d
		andl $7,%r9d
		jnz .Lbad_alignment
		.Lafter_bad_alignment:

		movl %r11d,%ecx
		shrl $6,%ecx
		jz .Lhandle_tail

		.p2align 4
		.Lloop_64:
		decl %ecx
		movq %rax,(%rdi)
		movq %rax,8(%rdi)
		movq %rax,16(%rdi)
		movq %rax,24(%rdi)
		movq %rax,32(%rdi)
		movq %rax,40(%rdi)
		movq %rax,48(%rdi)
		movq %rax,56(%rdi)
		leaq 64(%rdi),%rdi
		jnz .Lloop_64

		/* Handle tail in loops. The loops should be faster than hard
		to predict jump tables. */
		.p2align 4
		.Lhandle_tail:
		movl %r11d,%ecx
		andl $63&(~7),%ecx
		jz .Lhandle_7
		shrl $3,%ecx
		.p2align 4
		.Lloop_8:
		decl %ecx
		movq %rax,(%rdi)
		leaq 8(%rdi),%rdi
		jnz .Lloop_8

		.Lhandle_7:
		movl %r11d,%ecx
		andl $7,%ecx
		jz .Lende
		.p2align 4
		.Lloop_1:
		decl %ecx
		movb %al,(%rdi)
		leaq 1(%rdi),%rdi
		jnz .Lloop_1

		.Lende:
		movq %r10,%rax
		ret

		.Lbad_alignment:
		cmpq $7,%r11
		jbe .Lhandle_7
		movq %rax,(%rdi) /* unaligned store */
		movq $8,%r8
		subq %r9,%r8
		addq %r8,%rdi
		subq %r8,%r11
		jmp .Lafter_bad_alignment

		/* C stepping K8 run faster using the string instructions.
		It is also a lot simpler. Use this when possible */

		#include <asm/cpufeature.h>

		.section .altinstructions,"a"
		.align 8
		.quad memset
		.quad memset_c
		.byte X86_FEATURE_K8_C
		.byte memset_c_end-memset_c
		.byte memset_c_end-memset_c
		.previous

		.section .altinstr_replacement,"ax"
		/* rdi destination
		* rsi value
		* rdx count
		*/
		memset_c:
		movq %rdi,%r9
		movl %edx,%r8d
		andl $7,%r8d
		@@ -121,5 +29,3 @@ memset_c:
		stosb
		movq %r9,%rax
		ret
		memset_c_end:
		.previous