Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a5b250a4 authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds
Browse files

[PATCH] x86_64: Remove optimization for B stepping AMD K8



B stepping were the first shipping Opterons. memcpy/memset/copy_page/
clear_page had special optimized version for them. These are really
old and in the minority now and the difference to the generic versions
(using rep microcode) is not that big anyways. So just remove them.

TODO: figure out optimized versions for Intel Netburst based EM64T

Signed-off-by: default avatarAndi Kleen <ak@suse.de>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent a6f5deb2
Loading
Loading
Loading
Loading
+0 −38
Original line number Diff line number Diff line
@@ -5,46 +5,8 @@
	.globl clear_page
	.p2align 4
clear_page:
	xorl   %eax,%eax
	movl   $4096/64,%ecx
	.p2align 4
.Lloop:
	decl	%ecx
#define PUT(x) movq %rax,x*8(%rdi) 
	movq %rax,(%rdi)
	PUT(1)
	PUT(2)
	PUT(3)
	PUT(4)
	PUT(5)
	PUT(6)
	PUT(7)
	leaq	64(%rdi),%rdi
	jnz	.Lloop
	nop
	ret
clear_page_end:	
	
	/* C stepping K8 run faster using the string instructions.
	   It is also a lot simpler. Use this when possible */
	
#include <asm/cpufeature.h>
	    	
	.section .altinstructions,"a"
	.align 8
	.quad  clear_page
	.quad  clear_page_c
	.byte  X86_FEATURE_K8_C
	.byte  clear_page_end-clear_page	
	.byte  clear_page_c_end-clear_page_c
	.previous

	.section .altinstr_replacement,"ax"
clear_page_c:
	movl $4096/8,%ecx
	xorl %eax,%eax
	rep 
	stosq
	ret
clear_page_c_end:
	.previous
+0 −87
Original line number Diff line number Diff line
@@ -8,94 +8,7 @@
	.globl copy_page
	.p2align 4
copy_page:
	subq	$3*8,%rsp
	movq	%rbx,(%rsp)
	movq	%r12,1*8(%rsp)
	movq	%r13,2*8(%rsp)
			
	movl	$(4096/64)-5,%ecx
	.p2align 4
.Loop64:	
  	dec     %rcx

	movq        (%rsi), %rax
	movq      8 (%rsi), %rbx
	movq     16 (%rsi), %rdx
	movq     24 (%rsi), %r8
	movq     32 (%rsi), %r9
	movq     40 (%rsi), %r10
	movq     48 (%rsi), %r11
	movq     56 (%rsi), %r12

	prefetcht0 5*64(%rsi)

	movq     %rax,    (%rdi)
	movq     %rbx,  8 (%rdi)
	movq     %rdx, 16 (%rdi)
	movq     %r8,  24 (%rdi)
	movq     %r9,  32 (%rdi)
	movq     %r10, 40 (%rdi)
	movq     %r11, 48 (%rdi)
	movq     %r12, 56 (%rdi)

	leaq    64 (%rsi), %rsi
	leaq    64 (%rdi), %rdi

	jnz     .Loop64

	movl	$5,%ecx
	.p2align 4
.Loop2:	
	decl   %ecx

	movq        (%rsi), %rax
	movq      8 (%rsi), %rbx
	movq     16 (%rsi), %rdx
	movq     24 (%rsi), %r8
	movq     32 (%rsi), %r9
	movq     40 (%rsi), %r10
	movq     48 (%rsi), %r11
	movq     56 (%rsi), %r12

	movq     %rax,    (%rdi)
	movq     %rbx,  8 (%rdi)
	movq     %rdx, 16 (%rdi)
	movq     %r8,  24 (%rdi)
	movq     %r9,  32 (%rdi)
	movq     %r10, 40 (%rdi)
	movq     %r11, 48 (%rdi)
	movq     %r12, 56 (%rdi)
	
	leaq	64(%rdi),%rdi			
	leaq	64(%rsi),%rsi			
	
	jnz	.Loop2		
	
	movq	(%rsp),%rbx
	movq	1*8(%rsp),%r12
	movq	2*8(%rsp),%r13
	addq	$3*8,%rsp
	ret
	
	/* C stepping K8 run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>		
		
	.section .altinstructions,"a"
	.align 8
	.quad  copy_page
	.quad  copy_page_c
	.byte  X86_FEATURE_K8_C
	.byte  copy_page_c_end-copy_page_c
	.byte  copy_page_c_end-copy_page_c
	.previous

	.section .altinstr_replacement,"ax"
copy_page_c:
	movl $4096/8,%ecx
	rep 
	movsq 
	ret
copy_page_c_end:
	.previous
+2 −91
Original line number Diff line number Diff line
@@ -11,6 +11,8 @@
 * 
 * Output:
 * rax original destination
 * 
 * TODO: check best memcpy for PSC
 */	

 	.globl __memcpy
@@ -18,95 +20,6 @@
	.p2align 4
__memcpy:
memcpy:		
	pushq %rbx
	movq %rdi,%rax

	movl %edx,%ecx
	shrl $6,%ecx
	jz .Lhandle_tail
	
	.p2align 4
.Lloop_64:
	decl %ecx
	
	movq (%rsi),%r11
	movq 8(%rsi),%r8

	movq %r11,(%rdi)
	movq %r8,1*8(%rdi)

	movq 2*8(%rsi),%r9
	movq 3*8(%rsi),%r10

	movq %r9,2*8(%rdi)
	movq %r10,3*8(%rdi)
		
	movq 4*8(%rsi),%r11
	movq 5*8(%rsi),%r8

	movq %r11,4*8(%rdi)
	movq %r8,5*8(%rdi)

	movq 6*8(%rsi),%r9
	movq 7*8(%rsi),%r10

	movq %r9,6*8(%rdi)
	movq %r10,7*8(%rdi)

	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	jnz  .Lloop_64

.Lhandle_tail:
	movl %edx,%ecx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	.p2align 4
.Lloop_8: 
	decl %ecx
	movq (%rsi),%r8
	movq %r8,(%rdi) 
	leaq 8(%rdi),%rdi
	leaq 8(%rsi),%rsi
	jnz  .Lloop_8

.Lhandle_7:
	movl %edx,%ecx
	andl $7,%ecx
	jz .Lende
	.p2align 4
.Lloop_1:
	movb (%rsi),%r8b
	movb %r8b,(%rdi) 
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1
	
.Lende: 	
	popq %rbx
	ret
.Lfinal:
	
	/* C stepping K8 run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */
	
	.section .altinstructions,"a"
	.align 8
	.quad  memcpy
	.quad  memcpy_c
	.byte  X86_FEATURE_K8_C
	.byte  .Lfinal-memcpy
	.byte  memcpy_c_end-memcpy_c	
	.previous

	.section .altinstr_replacement,"ax"
 /* rdi	destination
  * rsi source
  * rdx count
  */			
memcpy_c:
	movq %rdi,%rax
	movl %edx,%ecx
	shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
	rep
	movsb
	ret
memcpy_c_end:
	.previous
+0 −94
Original line number Diff line number Diff line
@@ -13,98 +13,6 @@
	.p2align 4
memset:	
__memset:
	movq %rdi,%r10
	movq %rdx,%r11

	/* expand byte value  */
	movzbl %sil,%ecx
	movabs $0x0101010101010101,%rax
	mul    %rcx		/* with rax, clobbers rdx */

	/* align dst */
	movl  %edi,%r9d		
	andl  $7,%r9d	
	jnz  .Lbad_alignment
.Lafter_bad_alignment:
	
	movl %r11d,%ecx
	shrl $6,%ecx
	jz	 .Lhandle_tail

	.p2align 4
.Lloop_64:	
	decl   %ecx
	movq  %rax,(%rdi) 
	movq  %rax,8(%rdi) 
	movq  %rax,16(%rdi) 
	movq  %rax,24(%rdi) 
	movq  %rax,32(%rdi) 
	movq  %rax,40(%rdi) 
	movq  %rax,48(%rdi) 
	movq  %rax,56(%rdi) 
	leaq  64(%rdi),%rdi
	jnz    .Lloop_64

	/* Handle tail in loops. The loops should be faster than hard
	   to predict jump tables. */ 
	.p2align 4	   
.Lhandle_tail:
	movl	%r11d,%ecx
	andl    $63&(~7),%ecx
	jz 		.Lhandle_7
	shrl	$3,%ecx
	.p2align 4
.Lloop_8:
	decl   %ecx
	movq  %rax,(%rdi)
	leaq  8(%rdi),%rdi
	jnz    .Lloop_8

.Lhandle_7:
	movl	%r11d,%ecx
	andl	$7,%ecx
	jz      .Lende
	.p2align 4
.Lloop_1:
	decl    %ecx
	movb 	%al,(%rdi)
	leaq	1(%rdi),%rdi
	jnz     .Lloop_1
	
.Lende:	
	movq	%r10,%rax
	ret

.Lbad_alignment:
	cmpq $7,%r11
	jbe	.Lhandle_7
	movq %rax,(%rdi)	/* unaligned store */
	movq $8,%r8			
	subq %r9,%r8 
	addq %r8,%rdi
	subq %r8,%r11
	jmp .Lafter_bad_alignment

	/* C stepping K8 run faster using the string instructions.
	   It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>	
		
	.section .altinstructions,"a"
	.align 8
	.quad  memset
	.quad  memset_c
	.byte  X86_FEATURE_K8_C
	.byte  memset_c_end-memset_c
	.byte  memset_c_end-memset_c
	.previous

	.section .altinstr_replacement,"ax"
 /* rdi	destination
  * rsi value
  * rdx count
  */			
memset_c:	
	movq %rdi,%r9
	movl %edx,%r8d
	andl $7,%r8d		
@@ -121,5 +29,3 @@ memset_c:
	stosb
	movq %r9,%rax
	ret
memset_c_end:
	.previous