Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 57dda6ef authored by Mark Nelson's avatar Mark Nelson Committed by Paul Mackerras
Browse files

powerpc: New copy_4K_page()



This new copy_4K_page() function was originally tuned for the best
performance on the Cell processor, but after testing on more 64bit
powerpc chips it was found that with a small modification it either
matched the performance offered by the current mainline version or
bettered it by a small amount.

It was found that on a Cell-based QS22 blade the amount of system
time measured when compiling a 2.6.26 pseries_defconfig decreased
by 4%. Using the same test, a 4-way 970MP machine saw a decrease of
2% in system time. No noticeable change was seen on Power4, Power5
or Power6.

The 4096 byte page is copied in thirty-two 128 byte strides. An
initial setup loop executes dcbt instructions for the whole source
page and dcbz instructions for the whole destination page. To do
this, the cache line size is retrieved from ppc64_caches.

A new CPU feature bit, CPU_FTR_CP_USE_DCBTZ, (introduced in the
previous patch) is used to make the modification to this new copy
routine - on Power4, 970 and Cell the feature bit is set so the
setup loop is executed, but on all other 64bit chips the setup
loop is nop'ed out.

Signed-off-by: default avatarMark Nelson <markn@au1.ibm.com>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
parent 2a929436
Loading
Loading
Loading
Loading
+93 −105
Original line number Original line Diff line number Diff line
/*
/*
 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
 * Copyright (C) 2008 Mark Nelson, IBM Corp.
 *
 *
 * This program is free software; you can redistribute it and/or
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * modify it under the terms of the GNU General Public License
@@ -8,112 +8,100 @@
 */
 */
#include <asm/processor.h>
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>

        .section        ".toc","aw"
PPC64_CACHES:
        .tc             ppc64_caches[TC],ppc64_caches
        .section        ".text"



_GLOBAL(copy_4K_page)
_GLOBAL(copy_4K_page)
	std	r31,-8(1)
	li	r5,4096		/* 4K page size */
	std	r30,-16(1)
BEGIN_FTR_SECTION
	std	r29,-24(1)
	ld      r10,PPC64_CACHES@toc(r2)
	std	r28,-32(1)
	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
	std	r27,-40(1)
	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
	std	r26,-48(1)
	li	r9,0
	std	r25,-56(1)
	srd	r8,r5,r11
	std	r24,-64(1)

	std	r23,-72(1)
	mtctr	r8
	std	r22,-80(1)
setup:
	std	r21,-88(1)
	dcbt	r9,r4
	std	r20,-96(1)
	dcbz	r9,r3
	li	r5,4096/32 - 1
	add	r9,r9,r12
	bdnz	setup
END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
	addi	r3,r3,-8
	addi	r3,r3,-8
	li	r12,5
	srdi    r8,r5,7		/* page is copied in 128 byte strides */
0:	addi	r5,r5,-24
	addi	r8,r8,-1	/* one stride copied outside loop */
	mtctr	r12

	ld	r22,640(4)
	mtctr	r8
	ld	r21,512(4)

	ld	r20,384(4)
	ld	r5,0(r4)
	ld	r11,256(4)
	ld	r6,8(r4)
	ld	r9,128(4)
	ld	r7,16(r4)
	ld	r7,0(4)
	ldu	r8,24(r4)
	ld	r25,648(4)
1:	std	r5,8(r3)
	ld	r24,520(4)
	ld	r9,8(r4)
	ld	r23,392(4)
	std	r6,16(r3)
	ld	r10,264(4)
	ld	r10,16(r4)
	ld	r8,136(4)
	std	r7,24(r3)
	ldu	r6,8(4)
	ld	r11,24(r4)
	cmpwi	r5,24
	std	r8,32(r3)
1:	std	r22,648(3)
	ld	r12,32(r4)
	std	r21,520(3)
	std	r9,40(r3)
	std	r20,392(3)
	ld	r5,40(r4)
	std	r11,264(3)
	std	r10,48(r3)
	std	r9,136(3)
	ld	r6,48(r4)
	std	r7,8(3)
	std	r11,56(r3)
	ld	r28,648(4)
	ld	r7,56(r4)
	ld	r27,520(4)
	std	r12,64(r3)
	ld	r26,392(4)
	ld	r8,64(r4)
	ld	r31,264(4)
	std	r5,72(r3)
	ld	r30,136(4)
	ld	r9,72(r4)
	ld	r29,8(4)
	std	r6,80(r3)
	std	r25,656(3)
	ld	r10,80(r4)
	std	r24,528(3)
	std	r7,88(r3)
	std	r23,400(3)
	ld	r11,88(r4)
	std	r10,272(3)
	std	r8,96(r3)
	std	r8,144(3)
	ld	r12,96(r4)
	std	r6,16(3)
	std	r9,104(r3)
	ld	r22,656(4)
	ld	r5,104(r4)
	ld	r21,528(4)
	std	r10,112(r3)
	ld	r20,400(4)
	ld	r6,112(r4)
	ld	r11,272(4)
	std	r11,120(r3)
	ld	r9,144(4)
	ld	r7,120(r4)
	ld	r7,16(4)
	stdu	r12,128(r3)
	std	r28,664(3)
	ldu	r8,128(r4)
	std	r27,536(3)
	std	r26,408(3)
	std	r31,280(3)
	std	r30,152(3)
	stdu	r29,24(3)
	ld	r25,664(4)
	ld	r24,536(4)
	ld	r23,408(4)
	ld	r10,280(4)
	ld	r8,152(4)
	ldu	r6,24(4)
	bdnz	1b
	bdnz	1b
	std	r22,648(3)

	std	r21,520(3)
	std	r5,8(r3)
	std	r20,392(3)
	ld	r9,8(r4)
	std	r11,264(3)
	std	r6,16(r3)
	std	r9,136(3)
	ld	r10,16(r4)
	std	r7,8(3)
	std	r7,24(r3)
	addi	r4,r4,640
	ld	r11,24(r4)
	addi	r3,r3,648
	std	r8,32(r3)
	bge	0b
	ld	r12,32(r4)
	mtctr	r5
	std	r9,40(r3)
	ld	r7,0(4)
	ld	r5,40(r4)
	ld	r8,8(4)
	std	r10,48(r3)
	ldu	r9,16(4)
	ld	r6,48(r4)
3:	ld	r10,8(4)
	std	r11,56(r3)
	std	r7,8(3)
	ld	r7,56(r4)
	ld	r7,16(4)
	std	r12,64(r3)
	std	r8,16(3)
	ld	r8,64(r4)
	ld	r8,24(4)
	std	r5,72(r3)
	std	r9,24(3)
	ld	r9,72(r4)
	ldu	r9,32(4)
	std	r6,80(r3)
	stdu	r10,32(3)
	ld	r10,80(r4)
	bdnz	3b
	std	r7,88(r3)
4:	ld	r10,8(4)
	ld	r11,88(r4)
	std	r7,8(3)
	std	r8,96(r3)
	std	r8,16(3)
	ld	r12,96(r4)
	std	r9,24(3)
	std	r9,104(r3)
	std	r10,32(3)
	std	r10,112(r3)
9:	ld	r20,-96(1)
	std	r11,120(r3)
	ld	r21,-88(1)
	std	r12,128(r3)
	ld	r22,-80(1)
	ld	r23,-72(1)
	ld	r24,-64(1)
	ld	r25,-56(1)
	ld	r26,-48(1)
	ld	r27,-40(1)
	ld	r28,-32(1)
	ld	r29,-24(1)
	ld	r30,-16(1)
	ld	r31,-8(1)
	blr
	blr