Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9b83ecb0 authored by Anton Blanchard's avatar Anton Blanchard Committed by Benjamin Herrenschmidt
Browse files

powerpc: Optimise 64bit csum_partial



The main loop of csum_partial runs very slowly on recent POWER CPUs. After some
analysis on both POWER6 and POWER7 I came up with routine below. First we get
the source aligned to a double word, ignoring any odd alignment to keep things
simple. Then we do 64 bytes at a time, with an entry and exit limb of a further
64 bytes. On both POWER6 and POWER7 this should be as fast as we can go since
we are limited by the latency of the adde instructions.

To test this I forced checksumming on over loopback and ran socklib (a
simple TCP benchmark). On a POWER6 575 throughput improved by 11% with
this patch.

Signed-off-by: default avatarAnton Blanchard <anton@samba.org>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 93f68f1e
Loading
Loading
Loading
Loading
+153 −40
Original line number Original line Diff line number Diff line
@@ -65,53 +65,166 @@ _GLOBAL(csum_tcpudp_magic)
	srwi	r3,r3,16
	srwi	r3,r3,16
	blr
	blr


#define STACKFRAMESIZE 256
#define STK_REG(i)	(112 + ((i)-14)*8)

/*
/*
 * Computes the checksum of a memory block at buff, length len,
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
 * and adds in "sum" (32-bit).
 *
 *
 * This code assumes at least halfword alignment, though the length
 * can be any number of bytes.  The sum is accumulated in r5.
 *
 * csum_partial(r3=buff, r4=len, r5=sum)
 * csum_partial(r3=buff, r4=len, r5=sum)
 */
 */
_GLOBAL(csum_partial)
_GLOBAL(csum_partial)
        subi	r3,r3,8		/* we'll offset by 8 for the loads */
	addic	r0,r5,0			/* clear carry */
        srdi.	r6,r4,3         /* divide by 8 for doubleword count */

        addic   r5,r5,0         /* clear carry */
	srdi.	r6,r4,3			/* less than 8 bytes? */
        beq	3f              /* if we're doing < 8 bytes */
	beq	.Lcsum_tail_word
        andi.	r0,r3,2         /* aligned on a word boundary already? */

        beq+	1f
	/*
        lhz     r6,8(r3)        /* do 2 bytes to get aligned */
	 * If only halfword aligned, align to a double word. Since odd
        addi    r3,r3,2
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcsum_aligned

	li	r7,4
	sub	r6,r7,r6
	mtctr	r6

1:
	lhz	r6,0(r3)		/* align to doubleword */
	subi	r4,r4,2
	subi	r4,r4,2
        addc    r5,r5,r6
	addi	r3,r3,2
        srdi.   r6,r4,3         /* recompute number of doublewords */
	adde	r0,r0,r6
        beq     3f              /* any left? */
	bdnz	1b
1:      mtctr   r6

2:      ldu     r6,8(r3)        /* main sum loop */
.Lcsum_aligned:
        adde    r5,r5,r6
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r4,7
	beq	.Lcsum_tail_doublewords		/* len < 128 */

	srdi	r6,r4,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(r14)(r1)
	std	r15,STK_REG(r15)(r1)
	std	r16,STK_REG(r16)(r1)

	ld	r6,0(r3)
	ld	r9,8(r3)

	ld	r10,16(r3)
	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10

	adde	r0,r0,r11

	adde	r0,r0,r12

	adde	r0,r0,r14

	adde	r0,r0,r15
	ld	r6,0(r3)
	ld	r9,8(r3)

	adde	r0,r0,r16
	ld	r10,16(r3)
	ld	r11,24(r3)
	bdnz	2b
	bdnz	2b
        andi.	r4,r4,7         /* compute bytes left to sum after doublewords */

3:	cmpwi	0,r4,4		/* is at least a full word left? */

	blt	4f
	adde	r0,r0,r6
	lwz	r6,8(r3)	/* sum this word */
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
	adde	r0,r0,r11
	adde	r0,r0,r12
	adde	r0,r0,r14
	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(r14)(r1)
	ld	r15,STK_REG(r15)(r1)
	ld	r16,STK_REG(r16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r4,r4,63

.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r4,3
	beq	.Lcsum_tail_word

	mtctr	r6
3:
	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
	bdnz	3b

	andi.	r4,r4,7

.Lcsum_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r4,2
	beq	.Lcsum_tail_halfword

	lwz	r6,0(r3)
	addi	r3,r3,4
	addi	r3,r3,4
	adde	r0,r0,r6
	subi	r4,r4,4
	subi	r4,r4,4
	adde	r5,r5,r6

4:	cmpwi	0,r4,2		/* is at least a halfword left? */
.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
        blt+	5f
	srdi.	r6,r4,1
        lhz     r6,8(r3)        /* sum this halfword */
	beq	.Lcsum_tail_byte

	lhz	r6,0(r3)
	addi	r3,r3,2
	addi	r3,r3,2
	adde	r0,r0,r6
	subi	r4,r4,2
	subi	r4,r4,2
        adde    r5,r5,r6

5:	cmpwi	0,r4,1		/* is at least a byte left? */
.Lcsum_tail_byte:			/* Up to 1 byte to go */
        bne+    6f
	andi.	r6,r4,1
        lbz     r6,8(r3)        /* sum this byte */
	beq	.Lcsum_finish
        slwi    r6,r6,8         /* this byte is assumed to be the upper byte of a halfword */

        adde    r5,r5,r6
	lbz	r6,0(r3)
6:      addze	r5,r5		/* add in final carry */
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	rldicl  r4,r5,32,0      /* fold two 32-bit halves together */
	adde	r0,r0,r9
        add     r3,r4,r5

.Lcsum_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	srdi	r3,r3,32
	blr
	blr