arm64/lib: improve CRC32 performance for deep pipelines (efdb25ef) · Commits · e / devices / android_kernel_fairphone_FP5

Improve the performance of the crc32() asm routines by getting rid of most of the branches and small sized loads on the common path. Instead, use a branchless code path involving overlapping 16 byte loads to process the first (length % 32) bytes, and process the remainder using a loop that processes 32 bytes at a time. Tested using the following test program: #include <stdlib.h> extern void crc32_le(unsigned short, char const*, int); int main(void) { static const char buf[4096]; srand(20181126); for (int i = 0; i < 100 * 1000 * 1000; i++) crc32_le(0, buf, rand() % 1024); return 0; } On Cortex-A53 and Cortex-A57, the performance regresses but only very slightly. On Cortex-A72 however, the performance improves from $ time ./crc32 real 0m10.149s user 0m10.149s sys 0m0.000s to $ time ./crc32 real 0m7.915s user 0m7.915s sys 0m0.000s Cc: Rui Sun <sunrui26@huawei.com> Signed-off-by:

Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by:

Will Deacon <will.deacon@arm.com>

arch/arm64/lib/crc32.S

+49 −5

Original line number	Diff line number	Diff line
		@@ -15,15 +15,59 @@
		.cpu generic+crc

		.macro __crc32, c
		0: subs x2, x2, #16
		b.mi 8f
		ldp x3, x4, [x1], #16
		cmp x2, #16
		b.lt 8f // less than 16 bytes

		and x7, x2, #0x1f
		and x2, x2, #~0x1f
		cbz x7, 32f // multiple of 32 bytes

		and x8, x7, #0xf
		ldp x3, x4, [x1]
		add x8, x8, x1
		add x1, x1, x7
		ldp x5, x6, [x8]
		CPU_BE( rev x3, x3 )
		CPU_BE( rev x4, x4 )
		CPU_BE( rev x5, x5 )
		CPU_BE( rev x6, x6 )

		tst x7, #8
		crc32\c\()x w8, w0, x3
		csel x3, x3, x4, eq
		csel w0, w0, w8, eq
		tst x7, #4
		lsr x4, x3, #32
		crc32\c\()w w8, w0, w3
		csel x3, x3, x4, eq
		csel w0, w0, w8, eq
		tst x7, #2
		lsr w4, w3, #16
		crc32\c\()h w8, w0, w3
		csel w3, w3, w4, eq
		csel w0, w0, w8, eq
		tst x7, #1
		crc32\c\()b w8, w0, w3
		csel w0, w0, w8, eq
		tst x7, #16
		crc32\c\()x w8, w0, x5
		crc32\c\()x w8, w8, x6
		csel w0, w0, w8, eq
		cbz x2, 0f

		32: ldp x3, x4, [x1], #32
		sub x2, x2, #32
		ldp x5, x6, [x1, #-16]
		CPU_BE( rev x3, x3 )
		CPU_BE( rev x4, x4 )
		CPU_BE( rev x5, x5 )
		CPU_BE( rev x6, x6 )
		crc32\c\()x w0, w0, x3
		crc32\c\()x w0, w0, x4
		b.ne 0b
		ret
		crc32\c\()x w0, w0, x5
		crc32\c\()x w0, w0, x6
		cbnz x2, 32b
		0: ret

		8: tbz x2, #3, 4f
		ldr x3, [x1], #8