Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 223e23e8 authored by Will Deacon's avatar Will Deacon Committed by Catalin Marinas
Browse files

arm64: lib: improve copy_page to deal with 128 bytes at a time



We want to avoid lots of different copy_page implementations, settling
for something that is "good enough" everywhere and hopefully easy to
understand and maintain whilst we're at it.

This patch reworks our copy_page implementation based on discussions
with Cavium on the list and benchmarking on Cortex-A processors so that:

  - The loop is unrolled to copy 128 bytes per iteration

  - The reads are offset so that we read from the next 128-byte block
    in the same iteration that we store the previous block

  - Explicit prefetch instructions are removed for now, since they hurt
    performance on CPUs with hardware prefetching

  - The loop exit condition is calculated at the start of the loop

Signed-off-by: default avatarWill Deacon <will.deacon@arm.com>
Tested-by: default avatarAndrew Pinski <apinski@cavium.com>
Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent d5370f75
Loading
Loading
Loading
Loading
+38 −8
Original line number Diff line number Diff line
@@ -27,20 +27,50 @@
 *	x1 - src
 */
ENTRY(copy_page)
	/* Assume cache line size is 64 bytes. */
	prfm	pldl1strm, [x1, #64]
1:	ldp	x2, x3, [x1]
	ldp	x2, x3, [x1]
	ldp	x4, x5, [x1, #16]
	ldp	x6, x7, [x1, #32]
	ldp	x8, x9, [x1, #48]
	add	x1, x1, #64
	prfm	pldl1strm, [x1, #64]
	ldp	x10, x11, [x1, #64]
	ldp	x12, x13, [x1, #80]
	ldp	x14, x15, [x1, #96]
	ldp	x16, x17, [x1, #112]

	mov	x18, #(PAGE_SIZE - 128)
	add	x1, x1, #128
1:
	subs	x18, x18, #128

	stnp	x2, x3, [x0]
	ldp	x2, x3, [x1]
	stnp	x4, x5, [x0, #16]
	ldp	x4, x5, [x1, #16]
	stnp	x6, x7, [x0, #32]
	ldp	x6, x7, [x1, #32]
	stnp	x8, x9, [x0, #48]
	add	x0, x0, #64
	tst	x1, #(PAGE_SIZE - 1)
	b.ne	1b
	ldp	x8, x9, [x1, #48]
	stnp	x10, x11, [x0, #64]
	ldp	x10, x11, [x1, #64]
	stnp	x12, x13, [x0, #80]
	ldp	x12, x13, [x1, #80]
	stnp	x14, x15, [x0, #96]
	ldp	x14, x15, [x1, #96]
	stnp	x16, x17, [x0, #112]
	ldp	x16, x17, [x1, #112]

	add	x0, x0, #128
	add	x1, x1, #128

	b.gt	1b

	stnp	x2, x3, [x0]
	stnp	x4, x5, [x0, #16]
	stnp	x6, x7, [x0, #32]
	stnp	x8, x9, [x0, #48]
	stnp	x10, x11, [x0, #64]
	stnp	x12, x13, [x0, #80]
	stnp	x14, x15, [x0, #96]
	stnp	x16, x17, [x0, #112]

	ret
ENDPROC(copy_page)