arm64: lib: Implement optimized memmove routine (280adc19) · Commits · e / devices / android_kernel_teracube_emerald

arch/arm64/lib/memmove.S

+165 −25

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 ARM Ltd.
		* Copyright (C) 2013 Linaro.
		*
		* This code is based on glibc cortex strings work originally authored by Linaro
		* and re-licensed under GPLv2 for the Linux kernel. The original code can
		* be found @
		*
		* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		* files/head:/src/aarch64/
		*
		* This program is free software; you can redistribute it and/or modify
		* it under the terms of the GNU General Public License version 2 as
		@@ -16,6 +24,7 @@

		#include <linux/linkage.h>
		#include <asm/assembler.h>
		#include <asm/cache.h>

		/*
		* Move a buffer from src to test (alignment handled by the hardware).
		@@ -28,30 +37,161 @@
		* Returns:
		* x0 - dest
		*/
		dstin .req x0
		src .req x1
		count .req x2
		tmp1 .req x3
		tmp1w .req w3
		tmp2 .req x4
		tmp2w .req w4
		tmp3 .req x5
		tmp3w .req w5
		dst .req x6

		A_l .req x7
		A_h .req x8
		B_l .req x9
		B_h .req x10
		C_l .req x11
		C_h .req x12
		D_l .req x13
		D_h .req x14

		ENTRY(memmove)
		cmp x0, x1
		b.ls memcpy
		add x4, x0, x2
		add x1, x1, x2
		subs x2, x2, #8
		b.mi 2f
		1: ldr x3, [x1, #-8]!
		subs x2, x2, #8
		str x3, [x4, #-8]!
		b.pl 1b
		2: adds x2, x2, #4
		b.mi 3f
		ldr w3, [x1, #-4]!
		sub x2, x2, #4
		str w3, [x4, #-4]!
		3: adds x2, x2, #2
		b.mi 4f
		ldrh w3, [x1, #-2]!
		sub x2, x2, #2
		strh w3, [x4, #-2]!
		4: adds x2, x2, #1
		b.mi 5f
		ldrb w3, [x1, #-1]
		strb w3, [x4, #-1]
		5: ret
		cmp dstin, src
		b.lo memcpy
		add tmp1, src, count
		cmp dstin, tmp1
		b.hs memcpy /* No overlap. */

		add dst, dstin, count
		add src, src, count
		cmp count, #16
		b.lo .Ltail15 /probably non-alignment accesses./

		ands tmp2, src, #15 /* Bytes to reach alignment. */
		b.eq .LSrcAligned
		sub count, count, tmp2
		/*
		* process the aligned offset length to make the src aligned firstly.
		* those extra instructions' cost is acceptable. It also make the
		* coming accesses are based on aligned address.
		*/
		tbz tmp2, #0, 1f
		ldrb tmp1w, [src, #-1]!
		strb tmp1w, [dst, #-1]!
		1:
		tbz tmp2, #1, 2f
		ldrh tmp1w, [src, #-2]!
		strh tmp1w, [dst, #-2]!
		2:
		tbz tmp2, #2, 3f
		ldr tmp1w, [src, #-4]!
		str tmp1w, [dst, #-4]!
		3:
		tbz tmp2, #3, .LSrcAligned
		ldr tmp1, [src, #-8]!
		str tmp1, [dst, #-8]!

		.LSrcAligned:
		cmp count, #64
		b.ge .Lcpy_over64

		/*
		* Deal with small copies quickly by dropping straight into the
		* exit block.
		*/
		.Ltail63:
		/*
		* Copy up to 48 bytes of data. At this point we only need the
		* bottom 6 bits of count to be accurate.
		*/
		ands tmp1, count, #0x30
		b.eq .Ltail15
		cmp tmp1w, #0x20
		b.eq 1f
		b.lt 2f
		ldp A_l, A_h, [src, #-16]!
		stp A_l, A_h, [dst, #-16]!
		1:
		ldp A_l, A_h, [src, #-16]!
		stp A_l, A_h, [dst, #-16]!
		2:
		ldp A_l, A_h, [src, #-16]!
		stp A_l, A_h, [dst, #-16]!

		.Ltail15:
		tbz count, #3, 1f
		ldr tmp1, [src, #-8]!
		str tmp1, [dst, #-8]!
		1:
		tbz count, #2, 2f
		ldr tmp1w, [src, #-4]!
		str tmp1w, [dst, #-4]!
		2:
		tbz count, #1, 3f
		ldrh tmp1w, [src, #-2]!
		strh tmp1w, [dst, #-2]!
		3:
		tbz count, #0, .Lexitfunc
		ldrb tmp1w, [src, #-1]
		strb tmp1w, [dst, #-1]

		.Lexitfunc:
		ret

		.Lcpy_over64:
		subs count, count, #128
		b.ge .Lcpy_body_large
		/*
		* Less than 128 bytes to copy, so handle 64 bytes here and then jump
		* to the tail.
		*/
		ldp A_l, A_h, [src, #-16]
		stp A_l, A_h, [dst, #-16]
		ldp B_l, B_h, [src, #-32]
		ldp C_l, C_h, [src, #-48]
		stp B_l, B_h, [dst, #-32]
		stp C_l, C_h, [dst, #-48]
		ldp D_l, D_h, [src, #-64]!
		stp D_l, D_h, [dst, #-64]!

		tst count, #0x3f
		b.ne .Ltail63
		ret

		/*
		* Critical loop. Start at a new cache line boundary. Assuming
		* 64 bytes per line this ensures the entire loop is in one line.
		*/
		.p2align L1_CACHE_SHIFT
		.Lcpy_body_large:
		/* pre-load 64 bytes data. */
		ldp A_l, A_h, [src, #-16]
		ldp B_l, B_h, [src, #-32]
		ldp C_l, C_h, [src, #-48]
		ldp D_l, D_h, [src, #-64]!
		1:
		/*
		* interlace the load of next 64 bytes data block with store of the last
		* loaded 64 bytes data.
		*/
		stp A_l, A_h, [dst, #-16]
		ldp A_l, A_h, [src, #-16]
		stp B_l, B_h, [dst, #-32]
		ldp B_l, B_h, [src, #-32]
		stp C_l, C_h, [dst, #-48]
		ldp C_l, C_h, [src, #-48]
		stp D_l, D_h, [dst, #-64]!
		ldp D_l, D_h, [src, #-64]!
		subs count, count, #64
		b.ge 1b
		stp A_l, A_h, [dst, #-16]
		stp B_l, B_h, [dst, #-32]
		stp C_l, C_h, [dst, #-48]
		stp D_l, D_h, [dst, #-64]!

		tst count, #0x3f
		b.ne .Ltail63
		ret
		ENDPROC(memmove)