arm64: Change memcpy in kernel to use the copy template file (e5c88e3f) · Commits · e / devices / android_kernel_teracube_emerald

arch/arm64/lib/copy_template.S

0 → 100644

+193 −0

Original line number	Original line	Diff line number	Diff line
			/*
			* Copyright (C) 2013 ARM Ltd.
			* Copyright (C) 2013 Linaro.
			*
			* This code is based on glibc cortex strings work originally authored by Linaro
			* and re-licensed under GPLv2 for the Linux kernel. The original code can
			* be found @
			*
			* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
			* files/head:/src/aarch64/
			*
			* This program is free software; you can redistribute it and/or modify
			* it under the terms of the GNU General Public License version 2 as
			* published by the Free Software Foundation.
			*
			* This program is distributed in the hope that it will be useful,
			* but WITHOUT ANY WARRANTY; without even the implied warranty of
			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
			* GNU General Public License for more details.
			*
			* You should have received a copy of the GNU General Public License
			* along with this program. If not, see <http://www.gnu.org/licenses/>.
			*/


			/*
			* Copy a buffer from src to dest (alignment handled by the hardware)
			*
			* Parameters:
			* x0 - dest
			* x1 - src
			* x2 - n
			* Returns:
			* x0 - dest
			*/
			dstin .req x0
			src .req x1
			count .req x2
			tmp1 .req x3
			tmp1w .req w3
			tmp2 .req x4
			tmp2w .req w4
			dst .req x6

			A_l .req x7
			A_h .req x8
			B_l .req x9
			B_h .req x10
			C_l .req x11
			C_h .req x12
			D_l .req x13
			D_h .req x14

			mov dst, dstin
			cmp count, #16
			/When memory length is less than 16, the accessed are not aligned./
			b.lo .Ltiny15

			neg tmp2, src
			ands tmp2, tmp2, #15/* Bytes to reach alignment. */
			b.eq .LSrcAligned
			sub count, count, tmp2
			/*
			* Copy the leading memory data from src to dst in an increasing
			* address order.By this way,the risk of overwritting the source
			* memory data is eliminated when the distance between src and
			* dst is less than 16. The memory accesses here are alignment.
			*/
			tbz tmp2, #0, 1f
			ldrb1 tmp1w, src, #1
			strb1 tmp1w, dst, #1
			1:
			tbz tmp2, #1, 2f
			ldrh1 tmp1w, src, #2
			strh1 tmp1w, dst, #2
			2:
			tbz tmp2, #2, 3f
			ldr1 tmp1w, src, #4
			str1 tmp1w, dst, #4
			3:
			tbz tmp2, #3, .LSrcAligned
			ldr1 tmp1, src, #8
			str1 tmp1, dst, #8

			.LSrcAligned:
			cmp count, #64
			b.ge .Lcpy_over64
			/*
			* Deal with small copies quickly by dropping straight into the
			* exit block.
			*/
			.Ltail63:
			/*
			* Copy up to 48 bytes of data. At this point we only need the
			* bottom 6 bits of count to be accurate.
			*/
			ands tmp1, count, #0x30
			b.eq .Ltiny15
			cmp tmp1w, #0x20
			b.eq 1f
			b.lt 2f
			ldp1 A_l, A_h, src, #16
			stp1 A_l, A_h, dst, #16
			1:
			ldp1 A_l, A_h, src, #16
			stp1 A_l, A_h, dst, #16
			2:
			ldp1 A_l, A_h, src, #16
			stp1 A_l, A_h, dst, #16
			.Ltiny15:
			/*
			* Prefer to break one ldp/stp into several load/store to access
			* memory in an increasing address order,rather than to load/store 16
			* bytes from (src-16) to (dst-16) and to backward the src to aligned
			* address,which way is used in original cortex memcpy. If keeping
			* the original memcpy process here, memmove need to satisfy the
			* precondition that src address is at least 16 bytes bigger than dst
			* address,otherwise some source data will be overwritten when memove
			* call memcpy directly. To make memmove simpler and decouple the
			* memcpy's dependency on memmove, withdrew the original process.
			*/
			tbz count, #3, 1f
			ldr1 tmp1, src, #8
			str1 tmp1, dst, #8
			1:
			tbz count, #2, 2f
			ldr1 tmp1w, src, #4
			str1 tmp1w, dst, #4
			2:
			tbz count, #1, 3f
			ldrh1 tmp1w, src, #2
			strh1 tmp1w, dst, #2
			3:
			tbz count, #0, .Lexitfunc
			ldrb1 tmp1w, src, #1
			strb1 tmp1w, dst, #1

			b .Lexitfunc

			.Lcpy_over64:
			subs count, count, #128
			b.ge .Lcpy_body_large
			/*
			* Less than 128 bytes to copy, so handle 64 here and then jump
			* to the tail.
			*/
			ldp1 A_l, A_h, src, #16
			stp1 A_l, A_h, dst, #16
			ldp1 B_l, B_h, src, #16
			ldp1 C_l, C_h, src, #16
			stp1 B_l, B_h, dst, #16
			stp1 C_l, C_h, dst, #16
			ldp1 D_l, D_h, src, #16
			stp1 D_l, D_h, dst, #16

			tst count, #0x3f
			b.ne .Ltail63
			b .Lexitfunc

			/*
			* Critical loop. Start at a new cache line boundary. Assuming
			* 64 bytes per line this ensures the entire loop is in one line.
			*/
			.p2align L1_CACHE_SHIFT
			.Lcpy_body_large:
			/* pre-get 64 bytes data. */
			ldp1 A_l, A_h, src, #16
			ldp1 B_l, B_h, src, #16
			ldp1 C_l, C_h, src, #16
			ldp1 D_l, D_h, src, #16
			1:
			/*
			* interlace the load of next 64 bytes data block with store of the last
			* loaded 64 bytes data.
			*/
			stp1 A_l, A_h, dst, #16
			ldp1 A_l, A_h, src, #16
			stp1 B_l, B_h, dst, #16
			ldp1 B_l, B_h, src, #16
			stp1 C_l, C_h, dst, #16
			ldp1 C_l, C_h, src, #16
			stp1 D_l, D_h, dst, #16
			ldp1 D_l, D_h, src, #16
			subs count, count, #64
			b.ge 1b
			stp1 A_l, A_h, dst, #16
			stp1 B_l, B_h, dst, #16
			stp1 C_l, C_h, dst, #16
			stp1 D_l, D_h, dst, #16

			tst count, #0x3f
			b.ne .Ltail63
			.Lexitfunc:

arch/arm64/lib/memcpy.S

+26 −153

Original line number	Original line	Diff line number	Diff line
	@@ -36,166 +36,39 @@
	* Returns:		* Returns:
	* x0 - dest		* x0 - dest
	*/		*/
	dstin .req x0		.macro ldrb1 ptr, regB, val
	src .req x1		ldrb \ptr, [\regB], \val
	count .req x2		.endm
	tmp1 .req x3
	tmp1w .req w3
	tmp2 .req x4
	tmp2w .req w4
	tmp3 .req x5
	tmp3w .req w5
	dst .req x6

	A_l .req x7		.macro strb1 ptr, regB, val
	A_h .req x8		strb \ptr, [\regB], \val
	B_l .req x9		.endm
	B_h .req x10
	C_l .req x11
	C_h .req x12
	D_l .req x13
	D_h .req x14

	ENTRY(memcpy)		.macro ldrh1 ptr, regB, val
	mov dst, dstin		ldrh \ptr, [\regB], \val
	cmp count, #16		.endm
	/When memory length is less than 16, the accessed are not aligned./
	b.lo .Ltiny15

	neg tmp2, src		.macro strh1 ptr, regB, val
	ands tmp2, tmp2, #15/* Bytes to reach alignment. */		strh \ptr, [\regB], \val
	b.eq .LSrcAligned		.endm
	sub count, count, tmp2
	/*
	* Copy the leading memory data from src to dst in an increasing
	* address order.By this way,the risk of overwritting the source
	* memory data is eliminated when the distance between src and
	* dst is less than 16. The memory accesses here are alignment.
	*/
	tbz tmp2, #0, 1f
	ldrb tmp1w, [src], #1
	strb tmp1w, [dst], #1
	1:
	tbz tmp2, #1, 2f
	ldrh tmp1w, [src], #2
	strh tmp1w, [dst], #2
	2:
	tbz tmp2, #2, 3f
	ldr tmp1w, [src], #4
	str tmp1w, [dst], #4
	3:
	tbz tmp2, #3, .LSrcAligned
	ldr tmp1, [src],#8
	str tmp1, [dst],#8

	.LSrcAligned:		.macro ldr1 ptr, regB, val
	cmp count, #64		ldr \ptr, [\regB], \val
	b.ge .Lcpy_over64		.endm
	/*
	* Deal with small copies quickly by dropping straight into the
	* exit block.
	*/
	.Ltail63:
	/*
	* Copy up to 48 bytes of data. At this point we only need the
	* bottom 6 bits of count to be accurate.
	*/
	ands tmp1, count, #0x30
	b.eq .Ltiny15
	cmp tmp1w, #0x20
	b.eq 1f
	b.lt 2f
	ldp A_l, A_h, [src], #16
	stp A_l, A_h, [dst], #16
	1:
	ldp A_l, A_h, [src], #16
	stp A_l, A_h, [dst], #16
	2:
	ldp A_l, A_h, [src], #16
	stp A_l, A_h, [dst], #16
	.Ltiny15:
	/*
	* Prefer to break one ldp/stp into several load/store to access
	* memory in an increasing address order,rather than to load/store 16
	* bytes from (src-16) to (dst-16) and to backward the src to aligned
	* address,which way is used in original cortex memcpy. If keeping
	* the original memcpy process here, memmove need to satisfy the
	* precondition that src address is at least 16 bytes bigger than dst
	* address,otherwise some source data will be overwritten when memove
	* call memcpy directly. To make memmove simpler and decouple the
	* memcpy's dependency on memmove, withdrew the original process.
	*/
	tbz count, #3, 1f
	ldr tmp1, [src], #8
	str tmp1, [dst], #8
	1:
	tbz count, #2, 2f
	ldr tmp1w, [src], #4
	str tmp1w, [dst], #4
	2:
	tbz count, #1, 3f
	ldrh tmp1w, [src], #2
	strh tmp1w, [dst], #2
	3:
	tbz count, #0, .Lexitfunc
	ldrb tmp1w, [src]
	strb tmp1w, [dst]

	.Lexitfunc:		.macro str1 ptr, regB, val
	ret		str \ptr, [\regB], \val
			.endm

	.Lcpy_over64:		.macro ldp1 ptr, regB, regC, val
	subs count, count, #128		ldp \ptr, \regB, [\regC], \val
	b.ge .Lcpy_body_large		.endm
	/*
	* Less than 128 bytes to copy, so handle 64 here and then jump
	* to the tail.
	*/
	ldp A_l, A_h, [src],#16
	stp A_l, A_h, [dst],#16
	ldp B_l, B_h, [src],#16
	ldp C_l, C_h, [src],#16
	stp B_l, B_h, [dst],#16
	stp C_l, C_h, [dst],#16
	ldp D_l, D_h, [src],#16
	stp D_l, D_h, [dst],#16

	tst count, #0x3f		.macro stp1 ptr, regB, regC, val
	b.ne .Ltail63		stp \ptr, \regB, [\regC], \val
	ret		.endm

	/*		ENTRY(memcpy)
	* Critical loop. Start at a new cache line boundary. Assuming		#include "copy_template.S"
	* 64 bytes per line this ensures the entire loop is in one line.
	*/
	.p2align L1_CACHE_SHIFT
	.Lcpy_body_large:
	/* pre-get 64 bytes data. */
	ldp A_l, A_h, [src],#16
	ldp B_l, B_h, [src],#16
	ldp C_l, C_h, [src],#16
	ldp D_l, D_h, [src],#16
	1:
	/*
	* interlace the load of next 64 bytes data block with store of the last
	* loaded 64 bytes data.
	*/
	stp A_l, A_h, [dst],#16
	ldp A_l, A_h, [src],#16
	stp B_l, B_h, [dst],#16
	ldp B_l, B_h, [src],#16
	stp C_l, C_h, [dst],#16
	ldp C_l, C_h, [src],#16
	stp D_l, D_h, [dst],#16
	ldp D_l, D_h, [src],#16
	subs count, count, #64
	b.ge 1b
	stp A_l, A_h, [dst],#16
	stp B_l, B_h, [dst],#16
	stp C_l, C_h, [dst],#16
	stp D_l, D_h, [dst],#16

	tst count, #0x3f
	b.ne .Ltail63
	ret		ret
	ENDPROC(memcpy)		ENDPROC(memcpy)