Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 886cbdc0 authored by Wilco Dijkstra's avatar Wilco Dijkstra Committed by Andrea
Browse files

arm64: Use optimized memcmp

Patch written by Wilco Dijkstra submitted for review to newlib:
https://sourceware.org/ml/newlib/2017/msg00524.html



This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.

2017-06-28  Wilco Dijkstra  <wdijkstr@arm.com>

        * bionic/libc/arch-arm64/generic/bionic/memcmp.S (memcmp):
                Rewrite of optimized memcmp.

GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53:

Length    1, alignment  1/ 1:        153%
Length    1, alignment  1/ 1:        119%
Length    1, alignment  1/ 1:        154%
Length    2, alignment  2/ 2:        121%
Length    2, alignment  2/ 2:        140%
Length    2, alignment  2/ 2:        121%
Length    3, alignment  3/ 3:        105%
Length    3, alignment  3/ 3:        105%
Length    3, alignment  3/ 3:        105%
Length    4, alignment  4/ 4:        155%
Length    4, alignment  4/ 4:        154%
Length    4, alignment  4/ 4:        161%
Length    5, alignment  5/ 5:        173%
Length    5, alignment  5/ 5:        173%
Length    5, alignment  5/ 5:        173%
Length    6, alignment  6/ 6:        145%
Length    6, alignment  6/ 6:        145%
Length    6, alignment  6/ 6:        145%
Length    7, alignment  7/ 7:        125%
Length    7, alignment  7/ 7:        125%
Length    7, alignment  7/ 7:        125%
Length    8, alignment  8/ 8:        111%
Length    8, alignment  8/ 8:        130%
Length    8, alignment  8/ 8:        124%
Length    9, alignment  9/ 9:        160%
Length    9, alignment  9/ 9:        160%
Length    9, alignment  9/ 9:        150%
Length   10, alignment 10/10:        170%
Length   10, alignment 10/10:        137%
Length   10, alignment 10/10:        150%
Length   11, alignment 11/11:        160%
Length   11, alignment 11/11:        160%
Length   11, alignment 11/11:        160%
Length   12, alignment 12/12:        146%
Length   12, alignment 12/12:        168%
Length   12, alignment 12/12:        156%
Length   13, alignment 13/13:        167%
Length   13, alignment 13/13:        167%
Length   13, alignment 13/13:        173%
Length   14, alignment 14/14:        167%
Length   14, alignment 14/14:        168%
Length   14, alignment 14/14:        168%
Length   15, alignment 15/15:        168%
Length   15, alignment 15/15:        173%
Length   15, alignment 15/15:        173%
Length    1, alignment  0/ 0:        134%
Length    1, alignment  0/ 0:        127%
Length    1, alignment  0/ 0:        119%
Length    2, alignment  0/ 0:        94%
Length    2, alignment  0/ 0:        94%
Length    2, alignment  0/ 0:        106%
Length    3, alignment  0/ 0:        82%
Length    3, alignment  0/ 0:        87%
Length    3, alignment  0/ 0:        82%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        122%
Length    5, alignment  0/ 0:        127%
Length    5, alignment  0/ 0:        119%
Length    5, alignment  0/ 0:        127%
Length    6, alignment  0/ 0:        103%
Length    6, alignment  0/ 0:        100%
Length    6, alignment  0/ 0:        100%
Length    7, alignment  0/ 0:        82%
Length    7, alignment  0/ 0:        91%
Length    7, alignment  0/ 0:        87%
Length    8, alignment  0/ 0:        111%
Length    8, alignment  0/ 0:        124%
Length    8, alignment  0/ 0:        124%
Length    9, alignment  0/ 0:        136%
Length    9, alignment  0/ 0:        136%
Length    9, alignment  0/ 0:        136%
Length   10, alignment  0/ 0:        136%
Length   10, alignment  0/ 0:        135%
Length   10, alignment  0/ 0:        136%
Length   11, alignment  0/ 0:        136%
Length   11, alignment  0/ 0:        136%
Length   11, alignment  0/ 0:        135%
Length   12, alignment  0/ 0:        136%
Length   12, alignment  0/ 0:        136%
Length   12, alignment  0/ 0:        136%
Length   13, alignment  0/ 0:        135%
Length   13, alignment  0/ 0:        136%
Length   13, alignment  0/ 0:        136%
Length   14, alignment  0/ 0:        136%
Length   14, alignment  0/ 0:        136%
Length   14, alignment  0/ 0:        136%
Length   15, alignment  0/ 0:        136%
Length   15, alignment  0/ 0:        136%
Length   15, alignment  0/ 0:        136%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        115%
Length   32, alignment  0/ 0:        127%
Length   32, alignment  7/ 2:        395%
Length   32, alignment  0/ 0:        127%
Length   32, alignment  0/ 0:        127%
Length    8, alignment  0/ 0:        111%
Length    8, alignment  0/ 0:        124%
Length    8, alignment  0/ 0:        124%
Length   64, alignment  0/ 0:        128%
Length   64, alignment  6/ 4:        475%
Length   64, alignment  0/ 0:        131%
Length   64, alignment  0/ 0:        134%
Length   16, alignment  0/ 0:        128%
Length   16, alignment  0/ 0:        119%
Length   16, alignment  0/ 0:        128%
Length  128, alignment  0/ 0:        129%
Length  128, alignment  5/ 6:        475%
Length  128, alignment  0/ 0:        130%
Length  128, alignment  0/ 0:        129%
Length   32, alignment  0/ 0:        126%
Length   32, alignment  0/ 0:        126%
Length   32, alignment  0/ 0:        126%
Length  256, alignment  0/ 0:        127%
Length  256, alignment  4/ 8:        545%
Length  256, alignment  0/ 0:        126%
Length  256, alignment  0/ 0:        128%
Length   64, alignment  0/ 0:        171%
Length   64, alignment  0/ 0:        171%
Length   64, alignment  0/ 0:        174%
Length  512, alignment  0/ 0:        126%
Length  512, alignment  3/10:        585%
Length  512, alignment  0/ 0:        126%
Length  512, alignment  0/ 0:        127%
Length  128, alignment  0/ 0:        129%
Length  128, alignment  0/ 0:        128%
Length  128, alignment  0/ 0:        129%
Length 1024, alignment  0/ 0:        125%
Length 1024, alignment  2/12:        611%
Length 1024, alignment  0/ 0:        126%
Length 1024, alignment  0/ 0:        126%
Length  256, alignment  0/ 0:        128%
Length  256, alignment  0/ 0:        127%
Length  256, alignment  0/ 0:        128%
Length 2048, alignment  0/ 0:        125%
Length 2048, alignment  1/14:        625%
Length 2048, alignment  0/ 0:        125%
Length 2048, alignment  0/ 0:        125%
Length  512, alignment  0/ 0:        126%
Length  512, alignment  0/ 0:        127%
Length  512, alignment  0/ 0:        127%
Length 4096, alignment  0/ 0:        125%
Length 4096, alignment  0/16:        125%
Length 4096, alignment  0/ 0:        125%
Length 4096, alignment  0/ 0:        125%
Length 1024, alignment  0/ 0:        126%
Length 1024, alignment  0/ 0:        126%
Length 1024, alignment  0/ 0:        126%
Length 8192, alignment  0/ 0:        125%
Length 8192, alignment 63/18:        636%
Length 8192, alignment  0/ 0:        125%
Length 8192, alignment  0/ 0:        125%
Length   16, alignment  1/ 2:        317%
Length   16, alignment  1/ 2:        317%
Length   16, alignment  1/ 2:        317%
Length   32, alignment  2/ 4:        395%
Length   32, alignment  2/ 4:        395%
Length   32, alignment  2/ 4:        398%
Length   64, alignment  3/ 6:        475%
Length   64, alignment  3/ 6:        475%
Length   64, alignment  3/ 6:        477%
Length  128, alignment  4/ 8:        479%
Length  128, alignment  4/ 8:        479%
Length  128, alignment  4/ 8:        479%
Length  256, alignment  5/10:        543%
Length  256, alignment  5/10:        539%
Length  256, alignment  5/10:        543%
Length  512, alignment  6/12:        585%
Length  512, alignment  6/12:        585%
Length  512, alignment  6/12:        585%
Length 1024, alignment  7/14:        611%
Length 1024, alignment  7/14:        611%
Length 1024, alignment  7/14:        611%

Signed-off-by: default avatarPranav Vashi <neobuddy89@gmail.com>
parent 8fe41ba5
Loading
Loading
Loading
Loading
+115 −242
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 * Copyright (c) 2017 ARM Ltd
 * All rights reserved.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * and re-licensed under GPLv2 for the Linux kernel. The original code can
 * be found @
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* Assumptions:
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * ARMv8-a, AArch64, unaligned accesses.
 */

/* includes here */
#include <linux/linkage.h>
#include <asm/assembler.h>

/*
* compare memory areas(when two memory areas' offset are different,
* alignment handled by the hardware)
*
* Parameters:
*  x0 - const memory area 1 pointer
*  x1 - const memory area 2 pointer
*  x2 - the maximal compare byte length
* Returns:
*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
*/

/* Parameters and result.  */
src1		.req	x0
src2		.req	x1
limit		.req	x2
result		.req	x0
#define src1		x0
#define src2		x1
#define limit		x2
#define result		w0

/* Internal variables.  */
data1		.req	x3
data1w		.req	w3
data2		.req	x4
data2w		.req	w4
has_nul		.req	x5
diff		.req	x6
endloop		.req	x7
tmp1		.req	x8
tmp2		.req	x9
tmp3		.req	x10
pos		.req	x11
limit_wd	.req	x12
mask		.req	x13

#define data1		x3
#define data1w		w3
#define data2		x4
#define data2w		w4
#define tmp1		x5

/* Small inputs of less than 8 bytes are handled separately.  This allows the
   main code to be sped up using unaligned loads since there are now at least
   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
   This ensures each iteration does at most one unaligned access even if both
   src1 and src2 are unaligned, and mutually aligned inputs behave as if
   aligned.  After the main loop, process the last 8 bytes using unaligned
   accesses.  */

.p2align 6
ENTRY(memcmp)
	cbz	limit, .Lret0
	eor	tmp1, src1, src2
	tst	tmp1, #7
	b.ne	.Lmisaligned8
	ands	tmp1, src1, #7
	b.ne	.Lmutual_align
	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
	/*
	* The input source addresses are at alignment boundary.
	* Directly compare eight bytes each time.
	*/
.Lloop_aligned:
	ldr	data1, [src1], #8
	ldr	data2, [src2], #8
.Lstart_realigned:
	subs	limit_wd, limit_wd, #1
	eor	diff, data1, data2	/* Non-zero if differences found.  */
	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
	cbz	endloop, .Lloop_aligned

	/* Not reached the limit, must have found a diff.  */
	tbz	limit_wd, #63, .Lnot_limit

	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
	ands	limit, limit, #7
	b.eq	.Lnot_limit
	/*
	* The remained bytes less than 8. It is needed to extract valid data
	* from last eight bytes of the intended memory range.
	*/
	lsl	limit, limit, #3	/* bytes-> bits.  */
	mov	mask, #~0
CPU_BE( lsr	mask, mask, limit )
CPU_LE( lsl	mask, mask, limit )
	bic	data1, data1, mask
	bic	data2, data2, mask

	orr	diff, diff, mask
	b	.Lnot_limit

.Lmutual_align:
	/*
	* Sources are mutually aligned, but are not currently at an
	* alignment boundary. Round down the addresses and then mask off
	* the bytes that precede the start point.
	*/
	bic	src1, src1, #7
	bic	src2, src2, #7
	ldr	data1, [src1], #8
	ldr	data2, [src2], #8
	/*
	* We can not add limit with alignment offset(tmp1) here. Since the
	* addition probably make the limit overflown.
	*/
	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
	and	tmp3, limit_wd, #7
	lsr	limit_wd, limit_wd, #3
	add	tmp3, tmp3, tmp1
	add	limit_wd, limit_wd, tmp3, lsr #3
	add	limit, limit, tmp1/* Adjust the limit for the extra.  */

	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
	neg	tmp1, tmp1/* Bits to alignment -64.  */
	mov	tmp2, #~0
	/*mask off the non-intended bytes before the start address.*/
CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
	/* Little-endian.  Early bytes are at LSB.  */
CPU_LE( lsr	tmp2, tmp2, tmp1 )

	orr	data1, data1, tmp2
	orr	data2, data2, tmp2
	b	.Lstart_realigned

	/*src1 and src2 have different alignment offset.*/
.Lmisaligned8:
	cmp	limit, #8
	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/

	and	tmp1, src1, #7
	neg	tmp1, tmp1
	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
	and	tmp2, src2, #7
	neg	tmp2, tmp2
	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
	subs	tmp3, tmp1, tmp2
	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/

	sub	limit, limit, pos
	/*compare the proceeding bytes in the first 8 byte segment.*/
.Ltinycmp:
	ldrb	data1w, [src1], #1
	ldrb	data2w, [src2], #1
	subs	pos, pos, #1
	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
	b.eq	.Ltinycmp
	cbnz	pos, 1f /*diff occurred before the last byte.*/
	cmp	data1w, data2w
	b.eq	.Lstart_align
1:
	sub	result, data1, data2
	subs	limit, limit, 8
	b.lo	.Lless8

	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
	ldr	data1, [src1], 8
	ldr	data2, [src2], 8
	and	tmp1, src1, 7
	add	limit, limit, tmp1
	cmp	data1, data2
	bne	.Lreturn

	/* Align src1 and adjust src2 with bytes not yet done.  */
	sub	src1, src1, tmp1
	sub	src2, src2, tmp1

	subs	limit, limit, 8
	b.ls	.Llast_bytes

	/* Loop performing 8 bytes per iteration using aligned src1.
	   Limit is pre-decremented by 8 and must be larger than zero.
	   Exit if <= 8 bytes left to do or if the data is not equal.  */
	.p2align 4
.Lloop8:
	ldr	data1, [src1], 8
	ldr	data2, [src2], 8
	subs	limit, limit, 8
	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
	b.eq	.Lloop8

	cmp	data1, data2
	bne	.Lreturn

	/* Compare last 1-8 bytes using unaligned access.  */
.Llast_bytes:
	ldr	data1, [src1, limit]
	ldr	data2, [src2, limit]

	/* Compare data bytes and set return value to 0, -1 or 1.  */
.Lreturn:
#ifndef __AARCH64EB__
	rev	data1, data1
	rev	data2, data2
#endif
	cmp     data1, data2
.Lret_eq:
	cset	result, ne
	cneg	result, result, lo
        ret

.Lstart_align:
	lsr	limit_wd, limit, #3
	cbz	limit_wd, .Lremain8

	ands	xzr, src1, #7
	b.eq	.Lrecal_offset
	/*process more leading bytes to make src1 aligned...*/
	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
	add	src2, src2, tmp3
	sub	limit, limit, tmp3
	lsr	limit_wd, limit, #3
	cbz	limit_wd, .Lremain8
	/*load 8 bytes from aligned SRC1..*/
	ldr	data1, [src1], #8
	ldr	data2, [src2], #8

	subs	limit_wd, limit_wd, #1
	eor	diff, data1, data2  /*Non-zero if differences found.*/
	csinv	endloop, diff, xzr, ne
	cbnz	endloop, .Lunequal_proc
	/*How far is the current SRC2 from the alignment boundary...*/
	and	tmp3, tmp3, #7

.Lrecal_offset:/*src1 is aligned now..*/
	neg	pos, tmp3
.Lloopcmp_proc:
	/*
	* Divide the eight bytes into two parts. First,backwards the src2
	* to an alignment boundary,load eight bytes and compare from
	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
	* the second part's comparison. Otherwise finish the comparison.
	* This special handle can garantee all the accesses are in the
	* thread/task space in avoid to overrange access.
	*/
	ldr	data1, [src1,pos]
	ldr	data2, [src2,pos]
	eor	diff, data1, data2  /* Non-zero if differences found.  */
	cbnz	diff, .Lnot_limit

	/*The second part process*/
	ldr	data1, [src1], #8
	ldr	data2, [src2], #8
	eor	diff, data1, data2  /* Non-zero if differences found.  */
	subs	limit_wd, limit_wd, #1
	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
	cbz	endloop, .Lloopcmp_proc
.Lunequal_proc:
	cbz	diff, .Lremain8

/* There is difference occurred in the latest comparison. */
.Lnot_limit:
/*
* For little endian,reverse the low significant equal bits into MSB,then
* following CLZ can find how many equal bits exist.
*/
CPU_LE( rev	diff, diff )
CPU_LE( rev	data1, data1 )
CPU_LE( rev	data2, data2 )

	/*
	* The MS-non-zero bit of DIFF marks either the first bit
	* that is different, or the end of the significant data.
	* Shifting left now will bring the critical information into the
	* top bits.
	*/
	clz	pos, diff
	lsl	data1, data1, pos
	lsl	data2, data2, pos
	/*
	* We need to zero-extend (char is unsigned) the value and then
	* perform a signed subtraction.
	*/
	lsr	data1, data1, #56
	sub	result, data1, data2, lsr #56
	ret

.Lremain8:
	/* Limit % 8 == 0 =>. all data are equal.*/
	ands	limit, limit, #7
	b.eq	.Lret0

.Ltiny8proc:
	ldrb	data1w, [src1], #1
	ldrb	data2w, [src2], #1
	subs	limit, limit, #1

	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
	b.eq	.Ltiny8proc
	sub	result, data1, data2
	ret
.Lret0:
	mov	result, #0
	.p2align 4
	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
.Lless8:
	adds	limit, limit, 4
	b.lo	.Lless4
	ldr	data1w, [src1], 4
	ldr	data2w, [src2], 4
	cmp	data1w, data2w
	b.ne	.Lreturn
	sub	limit, limit, 4
.Lless4:
	adds	limit, limit, 4
	beq	.Lret_eq
.Lbyte_loop:
	ldrb	data1w, [src1], 1
	ldrb	data2w, [src2], 1
	subs	limit, limit, 1
	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
	b.eq	.Lbyte_loop
	sub	result, data1w, data2w
	ret
ENDPIPROC(memcmp)