Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 249ac17e authored by Chris Zankel's avatar Chris Zankel Committed by Linus Torvalds
Browse files

[PATCH] xtensa: Architecture support for Tensilica Xtensa Part 4



The attached patches provides part 4 of an architecture implementation for the
Tensilica Xtensa CPU series.

Signed-off-by: default avatarChris Zankel <chris@zankel.net>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 5a0015d6
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
#
# Makefile for Xtensa-specific library files.
#

lib-y	+= memcopy.o memset.o checksum.o strcasecmp.o \
	   usercopy.o strncpy_user.o strnlen_user.o
lib-$(CONFIG_PCI) += pci-auto.o
+410 −0
Original line number Diff line number Diff line
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IP/TCP/UDP checksumming routines
 *
 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
 *                  Optimized by Joe Taylor
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <asm/errno.h>
#include <linux/linkage.h>
#define _ASMLANGUAGE
#include <xtensa/config/core.h>

/*
 * computes a partial checksum, e.g. for TCP/UDP fragments
 */

/*
 * unsigned int csum_partial(const unsigned char *buf, int len,
 *                           unsigned int sum);
 *    a2 = buf
 *    a3 = len
 *    a4 = sum
 *
 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
 */

/* ONES_ADD converts twos-complement math to ones-complement. */
#define ONES_ADD(sum, val)	  \
	add	sum, sum, val	; \
	bgeu	sum, val, 99f	; \
	addi	sum, sum, 1	; \
99:				;

.text
ENTRY(csum_partial)
	  /*
	   * Experiments with Ethernet and SLIP connections show that buf
	   * is aligned on either a 2-byte or 4-byte boundary.
	   */
	entry	sp, 32
	extui	a5, a2, 0, 2
	bnez	a5, 8f		/* branch if 2-byte aligned */
	/* Fall-through on common case, 4-byte alignment */
1:
	srli	a5, a3, 5	/* 32-byte chunks */
#if XCHAL_HAVE_LOOPS
	loopgtz	a5, 2f
#else
	beqz	a5, 2f
	slli	a5, a5, 5
	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
.Loop1:
#endif
	l32i	a6, a2, 0
	l32i	a7, a2, 4
	ONES_ADD(a4, a6)
	ONES_ADD(a4, a7)
	l32i	a6, a2, 8
	l32i	a7, a2, 12
	ONES_ADD(a4, a6)
	ONES_ADD(a4, a7)
	l32i	a6, a2, 16
	l32i	a7, a2, 20
	ONES_ADD(a4, a6)
	ONES_ADD(a4, a7)
	l32i	a6, a2, 24
	l32i	a7, a2, 28
	ONES_ADD(a4, a6)
	ONES_ADD(a4, a7)
	addi	a2, a2, 4*8
#if !XCHAL_HAVE_LOOPS
	blt	a2, a5, .Loop1
#endif
2:
	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
#if XCHAL_HAVE_LOOPS
	loopgtz	a5, 3f
#else
	beqz	a5, 3f
	slli	a5, a5, 2
	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
.Loop2:
#endif
	l32i	a6, a2, 0
	ONES_ADD(a4, a6)
	addi	a2, a2, 4
#if !XCHAL_HAVE_LOOPS
	blt	a2, a5, .Loop2
#endif
3:
	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
	l16ui	a6, a2, 0
	ONES_ADD(a4, a6)
	addi	a2, a2, 2
5:
	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
6:	l8ui	a6, a2, 0
#ifdef __XTENSA_EB__
	slli	a6, a6, 8	/* load byte into bits 8..15 */
#endif
	ONES_ADD(a4, a6)
7:
	mov	a2, a4
	retw

	/* uncommon case, buf is 2-byte aligned */
8:
	beqz	a3, 7b		/* branch if len == 0 */
	beqi	a3, 1, 6b	/* branch if len == 1 */

	extui	a5, a2, 0, 1
	bnez	a5, 8f		/* branch if 1-byte aligned */

	l16ui	a6, a2, 0	/* common case, len >= 2 */
	ONES_ADD(a4, a6)
	addi	a2, a2, 2	/* adjust buf */
	addi	a3, a3, -2	/* adjust len */
	j	1b		/* now buf is 4-byte aligned */

	/* case: odd-byte aligned, len > 1
	 * This case is dog slow, so don't give us an odd address.
	 * (I don't think this ever happens, but just in case.)
	 */
8:
	srli	a5, a3, 2	/* 4-byte chunks */
#if XCHAL_HAVE_LOOPS
	loopgtz	a5, 2f
#else
	beqz	a5, 2f
	slli	a5, a5, 2
	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
.Loop3:
#endif
	l8ui	a6, a2, 0	/* bits 24..31 */
	l16ui	a7, a2, 1	/* bits  8..23 */
	l8ui	a8, a2, 3	/* bits  0.. 8 */
#ifdef	__XTENSA_EB__
	slli	a6, a6, 24
#else
	slli	a8, a8, 24
#endif
	slli	a7, a7, 8
	or	a7, a7, a6
	or	a7, a7, a8
	ONES_ADD(a4, a7)
	addi	a2, a2, 4
#if !XCHAL_HAVE_LOOPS
	blt	a2, a5, .Loop3
#endif
2:
	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
	l8ui	a6, a2, 0
	l8ui	a7, a2, 1
#ifdef	__XTENSA_EB__
	slli	a6, a6, 8
#else
	slli	a7, a7, 8
#endif
	or	a7, a7, a6
	ONES_ADD(a4, a7)
	addi	a2, a2, 2
3:
	j	5b		/* branch to handle the remaining byte */



/*
 * Copy from ds while checksumming, otherwise like csum_partial
 *
 * The macros SRC and DST specify the type of access for the instruction.
 * thus we can call a custom exception handler for each access type.
 */

#define SRC(y...)			\
	9999: y;			\
	.section __ex_table, "a";	\
	.long 9999b, 6001f	;	\
	.previous

#define DST(y...)			\
	9999: y;			\
	.section __ex_table, "a";	\
	.long 9999b, 6002f	;	\
	.previous

/*
unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
					int sum, int *src_err_ptr, int *dst_err_ptr)
	a2  = src
	a3  = dst
	a4  = len
	a5  = sum
	a6  = src_err_ptr
	a7  = dst_err_ptr
	a8  = temp
	a9  = temp
	a10 = temp
	a11 = original len for exception handling
	a12 = original dst for exception handling

    This function is optimized for 4-byte aligned addresses.  Other
    alignments work, but not nearly as efficiently.
 */

ENTRY(csum_partial_copy_generic)
	entry	sp, 32
	mov	a12, a3
	mov	a11, a4
	or	a10, a2, a3

	/* We optimize the following alignment tests for the 4-byte
	aligned case.  Two bbsi.l instructions might seem more optimal
	(commented out below).  However, both labels 5: and 3: are out
	of the imm8 range, so the assembler relaxes them into
	equivalent bbci.l, j combinations, which is actually
	slower. */

	extui	a9, a10, 0, 2
	beqz	a9, 1f		/* branch if both are 4-byte aligned */
	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
	j	3f		/* one address is 2-byte aligned */

/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */

1:
	/* src and dst are both 4-byte aligned */
	srli	a10, a4, 5	/* 32-byte chunks */
#if XCHAL_HAVE_LOOPS
	loopgtz	a10, 2f
#else
	beqz	a10, 2f
	slli	a10, a10, 5
	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
.Loop5:
#endif
SRC(	l32i	a9, a2, 0	)
SRC(	l32i	a8, a2, 4	)
DST(	s32i	a9, a3, 0	)
DST(	s32i	a8, a3, 4	)
	ONES_ADD(a5, a9)
	ONES_ADD(a5, a8)
SRC(	l32i	a9, a2, 8	)
SRC(	l32i	a8, a2, 12	)
DST(	s32i	a9, a3, 8	)
DST(	s32i	a8, a3, 12	)
	ONES_ADD(a5, a9)
	ONES_ADD(a5, a8)
SRC(	l32i	a9, a2, 16	)
SRC(	l32i	a8, a2, 20	)
DST(	s32i	a9, a3, 16	)
DST(	s32i	a8, a3, 20	)
	ONES_ADD(a5, a9)
	ONES_ADD(a5, a8)
SRC(	l32i	a9, a2, 24	)
SRC(	l32i	a8, a2, 28	)
DST(	s32i	a9, a3, 24	)
DST(	s32i	a8, a3, 28	)
	ONES_ADD(a5, a9)
	ONES_ADD(a5, a8)
	addi	a2, a2, 32
	addi	a3, a3, 32
#if !XCHAL_HAVE_LOOPS
	blt	a2, a10, .Loop5
#endif
2:
	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
#if XCHAL_HAVE_LOOPS
	loopgtz	a10, 3f
#else
	beqz	a10, 3f
	slli	a10, a10, 2
	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
.Loop6:
#endif
SRC(	l32i	a9, a2, 0	)
DST(	s32i	a9, a3, 0	)
	ONES_ADD(a5, a9)
	addi	a2, a2, 4
	addi	a3, a3, 4
#if !XCHAL_HAVE_LOOPS
	blt	a2, a10, .Loop6
#endif
3:
	/*
	Control comes to here in two cases: (1) It may fall through
	to here from the 4-byte alignment case to process, at most,
	one 2-byte chunk.  (2) It branches to here from above if
	either src or dst is 2-byte aligned, and we process all bytes
	here, except for perhaps a trailing odd byte.  It's
	inefficient, so align your addresses to 4-byte boundaries.

	a2 = src
	a3 = dst
	a4 = len
	a5 = sum
	*/
	srli	a10, a4, 1	/* 2-byte chunks */
#if XCHAL_HAVE_LOOPS
	loopgtz	a10, 4f
#else
	beqz	a10, 4f
	slli	a10, a10, 1
	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
.Loop7:
#endif
SRC(	l16ui	a9, a2, 0	)
DST(	s16i	a9, a3, 0	)
	ONES_ADD(a5, a9)
	addi	a2, a2, 2
	addi	a3, a3, 2
#if !XCHAL_HAVE_LOOPS
	blt	a2, a10, .Loop7
#endif
4:
	/* This section processes a possible trailing odd byte. */
	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
SRC(	l8ui	a9, a2, 0	)
DST(	s8i	a9, a3, 0	)
#ifdef __XTENSA_EB__
	slli	a9, a9, 8	/* shift byte to bits 8..15 */
#endif
	ONES_ADD(a5, a9)
8:
	mov	a2, a5
	retw

5:
	/* Control branch to here when either src or dst is odd.  We
	process all bytes using 8-bit accesses.  Grossly inefficient,
	so don't feed us an odd address. */

	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
#if XCHAL_HAVE_LOOPS
	loopgtz	a10, 6f
#else
	beqz	a10, 6f
	slli	a10, a10, 1
	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
.Loop8:
#endif
SRC(	l8ui	a9, a2, 0	)
SRC(	l8ui	a8, a2, 1	)
DST(	s8i	a9, a3, 0	)
DST(	s8i	a8, a3, 1	)
#ifdef __XTENSA_EB__
	slli	a9, a9, 8	/* combine into a single 16-bit value */
#else				/* for checksum computation */
	slli	a8, a8, 8
#endif
	or	a9, a9, a8
	ONES_ADD(a5, a9)
	addi	a2, a2, 2
	addi	a3, a3, 2
#if !XCHAL_HAVE_LOOPS
	blt	a2, a10, .Loop8
#endif
6:
	j	4b		/* process the possible trailing odd byte */


# Exception handler:
.section .fixup, "ax"
/*
	a6  = src_err_ptr
	a7  = dst_err_ptr
	a11 = original len for exception handling
	a12 = original dst for exception handling
*/

6001:
	_movi	a2, -EFAULT
	s32i	a2, a6, 0	/* src_err_ptr */

	# clear the complete destination - computing the rest
	# is too much work
	movi	a2, 0
#if XCHAL_HAVE_LOOPS
	loopgtz	a11, 2f
#else
	beqz	a11, 2f
	add	a11, a11, a12	/* a11 = ending address */
.Leloop:
#endif
	s8i	a2, a12, 0
	addi	a12, a12, 1
#if !XCHAL_HAVE_LOOPS
	blt	a12, a11, .Leloop
#endif
2:
	retw

6002:
	movi	a2, -EFAULT
	s32i	a2, a7, 0	/* dst_err_ptr */
	movi	a2, 0
	retw

.previous
+315 −0
Original line number Diff line number Diff line
/*
 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
 * xthal_memcpy and xthal_bcopy
 *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Copyright (C) 2002 - 2005 Tensilica Inc.
 */

#include <xtensa/coreasm.h>

	.macro	src_b	r, w0, w1
#ifdef __XTENSA_EB__
	src	\r, \w0, \w1
#else
	src	\r, \w1, \w0
#endif
	.endm

	.macro	ssa8	r
#ifdef __XTENSA_EB__
	ssa8b	\r
#else
	ssa8l	\r
#endif
	.endm


/*
 * void *memcpy(void *dst, const void *src, size_t len);
 * void *memmove(void *dst, const void *src, size_t len);
 * void *bcopy(const void *src, void *dst, size_t len);
 *
 * This function is intended to do the same thing as the standard
 * library function memcpy() (or bcopy()) for most cases.
 * However, where the source and/or destination references
 * an instruction RAM or ROM or a data RAM or ROM, that
 * source and/or destination will always be accessed with
 * 32-bit load and store instructions (as required for these
 * types of devices).
 *
 * !!!!!!!  XTFIXME:
 * !!!!!!!  Handling of IRAM/IROM has not yet
 * !!!!!!!  been implemented.
 *
 * The bcopy version is provided here to avoid the overhead
 * of an extra call, for callers that require this convention.
 *
 * The (general case) algorithm is as follows:
 *   If destination is unaligned, align it by conditionally
 *     copying 1 and 2 bytes.
 *   If source is aligned,
 *     do 16 bytes with a loop, and then finish up with
 *     8, 4, 2, and 1 byte copies conditional on the length;
 *   else (if source is unaligned),
 *     do the same, but use SRC to align the source data.
 *   This code tries to use fall-through branches for the common
 *     case of aligned source and destination and multiple
 *     of 4 (or 8) length.
 *
 * Register use:
 *	a0/ return address
 *	a1/ stack pointer
 *	a2/ return value
 *	a3/ src
 *	a4/ length
 *	a5/ dst
 *	a6/ tmp
 *	a7/ tmp
 *	a8/ tmp
 *	a9/ tmp
 *	a10/ tmp
 *	a11/ tmp
 */

	.text
	.align	4
	.global	bcopy
	.type   bcopy,@function
bcopy:
	entry	sp, 16		# minimal stack frame
	# a2=src, a3=dst, a4=len
	mov	a5, a3		# copy dst so that a2 is return value
	mov	a3, a2
	mov	a2, a5
	j	.Lcommon	# go to common code for memcpy+bcopy


/*
 * Byte by byte copy
 */
	.align	4
	.byte	0		# 1 mod 4 alignment for LOOPNEZ
				# (0 mod 4 alignment for LBEG)
.Lbytecopy:
#if XCHAL_HAVE_LOOPS
	loopnez	a4, .Lbytecopydone
#else /* !XCHAL_HAVE_LOOPS */
	beqz	a4, .Lbytecopydone
	add	a7, a3, a4	# a7 = end address for source
#endif /* !XCHAL_HAVE_LOOPS */
.Lnextbyte:
	l8ui	a6, a3, 0
	addi	a3, a3, 1
	s8i	a6, a5, 0
	addi	a5, a5, 1
#if !XCHAL_HAVE_LOOPS
	blt	a3, a7, .Lnextbyte
#endif /* !XCHAL_HAVE_LOOPS */
.Lbytecopydone:
	retw

/*
 * Destination is unaligned
 */

	.align	4
.Ldst1mod2:	# dst is only byte aligned
	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte

	# copy 1 byte
	l8ui	a6, a3,  0
	addi	a3, a3,  1
	addi	a4, a4, -1
	s8i	a6, a5,  0
	addi	a5, a5,  1
	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
					# return to main algorithm
.Ldst2mod4:	# dst 16-bit aligned
	# copy 2 bytes
	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
	l8ui	a6, a3,  0
	l8ui	a7, a3,  1
	addi	a3, a3,  2
	addi	a4, a4, -2
	s8i	a6, a5,  0
	s8i	a7, a5,  1
	addi	a5, a5,  2
	j	.Ldstaligned	# dst is now aligned, return to main algorithm

	.align	4
	.global	memcpy
	.type   memcpy,@function
memcpy:
	.global	memmove
	.type   memmove,@function
memmove:

	entry	sp, 16		# minimal stack frame
	# a2/ dst, a3/ src, a4/ len
	mov	a5, a2		# copy dst so that a2 is return value
.Lcommon:
	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
	srli	a7, a4, 4	# number of loop iterations with 16B
				# per iteration
	movi	a8, 3		# if source is not aligned,
	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
	/*
	 * Destination and source are word-aligned, use word copy.
	 */
	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
#if XCHAL_HAVE_LOOPS
	loopnez	a7, .Loop1done
#else /* !XCHAL_HAVE_LOOPS */
	beqz	a7, .Loop1done
	slli	a8, a7, 4
	add	a8, a8, a3	# a8 = end of last 16B source chunk
#endif /* !XCHAL_HAVE_LOOPS */
.Loop1:
	l32i	a6, a3,  0
	l32i	a7, a3,  4
	s32i	a6, a5,  0
	l32i	a6, a3,  8
	s32i	a7, a5,  4
	l32i	a7, a3, 12
	s32i	a6, a5,  8
	addi	a3, a3, 16
	s32i	a7, a5, 12
	addi	a5, a5, 16
#if !XCHAL_HAVE_LOOPS
	blt	a3, a8, .Loop1
#endif /* !XCHAL_HAVE_LOOPS */
.Loop1done:
	bbci.l	a4, 3, .L2
	# copy 8 bytes
	l32i	a6, a3,  0
	l32i	a7, a3,  4
	addi	a3, a3,  8
	s32i	a6, a5,  0
	s32i	a7, a5,  4
	addi	a5, a5,  8
.L2:
	bbsi.l	a4, 2, .L3
	bbsi.l	a4, 1, .L4
	bbsi.l	a4, 0, .L5
	retw
.L3:
	# copy 4 bytes
	l32i	a6, a3,  0
	addi	a3, a3,  4
	s32i	a6, a5,  0
	addi	a5, a5,  4
	bbsi.l	a4, 1, .L4
	bbsi.l	a4, 0, .L5
	retw
.L4:
	# copy 2 bytes
	l16ui	a6, a3,  0
	addi	a3, a3,  2
	s16i	a6, a5,  0
	addi	a5, a5,  2
	bbsi.l	a4, 0, .L5
	retw
.L5:
	# copy 1 byte
	l8ui	a6, a3,  0
	s8i	a6, a5,  0
	retw

/*
 * Destination is aligned, Source is unaligned
 */

	.align	4
.Lsrcunaligned:
	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
	# copy 16 bytes per iteration for word-aligned dst and unaligned src
	ssa8	a3		# set shift amount from byte offset
#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS (simulator) with the
					   lint or ferret client, or 0 to save a few cycles */
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
	and	a11, a3, a8	# save unalignment offset for below
	sub	a3, a3, a11	# align a3
#endif
	l32i	a6, a3, 0	# load first word
#if XCHAL_HAVE_LOOPS
	loopnez	a7, .Loop2done
#else /* !XCHAL_HAVE_LOOPS */
	beqz	a7, .Loop2done
	slli	a10, a7, 4
	add	a10, a10, a3	# a10 = end of last 16B source chunk
#endif /* !XCHAL_HAVE_LOOPS */
.Loop2:
	l32i	a7, a3,  4
	l32i	a8, a3,  8
	src_b	a6, a6, a7
	s32i	a6, a5,  0
	l32i	a9, a3, 12
	src_b	a7, a7, a8
	s32i	a7, a5,  4
	l32i	a6, a3, 16
	src_b	a8, a8, a9
	s32i	a8, a5,  8
	addi	a3, a3, 16
	src_b	a9, a9, a6
	s32i	a9, a5, 12
	addi	a5, a5, 16
#if !XCHAL_HAVE_LOOPS
	blt	a3, a10, .Loop2
#endif /* !XCHAL_HAVE_LOOPS */
.Loop2done:
	bbci.l	a4, 3, .L12
	# copy 8 bytes
	l32i	a7, a3,  4
	l32i	a8, a3,  8
	src_b	a6, a6, a7
	s32i	a6, a5,  0
	addi	a3, a3,  8
	src_b	a7, a7, a8
	s32i	a7, a5,  4
	addi	a5, a5,  8
	mov	a6, a8
.L12:
	bbci.l	a4, 2, .L13
	# copy 4 bytes
	l32i	a7, a3,  4
	addi	a3, a3,  4
	src_b	a6, a6, a7
	s32i	a6, a5,  0
	addi	a5, a5,  4
	mov	a6, a7
.L13:
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
	add	a3, a3, a11	# readjust a3 with correct misalignment
#endif
	bbsi.l	a4, 1, .L14
	bbsi.l	a4, 0, .L15
.Ldone:	retw
.L14:
	# copy 2 bytes
	l8ui	a6, a3,  0
	l8ui	a7, a3,  1
	addi	a3, a3,  2
	s8i	a6, a5,  0
	s8i	a7, a5,  1
	addi	a5, a5,  2
	bbsi.l	a4, 0, .L15
	retw
.L15:
	# copy 1 byte
	l8ui	a6, a3,  0
	s8i	a6, a5,  0
	retw

/*
 * Local Variables:
 * mode:fundamental
 * comment-start: "# "
 * comment-start-skip: "# *"
 * End:
 */
+160 −0
Original line number Diff line number Diff line
/*
 *  arch/xtensa/lib/memset.S
 *
 *  ANSI C standard library function memset
 *  (Well, almost.  .fixup code might return zero.)
 *
 *  This file is subject to the terms and conditions of the GNU General
 *  Public License.  See the file "COPYING" in the main directory of
 *  this archive for more details.
 *
 *  Copyright (C) 2002 Tensilica Inc.
 */

#include <xtensa/coreasm.h>

/*
 * void *memset(void *dst, int c, size_t length)
 *
 * The algorithm is as follows:
 *   Create a word with c in all byte positions
 *   If the destination is aligned,
 *     do 16B chucks with a loop, and then finish up with
 *     8B, 4B, 2B, and 1B stores conditional on the length.
 *   If destination is unaligned, align it by conditionally
 *     setting 1B and 2B and then go to aligned case.
 *   This code tries to use fall-through branches for the common
 *     case of an aligned destination (except for the branches to
 *     the alignment labels).
 */

/* Load or store instructions that may cause exceptions use the EX macro. */

#define EX(insn,reg1,reg2,offset,handler)	\
9:	insn	reg1, reg2, offset;		\
	.section __ex_table, "a";		\
	.word	9b, handler;			\
	.previous


.text
.align	4
.global	memset
.type	memset,@function
memset:
	entry	sp, 16		# minimal stack frame
	# a2/ dst, a3/ c, a4/ length
	extui	a3, a3, 0, 8	# mask to just 8 bits
	slli	a7, a3, 8	# duplicate character in all bytes of word
	or	a3, a3, a7	# ...
	slli	a7, a3, 16	# ...
	or	a3, a3, a7	# ...
	mov	a5, a2		# copy dst so that a2 is return value
	movi	a6, 3		# for alignment tests
	bany	a2, a6, .Ldstunaligned # if dst is unaligned
.L0:	# return here from .Ldstunaligned when dst is aligned
	srli	a7, a4, 4	# number of loop iterations with 16B
				# per iteration
	bnez	a4, .Laligned
	retw

/*
 * Destination is word-aligned.
 */
	# set 16 bytes per iteration for word-aligned dst
	.align	4		# 1 mod 4 alignment for LOOPNEZ
	.byte	0		# (0 mod 4 alignment for LBEG)
.Laligned:
#if XCHAL_HAVE_LOOPS
	loopnez	a7, .Loop1done
#else /* !XCHAL_HAVE_LOOPS */
	beqz	a7, .Loop1done
	slli	a6, a7, 4
	add	a6, a6, a5	# a6 = end of last 16B chunk
#endif /* !XCHAL_HAVE_LOOPS */
.Loop1:
	EX(s32i, a3, a5,  0, memset_fixup)
	EX(s32i, a3, a5,  4, memset_fixup)
	EX(s32i, a3, a5,  8, memset_fixup)
	EX(s32i, a3, a5, 12, memset_fixup)
	addi	a5, a5, 16
#if !XCHAL_HAVE_LOOPS
	blt	a5, a6, .Loop1
#endif /* !XCHAL_HAVE_LOOPS */
.Loop1done:
	bbci.l	a4, 3, .L2
	# set 8 bytes
	EX(s32i, a3, a5,  0, memset_fixup)
	EX(s32i, a3, a5,  4, memset_fixup)
	addi	a5, a5,  8
.L2:
	bbci.l	a4, 2, .L3
	# set 4 bytes
	EX(s32i, a3, a5,  0, memset_fixup)
	addi	a5, a5,  4
.L3:
	bbci.l	a4, 1, .L4
	# set 2 bytes
	EX(s16i, a3, a5,  0, memset_fixup)
	addi	a5, a5,  2
.L4:
	bbci.l	a4, 0, .L5
	# set 1 byte
	EX(s8i, a3, a5,  0, memset_fixup)
.L5:
.Lret1:
	retw

/*
 * Destination is unaligned
 */

.Ldstunaligned:
	bltui	a4, 8, .Lbyteset	# do short copies byte by byte
	bbci.l	a5, 0, .L20		# branch if dst alignment half-aligned
	# dst is only byte aligned
	# set 1 byte
	EX(s8i, a3, a5,  0, memset_fixup)
	addi	a5, a5,  1
	addi	a4, a4, -1
	# now retest if dst aligned
	bbci.l	a5, 1, .L0	# if now aligned, return to main algorithm
.L20:
	# dst half-aligned
	# set 2 bytes
	EX(s16i, a3, a5,  0, memset_fixup)
	addi	a5, a5,  2
	addi	a4, a4, -2
	j	.L0		# dst is now aligned, return to main algorithm

/*
 * Byte by byte set
 */
	.align	4
	.byte	0		# 1 mod 4 alignment for LOOPNEZ
				# (0 mod 4 alignment for LBEG)
.Lbyteset:
#if XCHAL_HAVE_LOOPS
	loopnez	a4, .Lbytesetdone
#else /* !XCHAL_HAVE_LOOPS */
	beqz	a4, .Lbytesetdone
	add	a6, a5, a4	# a6 = ending address
#endif /* !XCHAL_HAVE_LOOPS */
.Lbyteloop:
	EX(s8i, a3, a5, 0, memset_fixup)
	addi	a5, a5, 1
#if !XCHAL_HAVE_LOOPS
	blt	a5, a6, .Lbyteloop
#endif /* !XCHAL_HAVE_LOOPS */
.Lbytesetdone:
	retw


	.section .fixup, "ax"
	.align	4

/* We return zero if a failure occurred. */

memset_fixup:
	movi	a2, 0
	retw
+352 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading