Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 322ae8eb authored by Michal Simek's avatar Michal Simek
Browse files

microblaze_v8: supported function for memory - kernel/lib

parent 16bfeaf2
Loading
Loading
Loading
Loading
+662 −0
Original line number Original line Diff line number Diff line
/*
 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
 * Copyright (C) 2008-2009 PetaLogix
 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
 *
 * This file is subject to the terms and conditions of the GNU General
 * Public License.  See the file COPYING in the main directory of this
 * archive for more details.
 *
 * Written by Jim Law <jlaw@irispower.com>
 *
 * intended to replace:
 *	memcpy in memcpy.c and
 *	memmove in memmove.c
 * ... in arch/microblaze/lib
 *
 *
 * assly_fastcopy.S
 *
 * Attempt at quicker memcpy and memmove for MicroBlaze
 *	Input :	Operand1 in Reg r5 - destination address
 *		Operand2 in Reg r6 - source address
 *		Operand3 in Reg r7 - number of bytes to transfer
 *	Output: Result in Reg r3 - starting destinaition address
 *
 *
 * Explanation:
 *	Perform (possibly unaligned) copy of a block of memory
 *	between mem locations with size of xfer spec'd in bytes
 */

#include <linux/linkage.h>

	.globl	memcpy
	.ent	memcpy

memcpy:
fast_memcpy_ascending:
	/* move d to return register as value of function */
	addi	r3, r5, 0

	addi	r4, r0, 4	/* n = 4 */
	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */

	/* transfer first 0~3 bytes to get aligned dest address */
	andi	r4, r5, 3		/* n = d & 3 */
	/* if zero, destination already aligned */
	beqi	r4, a_dalign_done
	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
	rsubi	r4, r4, 4
	rsub	r7, r4, r7		/* c = c - n adjust c */

a_xfer_first_loop:
	/* if no bytes left to transfer, transfer the bulk */
	beqi	r4, a_dalign_done
	lbui	r11, r6, 0		/* h = *s */
	sbi	r11, r5, 0		/* *d = h */
	addi	r6, r6, 1		/* s++ */
	addi	r5, r5, 1		/* d++ */
	brid	a_xfer_first_loop	/* loop */
	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */

a_dalign_done:
	addi	r4, r0, 32		/* n = 32 */
	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
	/* if n < 0, less than one block to transfer */
	blti	r4, a_block_done

a_block_xfer:
	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
	rsub	r7, r4, r7		/* c = c - n */

	andi	r9, r6, 3		/* t1 = s & 3 */
	/* if temp != 0, unaligned transfers needed */
	bnei	r9, a_block_unaligned

a_block_aligned:
	lwi	r9, r6, 0		/* t1 = *(s + 0) */
	lwi	r10, r6, 4		/* t2 = *(s + 4) */
	lwi	r11, r6, 8		/* t3 = *(s + 8) */
	lwi	r12, r6, 12		/* t4 = *(s + 12) */
	swi	r9, r5, 0		/* *(d + 0) = t1 */
	swi	r10, r5, 4		/* *(d + 4) = t2 */
	swi	r11, r5, 8		/* *(d + 8) = t3 */
	swi	r12, r5, 12		/* *(d + 12) = t4 */
	lwi	r9, r6, 16		/* t1 = *(s + 16) */
	lwi	r10, r6, 20		/* t2 = *(s + 20) */
	lwi	r11, r6, 24		/* t3 = *(s + 24) */
	lwi	r12, r6, 28		/* t4 = *(s + 28) */
	swi	r9, r5, 16		/* *(d + 16) = t1 */
	swi	r10, r5, 20		/* *(d + 20) = t2 */
	swi	r11, r5, 24		/* *(d + 24) = t3 */
	swi	r12, r5, 28		/* *(d + 28) = t4 */
	addi	r6, r6, 32		/* s = s + 32 */
	addi	r4, r4, -32		/* n = n - 32 */
	bneid	r4, a_block_aligned	/* while (n) loop */
	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
	bri	a_block_done

a_block_unaligned:
	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
	add	r6, r6, r4		/* s = s + n */
	lwi	r11, r8, 0		/* h = *(as + 0) */

	addi	r9, r9, -1
	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
	addi	r9, r9, -1
	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */

a_block_u3:
	bslli	r11, r11, 24	/* h = h << 24 */
a_bu3_loop:
	lwi	r12, r8, 4	/* v = *(as + 4) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 0	/* *(d + 0) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 8	/* v = *(as + 8) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 4	/* *(d + 4) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 12	/* v = *(as + 12) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 8	/* *(d + 8) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 16	/* v = *(as + 16) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 12	/* *(d + 12) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 20	/* v = *(as + 20) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 16	/* *(d + 16) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 24	/* v = *(as + 24) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 20	/* *(d + 20) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 28	/* v = *(as + 28) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 24	/* *(d + 24) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	lwi	r12, r8, 32	/* v = *(as + 32) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 28	/* *(d + 28) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	addi	r8, r8, 32	/* as = as + 32 */
	addi	r4, r4, -32	/* n = n - 32 */
	bneid	r4, a_bu3_loop	/* while (n) loop */
	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
	bri	a_block_done

a_block_u1:
	bslli	r11, r11, 8	/* h = h << 8 */
a_bu1_loop:
	lwi	r12, r8, 4	/* v = *(as + 4) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 0	/* *(d + 0) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 8	/* v = *(as + 8) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 4	/* *(d + 4) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 12	/* v = *(as + 12) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 8	/* *(d + 8) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 16	/* v = *(as + 16) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 12	/* *(d + 12) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 20	/* v = *(as + 20) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 16	/* *(d + 16) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 24	/* v = *(as + 24) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 20	/* *(d + 20) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 28	/* v = *(as + 28) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 24	/* *(d + 24) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	lwi	r12, r8, 32	/* v = *(as + 32) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 28	/* *(d + 28) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	addi	r8, r8, 32	/* as = as + 32 */
	addi	r4, r4, -32	/* n = n - 32 */
	bneid	r4, a_bu1_loop	/* while (n) loop */
	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
	bri	a_block_done

a_block_u2:
	bslli	r11, r11, 16	/* h = h << 16 */
a_bu2_loop:
	lwi	r12, r8, 4	/* v = *(as + 4) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 0	/* *(d + 0) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 8	/* v = *(as + 8) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 4	/* *(d + 4) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 12	/* v = *(as + 12) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 8	/* *(d + 8) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 16	/* v = *(as + 16) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 12	/* *(d + 12) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 20	/* v = *(as + 20) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 16	/* *(d + 16) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 24	/* v = *(as + 24) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 20	/* *(d + 20) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 28	/* v = *(as + 28) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 24	/* *(d + 24) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	lwi	r12, r8, 32	/* v = *(as + 32) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 28	/* *(d + 28) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	addi	r8, r8, 32	/* as = as + 32 */
	addi	r4, r4, -32	/* n = n - 32 */
	bneid	r4, a_bu2_loop	/* while (n) loop */
	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */

a_block_done:
	addi	r4, r0, 4	/* n = 4 */
	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */

a_word_xfer:
	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
	addi	r10, r0, 0		/* offset = 0 */

	andi	r9, r6, 3		/* t1 = s & 3 */
	/* if temp != 0, unaligned transfers needed */
	bnei	r9, a_word_unaligned

a_word_aligned:
	lw	r9, r6, r10		/* t1 = *(s+offset) */
	sw	r9, r5, r10		/* *(d+offset) = t1 */
	addi	r4, r4,-4		/* n-- */
	bneid	r4, a_word_aligned	/* loop */
	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */

	bri	a_word_done

a_word_unaligned:
	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
	lwi	r11, r8, 0		/* h = *(as + 0) */
	addi	r8, r8, 4		/* as = as + 4 */

	addi	r9, r9, -1
	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
	addi	r9, r9, -1
	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */

a_word_u3:
	bslli	r11, r11, 24	/* h = h << 24 */
a_wu3_loop:
	lw	r12, r8, r10	/* v = *(as + offset) */
	bsrli	r9, r12, 8	/* t1 = v >> 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	sw	r9, r5, r10	/* *(d + offset) = t1 */
	bslli	r11, r12, 24	/* h = v << 24 */
	addi	r4, r4,-4	/* n = n - 4 */
	bneid	r4, a_wu3_loop	/* while (n) loop */
	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */

	bri	a_word_done

a_word_u1:
	bslli	r11, r11, 8	/* h = h << 8 */
a_wu1_loop:
	lw	r12, r8, r10	/* v = *(as + offset) */
	bsrli	r9, r12, 24	/* t1 = v >> 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	sw	r9, r5, r10	/* *(d + offset) = t1 */
	bslli	r11, r12, 8	/* h = v << 8 */
	addi	r4, r4,-4	/* n = n - 4 */
	bneid	r4, a_wu1_loop	/* while (n) loop */
	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */

	bri	a_word_done

a_word_u2:
	bslli	r11, r11, 16	/* h = h << 16 */
a_wu2_loop:
	lw	r12, r8, r10	/* v = *(as + offset) */
	bsrli	r9, r12, 16	/* t1 = v >> 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	sw	r9, r5, r10	/* *(d + offset) = t1 */
	bslli	r11, r12, 16	/* h = v << 16 */
	addi	r4, r4,-4	/* n = n - 4 */
	bneid	r4, a_wu2_loop	/* while (n) loop */
	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */

a_word_done:
	add	r5, r5, r10	/* d = d + offset */
	add	r6, r6, r10	/* s = s + offset */
	rsub	r7, r10, r7	/* c = c - offset */

a_xfer_end:
a_xfer_end_loop:
	beqi	r7, a_done		/* while (c) */
	lbui	r9, r6, 0		/* t1 = *s */
	addi	r6, r6, 1		/* s++ */
	sbi	r9, r5, 0		/* *d = t1 */
	addi	r7, r7, -1		/* c-- */
	brid	a_xfer_end_loop		/* loop */
	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */

a_done:
	rtsd	r15, 8
	nop

.end memcpy
/*----------------------------------------------------------------------------*/
	.globl	memmove
	.ent	memmove

memmove:
	cmpu	r4, r5, r6	/* n = s - d */
	bgei	r4,fast_memcpy_ascending

fast_memcpy_descending:
	/* move d to return register as value of function */
	addi	r3, r5, 0

	add	r5, r5, r7	/* d = d + c */
	add	r6, r6, r7	/* s = s + c */

	addi	r4, r0, 4	/* n = 4 */
	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */

	/* transfer first 0~3 bytes to get aligned dest address */
	andi	r4, r5, 3		/* n = d & 3 */
	/* if zero, destination already aligned */
	beqi	r4,d_dalign_done
	rsub	r7, r4, r7		/* c = c - n adjust c */

d_xfer_first_loop:
	/* if no bytes left to transfer, transfer the bulk */
	beqi	r4,d_dalign_done
	addi	r6, r6, -1		/* s-- */
	addi	r5, r5, -1		/* d-- */
	lbui	r11, r6, 0		/* h = *s */
	sbi	r11, r5, 0		/* *d = h */
	brid	d_xfer_first_loop	/* loop */
	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */

d_dalign_done:
	addi	r4, r0, 32	/* n = 32 */
	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
	/* if n < 0, less than one block to transfer */
	blti	r4, d_block_done

d_block_xfer:
	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
	rsub	r7, r4, r7		/* c = c - n */

	andi	r9, r6, 3		/* t1 = s & 3 */
	/* if temp != 0, unaligned transfers needed */
	bnei	r9, d_block_unaligned

d_block_aligned:
	addi	r6, r6, -32		/* s = s - 32 */
	addi	r5, r5, -32		/* d = d - 32 */
	lwi	r9, r6, 28		/* t1 = *(s + 28) */
	lwi	r10, r6, 24		/* t2 = *(s + 24) */
	lwi	r11, r6, 20		/* t3 = *(s + 20) */
	lwi	r12, r6, 16		/* t4 = *(s + 16) */
	swi	r9, r5, 28		/* *(d + 28) = t1 */
	swi	r10, r5, 24		/* *(d + 24) = t2 */
	swi	r11, r5, 20		/* *(d + 20) = t3 */
	swi	r12, r5, 16		/* *(d + 16) = t4 */
	lwi	r9, r6, 12		/* t1 = *(s + 12) */
	lwi	r10, r6, 8		/* t2 = *(s + 8) */
	lwi	r11, r6, 4		/* t3 = *(s + 4) */
	lwi	r12, r6, 0		/* t4 = *(s + 0) */
	swi	r9, r5, 12		/* *(d + 12) = t1 */
	swi	r10, r5, 8		/* *(d + 8) = t2 */
	swi	r11, r5, 4		/* *(d + 4) = t3 */
	addi	r4, r4, -32		/* n = n - 32 */
	bneid	r4, d_block_aligned	/* while (n) loop */
	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
	bri	d_block_done

d_block_unaligned:
	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
	rsub	r6, r4, r6		/* s = s - n */
	lwi	r11, r8, 0		/* h = *(as + 0) */

	addi	r9, r9, -1
	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
	addi	r9, r9, -1
	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */

d_block_u3:
	bsrli	r11, r11, 8	/* h = h >> 8 */
d_bu3_loop:
	addi	r8, r8, -32	/* as = as - 32 */
	addi	r5, r5, -32	/* d = d - 32 */
	lwi	r12, r8, 28	/* v = *(as + 28) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 28	/* *(d + 28) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 24	/* v = *(as + 24) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 24	/* *(d + 24) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 20	/* v = *(as + 20) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 20	/* *(d + 20) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 16	/* v = *(as + 16) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 16	/* *(d + 16) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 12	/* v = *(as + 12) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 12	/* *(d + 112) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 8	/* v = *(as + 8) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 8	/* *(d + 8) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 4	/* v = *(as + 4) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 4	/* *(d + 4) = t1 */
	bsrli	r11, r12, 8	/* h = v >> 8 */
	lwi	r12, r8, 0	/* v = *(as + 0) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 0	/* *(d + 0) = t1 */
	addi	r4, r4, -32	/* n = n - 32 */
	bneid	r4, d_bu3_loop	/* while (n) loop */
	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
	bri	d_block_done

d_block_u1:
	bsrli	r11, r11, 24	/* h = h >> 24 */
d_bu1_loop:
	addi	r8, r8, -32	/* as = as - 32 */
	addi	r5, r5, -32	/* d = d - 32 */
	lwi	r12, r8, 28	/* v = *(as + 28) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 28	/* *(d + 28) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 24	/* v = *(as + 24) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 24	/* *(d + 24) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 20	/* v = *(as + 20) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 20	/* *(d + 20) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 16	/* v = *(as + 16) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 16	/* *(d + 16) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 12	/* v = *(as + 12) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 12	/* *(d + 112) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 8	/* v = *(as + 8) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 8	/* *(d + 8) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 4	/* v = *(as + 4) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 4	/* *(d + 4) = t1 */
	bsrli	r11, r12, 24	/* h = v >> 24 */
	lwi	r12, r8, 0	/* v = *(as + 0) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 0	/* *(d + 0) = t1 */
	addi	r4, r4, -32	/* n = n - 32 */
	bneid	r4, d_bu1_loop	/* while (n) loop */
	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
	bri	d_block_done

d_block_u2:
	bsrli	r11, r11, 16	/* h = h >> 16 */
d_bu2_loop:
	addi	r8, r8, -32	/* as = as - 32 */
	addi	r5, r5, -32	/* d = d - 32 */
	lwi	r12, r8, 28	/* v = *(as + 28) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 28	/* *(d + 28) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 24	/* v = *(as + 24) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 24	/* *(d + 24) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 20	/* v = *(as + 20) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 20	/* *(d + 20) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 16	/* v = *(as + 16) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 16	/* *(d + 16) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 12	/* v = *(as + 12) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 12	/* *(d + 112) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 8	/* v = *(as + 8) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 8	/* *(d + 8) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 4	/* v = *(as + 4) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 4	/* *(d + 4) = t1 */
	bsrli	r11, r12, 16	/* h = v >> 16 */
	lwi	r12, r8, 0	/* v = *(as + 0) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	swi	r9, r5, 0	/* *(d + 0) = t1 */
	addi	r4, r4, -32	/* n = n - 32 */
	bneid	r4, d_bu2_loop	/* while (n) loop */
	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */

d_block_done:
	addi	r4, r0, 4	/* n = 4 */
	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */

d_word_xfer:
	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
	rsub	r5, r4, r5		/* d = d - n */
	rsub	r6, r4, r6		/* s = s - n */
	rsub	r7, r4, r7		/* c = c - n */

	andi	r9, r6, 3		/* t1 = s & 3 */
	/* if temp != 0, unaligned transfers needed */
	bnei	r9, d_word_unaligned

d_word_aligned:
	addi	r4, r4,-4		/* n-- */
	lw	r9, r6, r4		/* t1 = *(s+n) */
	bneid	r4, d_word_aligned	/* loop */
	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */

	bri	d_word_done

d_word_unaligned:
	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
	lw	r11, r8, r4		/* h = *(as + n) */

	addi	r9, r9, -1
	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
	addi	r9, r9, -1
	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */

d_word_u3:
	bsrli	r11, r11, 8	/* h = h >> 8 */
d_wu3_loop:
	addi	r4, r4,-4	/* n = n - 4 */
	lw	r12, r8, r4	/* v = *(as + n) */
	bslli	r9, r12, 24	/* t1 = v << 24 */
	or	r9, r11, r9	/* t1 = h | t1 */
	sw	r9, r5, r4	/* *(d + n) = t1 */
	bneid	r4, d_wu3_loop	/* while (n) loop */
	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */

	bri	d_word_done

d_word_u1:
	bsrli	r11, r11, 24	/* h = h >> 24 */
d_wu1_loop:
	addi	r4, r4,-4	/* n = n - 4 */
	lw	r12, r8, r4	/* v = *(as + n) */
	bslli	r9, r12, 8	/* t1 = v << 8 */
	or	r9, r11, r9	/* t1 = h | t1 */
	sw	r9, r5, r4	/* *(d + n) = t1 */
	bneid	r4, d_wu1_loop	/* while (n) loop */
	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */

	bri	d_word_done

d_word_u2:
	bsrli	r11, r11, 16	/* h = h >> 16 */
d_wu2_loop:
	addi	r4, r4,-4	/* n = n - 4 */
	lw	r12, r8, r4	/* v = *(as + n) */
	bslli	r9, r12, 16	/* t1 = v << 16 */
	or	r9, r11, r9	/* t1 = h | t1 */
	sw	r9, r5, r4	/* *(d + n) = t1 */
	bneid	r4, d_wu2_loop	/* while (n) loop */
	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */

d_word_done:

d_xfer_end:
d_xfer_end_loop:
	beqi	r7, a_done		/* while (c) */
	addi	r6, r6, -1		/* s-- */
	lbui	r9, r6, 0		/* t1 = *s */
	addi	r5, r5, -1		/* d-- */
	sbi	r9, r5, 0		/* *d = t1 */
	brid	d_xfer_end_loop		/* loop */
	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */

d_done:
	rtsd	r15, 8
	nop

.end memmove
+161 −0
Original line number Original line Diff line number Diff line
/*
 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
 * Copyright (C) 2008-2009 PetaLogix
 * Copyright (C) 2007 John Williams
 *
 * Reasonably optimised generic C-code for memcpy on Microblaze
 * This is generic C code to do efficient, alignment-aware memcpy.
 *
 * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
 * http://www.embedded.com/showArticle.jhtml?articleID=19205567
 *
 * Attempts were made, unsuccesfully, to contact the original
 * author of this code (Michael Morrow, Intel).  Below is the original
 * copyright notice.
 *
 * This software has been developed by Intel Corporation.
 * Intel specifically disclaims all warranties, express or
 * implied, and all liability, including consequential and
 * other indirect damages, for the use of this program, including
 * liability for infringement of any proprietary rights,
 * and including the warranties of merchantability and fitness
 * for a particular purpose. Intel does not assume any
 * responsibility for and errors which may appear in this program
 * not any responsibility to update it.
 */

#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/compiler.h>
#include <linux/module.h>

#include <linux/string.h>
#include <asm/system.h>

#ifdef __HAVE_ARCH_MEMCPY
void *memcpy(void *v_dst, const void *v_src, __kernel_size_t c)
{
	const char *src = v_src;
	char *dst = v_dst;
#ifndef CONFIG_OPT_LIB_FUNCTION
	/* Simple, byte oriented memcpy. */
	while (c--)
		*dst++ = *src++;

	return v_dst;
#else
	/* The following code tries to optimize the copy by using unsigned
	 * alignment. This will work fine if both source and destination are
	 * aligned on the same boundary. However, if they are aligned on
	 * different boundaries shifts will be necessary. This might result in
	 * bad performance on MicroBlaze systems without a barrel shifter.
	 */
	const uint32_t *i_src;
	uint32_t *i_dst;

	if (c >= 4) {
		unsigned  value, buf_hold;

		/* Align the dstination to a word boundry. */
		/* This is done in an endian independant manner. */
		switch ((unsigned long)dst & 3) {
		case 1:
			*dst++ = *src++;
			--c;
		case 2:
			*dst++ = *src++;
			--c;
		case 3:
			*dst++ = *src++;
			--c;
		}

		i_dst = (void *)dst;

		/* Choose a copy scheme based on the source */
		/* alignment relative to dstination. */
		switch ((unsigned long)src & 3) {
		case 0x0:	/* Both byte offsets are aligned */
			i_src  = (const void *)src;

			for (; c >= 4; c -= 4)
				*i_dst++ = *i_src++;

			src  = (const void *)i_src;
			break;
		case 0x1:	/* Unaligned - Off by 1 */
			/* Word align the source */
			i_src = (const void *) ((unsigned)src & ~3);

			/* Load the holding buffer */
			buf_hold = *i_src++ << 8;

			for (; c >= 4; c -= 4) {
				value = *i_src++;
				*i_dst++ = buf_hold | value >> 24;
				buf_hold = value << 8;
			}

			/* Realign the source */
			src = (const void *)i_src;
			src -= 3;
			break;
		case 0x2:	/* Unaligned - Off by 2 */
			/* Word align the source */
			i_src = (const void *) ((unsigned)src & ~3);

			/* Load the holding buffer */
			buf_hold = *i_src++ << 16;

			for (; c >= 4; c -= 4) {
				value = *i_src++;
				*i_dst++ = buf_hold | value >> 16;
				buf_hold = value << 16;
			}

			/* Realign the source */
			src = (const void *)i_src;
			src -= 2;
			break;
		case 0x3:	/* Unaligned - Off by 3 */
			/* Word align the source */
			i_src = (const void *) ((unsigned)src & ~3);

			/* Load the holding buffer */
			buf_hold = *i_src++ << 24;

			for (; c >= 4; c -= 4) {
				value = *i_src++;
				*i_dst++ = buf_hold | value >> 8;
				buf_hold = value << 24;
			}

			/* Realign the source */
			src = (const void *)i_src;
			src -= 1;
			break;
		}
		dst = (void *)i_dst;
	}

	/* Finish off any remaining bytes */
	/* simple fast copy, ... unless a cache boundry is crossed */
	switch (c) {
	case 3:
		*dst++ = *src++;
	case 2:
		*dst++ = *src++;
	case 1:
		*dst++ = *src++;
	}

	return v_dst;
#endif
}
EXPORT_SYMBOL(memcpy);
#endif /* __HAVE_ARCH_MEMCPY */

void *cacheable_memcpy(void *d, const void *s, __kernel_size_t c)
{
	return memcpy(d, s, c);
}
+175 −0

File added.

Preview size limit exceeded, changes collapsed.

+82 −0

File added.

Preview size limit exceeded, changes collapsed.