Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 32ee1e18 authored by Anton Blanchard's avatar Anton Blanchard Committed by Benjamin Herrenschmidt
Browse files

powerpc: Fix endian issues in VMX copy loops



Fix the permute loops for little endian.

Signed-off-by: default avatarAnton Blanchard <anton@samba.org>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 8b5ede69
Loading
Loading
Loading
Loading
+31 −23
Original line number Diff line number Diff line
@@ -19,6 +19,14 @@
 */
#include <asm/ppc_asm.h>

#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
#endif

	.macro err1
100:
	.section __ex_table,"a"
@@ -552,13 +560,13 @@ err3; stw r7,4(r3)
	li	r10,32
	li	r11,48

	lvsl	vr16,0,r4	/* Setup permute control vector */
	LVS(vr16,0,r4)		/* Setup permute control vector */
err3;	lvx	vr0,0,r4
	addi	r4,r4,16

	bf	cr7*4+3,5f
err3;	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
	addi	r4,r4,16
err3;	stvx	vr8,r0,r3
	addi	r3,r3,16
@@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3

5:	bf	cr7*4+2,6f
err3;	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
err3;	lvx	vr0,r4,r9
	vperm	vr9,vr1,vr0,vr16
	VPERM(vr9,vr1,vr0,vr16)
	addi	r4,r4,32
err3;	stvx	vr8,r0,r3
err3;	stvx	vr9,r3,r9
@@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9

6:	bf	cr7*4+1,7f
err3;	lvx	vr3,r0,r4
	vperm	vr8,vr0,vr3,vr16
	VPERM(vr8,vr0,vr3,vr16)
err3;	lvx	vr2,r4,r9
	vperm	vr9,vr3,vr2,vr16
	VPERM(vr9,vr3,vr2,vr16)
err3;	lvx	vr1,r4,r10
	vperm	vr10,vr2,vr1,vr16
	VPERM(vr10,vr2,vr1,vr16)
err3;	lvx	vr0,r4,r11
	vperm	vr11,vr1,vr0,vr16
	VPERM(vr11,vr1,vr0,vr16)
	addi	r4,r4,64
err3;	stvx	vr8,r0,r3
err3;	stvx	vr9,r3,r9
@@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11
	.align	5
8:
err4;	lvx	vr7,r0,r4
	vperm	vr8,vr0,vr7,vr16
	VPERM(vr8,vr0,vr7,vr16)
err4;	lvx	vr6,r4,r9
	vperm	vr9,vr7,vr6,vr16
	VPERM(vr9,vr7,vr6,vr16)
err4;	lvx	vr5,r4,r10
	vperm	vr10,vr6,vr5,vr16
	VPERM(vr10,vr6,vr5,vr16)
err4;	lvx	vr4,r4,r11
	vperm	vr11,vr5,vr4,vr16
	VPERM(vr11,vr5,vr4,vr16)
err4;	lvx	vr3,r4,r12
	vperm	vr12,vr4,vr3,vr16
	VPERM(vr12,vr4,vr3,vr16)
err4;	lvx	vr2,r4,r14
	vperm	vr13,vr3,vr2,vr16
	VPERM(vr13,vr3,vr2,vr16)
err4;	lvx	vr1,r4,r15
	vperm	vr14,vr2,vr1,vr16
	VPERM(vr14,vr2,vr1,vr16)
err4;	lvx	vr0,r4,r16
	vperm	vr15,vr1,vr0,vr16
	VPERM(vr15,vr1,vr0,vr16)
	addi	r4,r4,128
err4;	stvx	vr8,r0,r3
err4;	stvx	vr9,r3,r9
@@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16

	bf	cr7*4+1,9f
err3;	lvx	vr3,r0,r4
	vperm	vr8,vr0,vr3,vr16
	VPERM(vr8,vr0,vr3,vr16)
err3;	lvx	vr2,r4,r9
	vperm	vr9,vr3,vr2,vr16
	VPERM(vr9,vr3,vr2,vr16)
err3;	lvx	vr1,r4,r10
	vperm	vr10,vr2,vr1,vr16
	VPERM(vr10,vr2,vr1,vr16)
err3;	lvx	vr0,r4,r11
	vperm	vr11,vr1,vr0,vr16
	VPERM(vr11,vr1,vr0,vr16)
	addi	r4,r4,64
err3;	stvx	vr8,r0,r3
err3;	stvx	vr9,r3,r9
@@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11

9:	bf	cr7*4+2,10f
err3;	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
err3;	lvx	vr0,r4,r9
	vperm	vr9,vr1,vr0,vr16
	VPERM(vr9,vr1,vr0,vr16)
	addi	r4,r4,32
err3;	stvx	vr8,r0,r3
err3;	stvx	vr9,r3,r9
@@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9

10:	bf	cr7*4+3,11f
err3;	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
	addi	r4,r4,16
err3;	stvx	vr8,r0,r3
	addi	r3,r3,16
+32 −23
Original line number Diff line number Diff line
@@ -20,6 +20,15 @@
#include <asm/ppc_asm.h>

_GLOBAL(memcpy_power7)

#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
#endif

#ifdef CONFIG_ALTIVEC
	cmpldi	r5,16
	cmpldi	cr1,r5,4096
@@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)
	li	r10,32
	li	r11,48

	lvsl	vr16,0,r4	/* Setup permute control vector */
	LVS(vr16,0,r4)		/* Setup permute control vector */
	lvx	vr0,0,r4
	addi	r4,r4,16

	bf	cr7*4+3,5f
	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
	addi	r4,r4,16
	stvx	vr8,r0,r3
	addi	r3,r3,16
@@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)

5:	bf	cr7*4+2,6f
	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
	lvx	vr0,r4,r9
	vperm	vr9,vr1,vr0,vr16
	VPERM(vr9,vr1,vr0,vr16)
	addi	r4,r4,32
	stvx	vr8,r0,r3
	stvx	vr9,r3,r9
@@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)

6:	bf	cr7*4+1,7f
	lvx	vr3,r0,r4
	vperm	vr8,vr0,vr3,vr16
	VPERM(vr8,vr0,vr3,vr16)
	lvx	vr2,r4,r9
	vperm	vr9,vr3,vr2,vr16
	VPERM(vr9,vr3,vr2,vr16)
	lvx	vr1,r4,r10
	vperm	vr10,vr2,vr1,vr16
	VPERM(vr10,vr2,vr1,vr16)
	lvx	vr0,r4,r11
	vperm	vr11,vr1,vr0,vr16
	VPERM(vr11,vr1,vr0,vr16)
	addi	r4,r4,64
	stvx	vr8,r0,r3
	stvx	vr9,r3,r9
@@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)
	.align	5
8:
	lvx	vr7,r0,r4
	vperm	vr8,vr0,vr7,vr16
	VPERM(vr8,vr0,vr7,vr16)
	lvx	vr6,r4,r9
	vperm	vr9,vr7,vr6,vr16
	VPERM(vr9,vr7,vr6,vr16)
	lvx	vr5,r4,r10
	vperm	vr10,vr6,vr5,vr16
	VPERM(vr10,vr6,vr5,vr16)
	lvx	vr4,r4,r11
	vperm	vr11,vr5,vr4,vr16
	VPERM(vr11,vr5,vr4,vr16)
	lvx	vr3,r4,r12
	vperm	vr12,vr4,vr3,vr16
	VPERM(vr12,vr4,vr3,vr16)
	lvx	vr2,r4,r14
	vperm	vr13,vr3,vr2,vr16
	VPERM(vr13,vr3,vr2,vr16)
	lvx	vr1,r4,r15
	vperm	vr14,vr2,vr1,vr16
	VPERM(vr14,vr2,vr1,vr16)
	lvx	vr0,r4,r16
	vperm	vr15,vr1,vr0,vr16
	VPERM(vr15,vr1,vr0,vr16)
	addi	r4,r4,128
	stvx	vr8,r0,r3
	stvx	vr9,r3,r9
@@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)

	bf	cr7*4+1,9f
	lvx	vr3,r0,r4
	vperm	vr8,vr0,vr3,vr16
	VPERM(vr8,vr0,vr3,vr16)
	lvx	vr2,r4,r9
	vperm	vr9,vr3,vr2,vr16
	VPERM(vr9,vr3,vr2,vr16)
	lvx	vr1,r4,r10
	vperm	vr10,vr2,vr1,vr16
	VPERM(vr10,vr2,vr1,vr16)
	lvx	vr0,r4,r11
	vperm	vr11,vr1,vr0,vr16
	VPERM(vr11,vr1,vr0,vr16)
	addi	r4,r4,64
	stvx	vr8,r0,r3
	stvx	vr9,r3,r9
@@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)

9:	bf	cr7*4+2,10f
	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
	lvx	vr0,r4,r9
	vperm	vr9,vr1,vr0,vr16
	VPERM(vr9,vr1,vr0,vr16)
	addi	r4,r4,32
	stvx	vr8,r0,r3
	stvx	vr9,r3,r9
@@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)

10:	bf	cr7*4+3,11f
	lvx	vr1,r0,r4
	vperm	vr8,vr0,vr1,vr16
	VPERM(vr8,vr0,vr1,vr16)
	addi	r4,r4,16
	stvx	vr8,r0,r3
	addi	r3,r3,16