Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 55868b45 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/aes-blk - remove configurable interleave



The AES block mode implementation using Crypto Extensions or plain NEON
was written before real hardware existed, and so its interleave factor
was made build time configurable (as well as an option to instantiate
all interleaved sequences inline rather than as subroutines)

We ended up using INTERLEAVE=4 with inlining disabled for both flavors
of the core AES routines, so let's stick with that, and remove the option
to configure this at build time. This makes the code easier to modify,
which is nice now that we're adding yield support.

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 4bf7e7a1
Loading
Loading
Loading
Loading
+0 −3
Original line number Original line Diff line number Diff line
@@ -62,9 +62,6 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o


AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
AFLAGS_aes-neon.o	:= -DINTERLEAVE=4

CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS
CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS


$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
+40 −197
Original line number Original line Diff line number Diff line
@@ -13,44 +13,6 @@
	.text
	.text
	.align		4
	.align		4


/*
 * There are several ways to instantiate this code:
 * - no interleave, all inline
 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
 *
 * Macros imported by this code:
 * - enc_prepare	- setup NEON registers for encryption
 * - dec_prepare	- setup NEON registers for decryption
 * - enc_switch_key	- change to new key after having prepared for encryption
 * - encrypt_block	- encrypt a single block
 * - decrypt block	- decrypt a single block
 * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2)
 * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2)
 * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
 * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
 */

#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
#define FRAME_POP	ldp x29, x30, [sp],#16

#if INTERLEAVE == 2

aes_encrypt_block2x:
	encrypt_block2x	v0, v1, w3, x2, x8, w7
	ret
ENDPROC(aes_encrypt_block2x)

aes_decrypt_block2x:
	decrypt_block2x	v0, v1, w3, x2, x8, w7
	ret
ENDPROC(aes_decrypt_block2x)

#elif INTERLEAVE == 4

aes_encrypt_block4x:
aes_encrypt_block4x:
	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
	ret
	ret
@@ -61,48 +23,6 @@ aes_decrypt_block4x:
	ret
	ret
ENDPROC(aes_decrypt_block4x)
ENDPROC(aes_decrypt_block4x)


#else
#error INTERLEAVE should equal 2 or 4
#endif

	.macro		do_encrypt_block2x
	bl		aes_encrypt_block2x
	.endm

	.macro		do_decrypt_block2x
	bl		aes_decrypt_block2x
	.endm

	.macro		do_encrypt_block4x
	bl		aes_encrypt_block4x
	.endm

	.macro		do_decrypt_block4x
	bl		aes_decrypt_block4x
	.endm

#else
#define FRAME_PUSH
#define FRAME_POP

	.macro		do_encrypt_block2x
	encrypt_block2x	v0, v1, w3, x2, x8, w7
	.endm

	.macro		do_decrypt_block2x
	decrypt_block2x	v0, v1, w3, x2, x8, w7
	.endm

	.macro		do_encrypt_block4x
	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
	.endm

	.macro		do_decrypt_block4x
	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
	.endm

#endif

	/*
	/*
	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
	 *		   int blocks)
	 *		   int blocks)
@@ -111,28 +31,21 @@ ENDPROC(aes_decrypt_block4x)
	 */
	 */


AES_ENTRY(aes_ecb_encrypt)
AES_ENTRY(aes_ecb_encrypt)
	FRAME_PUSH
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp


	enc_prepare	w3, x2, x5
	enc_prepare	w3, x2, x5


.LecbencloopNx:
.LecbencloopNx:
#if INTERLEAVE >= 2
	subs		w4, w4, #4
	subs		w4, w4, #INTERLEAVE
	bmi		.Lecbenc1x
	bmi		.Lecbenc1x
#if INTERLEAVE == 2
	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
	do_encrypt_block2x
	st1		{v0.16b-v1.16b}, [x0], #32
#else
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
	do_encrypt_block4x
	bl		aes_encrypt_block4x
	st1		{v0.16b-v3.16b}, [x0], #64
	st1		{v0.16b-v3.16b}, [x0], #64
#endif
	b		.LecbencloopNx
	b		.LecbencloopNx
.Lecbenc1x:
.Lecbenc1x:
	adds		w4, w4, #INTERLEAVE
	adds		w4, w4, #4
	beq		.Lecbencout
	beq		.Lecbencout
#endif
.Lecbencloop:
.Lecbencloop:
	ld1		{v0.16b}, [x1], #16		/* get next pt block */
	ld1		{v0.16b}, [x1], #16		/* get next pt block */
	encrypt_block	v0, w3, x2, x5, w6
	encrypt_block	v0, w3, x2, x5, w6
@@ -140,34 +53,27 @@ AES_ENTRY(aes_ecb_encrypt)
	subs		w4, w4, #1
	subs		w4, w4, #1
	bne		.Lecbencloop
	bne		.Lecbencloop
.Lecbencout:
.Lecbencout:
	FRAME_POP
	ldp		x29, x30, [sp], #16
	ret
	ret
AES_ENDPROC(aes_ecb_encrypt)
AES_ENDPROC(aes_ecb_encrypt)




AES_ENTRY(aes_ecb_decrypt)
AES_ENTRY(aes_ecb_decrypt)
	FRAME_PUSH
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp


	dec_prepare	w3, x2, x5
	dec_prepare	w3, x2, x5


.LecbdecloopNx:
.LecbdecloopNx:
#if INTERLEAVE >= 2
	subs		w4, w4, #4
	subs		w4, w4, #INTERLEAVE
	bmi		.Lecbdec1x
	bmi		.Lecbdec1x
#if INTERLEAVE == 2
	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
	do_decrypt_block2x
	st1		{v0.16b-v1.16b}, [x0], #32
#else
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	do_decrypt_block4x
	bl		aes_decrypt_block4x
	st1		{v0.16b-v3.16b}, [x0], #64
	st1		{v0.16b-v3.16b}, [x0], #64
#endif
	b		.LecbdecloopNx
	b		.LecbdecloopNx
.Lecbdec1x:
.Lecbdec1x:
	adds		w4, w4, #INTERLEAVE
	adds		w4, w4, #4
	beq		.Lecbdecout
	beq		.Lecbdecout
#endif
.Lecbdecloop:
.Lecbdecloop:
	ld1		{v0.16b}, [x1], #16		/* get next ct block */
	ld1		{v0.16b}, [x1], #16		/* get next ct block */
	decrypt_block	v0, w3, x2, x5, w6
	decrypt_block	v0, w3, x2, x5, w6
@@ -175,7 +81,7 @@ AES_ENTRY(aes_ecb_decrypt)
	subs		w4, w4, #1
	subs		w4, w4, #1
	bne		.Lecbdecloop
	bne		.Lecbdecloop
.Lecbdecout:
.Lecbdecout:
	FRAME_POP
	ldp		x29, x30, [sp], #16
	ret
	ret
AES_ENDPROC(aes_ecb_decrypt)
AES_ENDPROC(aes_ecb_decrypt)


@@ -204,30 +110,20 @@ AES_ENDPROC(aes_cbc_encrypt)




AES_ENTRY(aes_cbc_decrypt)
AES_ENTRY(aes_cbc_decrypt)
	FRAME_PUSH
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp


	ld1		{v7.16b}, [x5]			/* get iv */
	ld1		{v7.16b}, [x5]			/* get iv */
	dec_prepare	w3, x2, x6
	dec_prepare	w3, x2, x6


.LcbcdecloopNx:
.LcbcdecloopNx:
#if INTERLEAVE >= 2
	subs		w4, w4, #4
	subs		w4, w4, #INTERLEAVE
	bmi		.Lcbcdec1x
	bmi		.Lcbcdec1x
#if INTERLEAVE == 2
	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
	mov		v2.16b, v0.16b
	mov		v3.16b, v1.16b
	do_decrypt_block2x
	eor		v0.16b, v0.16b, v7.16b
	eor		v1.16b, v1.16b, v2.16b
	mov		v7.16b, v3.16b
	st1		{v0.16b-v1.16b}, [x0], #32
#else
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	mov		v4.16b, v0.16b
	mov		v4.16b, v0.16b
	mov		v5.16b, v1.16b
	mov		v5.16b, v1.16b
	mov		v6.16b, v2.16b
	mov		v6.16b, v2.16b
	do_decrypt_block4x
	bl		aes_decrypt_block4x
	sub		x1, x1, #16
	sub		x1, x1, #16
	eor		v0.16b, v0.16b, v7.16b
	eor		v0.16b, v0.16b, v7.16b
	eor		v1.16b, v1.16b, v4.16b
	eor		v1.16b, v1.16b, v4.16b
@@ -235,12 +131,10 @@ AES_ENTRY(aes_cbc_decrypt)
	eor		v2.16b, v2.16b, v5.16b
	eor		v2.16b, v2.16b, v5.16b
	eor		v3.16b, v3.16b, v6.16b
	eor		v3.16b, v3.16b, v6.16b
	st1		{v0.16b-v3.16b}, [x0], #64
	st1		{v0.16b-v3.16b}, [x0], #64
#endif
	b		.LcbcdecloopNx
	b		.LcbcdecloopNx
.Lcbcdec1x:
.Lcbcdec1x:
	adds		w4, w4, #INTERLEAVE
	adds		w4, w4, #4
	beq		.Lcbcdecout
	beq		.Lcbcdecout
#endif
.Lcbcdecloop:
.Lcbcdecloop:
	ld1		{v1.16b}, [x1], #16		/* get next ct block */
	ld1		{v1.16b}, [x1], #16		/* get next ct block */
	mov		v0.16b, v1.16b			/* ...and copy to v0 */
	mov		v0.16b, v1.16b			/* ...and copy to v0 */
@@ -251,8 +145,8 @@ AES_ENTRY(aes_cbc_decrypt)
	subs		w4, w4, #1
	subs		w4, w4, #1
	bne		.Lcbcdecloop
	bne		.Lcbcdecloop
.Lcbcdecout:
.Lcbcdecout:
	FRAME_POP
	st1		{v7.16b}, [x5]			/* return iv */
	st1		{v7.16b}, [x5]			/* return iv */
	ldp		x29, x30, [sp], #16
	ret
	ret
AES_ENDPROC(aes_cbc_decrypt)
AES_ENDPROC(aes_cbc_decrypt)


@@ -263,34 +157,19 @@ AES_ENDPROC(aes_cbc_decrypt)
	 */
	 */


AES_ENTRY(aes_ctr_encrypt)
AES_ENTRY(aes_ctr_encrypt)
	FRAME_PUSH
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp


	enc_prepare	w3, x2, x6
	enc_prepare	w3, x2, x6
	ld1		{v4.16b}, [x5]
	ld1		{v4.16b}, [x5]


	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
	rev		x6, x6
	rev		x6, x6
#if INTERLEAVE >= 2
	cmn		w6, w4			/* 32 bit overflow? */
	cmn		w6, w4			/* 32 bit overflow? */
	bcs		.Lctrloop
	bcs		.Lctrloop
.LctrloopNx:
.LctrloopNx:
	subs		w4, w4, #INTERLEAVE
	subs		w4, w4, #4
	bmi		.Lctr1x
	bmi		.Lctr1x
#if INTERLEAVE == 2
	mov		v0.8b, v4.8b
	mov		v1.8b, v4.8b
	rev		x7, x6
	add		x6, x6, #1
	ins		v0.d[1], x7
	rev		x7, x6
	add		x6, x6, #1
	ins		v1.d[1], x7
	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */
	do_encrypt_block2x
	eor		v0.16b, v0.16b, v2.16b
	eor		v1.16b, v1.16b, v3.16b
	st1		{v0.16b-v1.16b}, [x0], #32
#else
	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
	dup		v7.4s, w6
	dup		v7.4s, w6
	mov		v0.16b, v4.16b
	mov		v0.16b, v4.16b
@@ -303,23 +182,21 @@ AES_ENTRY(aes_ctr_encrypt)
	mov		v2.s[3], v8.s[1]
	mov		v2.s[3], v8.s[1]
	mov		v3.s[3], v8.s[2]
	mov		v3.s[3], v8.s[2]
	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
	do_encrypt_block4x
	bl		aes_encrypt_block4x
	eor		v0.16b, v5.16b, v0.16b
	eor		v0.16b, v5.16b, v0.16b
	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
	eor		v1.16b, v6.16b, v1.16b
	eor		v1.16b, v6.16b, v1.16b
	eor		v2.16b, v7.16b, v2.16b
	eor		v2.16b, v7.16b, v2.16b
	eor		v3.16b, v5.16b, v3.16b
	eor		v3.16b, v5.16b, v3.16b
	st1		{v0.16b-v3.16b}, [x0], #64
	st1		{v0.16b-v3.16b}, [x0], #64
	add		x6, x6, #INTERLEAVE
	add		x6, x6, #4
#endif
	rev		x7, x6
	rev		x7, x6
	ins		v4.d[1], x7
	ins		v4.d[1], x7
	cbz		w4, .Lctrout
	cbz		w4, .Lctrout
	b		.LctrloopNx
	b		.LctrloopNx
.Lctr1x:
.Lctr1x:
	adds		w4, w4, #INTERLEAVE
	adds		w4, w4, #4
	beq		.Lctrout
	beq		.Lctrout
#endif
.Lctrloop:
.Lctrloop:
	mov		v0.16b, v4.16b
	mov		v0.16b, v4.16b
	encrypt_block	v0, w3, x2, x8, w7
	encrypt_block	v0, w3, x2, x8, w7
@@ -339,12 +216,12 @@ AES_ENTRY(aes_ctr_encrypt)


.Lctrout:
.Lctrout:
	st1		{v4.16b}, [x5]		/* return next CTR value */
	st1		{v4.16b}, [x5]		/* return next CTR value */
	FRAME_POP
	ldp		x29, x30, [sp], #16
	ret
	ret


.Lctrtailblock:
.Lctrtailblock:
	st1		{v0.16b}, [x0]
	st1		{v0.16b}, [x0]
	FRAME_POP
	ldp		x29, x30, [sp], #16
	ret
	ret


.Lctrcarry:
.Lctrcarry:
@@ -378,7 +255,9 @@ CPU_LE( .quad 1, 0x87 )
CPU_BE(	.quad		0x87, 1		)
CPU_BE(	.quad		0x87, 1		)


AES_ENTRY(aes_xts_encrypt)
AES_ENTRY(aes_xts_encrypt)
	FRAME_PUSH
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp

	ld1		{v4.16b}, [x6]
	ld1		{v4.16b}, [x6]
	cbz		w7, .Lxtsencnotfirst
	cbz		w7, .Lxtsencnotfirst


@@ -394,25 +273,8 @@ AES_ENTRY(aes_xts_encrypt)
	ldr		q7, .Lxts_mul_x
	ldr		q7, .Lxts_mul_x
	next_tweak	v4, v4, v7, v8
	next_tweak	v4, v4, v7, v8
.LxtsencNx:
.LxtsencNx:
#if INTERLEAVE >= 2
	subs		w4, w4, #4
	subs		w4, w4, #INTERLEAVE
	bmi		.Lxtsenc1x
	bmi		.Lxtsenc1x
#if INTERLEAVE == 2
	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
	next_tweak	v5, v4, v7, v8
	eor		v0.16b, v0.16b, v4.16b
	eor		v1.16b, v1.16b, v5.16b
	do_encrypt_block2x
	eor		v0.16b, v0.16b, v4.16b
	eor		v1.16b, v1.16b, v5.16b
	st1		{v0.16b-v1.16b}, [x0], #32
	cbz		w4, .LxtsencoutNx
	next_tweak	v4, v5, v7, v8
	b		.LxtsencNx
.LxtsencoutNx:
	mov		v4.16b, v5.16b
	b		.Lxtsencout
#else
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
	next_tweak	v5, v4, v7, v8
	next_tweak	v5, v4, v7, v8
	eor		v0.16b, v0.16b, v4.16b
	eor		v0.16b, v0.16b, v4.16b
@@ -421,7 +283,7 @@ AES_ENTRY(aes_xts_encrypt)
	eor		v2.16b, v2.16b, v6.16b
	eor		v2.16b, v2.16b, v6.16b
	next_tweak	v7, v6, v7, v8
	next_tweak	v7, v6, v7, v8
	eor		v3.16b, v3.16b, v7.16b
	eor		v3.16b, v3.16b, v7.16b
	do_encrypt_block4x
	bl		aes_encrypt_block4x
	eor		v3.16b, v3.16b, v7.16b
	eor		v3.16b, v3.16b, v7.16b
	eor		v0.16b, v0.16b, v4.16b
	eor		v0.16b, v0.16b, v4.16b
	eor		v1.16b, v1.16b, v5.16b
	eor		v1.16b, v1.16b, v5.16b
@@ -430,11 +292,9 @@ AES_ENTRY(aes_xts_encrypt)
	mov		v4.16b, v7.16b
	mov		v4.16b, v7.16b
	cbz		w4, .Lxtsencout
	cbz		w4, .Lxtsencout
	b		.LxtsencloopNx
	b		.LxtsencloopNx
#endif
.Lxtsenc1x:
.Lxtsenc1x:
	adds		w4, w4, #INTERLEAVE
	adds		w4, w4, #4
	beq		.Lxtsencout
	beq		.Lxtsencout
#endif
.Lxtsencloop:
.Lxtsencloop:
	ld1		{v1.16b}, [x1], #16
	ld1		{v1.16b}, [x1], #16
	eor		v0.16b, v1.16b, v4.16b
	eor		v0.16b, v1.16b, v4.16b
@@ -447,13 +307,15 @@ AES_ENTRY(aes_xts_encrypt)
	b		.Lxtsencloop
	b		.Lxtsencloop
.Lxtsencout:
.Lxtsencout:
	st1		{v4.16b}, [x6]
	st1		{v4.16b}, [x6]
	FRAME_POP
	ldp		x29, x30, [sp], #16
	ret
	ret
AES_ENDPROC(aes_xts_encrypt)
AES_ENDPROC(aes_xts_encrypt)




AES_ENTRY(aes_xts_decrypt)
AES_ENTRY(aes_xts_decrypt)
	FRAME_PUSH
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp

	ld1		{v4.16b}, [x6]
	ld1		{v4.16b}, [x6]
	cbz		w7, .Lxtsdecnotfirst
	cbz		w7, .Lxtsdecnotfirst


@@ -469,25 +331,8 @@ AES_ENTRY(aes_xts_decrypt)
	ldr		q7, .Lxts_mul_x
	ldr		q7, .Lxts_mul_x
	next_tweak	v4, v4, v7, v8
	next_tweak	v4, v4, v7, v8
.LxtsdecNx:
.LxtsdecNx:
#if INTERLEAVE >= 2
	subs		w4, w4, #4
	subs		w4, w4, #INTERLEAVE
	bmi		.Lxtsdec1x
	bmi		.Lxtsdec1x
#if INTERLEAVE == 2
	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
	next_tweak	v5, v4, v7, v8
	eor		v0.16b, v0.16b, v4.16b
	eor		v1.16b, v1.16b, v5.16b
	do_decrypt_block2x
	eor		v0.16b, v0.16b, v4.16b
	eor		v1.16b, v1.16b, v5.16b
	st1		{v0.16b-v1.16b}, [x0], #32
	cbz		w4, .LxtsdecoutNx
	next_tweak	v4, v5, v7, v8
	b		.LxtsdecNx
.LxtsdecoutNx:
	mov		v4.16b, v5.16b
	b		.Lxtsdecout
#else
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	next_tweak	v5, v4, v7, v8
	next_tweak	v5, v4, v7, v8
	eor		v0.16b, v0.16b, v4.16b
	eor		v0.16b, v0.16b, v4.16b
@@ -496,7 +341,7 @@ AES_ENTRY(aes_xts_decrypt)
	eor		v2.16b, v2.16b, v6.16b
	eor		v2.16b, v2.16b, v6.16b
	next_tweak	v7, v6, v7, v8
	next_tweak	v7, v6, v7, v8
	eor		v3.16b, v3.16b, v7.16b
	eor		v3.16b, v3.16b, v7.16b
	do_decrypt_block4x
	bl		aes_decrypt_block4x
	eor		v3.16b, v3.16b, v7.16b
	eor		v3.16b, v3.16b, v7.16b
	eor		v0.16b, v0.16b, v4.16b
	eor		v0.16b, v0.16b, v4.16b
	eor		v1.16b, v1.16b, v5.16b
	eor		v1.16b, v1.16b, v5.16b
@@ -505,11 +350,9 @@ AES_ENTRY(aes_xts_decrypt)
	mov		v4.16b, v7.16b
	mov		v4.16b, v7.16b
	cbz		w4, .Lxtsdecout
	cbz		w4, .Lxtsdecout
	b		.LxtsdecloopNx
	b		.LxtsdecloopNx
#endif
.Lxtsdec1x:
.Lxtsdec1x:
	adds		w4, w4, #INTERLEAVE
	adds		w4, w4, #4
	beq		.Lxtsdecout
	beq		.Lxtsdecout
#endif
.Lxtsdecloop:
.Lxtsdecloop:
	ld1		{v1.16b}, [x1], #16
	ld1		{v1.16b}, [x1], #16
	eor		v0.16b, v1.16b, v4.16b
	eor		v0.16b, v1.16b, v4.16b
@@ -522,7 +365,7 @@ AES_ENTRY(aes_xts_decrypt)
	b		.Lxtsdecloop
	b		.Lxtsdecloop
.Lxtsdecout:
.Lxtsdecout:
	st1		{v4.16b}, [x6]
	st1		{v4.16b}, [x6]
	FRAME_POP
	ldp		x29, x30, [sp], #16
	ret
	ret
AES_ENDPROC(aes_xts_decrypt)
AES_ENDPROC(aes_xts_decrypt)