Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4edd7d01 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/aes-neon-blk - tweak performance for low end cores



The non-bitsliced AES implementation using the NEON is highly sensitive
to micro-architectural details, and, as it turns out, the Cortex-A53 on
the Raspberry Pi 3 is a core that can benefit from this code, given that
its scalar AES performance is abysmal (32.9 cycles per byte).

The new bitsliced AES code manages 19.8 cycles per byte on this core,
but can only operate on 8 blocks at a time, which is not supported by
all chaining modes. With a bit of tweaking, we can get the plain NEON
code to run at 22.0 cycles per byte, making it useful for sequential
modes like CBC encryption. (Like bitsliced NEON, the plain NEON
implementation does not use any lookup tables, which makes it easy on
the D-cache, and invulnerable to cache timing attacks)

So tweak the plain NEON AES code to use tbl instructions rather than
shl/sri pairs, and to avoid the need to reload permutation vectors or
other constants from memory in every round. Also, improve the decryption
performance by switching to 16x8 pmul instructions for the performing
the multiplications in GF(2^8).

To allow the ECB and CBC encrypt routines to be reused by the bitsliced
NEON code in a subsequent patch, export them from the module.

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent c458c4ad
Loading
Loading
Loading
Loading
+2 −0
Original line number Original line Diff line number Diff line
@@ -409,5 +409,7 @@ static int __init aes_init(void)
module_cpu_feature_match(AES, aes_init);
module_cpu_feature_match(AES, aes_init);
#else
#else
module_init(aes_init);
module_init(aes_init);
EXPORT_SYMBOL(neon_aes_ecb_encrypt);
EXPORT_SYMBOL(neon_aes_cbc_encrypt);
#endif
#endif
module_exit(aes_exit);
module_exit(aes_exit);
+100 −135
Original line number Original line Diff line number Diff line
/*
/*
 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
 *
 *
 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * it under the terms of the GNU General Public License version 2 as
@@ -17,17 +17,25 @@
	/* multiply by polynomial 'x' in GF(2^8) */
	/* multiply by polynomial 'x' in GF(2^8) */
	.macro		mul_by_x, out, in, temp, const
	.macro		mul_by_x, out, in, temp, const
	sshr		\temp, \in, #7
	sshr		\temp, \in, #7
	add		\out, \in, \in
	shl		\out, \in, #1
	and		\temp, \temp, \const
	and		\temp, \temp, \const
	eor		\out, \out, \temp
	eor		\out, \out, \temp
	.endm
	.endm


	/* multiply by polynomial 'x^2' in GF(2^8) */
	.macro		mul_by_x2, out, in, temp, const
	ushr		\temp, \in, #6
	shl		\out, \in, #2
	pmul		\temp, \temp, \const
	eor		\out, \out, \temp
	.endm

	/* preload the entire Sbox */
	/* preload the entire Sbox */
	.macro		prepare, sbox, shiftrows, temp
	.macro		prepare, sbox, shiftrows, temp
	adr		\temp, \sbox
	adr		\temp, \sbox
	movi		v12.16b, #0x40
	movi		v12.16b, #0x1b
	ldr		q13, \shiftrows
	ldr		q13, \shiftrows
	movi		v14.16b, #0x1b
	ldr		q14, .Lror32by8
	ld1		{v16.16b-v19.16b}, [\temp], #64
	ld1		{v16.16b-v19.16b}, [\temp], #64
	ld1		{v20.16b-v23.16b}, [\temp], #64
	ld1		{v20.16b-v23.16b}, [\temp], #64
	ld1		{v24.16b-v27.16b}, [\temp], #64
	ld1		{v24.16b-v27.16b}, [\temp], #64
@@ -50,37 +58,31 @@


	/* apply SubBytes transformation using the the preloaded Sbox */
	/* apply SubBytes transformation using the the preloaded Sbox */
	.macro		sub_bytes, in
	.macro		sub_bytes, in
	sub		v9.16b, \in\().16b, v12.16b
	sub		v9.16b, \in\().16b, v15.16b
	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
	sub		v10.16b, v9.16b, v12.16b
	sub		v10.16b, v9.16b, v15.16b
	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
	sub		v11.16b, v10.16b, v12.16b
	sub		v11.16b, v10.16b, v15.16b
	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
	.endm
	.endm


	/* apply MixColumns transformation */
	/* apply MixColumns transformation */
	.macro		mix_columns, in
	.macro		mix_columns, in, enc
	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
	.if		\enc == 0
	rev32		v8.8h, \in\().8h
	eor		\in\().16b, v10.16b, \in\().16b
	shl		v9.4s, v8.4s, #24
	shl		v11.4s, \in\().4s, #24
	sri		v9.4s, v8.4s, #8
	sri		v11.4s, \in\().4s, #8
	eor		v9.16b, v9.16b, v8.16b
	eor		v10.16b, v10.16b, v9.16b
	eor		\in\().16b, v10.16b, v11.16b
	.endm

	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
	.macro		inv_mix_columns, in
	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
	eor		\in\().16b, \in\().16b, v8.16b
	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
	rev32		v8.8h, v8.8h
	eor		\in\().16b, \in\().16b, v11.16b
	eor		\in\().16b, \in\().16b, v8.16b
	rev32		v11.8h, v11.8h
	.endif
	eor		\in\().16b, \in\().16b, v11.16b

	mix_columns	\in
	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
	rev32		v8.8h, \in\().8h
	eor		v8.16b, v8.16b, v9.16b
	eor		\in\().16b, \in\().16b, v8.16b
	tbl		\in\().16b, {\in\().16b}, v14.16b
	eor		\in\().16b, \in\().16b, v8.16b
	.endm
	.endm


	.macro		do_block, enc, in, rounds, rk, rkp, i
	.macro		do_block, enc, in, rounds, rk, rkp, i
@@ -88,16 +90,13 @@
	add		\rkp, \rk, #16
	add		\rkp, \rk, #16
	mov		\i, \rounds
	mov		\i, \rounds
1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
	movi		v15.16b, #0x40
	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
	sub_bytes	\in
	sub_bytes	\in
	ld1		{v15.4s}, [\rkp], #16
	subs		\i, \i, #1
	subs		\i, \i, #1
	ld1		{v15.4s}, [\rkp], #16
	beq		2222f
	beq		2222f
	.if		\enc == 1
	mix_columns	\in, \enc
	mix_columns	\in
	.else
	inv_mix_columns	\in
	.endif
	b		1111b
	b		1111b
2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
	.endm
	.endm
@@ -116,48 +115,48 @@
	 */
	 */


	.macro		sub_bytes_2x, in0, in1
	.macro		sub_bytes_2x, in0, in1
	sub		v8.16b, \in0\().16b, v12.16b
	sub		v8.16b, \in0\().16b, v15.16b
	sub		v9.16b, \in1\().16b, v12.16b
	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
	sub		v9.16b, \in1\().16b, v15.16b
	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
	sub		v10.16b, v8.16b, v12.16b
	sub		v10.16b, v8.16b, v15.16b
	sub		v11.16b, v9.16b, v12.16b
	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
	sub		v11.16b, v9.16b, v15.16b
	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
	sub		v8.16b, v10.16b, v12.16b
	sub		v8.16b, v10.16b, v15.16b
	sub		v9.16b, v11.16b, v12.16b
	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
	sub		v9.16b, v11.16b, v15.16b
	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
	.endm
	.endm


	.macro		sub_bytes_4x, in0, in1, in2, in3
	.macro		sub_bytes_4x, in0, in1, in2, in3
	sub		v8.16b, \in0\().16b, v12.16b
	sub		v8.16b, \in0\().16b, v15.16b
	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
	sub		v9.16b, \in1\().16b, v12.16b
	sub		v9.16b, \in1\().16b, v15.16b
	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
	sub		v10.16b, \in2\().16b, v12.16b
	sub		v10.16b, \in2\().16b, v15.16b
	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
	sub		v11.16b, \in3\().16b, v12.16b
	sub		v11.16b, \in3\().16b, v15.16b
	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
	sub		v8.16b, v8.16b, v12.16b
	sub		v8.16b, v8.16b, v15.16b
	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
	sub		v9.16b, v9.16b, v12.16b
	sub		v9.16b, v9.16b, v15.16b
	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
	sub		v10.16b, v10.16b, v12.16b
	sub		v10.16b, v10.16b, v15.16b
	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
	sub		v11.16b, v11.16b, v12.16b
	sub		v11.16b, v11.16b, v15.16b
	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
	sub		v8.16b, v8.16b, v12.16b
	sub		v8.16b, v8.16b, v15.16b
	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
	sub		v9.16b, v9.16b, v12.16b
	sub		v9.16b, v9.16b, v15.16b
	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
	sub		v10.16b, v10.16b, v12.16b
	sub		v10.16b, v10.16b, v15.16b
	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
	sub		v11.16b, v11.16b, v12.16b
	sub		v11.16b, v11.16b, v15.16b
	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
@@ -165,90 +164,65 @@


	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
	sshr		\tmp0\().16b, \in0\().16b, #7
	sshr		\tmp0\().16b, \in0\().16b, #7
	add		\out0\().16b, \in0\().16b,  \in0\().16b
	shl		\out0\().16b, \in0\().16b, #1
	sshr		\tmp1\().16b, \in1\().16b, #7
	sshr		\tmp1\().16b, \in1\().16b, #7
	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
	add		\out1\().16b, \in1\().16b,  \in1\().16b
	shl		\out1\().16b, \in1\().16b, #1
	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
	.endm
	.endm


	.macro		mix_columns_2x, in0, in1
	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
	ushr		\tmp0\().16b, \in0\().16b, #6
	rev32		v10.8h, \in0\().8h
	shl		\out0\().16b, \in0\().16b, #2
	rev32		v11.8h, \in1\().8h
	ushr		\tmp1\().16b, \in1\().16b, #6
	eor		\in0\().16b, v8.16b, \in0\().16b
	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
	eor		\in1\().16b, v9.16b, \in1\().16b
	shl		\out1\().16b, \in1\().16b, #2
	shl		v12.4s, v10.4s, #24
	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
	shl		v13.4s, v11.4s, #24
	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
	eor		v8.16b, v8.16b, v10.16b
	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
	sri		v12.4s, v10.4s, #8
	shl		v10.4s, \in0\().4s, #24
	eor		v9.16b, v9.16b, v11.16b
	sri		v13.4s, v11.4s, #8
	shl		v11.4s, \in1\().4s, #24
	sri		v10.4s, \in0\().4s, #8
	eor		\in0\().16b, v8.16b, v12.16b
	sri		v11.4s, \in1\().4s, #8
	eor		\in1\().16b, v9.16b, v13.16b
	eor		\in0\().16b, v10.16b, \in0\().16b
	eor		\in1\().16b, v11.16b, \in1\().16b
	.endm
	.endm


	.macro		inv_mix_cols_2x, in0, in1
	.macro		mix_columns_2x, in0, in1, enc
	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
	.if		\enc == 0
	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
	eor		\in0\().16b, \in0\().16b, v8.16b
	eor		\in0\().16b, \in0\().16b, v8.16b
	eor		\in1\().16b, \in1\().16b, v9.16b
	rev32		v8.8h, v8.8h
	rev32		v8.8h, v8.8h
	rev32		v9.8h, v9.8h
	eor		\in0\().16b, \in0\().16b, v8.16b
	eor		\in1\().16b, \in1\().16b, v9.16b
	mix_columns_2x	\in0, \in1
	.endm

	.macro		inv_mix_cols_4x, in0, in1, in2, in3
	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
	eor		\in0\().16b, \in0\().16b, v8.16b
	eor		\in1\().16b, \in1\().16b, v9.16b
	eor		\in1\().16b, \in1\().16b, v9.16b
	eor		\in2\().16b, \in2\().16b, v10.16b
	eor		\in3\().16b, \in3\().16b, v11.16b
	rev32		v8.8h, v8.8h
	rev32		v9.8h, v9.8h
	rev32		v9.8h, v9.8h
	rev32		v10.8h, v10.8h
	rev32		v11.8h, v11.8h
	eor		\in0\().16b, \in0\().16b, v8.16b
	eor		\in0\().16b, \in0\().16b, v8.16b
	eor		\in1\().16b, \in1\().16b, v9.16b
	eor		\in1\().16b, \in1\().16b, v9.16b
	eor		\in2\().16b, \in2\().16b, v10.16b
	.endif
	eor		\in3\().16b, \in3\().16b, v11.16b

	mix_columns_2x	\in0, \in1
	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
	mix_columns_2x	\in2, \in3
	rev32		v10.8h, \in0\().8h
	rev32		v11.8h, \in1\().8h
	eor		v10.16b, v10.16b, v8.16b
	eor		v11.16b, v11.16b, v9.16b
	eor		\in0\().16b, \in0\().16b, v10.16b
	eor		\in1\().16b, \in1\().16b, v11.16b
	tbl		\in0\().16b, {\in0\().16b}, v14.16b
	tbl		\in1\().16b, {\in1\().16b}, v14.16b
	eor		\in0\().16b, \in0\().16b, v10.16b
	eor		\in1\().16b, \in1\().16b, v11.16b
	.endm
	.endm


	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
	ld1		{v15.4s}, [\rk]
	ld1		{v15.4s}, [\rk]
	add		\rkp, \rk, #16
	add		\rkp, \rk, #16
	mov		\i, \rounds
	mov		\i, \rounds
1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
	sub_bytes_2x	\in0, \in1
	movi		v15.16b, #0x40
	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
	ld1		{v15.4s}, [\rkp], #16
	sub_bytes_2x	\in0, \in1
	subs		\i, \i, #1
	subs		\i, \i, #1
	ld1		{v15.4s}, [\rkp], #16
	beq		2222f
	beq		2222f
	.if		\enc == 1
	mix_columns_2x	\in0, \in1, \enc
	mix_columns_2x	\in0, \in1
	ldr		q13, .LForward_ShiftRows
	.else
	inv_mix_cols_2x	\in0, \in1
	ldr		q13, .LReverse_ShiftRows
	.endif
	movi		v12.16b, #0x40
	b		1111b
	b		1111b
2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -262,23 +236,17 @@
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
	sub_bytes_4x	\in0, \in1, \in2, \in3
	movi		v15.16b, #0x40
	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
	ld1		{v15.4s}, [\rkp], #16
	sub_bytes_4x	\in0, \in1, \in2, \in3
	subs		\i, \i, #1
	subs		\i, \i, #1
	ld1		{v15.4s}, [\rkp], #16
	beq		2222f
	beq		2222f
	.if		\enc == 1
	mix_columns_2x	\in0, \in1, \enc
	mix_columns_2x	\in0, \in1
	mix_columns_2x	\in2, \in3, \enc
	mix_columns_2x	\in2, \in3
	ldr		q13, .LForward_ShiftRows
	.else
	inv_mix_cols_4x	\in0, \in1, \in2, \in3
	ldr		q13, .LReverse_ShiftRows
	.endif
	movi		v12.16b, #0x40
	b		1111b
	b		1111b
2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -305,19 +273,7 @@
#include "aes-modes.S"
#include "aes-modes.S"


	.text
	.text
	.align		4
	.align		6
.LForward_ShiftRows:
CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)

.LReverse_ShiftRows:
CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)

.LForward_Sbox:
.LForward_Sbox:
	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@@ -385,3 +341,12 @@ CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 )
	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d

.LForward_ShiftRows:
	.octa		0x0b06010c07020d08030e09040f0a0500

.LReverse_ShiftRows:
	.octa		0x0306090c0f0205080b0e0104070a0d00

.Lror32by8:
	.octa		0x0c0f0e0d080b0a090407060500030201