Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7367bfeb authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR



This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.

Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.

For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent e2174139
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -18,6 +18,8 @@
	.arch		armv8-a+crypto

	xtsmask		.req	v16
	cbciv		.req	v16
	vctr		.req	v16

	.macro		xts_reload_mask, tmp
	.endm
+71 −31
Original line number Diff line number Diff line
@@ -17,6 +17,14 @@
#define MAX_STRIDE	4
#endif

#if MAX_STRIDE == 4
#define ST4(x...) x
#define ST5(x...)
#else
#define ST4(x...)
#define ST5(x...) x
#endif

aes_encrypt_block4x:
	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
	ret
@@ -53,14 +61,17 @@ AES_ENTRY(aes_ecb_encrypt)
	enc_prepare	w3, x2, x5

.LecbencloopNx:
	subs		w4, w4, #4
	subs		w4, w4, #MAX_STRIDE
	bmi		.Lecbenc1x
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
	bl		aes_encrypt_block4x
ST4(	bl		aes_encrypt_block4x		)
ST5(	ld1		{v4.16b}, [x1], #16		)
ST5(	bl		aes_encrypt_block5x		)
	st1		{v0.16b-v3.16b}, [x0], #64
ST5(	st1		{v4.16b}, [x0], #16		)
	b		.LecbencloopNx
.Lecbenc1x:
	adds		w4, w4, #4
	adds		w4, w4, #MAX_STRIDE
	beq		.Lecbencout
.Lecbencloop:
	ld1		{v0.16b}, [x1], #16		/* get next pt block */
@@ -81,14 +92,17 @@ AES_ENTRY(aes_ecb_decrypt)
	dec_prepare	w3, x2, x5

.LecbdecloopNx:
	subs		w4, w4, #4
	subs		w4, w4, #MAX_STRIDE
	bmi		.Lecbdec1x
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
	bl		aes_decrypt_block4x
ST4(	bl		aes_decrypt_block4x		)
ST5(	ld1		{v4.16b}, [x1], #16		)
ST5(	bl		aes_decrypt_block5x		)
	st1		{v0.16b-v3.16b}, [x0], #64
ST5(	st1		{v4.16b}, [x0], #16		)
	b		.LecbdecloopNx
.Lecbdec1x:
	adds		w4, w4, #4
	adds		w4, w4, #MAX_STRIDE
	beq		.Lecbdecout
.Lecbdecloop:
	ld1		{v0.16b}, [x1], #16		/* get next ct block */
@@ -148,39 +162,56 @@ AES_ENTRY(aes_cbc_decrypt)
	stp		x29, x30, [sp, #-16]!
	mov		x29, sp

	ld1		{v7.16b}, [x5]			/* get iv */
	ld1		{cbciv.16b}, [x5]		/* get iv */
	dec_prepare	w3, x2, x6

.LcbcdecloopNx:
	subs		w4, w4, #4
	subs		w4, w4, #MAX_STRIDE
	bmi		.Lcbcdec1x
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
#if MAX_STRIDE == 5
	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
	mov		v5.16b, v0.16b
	mov		v6.16b, v1.16b
	mov		v7.16b, v2.16b
	bl		aes_decrypt_block5x
	sub		x1, x1, #32
	eor		v0.16b, v0.16b, cbciv.16b
	eor		v1.16b, v1.16b, v5.16b
	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
	eor		v2.16b, v2.16b, v6.16b
	eor		v3.16b, v3.16b, v7.16b
	eor		v4.16b, v4.16b, v5.16b
#else
	mov		v4.16b, v0.16b
	mov		v5.16b, v1.16b
	mov		v6.16b, v2.16b
	bl		aes_decrypt_block4x
	sub		x1, x1, #16
	eor		v0.16b, v0.16b, v7.16b
	eor		v0.16b, v0.16b, cbciv.16b
	eor		v1.16b, v1.16b, v4.16b
	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
	eor		v2.16b, v2.16b, v5.16b
	eor		v3.16b, v3.16b, v6.16b
#endif
	st1		{v0.16b-v3.16b}, [x0], #64
ST5(	st1		{v4.16b}, [x0], #16		)
	b		.LcbcdecloopNx
.Lcbcdec1x:
	adds		w4, w4, #4
	adds		w4, w4, #MAX_STRIDE
	beq		.Lcbcdecout
.Lcbcdecloop:
	ld1		{v1.16b}, [x1], #16		/* get next ct block */
	mov		v0.16b, v1.16b			/* ...and copy to v0 */
	decrypt_block	v0, w3, x2, x6, w7
	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
	mov		v7.16b, v1.16b			/* ct is next iv */
	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
	mov		cbciv.16b, v1.16b		/* ct is next iv */
	st1		{v0.16b}, [x0], #16
	subs		w4, w4, #1
	bne		.Lcbcdecloop
.Lcbcdecout:
	st1		{v7.16b}, [x5]			/* return iv */
	st1		{cbciv.16b}, [x5]		/* return iv */
	ldp		x29, x30, [sp], #16
	ret
AES_ENDPROC(aes_cbc_decrypt)
@@ -274,51 +305,60 @@ AES_ENTRY(aes_ctr_encrypt)
	mov		x29, sp

	enc_prepare	w3, x2, x6
	ld1		{v4.16b}, [x5]
	ld1		{vctr.16b}, [x5]

	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
	umov		x6, vctr.d[1]		/* keep swabbed ctr in reg */
	rev		x6, x6
	cmn		w6, w4			/* 32 bit overflow? */
	bcs		.Lctrloop
.LctrloopNx:
	subs		w4, w4, #4
	subs		w4, w4, #MAX_STRIDE
	bmi		.Lctr1x
	add		w7, w6, #1
	mov		v0.16b, v4.16b
	mov		v0.16b, vctr.16b
	add		w8, w6, #2
	mov		v1.16b, v4.16b
	mov		v1.16b, vctr.16b
	add		w9, w6, #3
	mov		v2.16b, vctr.16b
	add		w9, w6, #3
	mov		v2.16b, v4.16b
	rev		w7, w7
	mov		v3.16b, v4.16b
	mov		v3.16b, vctr.16b
	rev		w8, w8
ST5(	mov		v4.16b, vctr.16b		)
	mov		v1.s[3], w7
	rev		w9, w9
ST5(	add		w10, w6, #4			)
	mov		v2.s[3], w8
ST5(	rev		w10, w10			)
	mov		v3.s[3], w9
ST5(	mov		v4.s[3], w10			)
	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
	bl		aes_encrypt_block4x
ST4(	bl		aes_encrypt_block4x		)
ST5(	bl		aes_encrypt_block5x		)
	eor		v0.16b, v5.16b, v0.16b
	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
ST4(	ld1		{v5.16b}, [x1], #16		)
	eor		v1.16b, v6.16b, v1.16b
ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
	eor		v2.16b, v7.16b, v2.16b
	eor		v3.16b, v5.16b, v3.16b
ST5(	eor		v4.16b, v6.16b, v4.16b		)
	st1		{v0.16b-v3.16b}, [x0], #64
	add		x6, x6, #4
ST5(	st1		{v4.16b}, [x0], #16		)
	add		x6, x6, #MAX_STRIDE
	rev		x7, x6
	ins		v4.d[1], x7
	ins		vctr.d[1], x7
	cbz		w4, .Lctrout
	b		.LctrloopNx
.Lctr1x:
	adds		w4, w4, #4
	adds		w4, w4, #MAX_STRIDE
	beq		.Lctrout
.Lctrloop:
	mov		v0.16b, v4.16b
	mov		v0.16b, vctr.16b
	encrypt_block	v0, w3, x2, x8, w7

	adds		x6, x6, #1		/* increment BE ctr */
	rev		x7, x6
	ins		v4.d[1], x7
	ins		vctr.d[1], x7
	bcs		.Lctrcarry		/* overflow? */

.Lctrcarrydone:
@@ -330,7 +370,7 @@ AES_ENTRY(aes_ctr_encrypt)
	bne		.Lctrloop

.Lctrout:
	st1		{v4.16b}, [x5]		/* return next CTR value */
	st1		{vctr.16b}, [x5]	/* return next CTR value */
	ldp		x29, x30, [sp], #16
	ret

@@ -339,11 +379,11 @@ AES_ENTRY(aes_ctr_encrypt)
	b		.Lctrout

.Lctrcarry:
	umov		x7, v4.d[0]		/* load upper word of ctr  */
	umov		x7, vctr.d[0]		/* load upper word of ctr  */
	rev		x7, x7			/* ... to handle the carry */
	add		x7, x7, #1
	rev		x7, x7
	ins		v4.d[0], x7
	ins		vctr.d[0], x7
	b		.Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt)

+2 −0
Original line number Diff line number Diff line
@@ -15,6 +15,8 @@
#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)

	xtsmask		.req	v7
	cbciv		.req	v7
	vctr		.req	v4

	.macro		xts_reload_mask, tmp
	xts_load_mask	\tmp