crypto: arm64 - revert NEON yield for fast AEAD implementations (f10dc56c) · Commits · e / devices / android_kernel_oneplus_sm7250

arch/arm64/crypto/aes-ce-ccm-core.S

+55 −95

Original line number	Diff line number	Diff line
		@@ -19,33 +19,24 @@
		* u32 *macp, u8 const rk[], u32 rounds);
		*/
		ENTRY(ce_aes_ccm_auth_data)
		frame_push 7

		mov x19, x0
		mov x20, x1
		mov x21, x2
		mov x22, x3
		mov x23, x4
		mov x24, x5

		ldr w25, [x22] /* leftover from prev round? */
		ldr w8, [x3] /* leftover from prev round? */
		ld1 {v0.16b}, [x0] /* load mac */
		cbz w25, 1f
		sub w25, w25, #16
		cbz w8, 1f
		sub w8, w8, #16
		eor v1.16b, v1.16b, v1.16b
		0: ldrb w7, [x20], #1 /* get 1 byte of input */
		subs w21, w21, #1
		add w25, w25, #1
		0: ldrb w7, [x1], #1 /* get 1 byte of input */
		subs w2, w2, #1
		add w8, w8, #1
		ins v1.b[0], w7
		ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
		beq 8f /* out of input? */
		cbnz w25, 0b
		cbnz w8, 0b
		eor v0.16b, v0.16b, v1.16b
		1: ld1 {v3.4s}, [x23] /* load first round key */
		prfm pldl1strm, [x20]
		cmp w24, #12 /* which key size? */
		add x6, x23, #16
		sub w7, w24, #2 /* modified # of rounds */
		1: ld1 {v3.4s}, [x4] /* load first round key */
		prfm pldl1strm, [x1]
		cmp w5, #12 /* which key size? */
		add x6, x4, #16
		sub w7, w5, #2 /* modified # of rounds */
		bmi 2f
		bne 5f
		mov v5.16b, v3.16b
		@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
		ld1 {v5.4s}, [x6], #16 /* load next round key */
		bpl 3b
		aese v0.16b, v4.16b
		subs w21, w21, #16 /* last data? */
		subs w2, w2, #16 /* last data? */
		eor v0.16b, v0.16b, v5.16b /* final round */
		bmi 6f
		ld1 {v1.16b}, [x20], #16 /* load next input block */
		ld1 {v1.16b}, [x1], #16 /* load next input block */
		eor v0.16b, v0.16b, v1.16b /* xor with mac */
		beq 6f

		if_will_cond_yield_neon
		st1 {v0.16b}, [x19] /* store mac */
		do_cond_yield_neon
		ld1 {v0.16b}, [x19] /* reload mac */
		endif_yield_neon

		b 1b
		6: st1 {v0.16b}, [x19] /* store mac */
		bne 1b
		6: st1 {v0.16b}, [x0] /* store mac */
		beq 10f
		adds w21, w21, #16
		adds w2, w2, #16
		beq 10f
		mov w25, w21
		7: ldrb w7, [x20], #1
		mov w8, w2
		7: ldrb w7, [x1], #1
		umov w6, v0.b[0]
		eor w6, w6, w7
		strb w6, [x19], #1
		subs w21, w21, #1
		strb w6, [x0], #1
		subs w2, w2, #1
		beq 10f
		ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
		b 7b
		8: mov w7, w25
		add w25, w25, #16
		8: mov w7, w8
		add w8, w8, #16
		9: ext v1.16b, v1.16b, v1.16b, #1
		adds w7, w7, #1
		bne 9b
		eor v0.16b, v0.16b, v1.16b
		st1 {v0.16b}, [x19]
		10: str w25, [x22]

		frame_pop
		st1 {v0.16b}, [x0]
		10: str w8, [x3]
		ret
		ENDPROC(ce_aes_ccm_auth_data)

		@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
		ENDPROC(ce_aes_ccm_final)

		.macro aes_ccm_do_crypt,enc
		frame_push 8

		mov x19, x0
		mov x20, x1
		mov x21, x2
		mov x22, x3
		mov x23, x4
		mov x24, x5
		mov x25, x6

		ldr x26, [x25, #8] /* load lower ctr */
		ld1 {v0.16b}, [x24] /* load mac */
		CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
		ldr x8, [x6, #8] /* load lower ctr */
		ld1 {v0.16b}, [x5] /* load mac */
		CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
		0: /* outer loop */
		ld1 {v1.8b}, [x25] /* load upper ctr */
		prfm pldl1strm, [x20]
		add x26, x26, #1
		rev x9, x26
		cmp w23, #12 /* which key size? */
		sub w7, w23, #2 /* get modified # of rounds */
		ld1 {v1.8b}, [x6] /* load upper ctr */
		prfm pldl1strm, [x1]
		add x8, x8, #1
		rev x9, x8
		cmp w4, #12 /* which key size? */
		sub w7, w4, #2 /* get modified # of rounds */
		ins v1.d[1], x9 /* no carry in lower ctr */
		ld1 {v3.4s}, [x22] /* load first round key */
		add x10, x22, #16
		ld1 {v3.4s}, [x3] /* load first round key */
		add x10, x3, #16
		bmi 1f
		bne 4f
		mov v5.16b, v3.16b
		@@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
		bpl 2b
		aese v0.16b, v4.16b
		aese v1.16b, v4.16b
		subs w21, w21, #16
		bmi 7f /* partial block? */
		ld1 {v2.16b}, [x20], #16 /* load next input block */
		subs w2, w2, #16
		bmi 6f /* partial block? */
		ld1 {v2.16b}, [x1], #16 /* load next input block */
		.if \enc == 1
		eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
		eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
		@@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
		eor v1.16b, v2.16b, v5.16b /* final round enc */
		.endif
		eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
		st1 {v1.16b}, [x19], #16 /* write output block */
		beq 5f

		if_will_cond_yield_neon
		st1 {v0.16b}, [x24] /* store mac */
		do_cond_yield_neon
		ld1 {v0.16b}, [x24] /* reload mac */
		endif_yield_neon

		b 0b
		5:
		CPU_LE( rev x26, x26 )
		st1 {v0.16b}, [x24] /* store mac */
		str x26, [x25, #8] /* store lsb end of ctr (BE) */

		6: frame_pop
		ret

		7: eor v0.16b, v0.16b, v5.16b /* final round mac */
		st1 {v1.16b}, [x0], #16 /* write output block */
		bne 0b
		CPU_LE( rev x8, x8 )
		st1 {v0.16b}, [x5] /* store mac */
		str x8, [x6, #8] /* store lsb end of ctr (BE) */
		5: ret

		6: eor v0.16b, v0.16b, v5.16b /* final round mac */
		eor v1.16b, v1.16b, v5.16b /* final round enc */
		st1 {v0.16b}, [x24] /* store mac */
		add w21, w21, #16 /* process partial tail block */
		8: ldrb w9, [x20], #1 /* get 1 byte of input */
		st1 {v0.16b}, [x5] /* store mac */
		add w2, w2, #16 /* process partial tail block */
		7: ldrb w9, [x1], #1 /* get 1 byte of input */
		umov w6, v1.b[0] /* get top crypted ctr byte */
		umov w7, v0.b[0] /* get top mac byte */
		.if \enc == 1
		@@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 )
		eor w9, w9, w6
		eor w7, w7, w9
		.endif
		strb w9, [x19], #1 /* store out byte */
		strb w7, [x24], #1 /* store mac byte */
		subs w21, w21, #1
		beq 6b
		strb w9, [x0], #1 /* store out byte */
		strb w7, [x5], #1 /* store mac byte */
		subs w2, w2, #1
		beq 5b
		ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
		ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
		b 8b
		b 7b
		.endm

		/*

arch/arm64/crypto/ghash-ce-core.S

+25 −51

Original line number	Diff line number	Diff line
		@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
		.endm

		.macro pmull_gcm_do_crypt, enc
		frame_push 10
		ld1 {SHASH.2d}, [x4]
		ld1 {XL.2d}, [x1]
		ldr x8, [x5, #8] // load lower counter

		mov x19, x0
		mov x20, x1
		mov x21, x2
		mov x22, x3
		mov x23, x4
		mov x24, x5
		mov x25, x6
		mov x26, x7
		.if \enc == 1
		ldr x27, [sp, #96] // first stacked arg
		.endif

		ldr x28, [x24, #8] // load lower counter
		CPU_LE( rev x28, x28 )

		0: mov x0, x25
		load_round_keys w26, x0
		ld1 {SHASH.2d}, [x23]
		ld1 {XL.2d}, [x20]
		load_round_keys w7, x6

		movi MASK.16b, #0xe1
		ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
		CPU_LE( rev x8, x8 )
		shl MASK.2d, MASK.2d, #57
		eor SHASH2.16b, SHASH2.16b, SHASH.16b

		.if \enc == 1
		ld1 {KS.16b}, [x27]
		ldr x10, [sp]
		ld1 {KS.16b}, [x10]
		.endif

		1: ld1 {CTR.8b}, [x24] // load upper counter
		ld1 {INP.16b}, [x22], #16
		rev x9, x28
		add x28, x28, #1
		sub w19, w19, #1
		0: ld1 {CTR.8b}, [x5] // load upper counter
		ld1 {INP.16b}, [x3], #16
		rev x9, x8
		add x8, x8, #1
		sub w0, w0, #1
		ins CTR.d[1], x9 // set lower counter

		.if \enc == 1
		eor INP.16b, INP.16b, KS.16b // encrypt input
		st1 {INP.16b}, [x21], #16
		st1 {INP.16b}, [x2], #16
		.endif

		rev64 T1.16b, INP.16b

		cmp w26, #12
		b.ge 4f // AES-192/256?
		cmp w7, #12
		b.ge 2f // AES-192/256?

		2: enc_round CTR, v21
		1: enc_round CTR, v21

		ext T2.16b, XL.16b, XL.16b, #8
		ext IN1.16b, T1.16b, T1.16b, #8
		@@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 )

		.if \enc == 0
		eor INP.16b, INP.16b, KS.16b
		st1 {INP.16b}, [x21], #16
		st1 {INP.16b}, [x2], #16
		.endif

		cbz w19, 3f
		cbnz w0, 0b

		if_will_cond_yield_neon
		st1 {XL.2d}, [x20]
		.if \enc == 1
		st1 {KS.16b}, [x27]
		.endif
		do_cond_yield_neon
		b 0b
		endif_yield_neon
		CPU_LE( rev x8, x8 )
		st1 {XL.2d}, [x1]
		str x8, [x5, #8] // store lower counter

		b 1b

		3: st1 {XL.2d}, [x20]
		.if \enc == 1
		st1 {KS.16b}, [x27]
		st1 {KS.16b}, [x10]
		.endif

		CPU_LE( rev x28, x28 )
		str x28, [x24, #8] // store lower counter

		frame_pop
		ret

		4: b.eq 5f // AES-192?
		2: b.eq 3f // AES-192?
		enc_round CTR, v17
		enc_round CTR, v18
		5: enc_round CTR, v19
		3: enc_round CTR, v19
		enc_round CTR, v20
		b 2b
		b 1b
		.endm

		/*