Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b913a640 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Catalin Marinas
Browse files

arm64/crypto: improve performance of GHASH algorithm



This patches modifies the GHASH secure hash implementation to switch to a
faster, polynomial multiplication based reduction instead of one that uses
shifts and rotates.

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent 6aa8b209
Loading
Loading
Loading
Loading
+38 −54
Original line number Diff line number Diff line
@@ -3,14 +3,6 @@
 *
 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
 *
 * Copyright (c) 2009 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 *           Vinodh Gopal
 *           Erdinc Ozturk
 *           Deniz Karakoyunlu
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
@@ -19,13 +11,15 @@
#include <linux/linkage.h>
#include <asm/assembler.h>

	DATA	.req	v0
	SHASH	.req	v1
	IN1	.req	v2
	SHASH	.req	v0
	SHASH2	.req	v1
	T1	.req	v2
	T2	.req	v3
	T3	.req	v4
	VZR	.req	v5
	MASK	.req	v4
	XL	.req	v5
	XM	.req	v6
	XH	.req	v7
	IN1	.req	v7

	.text
	.arch		armv8-a+crypto
@@ -35,61 +29,51 @@
	 *			   struct ghash_key const *k, const char *head)
	 */
ENTRY(pmull_ghash_update)
	ld1		{DATA.16b}, [x1]
	ld1		{SHASH.16b}, [x3]
	eor		VZR.16b, VZR.16b, VZR.16b
	ld1		{XL.16b}, [x1]
	movi		MASK.16b, #0xe1
	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
	shl		MASK.2d, MASK.2d, #57
	eor		SHASH2.16b, SHASH2.16b, SHASH.16b

	/* do the head block first, if supplied */
	cbz		x4, 0f
	ld1		{IN1.2d}, [x4]
	ld1		{T1.2d}, [x4]
	b		1f

0:	ld1		{IN1.2d}, [x2], #16
0:	ld1		{T1.2d}, [x2], #16
	sub		w0, w0, #1
1:	ext		IN1.16b, IN1.16b, IN1.16b, #8
CPU_LE(	rev64		IN1.16b, IN1.16b	)
	eor		DATA.16b, DATA.16b, IN1.16b

	/* multiply DATA by SHASH in GF(2^128) */
	ext		T2.16b, DATA.16b, DATA.16b, #8
	ext		T3.16b, SHASH.16b, SHASH.16b, #8
	eor		T2.16b, T2.16b, DATA.16b
	eor		T3.16b, T3.16b, SHASH.16b
1:	/* multiply XL by SHASH in GF(2^128) */
CPU_LE(	rev64		T1.16b, T1.16b	)

	pmull2		T1.1q, SHASH.2d, DATA.2d	// a1 * b1
	pmull		DATA.1q, SHASH.1d, DATA.1d	// a0 * b0
	pmull		T2.1q, T2.1d, T3.1d		// (a1 + a0)(b1 + b0)
	eor		T2.16b, T2.16b, T1.16b		// (a0 * b1) + (a1 * b0)
	eor		T2.16b, T2.16b, DATA.16b
	ext		T2.16b, XL.16b, XL.16b, #8
	ext		IN1.16b, T1.16b, T1.16b, #8
	eor		T1.16b, T1.16b, T2.16b
	eor		XL.16b, XL.16b, IN1.16b

	ext		T3.16b, VZR.16b, T2.16b, #8
	ext		T2.16b, T2.16b, VZR.16b, #8
	eor		DATA.16b, DATA.16b, T3.16b
	eor		T1.16b, T1.16b, T2.16b	// <T1:DATA> is result of
						// carry-less multiplication
	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
	eor		T1.16b, T1.16b, XL.16b
	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)

	/* first phase of the reduction */
	shl		T3.2d, DATA.2d, #1
	eor		T3.16b, T3.16b, DATA.16b
	shl		T3.2d, T3.2d, #5
	eor		T3.16b, T3.16b, DATA.16b
	shl		T3.2d, T3.2d, #57
	ext		T2.16b, VZR.16b, T3.16b, #8
	ext		T3.16b, T3.16b, VZR.16b, #8
	eor		DATA.16b, DATA.16b, T2.16b
	eor		T1.16b, T1.16b, T3.16b
	ext		T1.16b, XL.16b, XH.16b, #8
	eor		T2.16b, XL.16b, XH.16b
	eor		XM.16b, XM.16b, T1.16b
	eor		XM.16b, XM.16b, T2.16b
	pmull		T2.1q, XL.1d, MASK.1d

	/* second phase of the reduction */
	ushr		T2.2d, DATA.2d, #5
	eor		T2.16b, T2.16b, DATA.16b
	ushr		T2.2d, T2.2d, #1
	eor		T2.16b, T2.16b, DATA.16b
	ushr		T2.2d, T2.2d, #1
	eor		T1.16b, T1.16b, T2.16b
	eor		DATA.16b, DATA.16b, T1.16b
	mov		XH.d[0], XM.d[1]
	mov		XM.d[1], XL.d[0]

	eor		XL.16b, XM.16b, T2.16b
	ext		T2.16b, XL.16b, XL.16b, #8
	pmull		XL.1q, XL.1d, MASK.1d
	eor		T2.16b, T2.16b, XH.16b
	eor		XL.16b, XL.16b, T2.16b

	cbnz		w0, 0b

	st1		{DATA.16b}, [x1]
	st1		{XL.16b}, [x1]
	ret
ENDPROC(pmull_ghash_update)
+2 −2
Original line number Diff line number Diff line
@@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
		blocks = len / GHASH_BLOCK_SIZE;
		len %= GHASH_BLOCK_SIZE;

		kernel_neon_begin_partial(6);
		kernel_neon_begin_partial(8);
		pmull_ghash_update(blocks, ctx->digest, src, key,
				   partial ? ctx->buf : NULL);
		kernel_neon_end();
@@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)

		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);

		kernel_neon_begin_partial(6);
		kernel_neon_begin_partial(8);
		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
		kernel_neon_end();
	}