Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b913a640 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Catalin Marinas
Browse files

arm64/crypto: improve performance of GHASH algorithm



This patches modifies the GHASH secure hash implementation to switch to a
faster, polynomial multiplication based reduction instead of one that uses
shifts and rotates.

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent 6aa8b209
Loading
Loading
Loading
Loading
+38 −54
Original line number Original line Diff line number Diff line
@@ -3,14 +3,6 @@
 *
 *
 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 *
 * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
 *
 * Copyright (c) 2009 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 *           Vinodh Gopal
 *           Erdinc Ozturk
 *           Deniz Karakoyunlu
 *
 * This program is free software; you can redistribute it and/or modify it
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 * by the Free Software Foundation.
@@ -19,13 +11,15 @@
#include <linux/linkage.h>
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/assembler.h>


	DATA	.req	v0
	SHASH	.req	v0
	SHASH	.req	v1
	SHASH2	.req	v1
	IN1	.req	v2
	T1	.req	v2
	T1	.req	v2
	T2	.req	v3
	T2	.req	v3
	T3	.req	v4
	MASK	.req	v4
	VZR	.req	v5
	XL	.req	v5
	XM	.req	v6
	XH	.req	v7
	IN1	.req	v7


	.text
	.text
	.arch		armv8-a+crypto
	.arch		armv8-a+crypto
@@ -35,61 +29,51 @@
	 *			   struct ghash_key const *k, const char *head)
	 *			   struct ghash_key const *k, const char *head)
	 */
	 */
ENTRY(pmull_ghash_update)
ENTRY(pmull_ghash_update)
	ld1		{DATA.16b}, [x1]
	ld1		{SHASH.16b}, [x3]
	ld1		{SHASH.16b}, [x3]
	eor		VZR.16b, VZR.16b, VZR.16b
	ld1		{XL.16b}, [x1]
	movi		MASK.16b, #0xe1
	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
	shl		MASK.2d, MASK.2d, #57
	eor		SHASH2.16b, SHASH2.16b, SHASH.16b


	/* do the head block first, if supplied */
	/* do the head block first, if supplied */
	cbz		x4, 0f
	cbz		x4, 0f
	ld1		{IN1.2d}, [x4]
	ld1		{T1.2d}, [x4]
	b		1f
	b		1f


0:	ld1		{IN1.2d}, [x2], #16
0:	ld1		{T1.2d}, [x2], #16
	sub		w0, w0, #1
	sub		w0, w0, #1
1:	ext		IN1.16b, IN1.16b, IN1.16b, #8
CPU_LE(	rev64		IN1.16b, IN1.16b	)
	eor		DATA.16b, DATA.16b, IN1.16b


	/* multiply DATA by SHASH in GF(2^128) */
1:	/* multiply XL by SHASH in GF(2^128) */
	ext		T2.16b, DATA.16b, DATA.16b, #8
CPU_LE(	rev64		T1.16b, T1.16b	)
	ext		T3.16b, SHASH.16b, SHASH.16b, #8
	eor		T2.16b, T2.16b, DATA.16b
	eor		T3.16b, T3.16b, SHASH.16b


	pmull2		T1.1q, SHASH.2d, DATA.2d	// a1 * b1
	ext		T2.16b, XL.16b, XL.16b, #8
	pmull		DATA.1q, SHASH.1d, DATA.1d	// a0 * b0
	ext		IN1.16b, T1.16b, T1.16b, #8
	pmull		T2.1q, T2.1d, T3.1d		// (a1 + a0)(b1 + b0)
	eor		T1.16b, T1.16b, T2.16b
	eor		T2.16b, T2.16b, T1.16b		// (a0 * b1) + (a1 * b0)
	eor		XL.16b, XL.16b, IN1.16b
	eor		T2.16b, T2.16b, DATA.16b


	ext		T3.16b, VZR.16b, T2.16b, #8
	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
	ext		T2.16b, T2.16b, VZR.16b, #8
	eor		T1.16b, T1.16b, XL.16b
	eor		DATA.16b, DATA.16b, T3.16b
	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
	eor		T1.16b, T1.16b, T2.16b	// <T1:DATA> is result of
	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
						// carry-less multiplication


	/* first phase of the reduction */
	ext		T1.16b, XL.16b, XH.16b, #8
	shl		T3.2d, DATA.2d, #1
	eor		T2.16b, XL.16b, XH.16b
	eor		T3.16b, T3.16b, DATA.16b
	eor		XM.16b, XM.16b, T1.16b
	shl		T3.2d, T3.2d, #5
	eor		XM.16b, XM.16b, T2.16b
	eor		T3.16b, T3.16b, DATA.16b
	pmull		T2.1q, XL.1d, MASK.1d
	shl		T3.2d, T3.2d, #57
	ext		T2.16b, VZR.16b, T3.16b, #8
	ext		T3.16b, T3.16b, VZR.16b, #8
	eor		DATA.16b, DATA.16b, T2.16b
	eor		T1.16b, T1.16b, T3.16b


	/* second phase of the reduction */
	mov		XH.d[0], XM.d[1]
	ushr		T2.2d, DATA.2d, #5
	mov		XM.d[1], XL.d[0]
	eor		T2.16b, T2.16b, DATA.16b

	ushr		T2.2d, T2.2d, #1
	eor		XL.16b, XM.16b, T2.16b
	eor		T2.16b, T2.16b, DATA.16b
	ext		T2.16b, XL.16b, XL.16b, #8
	ushr		T2.2d, T2.2d, #1
	pmull		XL.1q, XL.1d, MASK.1d
	eor		T1.16b, T1.16b, T2.16b
	eor		T2.16b, T2.16b, XH.16b
	eor		DATA.16b, DATA.16b, T1.16b
	eor		XL.16b, XL.16b, T2.16b


	cbnz		w0, 0b
	cbnz		w0, 0b


	st1		{DATA.16b}, [x1]
	st1		{XL.16b}, [x1]
	ret
	ret
ENDPROC(pmull_ghash_update)
ENDPROC(pmull_ghash_update)
+2 −2
Original line number Original line Diff line number Diff line
@@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
		blocks = len / GHASH_BLOCK_SIZE;
		blocks = len / GHASH_BLOCK_SIZE;
		len %= GHASH_BLOCK_SIZE;
		len %= GHASH_BLOCK_SIZE;


		kernel_neon_begin_partial(6);
		kernel_neon_begin_partial(8);
		pmull_ghash_update(blocks, ctx->digest, src, key,
		pmull_ghash_update(blocks, ctx->digest, src, key,
				   partial ? ctx->buf : NULL);
				   partial ? ctx->buf : NULL);
		kernel_neon_end();
		kernel_neon_end();
@@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)


		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);


		kernel_neon_begin_partial(6);
		kernel_neon_begin_partial(8);
		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
		kernel_neon_end();
		kernel_neon_end();
	}
	}