Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3759ee05 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm/ghash - add NEON accelerated fallback for vmull.p64

Implement a NEON fallback for systems that do support NEON but have
no support for the optional 64x64->128 polynomial multiplication
instruction that is part of the ARMv8 Crypto Extensions. It is based
on the paper "Fast Software Polynomial Multiplication on ARM Processors
Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
Ricardo Dahab (https://hal.inria.fr/hal-01506572

)

On a 32-bit guest executing under KVM on a Cortex-A57, the new code is
not only 4x faster than the generic table based GHASH driver, it is also
time invariant. (Note that the existing vmull.p64 code is 16x faster on
this core).

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 537c1445
Loading
Loading
Loading
Loading
+3 −2
Original line number Original line Diff line number Diff line
@@ -94,14 +94,15 @@ config CRYPTO_AES_ARM_CE
	  ARMv8 Crypto Extensions
	  ARMv8 Crypto Extensions


config CRYPTO_GHASH_ARM_CE
config CRYPTO_GHASH_ARM_CE
	tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions"
	tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions"
	depends on KERNEL_MODE_NEON
	depends on KERNEL_MODE_NEON
	select CRYPTO_HASH
	select CRYPTO_HASH
	select CRYPTO_CRYPTD
	select CRYPTO_CRYPTD
	help
	help
	  Use an implementation of GHASH (used by the GCM AEAD chaining mode)
	  Use an implementation of GHASH (used by the GCM AEAD chaining mode)
	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
	  that is part of the ARMv8 Crypto Extensions
	  that is part of the ARMv8 Crypto Extensions, or a slower variant that
	  uses the vmull.p8 instruction that is part of the basic NEON ISA.


config CRYPTO_CRCT10DIF_ARM_CE
config CRYPTO_CRCT10DIF_ARM_CE
	tristate "CRCT10DIF digest algorithm using PMULL instructions"
	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+191 −43
Original line number Original line Diff line number Diff line
/*
/*
 * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
 *
 *
 * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 *
 * This program is free software; you can redistribute it and/or modify it
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * under the terms of the GNU General Public License version 2 as published
@@ -12,40 +12,162 @@
#include <asm/assembler.h>
#include <asm/assembler.h>


	SHASH		.req	q0
	SHASH		.req	q0
	SHASH2		.req	q1
	T1		.req	q1
	T1		.req	q2
	XL		.req	q2
	T2		.req	q3
	XM		.req	q3
	MASK		.req	q4
	XH		.req	q4
	XL		.req	q5
	IN1		.req	q4
	XM		.req	q6
	XH		.req	q7
	IN1		.req	q7


	SHASH_L		.req	d0
	SHASH_L		.req	d0
	SHASH_H		.req	d1
	SHASH_H		.req	d1
	SHASH2_L	.req	d2
	T1_L		.req	d2
	T1_L		.req	d4
	T1_H		.req	d3
	MASK_L		.req	d8
	XL_L		.req	d4
	XL_L		.req	d10
	XL_H		.req	d5
	XL_H		.req	d11
	XM_L		.req	d6
	XM_L		.req	d12
	XM_H		.req	d7
	XM_H		.req	d13
	XH_L		.req	d8
	XH_L		.req	d14

	t0l		.req	d10
	t0h		.req	d11
	t1l		.req	d12
	t1h		.req	d13
	t2l		.req	d14
	t2h		.req	d15
	t3l		.req	d16
	t3h		.req	d17
	t4l		.req	d18
	t4h		.req	d19

	t0q		.req	q5
	t1q		.req	q6
	t2q		.req	q7
	t3q		.req	q8
	t4q		.req	q9
	T2		.req	q9

	s1l		.req	d20
	s1h		.req	d21
	s2l		.req	d22
	s2h		.req	d23
	s3l		.req	d24
	s3h		.req	d25
	s4l		.req	d26
	s4h		.req	d27

	MASK		.req	d28
	SHASH2_p8	.req	d28

	k16		.req	d29
	k32		.req	d30
	k48		.req	d31
	SHASH2_p64	.req	d31


	.text
	.text
	.fpu		crypto-neon-fp-armv8
	.fpu		crypto-neon-fp-armv8


	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
	vmull.p64	\rd, \rn, \rm
	.endm

	/*
	/*
	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
	 * This implementation of 64x64 -> 128 bit polynomial multiplication
	 *			   struct ghash_key const *k, const char *head)
	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
	 * "Fast Software Polynomial Multiplication on ARM Processors Using
	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
	 *
	 * It has been slightly tweaked for in-order performance, and to allow
	 * 'rq' to overlap with 'ad' or 'bd'.
	 */
	 */
ENTRY(pmull_ghash_update)
	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
	vld1.64		{SHASH}, [r3]
	vext.8		t0l, \ad, \ad, #1	@ A1
	.ifc		\b1, t4l
	vext.8		t4l, \bd, \bd, #1	@ B1
	.endif
	vmull.p8	t0q, t0l, \bd		@ F = A1*B
	vext.8		t1l, \ad, \ad, #2	@ A2
	vmull.p8	t4q, \ad, \b1		@ E = A*B1
	.ifc		\b2, t3l
	vext.8		t3l, \bd, \bd, #2	@ B2
	.endif
	vmull.p8	t1q, t1l, \bd		@ H = A2*B
	vext.8		t2l, \ad, \ad, #3	@ A3
	vmull.p8	t3q, \ad, \b2		@ G = A*B2
	veor		t0q, t0q, t4q		@ L = E + F
	.ifc		\b3, t4l
	vext.8		t4l, \bd, \bd, #3	@ B3
	.endif
	vmull.p8	t2q, t2l, \bd		@ J = A3*B
	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
	veor		t1q, t1q, t3q		@ M = G + H
	.ifc		\b4, t3l
	vext.8		t3l, \bd, \bd, #4	@ B4
	.endif
	vmull.p8	t4q, \ad, \b3		@ I = A*B3
	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
	vmull.p8	t3q, \ad, \b4		@ K = A*B4
	vand		t0h, t0h, k48
	vand		t1h, t1h, k32
	veor		t2q, t2q, t4q		@ N = I + J
	veor		t0l, t0l, t0h
	veor		t1l, t1l, t1h
	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
	vand		t2h, t2h, k16
	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
	vmov.i64	t3h, #0
	vext.8		t0q, t0q, t0q, #15
	veor		t2l, t2l, t2h
	vext.8		t1q, t1q, t1q, #14
	vmull.p8	\rq, \ad, \bd		@ D = A*B
	vext.8		t2q, t2q, t2q, #13
	vext.8		t3q, t3q, t3q, #12
	veor		t0q, t0q, t1q
	veor		t2q, t2q, t3q
	veor		\rq, \rq, t0q
	veor		\rq, \rq, t2q
	.endm

	//
	// PMULL (64x64->128) based reduction for CPUs that can do
	// it in a single instruction.
	//
	.macro		__pmull_reduce_p64
	vmull.p64	T1, XL_L, MASK

	veor		XH_L, XH_L, XM_H
	vext.8		T1, T1, T1, #8
	veor		XL_H, XL_H, XM_L
	veor		T1, T1, XL

	vmull.p64	XL, T1_H, MASK
	.endm

	//
	// Alternative reduction for CPUs that lack support for the
	// 64x64->128 PMULL instruction
	//
	.macro		__pmull_reduce_p8
	veor		XL_H, XL_H, XM_L
	veor		XH_L, XH_L, XM_H

	vshl.i64	T1, XL, #57
	vshl.i64	T2, XL, #62
	veor		T1, T1, T2
	vshl.i64	T2, XL, #63
	veor		T1, T1, T2
	veor		XL_H, XL_H, T1_L
	veor		XH_L, XH_L, T1_H

	vshr.u64	T1, XL, #1
	veor		XH, XH, XL
	veor		XL, XL, T1
	vshr.u64	T1, T1, #6
	vshr.u64	XL, XL, #1
	.endm

	.macro		ghash_update, pn
	vld1.64		{XL}, [r1]
	vld1.64		{XL}, [r1]
	vmov.i8		MASK, #0xe1
	vext.8		SHASH2, SHASH, SHASH, #8
	vshl.u64	MASK, MASK, #57
	veor		SHASH2, SHASH2, SHASH


	/* do the head block first, if supplied */
	/* do the head block first, if supplied */
	ldr		ip, [sp]
	ldr		ip, [sp]
@@ -62,33 +184,59 @@ ENTRY(pmull_ghash_update)
#ifndef CONFIG_CPU_BIG_ENDIAN
#ifndef CONFIG_CPU_BIG_ENDIAN
	vrev64.8	T1, T1
	vrev64.8	T1, T1
#endif
#endif
	vext.8		T2, XL, XL, #8
	vext.8		IN1, T1, T1, #8
	vext.8		IN1, T1, T1, #8
	veor		T1, T1, T2
	veor		T1_L, T1_L, XL_H
	veor		XL, XL, IN1
	veor		XL, XL, IN1


	vmull.p64	XH, SHASH_H, XL_H		@ a1 * b1
	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
	veor		T1, T1, XL
	veor		T1, T1, XL
	vmull.p64	XL, SHASH_L, XL_L		@ a0 * b0
	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
	vmull.p64	XM, SHASH2_L, T1_L		@ (a1 + a0)(b1 + b0)
	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)


	vext.8		T1, XL, XH, #8
	veor		T1, XL, XH
	veor		T2, XL, XH
	veor		XM, XM, T1
	veor		XM, XM, T1
	veor		XM, XM, T2
	vmull.p64	T2, XL_L, MASK_L


	vmov		XH_L, XM_H
	__pmull_reduce_\pn
	vmov		XM_H, XL_L


	veor		XL, XM, T2
	veor		T1, T1, XH
	vext.8		T2, XL, XL, #8
	veor		XL, XL, T1
	vmull.p64	XL, XL_L, MASK_L
	veor		T2, T2, XH
	veor		XL, XL, T2


	bne		0b
	bne		0b


	vst1.64		{XL}, [r1]
	vst1.64		{XL}, [r1]
	bx		lr
	bx		lr
ENDPROC(pmull_ghash_update)
	.endm

	/*
	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
	 *			   struct ghash_key const *k, const char *head)
	 */
ENTRY(pmull_ghash_update_p64)
	vld1.64		{SHASH}, [r3]
	veor		SHASH2_p64, SHASH_L, SHASH_H

	vmov.i8		MASK, #0xe1
	vshl.u64	MASK, MASK, #57

	ghash_update	p64
ENDPROC(pmull_ghash_update_p64)

ENTRY(pmull_ghash_update_p8)
	vld1.64		{SHASH}, [r3]
	veor		SHASH2_p8, SHASH_L, SHASH_H

	vext.8		s1l, SHASH_L, SHASH_L, #1
	vext.8		s2l, SHASH_L, SHASH_L, #2
	vext.8		s3l, SHASH_L, SHASH_L, #3
	vext.8		s4l, SHASH_L, SHASH_L, #4
	vext.8		s1h, SHASH_H, SHASH_H, #1
	vext.8		s2h, SHASH_H, SHASH_H, #2
	vext.8		s3h, SHASH_H, SHASH_H, #3
	vext.8		s4h, SHASH_H, SHASH_H, #4

	vmov.i64	k16, #0xffff
	vmov.i64	k32, #0xffffffff
	vmov.i64	k48, #0xffffffffffff

	ghash_update	p8
ENDPROC(pmull_ghash_update_p8)
+21 −3
Original line number Original line Diff line number Diff line
@@ -22,6 +22,7 @@
MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("ghash");


#define GHASH_BLOCK_SIZE	16
#define GHASH_BLOCK_SIZE	16
#define GHASH_DIGEST_SIZE	16
#define GHASH_DIGEST_SIZE	16
@@ -41,8 +42,17 @@ struct ghash_async_ctx {
	struct cryptd_ahash *cryptd_tfm;
	struct cryptd_ahash *cryptd_tfm;
};
};


asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
				   struct ghash_key const *k, const char *head);
				       struct ghash_key const *k,
				       const char *head);

asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
				      struct ghash_key const *k,
				      const char *head);

static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
				  struct ghash_key const *k,
				  const char *head);


static int ghash_init(struct shash_desc *desc)
static int ghash_init(struct shash_desc *desc)
{
{
@@ -312,6 +322,14 @@ static int __init ghash_ce_mod_init(void)
{
{
	int err;
	int err;


	if (!(elf_hwcap & HWCAP_NEON))
		return -ENODEV;

	if (elf_hwcap2 & HWCAP2_PMULL)
		pmull_ghash_update = pmull_ghash_update_p64;
	else
		pmull_ghash_update = pmull_ghash_update_p8;

	err = crypto_register_shash(&ghash_alg);
	err = crypto_register_shash(&ghash_alg);
	if (err)
	if (err)
		return err;
		return err;
@@ -332,5 +350,5 @@ static void __exit ghash_ce_mod_exit(void)
	crypto_unregister_shash(&ghash_alg);
	crypto_unregister_shash(&ghash_alg);
}
}


module_cpu_feature_match(PMULL, ghash_ce_mod_init);
module_init(ghash_ce_mod_init);
module_exit(ghash_ce_mod_exit);
module_exit(ghash_ce_mod_exit);