Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e31ac32d authored by Timothy McCaffrey's avatar Timothy McCaffrey Committed by Herbert Xu
Browse files

crypto: aesni - Add support for 192 & 256 bit keys to AESNI RFC4106

These patches fix the RFC4106 implementation in the aesni-intel
module so it supports 192 & 256 bit keys.

Since the AVX support that was added to this module also only
supports 128 bit keys, and this patch only affects the SSE
implementation, changes were also made to use the SSE version
if key sizes other than 128 are specified.

RFC4106 specifies that 192 & 256 bit keys must be supported (section
8.4).

Also, this should fix Strongswan issue 341 where the aesni module
needs to be unloaded if 256 bit keys are used:

http://wiki.strongswan.org/issues/341



This patch has been tested with Sandy Bridge and Haswell processors.
With 128 bit keys and input buffers > 512 bytes a slight performance
degradation was noticed (~1%).  For input buffers of less than 512
bytes there was no performance impact.  Compared to 128 bit keys,
256 bit key size performance is approx. .5 cycles per byte slower
on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs.
SSE code).

This patch has also been tested with StrongSwan IPSec connections
where it worked correctly.

I created this diff from a git clone of crypto-2.6.git.

Any questions, please feel free to contact me.

Signed-off-by: default avatarTimothy McCaffrey <timothy.mccaffrey@unisys.com>
Signed-off-by: default avatarJarod Wilson <jarod@redhat.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent d8219f52
Loading
Loading
Loading
Loading
+177 −166
Original line number Diff line number Diff line
@@ -32,12 +32,23 @@
#include <linux/linkage.h>
#include <asm/inst.h>

/*
 * The following macros are used to move an (un)aligned 16 byte value to/from
 * an XMM register.  This can done for either FP or integer values, for FP use
 * movaps (move aligned packed single) or integer use movdqa (move double quad
 * aligned).  It doesn't make a performance difference which instruction is used
 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
 * shorter, so that is the one we'll use for now. (same for unaligned).
 */
#define MOVADQ	movaps
#define MOVUDQ	movups

#ifdef __x86_64__

.data
.align 16
.Lgf128mul_x_ble_mask:
	.octa 0x00000000000000010000000000000087

POLY:   .octa 0xC2000000000000000000000000000001
TWOONE: .octa 0x00000001000000000000000000000001

@@ -89,6 +100,7 @@ enc: .octa 0x2
#define arg8 STACK_OFFSET+16(%r14)
#define arg9 STACK_OFFSET+24(%r14)
#define arg10 STACK_OFFSET+32(%r14)
#define keysize 2*15*16(%arg1)
#endif


@@ -213,10 +225,12 @@ enc: .octa 0x2

.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
        MOVADQ     SHUF_MASK(%rip), %xmm14
	mov	   arg7, %r10           # %r10 = AAD
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
	pxor	   %xmm\i, %xmm\i

_get_AAD_loop\num_initial_blocks\operation:
	movd	   (%r10), \TMP1
	pslldq	   $12, \TMP1
@@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation:
	add	   $4, %r10
	sub	   $4, %r12
	jne	   _get_AAD_loop\num_initial_blocks\operation

	cmp	   $16, %r11
	je	   _get_AAD_loop2_done\num_initial_blocks\operation

	mov	   $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
	psrldq	   $4, %xmm\i
	sub	   $4, %r12
	cmp	   %r11, %r12
	jne	   _get_AAD_loop2\num_initial_blocks\operation

_get_AAD_loop2_done\num_initial_blocks\operation:
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data

	xor	   %r11, %r11 # initialise the data pointer offset as zero
@@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:

	mov	   %arg5, %rax                      # %rax = *Y0
	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM   %xmm14, \XMM0

.if (\i == 5) || (\i == 6) || (\i == 7)
	MOVADQ		ONE(%RIP),\TMP1
	MOVADQ		(%arg1),\TMP2
.irpc index, \i_seq
	paddd	   ONE(%rip), \XMM0                 # INCR Y0
	paddd	   \TMP1, \XMM0                 # INCR Y0
	movdqa	   \XMM0, %xmm\index
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap

.endr
.irpc index, \i_seq
	pxor	   16*0(%arg1), %xmm\index
.endr
.irpc index, \i_seq
	movaps 0x10(%rdi), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 1
.endr
.irpc index, \i_seq
	movaps 0x20(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x30(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x40(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x50(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x60(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x70(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x80(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
	pxor	   \TMP2, %xmm\index
.endr
	lea	0x10(%arg1),%r10
	mov	keysize,%eax
	shr	$2,%eax				# 128->4, 192->6, 256->8
	add	$5,%eax			      # 128->9, 192->11, 256->13

aes_loop_initial_dec\num_initial_blocks:
	MOVADQ	(%r10),\TMP1
.irpc	index, \i_seq
	movaps 0x90(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
	AESENC	\TMP1, %xmm\index
.endr
	add	$16,%r10
	sub	$1,%eax
	jnz	aes_loop_initial_dec\num_initial_blocks

	MOVADQ	(%r10), \TMP1
.irpc index, \i_seq
	movaps 0xa0(%arg1), \TMP1
	AESENCLAST \TMP1, %xmm\index         # Round 10
	AESENCLAST \TMP1, %xmm\index         # Last Round
.endr
.irpc index, \i_seq
	movdqu	   (%arg3 , %r11, 1), \TMP1
@@ -305,9 +296,7 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
	add	   $16, %r11

	movdqa     \TMP1, %xmm\index
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM	   %xmm14, %xmm\index

                # prepare plaintext/ciphertext for GHASH computation
.endr
.endif
@@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM1
        movdqa     SHUF_MASK(%rip), %xmm14
	MOVADQ	   ONE(%rip), \TMP1
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM1
	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap

	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM2
        movdqa     SHUF_MASK(%rip), %xmm14
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM2
	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap

	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM3
        movdqa     SHUF_MASK(%rip), %xmm14
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM3
	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap

	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM4
        movdqa     SHUF_MASK(%rip), %xmm14
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM4
	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap

	pxor	   16*0(%arg1), \XMM1
	pxor	   16*0(%arg1), \XMM2
	pxor	   16*0(%arg1), \XMM3
	pxor	   16*0(%arg1), \XMM4
	MOVADQ	   0(%arg1),\TMP1
	pxor	   \TMP1, \XMM1
	pxor	   \TMP1, \XMM2
	pxor	   \TMP1, \XMM3
	pxor	   \TMP1, \XMM4
	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
@@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_4_k(%rsp)
	movaps 0xa0(%arg1), \TMP2
	lea	   0xa0(%arg1),%r10
	mov	   keysize,%eax
	shr	   $2,%eax			# 128->4, 192->6, 256->8
	sub	   $4,%eax			# 128->0, 192->2, 256->4
	jz	   aes_loop_pre_dec_done\num_initial_blocks

aes_loop_pre_dec\num_initial_blocks:
	MOVADQ	   (%r10),\TMP2
.irpc	index, 1234
	AESENC	   \TMP2, %xmm\index
.endr
	add	   $16,%r10
	sub	   $1,%eax
	jnz	   aes_loop_pre_dec\num_initial_blocks

aes_loop_pre_dec_done\num_initial_blocks:
	MOVADQ	   (%r10), \TMP2
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
@@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM4
	add	   $64, %r11
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap

_initial_blocks_done\num_initial_blocks\operation:
@@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:

.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
        MOVADQ     SHUF_MASK(%rip), %xmm14
	mov	   arg7, %r10           # %r10 = AAD
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
@@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
	cmp	   %r11, %r12
	jne	   _get_AAD_loop2\num_initial_blocks\operation
_get_AAD_loop2_done\num_initial_blocks\operation:
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data

	xor	   %r11, %r11 # initialise the data pointer offset as zero
@@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:

	mov	   %arg5, %rax                      # %rax = *Y0
	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM   %xmm14, \XMM0

.if (\i == 5) || (\i == 6) || (\i == 7)
.irpc index, \i_seq
	paddd	   ONE(%rip), \XMM0                 # INCR Y0
	movdqa	   \XMM0, %xmm\index
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap

.endr
	MOVADQ		ONE(%RIP),\TMP1
	MOVADQ		0(%arg1),\TMP2
.irpc index, \i_seq
	pxor	   16*0(%arg1), %xmm\index
.endr
.irpc index, \i_seq
	movaps 0x10(%rdi), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 1
.endr
.irpc index, \i_seq
	movaps 0x20(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x30(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x40(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x50(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x60(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x70(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x80(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
	paddd		\TMP1, \XMM0                 # INCR Y0
	MOVADQ		\XMM0, %xmm\index
	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
	pxor		\TMP2, %xmm\index
.endr
	lea	0x10(%arg1),%r10
	mov	keysize,%eax
	shr	$2,%eax				# 128->4, 192->6, 256->8
	add	$5,%eax			      # 128->9, 192->11, 256->13

aes_loop_initial_enc\num_initial_blocks:
	MOVADQ	(%r10),\TMP1
.irpc	index, \i_seq
	movaps 0x90(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
	AESENC	\TMP1, %xmm\index
.endr
	add	$16,%r10
	sub	$1,%eax
	jnz	aes_loop_initial_enc\num_initial_blocks

	MOVADQ	(%r10), \TMP1
.irpc index, \i_seq
	movaps 0xa0(%arg1), \TMP1
	AESENCLAST \TMP1, %xmm\index         # Round 10
	AESENCLAST \TMP1, %xmm\index         # Last Round
.endr
.irpc index, \i_seq
	movdqu	   (%arg3 , %r11, 1), \TMP1
@@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
	movdqu	   %xmm\index, (%arg2 , %r11, 1)
	# write back plaintext/ciphertext for num_initial_blocks
	add	   $16, %r11

        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM	   %xmm14, %xmm\index

		# prepare plaintext/ciphertext for GHASH computation
@@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM1
        movdqa     SHUF_MASK(%rip), %xmm14
	MOVADQ	   ONE(%RIP),\TMP1
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM1
	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap

	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM2
        movdqa     SHUF_MASK(%rip), %xmm14
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM2
	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap

	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM3
        movdqa     SHUF_MASK(%rip), %xmm14
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM3
	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap

	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM4
        movdqa     SHUF_MASK(%rip), %xmm14
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM4
	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap

	pxor	   16*0(%arg1), \XMM1
	pxor	   16*0(%arg1), \XMM2
	pxor	   16*0(%arg1), \XMM3
	pxor	   16*0(%arg1), \XMM4
	MOVADQ	   0(%arg1),\TMP1
	pxor	   \TMP1, \XMM1
	pxor	   \TMP1, \XMM2
	pxor	   \TMP1, \XMM3
	pxor	   \TMP1, \XMM4
	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
@@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_4_k(%rsp)
	movaps 0xa0(%arg1), \TMP2
	lea	   0xa0(%arg1),%r10
	mov	   keysize,%eax
	shr	   $2,%eax			# 128->4, 192->6, 256->8
	sub	   $4,%eax			# 128->0, 192->2, 256->4
	jz	   aes_loop_pre_enc_done\num_initial_blocks

aes_loop_pre_enc\num_initial_blocks:
	MOVADQ	   (%r10),\TMP2
.irpc	index, 1234
	AESENC	   \TMP2, %xmm\index
.endr
	add	   $16,%r10
	sub	   $1,%eax
	jnz	   aes_loop_pre_enc\num_initial_blocks

aes_loop_pre_enc_done\num_initial_blocks:
	MOVADQ	   (%r10), \TMP2
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
@@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)

	add	   $64, %r11
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
        movdqa     SHUF_MASK(%rip), %xmm14
	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap

_initial_blocks_done\num_initial_blocks\operation:
@@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
	movaps 0xa0(%arg1), \TMP3
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax			# 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
	jz	  aes_loop_par_enc_done

aes_loop_par_enc:
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
	jnz	  aes_loop_par_enc

aes_loop_par_enc_done:
	MOVADQ	  (%r10), \TMP3
	AESENCLAST \TMP3, \XMM1           # Round 10
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
@@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
	movaps 0xa0(%arg1), \TMP3
	AESENCLAST \TMP3, \XMM1           # Round 10
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax		        # 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
	jz	  aes_loop_par_dec_done

aes_loop_par_dec:
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
	jnz	  aes_loop_par_dec

aes_loop_par_dec_done:
	MOVADQ	  (%r10), \TMP3
	AESENCLAST \TMP3, \XMM1           # last round
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
@@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
.endm

/* Encryption of a single block done*/

/* Encryption of a single block
* uses eax & r10
*/

.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1

	pxor		(%arg1), \XMM0
        movaps 16(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 32(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 48(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 64(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 80(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 96(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 112(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 128(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 144(%arg1), \TMP1
	mov		keysize,%eax
	shr		$2,%eax			# 128->4, 192->6, 256->8
	add		$5,%eax			# 128->9, 192->11, 256->13
	lea		16(%arg1), %r10	  # get first expanded key address

_esb_loop_\@:
	MOVADQ		(%r10),\TMP1
	AESENC		\TMP1,\XMM0
        movaps 160(%arg1), \TMP1
	add		$16,%r10
	sub		$1,%eax
	jnz		_esb_loop_\@

	MOVADQ		(%r10),\TMP1
	AESENCLAST	\TMP1,\XMM0
.endm


/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
+28 −6
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@
#include <asm/crypto/glue_helper.h>
#endif


/* This data is stored at the end of the crypto_tfm struct.
 * It's a type of per "session" data storage location.
 * This needs to be 16 byte aligned.
@@ -182,7 +183,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
			u8 *auth_tag, unsigned long auth_tag_len)
{
	if (plaintext_len < AVX_GEN2_OPTSIZE) {
        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
				aad_len, auth_tag, auth_tag_len);
	} else {
@@ -197,7 +199,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out,
			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
			u8 *auth_tag, unsigned long auth_tag_len)
{
	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
				aad_len, auth_tag, auth_tag_len);
	} else {
@@ -231,7 +234,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
			u8 *auth_tag, unsigned long auth_tag_len)
{
	if (plaintext_len < AVX_GEN2_OPTSIZE) {
       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
				aad_len, auth_tag, auth_tag_len);
	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
@@ -250,7 +254,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
			u8 *auth_tag, unsigned long auth_tag_len)
{
	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
				aad, aad_len, auth_tag, auth_tag_len);
	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
@@ -902,7 +907,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
	}
	/*Account for 4 byte nonce at the end.*/
	key_len -= 4;
	if (key_len != AES_KEYSIZE_128) {
	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
	    key_len != AES_KEYSIZE_256) {
		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
		return -EINVAL;
	}
@@ -1013,6 +1019,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
	__be32 counter = cpu_to_be32(1);
	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
	u32 key_len = ctx->aes_key_expanded.key_length;
	void *aes_ctx = &(ctx->aes_key_expanded);
	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
	u8 iv_tab[16+AESNI_ALIGN];
@@ -1027,6 +1034,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
	/* to 8 or 12 bytes */
	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
		return -EINVAL;
	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
	        return -EINVAL;
	if (unlikely(key_len != AES_KEYSIZE_128 &&
	             key_len != AES_KEYSIZE_192 &&
	             key_len != AES_KEYSIZE_256))
	        return -EINVAL;

	/* IV below built */
	for (i = 0; i < 4; i++)
		*(iv+i) = ctx->nonce[i];
@@ -1091,6 +1105,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
	int retval = 0;
	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
	u32 key_len = ctx->aes_key_expanded.key_length;
	void *aes_ctx = &(ctx->aes_key_expanded);
	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
	u8 iv_and_authTag[32+AESNI_ALIGN];
@@ -1104,6 +1119,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
	if (unlikely((req->cryptlen < auth_tag_len) ||
		(req->assoclen != 8 && req->assoclen != 12)))
		return -EINVAL;
	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
	        return -EINVAL;
	if (unlikely(key_len != AES_KEYSIZE_128 &&
	             key_len != AES_KEYSIZE_192 &&
	             key_len != AES_KEYSIZE_256))
	        return -EINVAL;

	/* Assuming we are supporting rfc4106 64-bit extended */
	/* sequence numbers We need to have the AAD length */
	/* equal to 8 or 12 bytes */