Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0487ccac authored by Sabrina Dubroca's avatar Sabrina Dubroca Committed by Herbert Xu
Browse files

crypto: aesni - make non-AVX AES-GCM work with any aadlen



This is the first step to make the aesni AES-GCM implementation
generic. The current code was written for rfc4106, so it handles only
some specific sizes of associated data.

Signed-off-by: default avatarSabrina Dubroca <sd@queasysnail.net>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent f4857f4c
Loading
Loading
Loading
Loading
+132 −37
Original line number Diff line number Diff line
@@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
            .octa 0x00000000000000000000000000000000

.section .rodata
.align 16
.type aad_shift_arr, @object
.size aad_shift_arr, 272
aad_shift_arr:
        .octa     0xffffffffffffffffffffffffffffffff
        .octa     0xffffffffffffffffffffffffffffff0C
        .octa     0xffffffffffffffffffffffffffff0D0C
        .octa     0xffffffffffffffffffffffffff0E0D0C
        .octa     0xffffffffffffffffffffffff0F0E0D0C
        .octa     0xffffffffffffffffffffff0C0B0A0908
        .octa     0xffffffffffffffffffff0D0C0B0A0908
        .octa     0xffffffffffffffffff0E0D0C0B0A0908
        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
        .octa     0xffffffffffffff0C0B0A090807060504
        .octa     0xffffffffffff0D0C0B0A090807060504
        .octa     0xffffffffff0E0D0C0B0A090807060504
        .octa     0xffffffff0F0E0D0C0B0A090807060504
        .octa     0xffffff0C0B0A09080706050403020100
        .octa     0xffff0D0C0B0A09080706050403020100
        .octa     0xff0E0D0C0B0A09080706050403020100
        .octa     0x0F0E0D0C0B0A09080706050403020100


.text

@@ -252,31 +275,65 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
	pxor	   %xmm\i, %xmm\i
	pxor       \XMM2, \XMM2

_get_AAD_loop\num_initial_blocks\operation:
	movd	   (%r10), \TMP1
	pslldq	   $12, \TMP1
	psrldq	   $4, %xmm\i
	pxor	   \TMP1, %xmm\i
	add	   $4, %r10
	sub	   $4, %r12
	jne	   _get_AAD_loop\num_initial_blocks\operation

	cmp	   $16, %r11
	je	   _get_AAD_loop2_done\num_initial_blocks\operation
	jl	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
	movdqu	   (%r10), %xmm\i
	PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
	pxor	   %xmm\i, \XMM2
	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
	add	   $16, %r10
	sub	   $16, %r12
	sub	   $16, %r11
	cmp	   $16, %r11
	jge	   _get_AAD_blocks\num_initial_blocks\operation

	mov	   $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
	psrldq	   $4, %xmm\i
	sub	   $4, %r12
	cmp	   %r11, %r12
	jne	   _get_AAD_loop2\num_initial_blocks\operation
	movdqu	   \XMM2, %xmm\i
	cmp	   $0, %r11
	je	   _get_AAD_done\num_initial_blocks\operation

	pxor	   %xmm\i,%xmm\i

_get_AAD_loop2_done\num_initial_blocks\operation:
	/* read the last <16B of AAD. since we have at least 4B of
	data right after the AAD (the ICV, and maybe some CT), we can
	read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\num_initial_blocks\operation:
	cmp	   $4, %r11
	jle	   _get_AAD_rest4\num_initial_blocks\operation
	movq	   (%r10), \TMP1
	add	   $8, %r10
	sub	   $8, %r11
	pslldq	   $8, \TMP1
	psrldq	   $8, %xmm\i
	pxor	   \TMP1, %xmm\i
	jmp	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_rest4\num_initial_blocks\operation:
	cmp	   $0, %r11
	jle	   _get_AAD_rest0\num_initial_blocks\operation
	mov	   (%r10), %eax
	movq	   %rax, \TMP1
	add	   $4, %r10
	sub	   $4, %r10
	pslldq	   $12, \TMP1
	psrldq	   $4, %xmm\i
	pxor	   \TMP1, %xmm\i
_get_AAD_rest0\num_initial_blocks\operation:
	/* finalize: shift out the extra bytes we read, and align
	left. since pslldq can only shift by an immediate, we use
	vpshufb and an array of shuffle masks */
	movq	   %r12, %r11
	salq	   $4, %r11
	movdqu	   aad_shift_arr(%r11), \TMP1
	PSHUFB_XMM \TMP1, %xmm\i
_get_AAD_rest_final\num_initial_blocks\operation:
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
	pxor	   \XMM2, %xmm\i
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1

_get_AAD_done\num_initial_blocks\operation:
	xor	   %r11, %r11 # initialise the data pointer offset as zero

	# start AES for num_initial_blocks blocks

	mov	   %arg5, %rax                      # %rax = *Y0
@@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks:
                # prepare plaintext/ciphertext for GHASH computation
.endr
.endif
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1

        # apply GHASH on num_initial_blocks blocks

.if \i == 5
@@ -477,27 +534,65 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
	pxor	   %xmm\i, %xmm\i
_get_AAD_loop\num_initial_blocks\operation:
	movd	   (%r10), \TMP1
	pslldq	   $12, \TMP1
	psrldq	   $4, %xmm\i
	pxor	   \XMM2, \XMM2

	cmp	   $16, %r11
	jl	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
	movdqu	   (%r10), %xmm\i
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
	pxor	   %xmm\i, \XMM2
	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
	add	   $16, %r10
	sub	   $16, %r12
	sub	   $16, %r11
	cmp	   $16, %r11
	jge	   _get_AAD_blocks\num_initial_blocks\operation

	movdqu	   \XMM2, %xmm\i
	cmp	   $0, %r11
	je	   _get_AAD_done\num_initial_blocks\operation

	pxor	   %xmm\i,%xmm\i

	/* read the last <16B of AAD. since we have at least 4B of
	data right after the AAD (the ICV, and maybe some PT), we can
	read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\num_initial_blocks\operation:
	cmp	   $4, %r11
	jle	   _get_AAD_rest4\num_initial_blocks\operation
	movq	   (%r10), \TMP1
	add	   $8, %r10
	sub	   $8, %r11
	pslldq	   $8, \TMP1
	psrldq	   $8, %xmm\i
	pxor	   \TMP1, %xmm\i
	jmp	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_rest4\num_initial_blocks\operation:
	cmp	   $0, %r11
	jle	   _get_AAD_rest0\num_initial_blocks\operation
	mov	   (%r10), %eax
	movq	   %rax, \TMP1
	add	   $4, %r10
	sub	   $4, %r12
	jne	   _get_AAD_loop\num_initial_blocks\operation
	cmp	   $16, %r11
	je	   _get_AAD_loop2_done\num_initial_blocks\operation
	mov	   $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
	sub	   $4, %r10
	pslldq	   $12, \TMP1
	psrldq	   $4, %xmm\i
	sub	   $4, %r12
	cmp	   %r11, %r12
	jne	   _get_AAD_loop2\num_initial_blocks\operation
_get_AAD_loop2_done\num_initial_blocks\operation:
	pxor	   \TMP1, %xmm\i
_get_AAD_rest0\num_initial_blocks\operation:
	/* finalize: shift out the extra bytes we read, and align
	left. since pslldq can only shift by an immediate, we use
	vpshufb and an array of shuffle masks */
	movq	   %r12, %r11
	salq	   $4, %r11
	movdqu	   aad_shift_arr(%r11), \TMP1
	PSHUFB_XMM \TMP1, %xmm\i
_get_AAD_rest_final\num_initial_blocks\operation:
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
	pxor	   \XMM2, %xmm\i
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1

_get_AAD_done\num_initial_blocks\operation:
	xor	   %r11, %r11 # initialise the data pointer offset as zero

	# start AES for num_initial_blocks blocks

	mov	   %arg5, %rax                      # %rax = *Y0
@@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks:
		# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1

        # apply GHASH on num_initial_blocks blocks

.if \i == 5