Loading arch/x86/crypto/sha1_avx2_x86_64_asm.S +36 −31 Original line number Diff line number Diff line Loading @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm #define K_BASE %r8 #define HASH_PTR %r9 #define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 #define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 Loading Loading @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) vpaddd K_XMM(K_BASE), WY, WY_TMP vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) Loading Loading @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY vpaddd K_XMM(K_BASE), WY, WY_TMP vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY Loading Loading @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) vpaddd K_XMM(K_BASE), WY, WY_TMP vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY Loading Loading @@ -446,6 +445,16 @@ .endm /* Add constant only if (%2 > %3) condition met (uses RTA as temp) * %1 + %2 >= %3 ? %4 : 0 */ .macro ADD_IF_GE a, b, c, d mov \a, RTA add $\d, RTA cmp $\c, \b cmovge RTA, \a .endm /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ Loading @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks PRECALC_OFFSET = 0 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr PRECALC_OFFSET = 128 /* Go to next block if needed */ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 Loading @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ cmp K_BASE, BUFFER_PTR jne _begin test BLOCKS_CTR, BLOCKS_CTR jnz _begin .align 32 jmp _end .align 32 Loading Loading @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ /* Update Counter */ sub $1, BLOCKS_CTR /* Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 Loading @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E cmp K_BASE, BUFFER_PTR /* is current block the last one? */ je _loop test BLOCKS_CTR, BLOCKS_CTR jz _loop mov TB, B Loading Loading @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ /* update counter */ sub $1, BLOCKS_CTR /* Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: Loading Loading @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper lea K_XMM_AR(%rip), K_BASE /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR lea 64(BUF), BUFFER_PTR2 shl $6, CNT /* mul by 64 */ add BUF, CNT add $64, CNT mov CNT, BUFFER_END cmp BUFFER_END, BUFFER_PTR2 cmovae K_BASE, BUFFER_PTR2 mov BUF, BUFFER_PTR2 mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP Loading arch/x86/crypto/sha1_ssse3_glue.c +1 −1 Original line number Diff line number Diff line Loading @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, static bool avx2_usable(void) { if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; Loading crypto/chacha20_generic.c +7 −2 Original line number Diff line number Diff line Loading @@ -91,9 +91,14 @@ int crypto_chacha20_crypt(struct skcipher_request *req) crypto_chacha20_init(state, ctx, walk.iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes); err = skcipher_walk_done(&walk, 0); nbytes); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; Loading crypto/testmgr.h +7 −0 Original line number Diff line number Diff line Loading @@ -32675,6 +32675,10 @@ static const struct cipher_testvec chacha20_enc_tv_template[] = { "\x5b\x86\x2f\x37\x30\xe3\x7c\xfd" "\xc4\xfd\x80\x6c\x22\xf2\x21", .rlen = 375, .also_non_np = 1, .np = 3, .tap = { 375 - 20, 4, 16 }, }, { /* RFC7539 A.2. Test Vector #3 */ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" Loading Loading @@ -33049,6 +33053,9 @@ static const struct cipher_testvec chacha20_enc_tv_template[] = { "\xa1\xed\xad\xd5\x76\xfa\x24\x8f" "\x98", .rlen = 1281, .also_non_np = 1, .np = 3, .tap = { 1200, 1, 80 }, }, }; drivers/crypto/inside-secure/safexcel_hash.c +4 −4 Original line number Diff line number Diff line Loading @@ -883,10 +883,7 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, if (ret) return ret; memcpy(ctx->ipad, &istate.state, SHA1_DIGEST_SIZE); memcpy(ctx->opad, &ostate.state, SHA1_DIGEST_SIZE); for (i = 0; i < ARRAY_SIZE(istate.state); i++) { for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++) { if (ctx->ipad[i] != le32_to_cpu(istate.state[i]) || ctx->opad[i] != le32_to_cpu(ostate.state[i])) { ctx->base.needs_inv = true; Loading @@ -894,6 +891,9 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, } } memcpy(ctx->ipad, &istate.state, SHA1_DIGEST_SIZE); memcpy(ctx->opad, &ostate.state, SHA1_DIGEST_SIZE); return 0; } Loading Loading
arch/x86/crypto/sha1_avx2_x86_64_asm.S +36 −31 Original line number Diff line number Diff line Loading @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm #define K_BASE %r8 #define HASH_PTR %r9 #define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 #define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 Loading Loading @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) vpaddd K_XMM(K_BASE), WY, WY_TMP vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) Loading Loading @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY vpaddd K_XMM(K_BASE), WY, WY_TMP vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY Loading Loading @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) vpaddd K_XMM(K_BASE), WY, WY_TMP vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY Loading Loading @@ -446,6 +445,16 @@ .endm /* Add constant only if (%2 > %3) condition met (uses RTA as temp) * %1 + %2 >= %3 ? %4 : 0 */ .macro ADD_IF_GE a, b, c, d mov \a, RTA add $\d, RTA cmp $\c, \b cmovge RTA, \a .endm /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ Loading @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks PRECALC_OFFSET = 0 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr PRECALC_OFFSET = 128 /* Go to next block if needed */ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 Loading @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ cmp K_BASE, BUFFER_PTR jne _begin test BLOCKS_CTR, BLOCKS_CTR jnz _begin .align 32 jmp _end .align 32 Loading Loading @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ /* Update Counter */ sub $1, BLOCKS_CTR /* Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 Loading @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E cmp K_BASE, BUFFER_PTR /* is current block the last one? */ je _loop test BLOCKS_CTR, BLOCKS_CTR jz _loop mov TB, B Loading Loading @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ /* update counter */ sub $1, BLOCKS_CTR /* Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: Loading Loading @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper lea K_XMM_AR(%rip), K_BASE /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR lea 64(BUF), BUFFER_PTR2 shl $6, CNT /* mul by 64 */ add BUF, CNT add $64, CNT mov CNT, BUFFER_END cmp BUFFER_END, BUFFER_PTR2 cmovae K_BASE, BUFFER_PTR2 mov BUF, BUFFER_PTR2 mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP Loading
arch/x86/crypto/sha1_ssse3_glue.c +1 −1 Original line number Diff line number Diff line Loading @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, static bool avx2_usable(void) { if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; Loading
crypto/chacha20_generic.c +7 −2 Original line number Diff line number Diff line Loading @@ -91,9 +91,14 @@ int crypto_chacha20_crypt(struct skcipher_request *req) crypto_chacha20_init(state, ctx, walk.iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes); err = skcipher_walk_done(&walk, 0); nbytes); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; Loading
crypto/testmgr.h +7 −0 Original line number Diff line number Diff line Loading @@ -32675,6 +32675,10 @@ static const struct cipher_testvec chacha20_enc_tv_template[] = { "\x5b\x86\x2f\x37\x30\xe3\x7c\xfd" "\xc4\xfd\x80\x6c\x22\xf2\x21", .rlen = 375, .also_non_np = 1, .np = 3, .tap = { 375 - 20, 4, 16 }, }, { /* RFC7539 A.2. Test Vector #3 */ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" Loading Loading @@ -33049,6 +33053,9 @@ static const struct cipher_testvec chacha20_enc_tv_template[] = { "\xa1\xed\xad\xd5\x76\xfa\x24\x8f" "\x98", .rlen = 1281, .also_non_np = 1, .np = 3, .tap = { 1200, 1, 80 }, }, };
drivers/crypto/inside-secure/safexcel_hash.c +4 −4 Original line number Diff line number Diff line Loading @@ -883,10 +883,7 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, if (ret) return ret; memcpy(ctx->ipad, &istate.state, SHA1_DIGEST_SIZE); memcpy(ctx->opad, &ostate.state, SHA1_DIGEST_SIZE); for (i = 0; i < ARRAY_SIZE(istate.state); i++) { for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++) { if (ctx->ipad[i] != le32_to_cpu(istate.state[i]) || ctx->opad[i] != le32_to_cpu(ostate.state[i])) { ctx->base.needs_inv = true; Loading @@ -894,6 +891,9 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, } } memcpy(ctx->ipad, &istate.state, SHA1_DIGEST_SIZE); memcpy(ctx->opad, &ostate.state, SHA1_DIGEST_SIZE); return 0; } Loading