Skip to content

Commit

Permalink
Signed-off-by: Nicola Torracca <shark@bitchx.it>
Browse files Browse the repository at this point in the history
Use embedded broadcast for constants & VPTERNLOG for 3-way bitwise
logical operations.
  • Loading branch information
Shark64 authored and Shark committed Apr 11, 2024
1 parent f80afdf commit aeb299b
Show file tree
Hide file tree
Showing 10 changed files with 552 additions and 1,252 deletions.
16 changes: 8 additions & 8 deletions mh_sha1/mh_sha1_block_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ func(mh_sha1_block_avx2)
;;
;; perform 0-79 steps
;;
vpbroadcastq K, [K00_19]
vpbroadcastd K, [K00_19]
;; do rounds 0...15
%assign I 0
%rep 16
Expand All @@ -426,14 +426,14 @@ func(mh_sha1_block_avx2)
PREFETCH_X [mh_in_p + pref+128*0]
PREFETCH_X [mh_in_p + pref+128*1]
;; do rounds 20...39
vpbroadcastq K, [K20_39]
vpbroadcastd K, [K20_39]
%rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
ROTATE_ARGS
%assign I (I+1)
%endrep
;; do rounds 40...59
vpbroadcastq K, [K40_59]
vpbroadcastd K, [K40_59]
%rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
ROTATE_ARGS
Expand All @@ -442,7 +442,7 @@ func(mh_sha1_block_avx2)
PREFETCH_X [mh_in_p + pref+128*2]
PREFETCH_X [mh_in_p + pref+128*3]
;; do rounds 60...79
vpbroadcastq K, [K60_79]
vpbroadcastd K, [K60_79]
%rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
ROTATE_ARGS
Expand Down Expand Up @@ -502,7 +502,7 @@ section .rodata align=32

align 32
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
K00_19: dq 0x5A8279995A827999
K20_39: dq 0x6ED9EBA16ED9EBA1
K40_59: dq 0x8F1BBCDC8F1BBCDC
K60_79: dq 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6
57 changes: 10 additions & 47 deletions mh_sha1/mh_sha1_block_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ func(mh_sha1_block_avx512)
; save rsp
mov RSP_SAVE, rsp

cmp loops, 0
test loops, loops
jle .return

; align rsp to 64 Bytes needed by avx512
Expand All @@ -271,7 +271,7 @@ func(mh_sha1_block_avx512)
VMOVPS HH3, [mh_digests_p + 64*3]
VMOVPS HH4, [mh_digests_p + 64*4]
;a mask used to transform to big-endian data
vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
vbroadcasti32x4 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]

.block_loop:
;transform to big-endian data and store on aligned_frame
Expand All @@ -293,7 +293,7 @@ func(mh_sha1_block_avx512)
vmovdqa64 D, HH3
vmovdqa64 E, HH4

vmovdqa32 KT, [K00_19]
vpbroadcastd KT, [K00_19]
%assign I 0xCA
%assign J 0
%assign K 2
Expand All @@ -306,13 +306,13 @@ func(mh_sha1_block_avx512)
MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
%endif
%if N = 19
vmovdqa32 KT, [K20_39]
vpbroadcastd KT, [K20_39]
%assign I 0x96
%elif N = 39
vmovdqa32 KT, [K40_59]
vpbroadcastd KT, [K40_59]
%assign I 0xE8
%elif N = 59
vmovdqa32 KT, [K60_79]
vpbroadcastd KT, [K60_79]
%assign I 0x96
%endif
%if N % 10 = 9
Expand Down Expand Up @@ -355,48 +355,11 @@ section .data align=64
align 64
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b

K00_19: dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999

K20_39: dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1

K40_59: dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC

K60_79: dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6

%else
%ifidn __OUTPUT_FORMAT__, win64
Expand Down
27 changes: 11 additions & 16 deletions mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
; save rsp
mov RSP_SAVE, rsp

cmp loops, 0
test loops, loops
jle .return

; leave enough space to store segs_digests
Expand Down Expand Up @@ -507,7 +507,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)

.block_loop:
;transform to big-endian data and store on aligned_frame
vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
vbroadcasti128 F, [PSHUFFLE_BYTE_FLIP_MASK]
;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
%assign I 0
%rep 16
Expand All @@ -521,7 +521,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
%assign I (I+1)
%endrep

mov mh_segs, 0 ;start from the first 8 segments
xor mh_segs, mh_segs ;start from the first 8 segments
mov pref, 1024 ;avoid prefetch repeadtedly
.segs_loop:
;; Initialize digests
Expand All @@ -539,7 +539,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
;;
;; perform 0-79 steps
;;
vmovdqa K, [K00_19]
vpbroadcastd K, [K00_19]
;; do rounds 0...15
%assign I 0
%rep 16
Expand All @@ -560,15 +560,15 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
PREFETCH_X [mh_in_p + pref+128*0]
PREFETCH_X [mh_in_p + pref+128*1]
;; do rounds 20...39
vmovdqa K, [K20_39]
vpbroadcastd K, [K20_39]
%rep 20
%assign J (I % 2)
SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
ROTATE_ARGS
%assign I (I+1)
%endrep
;; do rounds 40...59
vmovdqa K, [K40_59]
vpbroadcastd K, [K40_59]
%rep 20
%assign J (I % 2)
SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
Expand All @@ -578,7 +578,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
PREFETCH_X [mh_in_p + pref+128*2]
PREFETCH_X [mh_in_p + pref+128*3]
;; do rounds 60...79
vmovdqa K, [K60_79]
vpbroadcastd K, [K60_79]
%rep 20
%assign J (I % 2)
SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
Expand Down Expand Up @@ -642,12 +642,7 @@ section .data align=32

align 32
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
dq 0x5A8279995A827999, 0x5A8279995A827999
K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6
57 changes: 10 additions & 47 deletions mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
; save rsp
mov RSP_SAVE, rsp

cmp loops, 0
test loops, loops
jle .return

; align rsp to 64 Bytes needed by avx512
Expand All @@ -354,7 +354,7 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
VMOVPS HH3, [mh_digests_p + 64*3]
VMOVPS HH4, [mh_digests_p + 64*4]
;a mask used to transform to big-endian data
vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
vbroadcasti32x4 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]

;init murmur variables
mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
Expand Down Expand Up @@ -384,7 +384,7 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
vmovdqa64 D, HH3
vmovdqa64 E, HH4

vmovdqa32 KT, [K00_19]
vpbroadcastd KT, [K00_19]
%assign I 0xCA
%assign J 0
%assign K 2
Expand All @@ -399,13 +399,13 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
PROCESS_LOOP APPEND(W,J), I
%endif
%if N = 19
vmovdqa32 KT, [K20_39]
vpbroadcastd KT, [K20_39]
%assign I 0x96
%elif N = 39
vmovdqa32 KT, [K40_59]
vpbroadcastd KT, [K40_59]
%assign I 0xE8
%elif N = 59
vmovdqa32 KT, [K60_79]
vpbroadcastd KT, [K60_79]
%assign I 0x96
%endif
%if N % 20 = 19
Expand Down Expand Up @@ -453,48 +453,11 @@ section .data align=64
align 64
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b

K00_19: dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999

K20_39: dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1

K40_59: dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC

K60_79: dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6

%else
%ifidn __OUTPUT_FORMAT__, win64
Expand Down
Loading

0 comments on commit aeb299b

Please sign in to comment.