all: AVX-512 (#217)

Extends avo to support most AVX-512 instruction sets.

The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.

Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.

AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:

1.  Instruction constructors in the `x86` package moved to an optab-based
    approach. This compiles substantially faster than the verbose code
    generation we had before.

2.  The most verbose code-generated tests are moved under build tags and
    limited to a stress test mode. Stress test builds are run on
    schedule but not in regular CI.

An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.

Updates #20 #163 #229

Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
This commit is contained in:
Michael McLoughlin
2021-11-12 18:35:36 -08:00
parent 2867bd7e01
commit b76e849b5c
71 changed files with 257395 additions and 61474 deletions

View File

@@ -19,5 +19,6 @@ Features:
* **[fnv1a](fnv1a):** [FNV-1a](https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function#FNV-1a_hash) hash function.
* **[dot](dot):** Vector dot product.
* **[geohash](geohash):** Integer [geohash](https://en.wikipedia.org/wiki/Geohash) encoding.
* **[md5x16](md5x16):** AVX-512 accelerated [MD5](https://en.wikipedia.org/wiki/MD5).
* **[sha1](sha1):** [SHA-1](https://en.wikipedia.org/wiki/SHA-1) cryptographic hash.
* **[stadtx](stadtx):** [`StadtX` hash](https://github.com/demerphq/BeagleHash) port from [dgryski/go-stadtx](https://github.com/dgryski/go-stadtx).

135
examples/md5x16/README.md Normal file
View File

@@ -0,0 +1,135 @@
# md5x16
AVX-512 accelerated 16-lane [MD5](https://en.wikipedia.org/wiki/MD5) in `avo`.
Inspired by [`minio/md5-simd`](https://github.com/minio/md5-simd) and
[`igneous-systems/md5vec`](https://github.com/igneous-systems/md5vec).
Note that the focus of this example is the core assembly `block` function. The
`Sum` function can only handle parallel hashes of exactly the same length. In
practice you'd likely need hash server functionality provided by
[`md5-simd`](https://github.com/minio/md5-simd) to multiplex independent hashes
of different lengths into the 16 SIMD lanes.
[embedmd]:# (asm.go /func main/ /^}/)
```go
func main() {
// Define round constants data section.
//
// These may be computed as the integer part of abs(sin(i+1))*2^32.
T := GLOBL("consts", RODATA|NOPTR)
for i := 0; i < 64; i++ {
k := uint32(math.Floor(math.Ldexp(math.Abs(math.Sin(float64(i+1))), 32)))
DATA(4*i, U32(k))
}
// MD5 16-lane block function.
TEXT("block", 0, "func(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)")
Doc(
"block MD5 hashes 16 messages into the running hash states h. Messages are",
"at the given offsets from the base pointer. The 16-bit mask specifies",
"which lanes are active: when bit i is not set loads will be disabled and",
"the value of the resulting hash is undefined.",
)
h := Mem{Base: Load(Param("h"), GP64())}
base := Mem{Base: Load(Param("base"), GP64())}
offsetsptr := Mem{Base: Load(Param("offsets"), GP64())}
mask := Load(Param("mask"), K())
Comment("Load offsets.")
offsets := ZMM()
VMOVUPD(offsetsptr, offsets)
Comment("Load initial hash.")
hash := [4]Register{ZMM(), ZMM(), ZMM(), ZMM()}
for i, r := range hash {
VMOVUPD(h.Offset(64*i), r)
}
Comment("Initialize registers.")
a, b, c, d := ZMM(), ZMM(), ZMM(), ZMM()
for i, r := range []Register{a, b, c, d} {
VMOVUPD(hash[i], r)
}
// Allocate message registers.
m := make([]Register, 16)
for i := range m {
m[i] = ZMM()
}
// Generate round updates.
//
// Each 16-round block is parameterized based on the btiwise function,
// message indexes and shift amounts. Constants B, C, D are helpers in
// computing the logic table required by VPTERNLOGD.
const (
B = uint8(0b10101010)
C = uint8(0b11001100)
D = uint8(0b11110000)
)
quarter := []struct {
F uint8 // ternary logic table
i func(int) int // message index at round r
s []int // shift amounts
}{
{
F: (B & C) | (^B & D),
i: func(r int) int { return r % 16 },
s: []int{7, 12, 17, 22},
},
{
F: (D & B) | (^D & C),
i: func(r int) int { return (5*r + 1) % 16 },
s: []int{5, 9, 14, 20},
},
{
F: B ^ C ^ D,
i: func(r int) int { return (3*r + 5) % 16 },
s: []int{4, 11, 16, 23},
},
{
F: C ^ (B | ^D),
i: func(r int) int { return (7 * r) % 16 },
s: []int{6, 10, 15, 21},
},
}
for r := 0; r < 64; r++ {
Commentf("Round %d.", r)
q := quarter[r/16]
// Load message words.
if r < 16 {
k := K()
KMOVW(mask, k)
VPGATHERDD(base.Offset(4*r).Idx(offsets, 1), k, m[r])
}
VPADDD(m[q.i(r)], a, a)
VPADDD_BCST(T.Offset(4*r), a, a)
f := ZMM()
VMOVUPD(d, f)
VPTERNLOGD(U8(q.F), b, c, f)
VPADDD(f, a, a)
VPROLD(U8(q.s[r%4]), a, a)
VPADDD(b, a, a)
a, b, c, d = d, a, b, c
}
Comment("Final add.")
for i, r := range []Register{a, b, c, d} {
VPADDD(r, hash[i], hash[i])
}
Comment("Store results back.")
for i, r := range hash {
VMOVUPD(r, h.Offset(64*i))
}
VZEROUPPER()
RET()
Generate()
}
```

132
examples/md5x16/asm.go Normal file
View File

@@ -0,0 +1,132 @@
//go:build ignore
// +build ignore
package main
import (
"math"
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
)
func main() {
// Define round constants data section.
//
// These may be computed as the integer part of abs(sin(i+1))*2^32.
T := GLOBL("consts", RODATA|NOPTR)
for i := 0; i < 64; i++ {
k := uint32(math.Floor(math.Ldexp(math.Abs(math.Sin(float64(i+1))), 32)))
DATA(4*i, U32(k))
}
// MD5 16-lane block function.
TEXT("block", 0, "func(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)")
Doc(
"block MD5 hashes 16 messages into the running hash states h. Messages are",
"at the given offsets from the base pointer. The 16-bit mask specifies",
"which lanes are active: when bit i is not set loads will be disabled and",
"the value of the resulting hash is undefined.",
)
h := Mem{Base: Load(Param("h"), GP64())}
base := Mem{Base: Load(Param("base"), GP64())}
offsetsptr := Mem{Base: Load(Param("offsets"), GP64())}
mask := Load(Param("mask"), K())
Comment("Load offsets.")
offsets := ZMM()
VMOVUPD(offsetsptr, offsets)
Comment("Load initial hash.")
hash := [4]Register{ZMM(), ZMM(), ZMM(), ZMM()}
for i, r := range hash {
VMOVUPD(h.Offset(64*i), r)
}
Comment("Initialize registers.")
a, b, c, d := ZMM(), ZMM(), ZMM(), ZMM()
for i, r := range []Register{a, b, c, d} {
VMOVUPD(hash[i], r)
}
// Allocate message registers.
m := make([]Register, 16)
for i := range m {
m[i] = ZMM()
}
// Generate round updates.
//
// Each 16-round block is parameterized based on the btiwise function,
// message indexes and shift amounts. Constants B, C, D are helpers in
// computing the logic table required by VPTERNLOGD.
const (
B = uint8(0b10101010)
C = uint8(0b11001100)
D = uint8(0b11110000)
)
quarter := []struct {
F uint8 // ternary logic table
i func(int) int // message index at round r
s []int // shift amounts
}{
{
F: (B & C) | (^B & D),
i: func(r int) int { return r % 16 },
s: []int{7, 12, 17, 22},
},
{
F: (D & B) | (^D & C),
i: func(r int) int { return (5*r + 1) % 16 },
s: []int{5, 9, 14, 20},
},
{
F: B ^ C ^ D,
i: func(r int) int { return (3*r + 5) % 16 },
s: []int{4, 11, 16, 23},
},
{
F: C ^ (B | ^D),
i: func(r int) int { return (7 * r) % 16 },
s: []int{6, 10, 15, 21},
},
}
for r := 0; r < 64; r++ {
Commentf("Round %d.", r)
q := quarter[r/16]
// Load message words.
if r < 16 {
k := K()
KMOVW(mask, k)
VPGATHERDD(base.Offset(4*r).Idx(offsets, 1), k, m[r])
}
VPADDD(m[q.i(r)], a, a)
VPADDD_BCST(T.Offset(4*r), a, a)
f := ZMM()
VMOVUPD(d, f)
VPTERNLOGD(U8(q.F), b, c, f)
VPADDD(f, a, a)
VPROLD(U8(q.s[r%4]), a, a)
VPADDD(b, a, a)
a, b, c, d = d, a, b, c
}
Comment("Final add.")
for i, r := range []Register{a, b, c, d} {
VPADDD(r, hash[i], hash[i])
}
Comment("Store results back.")
for i, r := range hash {
VMOVUPD(r, h.Offset(64*i))
}
VZEROUPPER()
RET()
Generate()
}

151
examples/md5x16/md5x16.go Normal file
View File

@@ -0,0 +1,151 @@
// Package md5x16 implements 16-lane parallel MD5 with AVX-512 instructions.
package md5x16
import (
"encoding/binary"
"errors"
"math"
"reflect"
"unsafe"
)
//go:generate go run asm.go -out md5x16.s -stubs stub.go
// Size of a MD5 checksum in bytes.
const Size = 16
// BlockSize is the block size of MD5 in bytes.
const BlockSize = 64
// Lanes is the maximum number of parallel MD5 computations.
const Lanes = 16
// Validate checks whether the preconditions required by Sum() are met.
func Validate(data [Lanes][]byte) error {
_, err := config(data)
return err
}
// Sum returns the MD5 checksum of up to Lanes data of the same length.
//
// Non-nil inputs must all have the same length, and occupy a memory span not
// exceeding 32 bits.
func Sum(data [Lanes][]byte) [Lanes][Size]byte {
// Determine lane configuration.
cfg, err := config(data)
if err != nil {
panic(err)
}
// Initialize hash.
var h [4][Lanes]uint32
for _, l := range cfg.active {
h[0][l] = 0x67452301
h[1][l] = 0xefcdab89
h[2][l] = 0x98badcfe
h[3][l] = 0x10325476
}
// Consume full blocks.
base, n := cfg.base, cfg.n
for ; n >= BlockSize; n -= BlockSize {
block(&h, base, &cfg.offsets, cfg.mask)
base += BlockSize
}
// Final block.
var last [Lanes][]byte
var buffer [Lanes * BlockSize]byte
base = dataptr(buffer[:])
var offsets [Lanes]uint32
for _, l := range cfg.active {
last[l] = buffer[l*BlockSize : (l+1)*BlockSize]
offsets[l] = uint32(l * BlockSize)
copy(last[l], data[l][cfg.n-n:])
last[l][n] = 0x80
}
if n >= 56 {
block(&h, base, &offsets, cfg.mask)
for i := range buffer {
buffer[i] = 0
}
}
for _, l := range cfg.active {
binary.LittleEndian.PutUint64(last[l][56:], uint64(8*cfg.n))
}
block(&h, base, &offsets, cfg.mask)
// Write into byte array.
var digest [Lanes][Size]byte
for _, l := range cfg.active {
for i := 0; i < 4; i++ {
binary.LittleEndian.PutUint32(digest[l][4*i:], h[i][l])
}
}
return digest
}
// lanes represents the configuration of the 16 data lanes of an MD5
// computation.
type lanes struct {
n int // length of all active (non-nil) lanes
active []int // indexes of active lanes
mask uint16 // mask of active lanes
base uintptr // base pointer
offsets [Lanes]uint32 // offset of data lanes relative to base
}
// config determines the lane configuration for the provided data. Returns an
// error if there are no active lanes, there's a length mismatch among active
// lanes, or the data spans a memory region larger than 32-bits.
func config(data [Lanes][]byte) (*lanes, error) {
cfg := &lanes{}
// Populate active lanes, and ensure they're all the same length.
for l, d := range data {
if d != nil {
cfg.active = append(cfg.active, l)
}
}
if len(cfg.active) == 0 {
return nil, errors.New("no active lanes")
}
cfg.n = len(data[cfg.active[0]])
for _, l := range cfg.active {
cfg.mask |= 1 << l
if len(data[l]) != cfg.n {
return nil, errors.New("length mismatch")
}
}
// Compute base pointer and lane offsets.
cfg.base = ^uintptr(0)
for _, l := range cfg.active {
ptr := dataptr(data[l])
if ptr < cfg.base {
cfg.base = ptr
}
}
for _, l := range cfg.active {
ptr := dataptr(data[l])
offset := ptr - cfg.base
if offset > math.MaxUint32 {
return nil, errors.New("input data exceed 32-bit memory region")
}
cfg.offsets[l] = uint32(offset)
}
return cfg, nil
}
// dataptr extracts the data pointer from the given slice.
func dataptr(data []byte) uintptr {
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&data))
return hdr.Data
}

714
examples/md5x16/md5x16.s Normal file
View File

@@ -0,0 +1,714 @@
// Code generated by command: go run asm.go -out md5x16.s -stubs stub.go. DO NOT EDIT.
#include "textflag.h"
DATA consts<>+0(SB)/4, $0xd76aa478
DATA consts<>+4(SB)/4, $0xe8c7b756
DATA consts<>+8(SB)/4, $0x242070db
DATA consts<>+12(SB)/4, $0xc1bdceee
DATA consts<>+16(SB)/4, $0xf57c0faf
DATA consts<>+20(SB)/4, $0x4787c62a
DATA consts<>+24(SB)/4, $0xa8304613
DATA consts<>+28(SB)/4, $0xfd469501
DATA consts<>+32(SB)/4, $0x698098d8
DATA consts<>+36(SB)/4, $0x8b44f7af
DATA consts<>+40(SB)/4, $0xffff5bb1
DATA consts<>+44(SB)/4, $0x895cd7be
DATA consts<>+48(SB)/4, $0x6b901122
DATA consts<>+52(SB)/4, $0xfd987193
DATA consts<>+56(SB)/4, $0xa679438e
DATA consts<>+60(SB)/4, $0x49b40821
DATA consts<>+64(SB)/4, $0xf61e2562
DATA consts<>+68(SB)/4, $0xc040b340
DATA consts<>+72(SB)/4, $0x265e5a51
DATA consts<>+76(SB)/4, $0xe9b6c7aa
DATA consts<>+80(SB)/4, $0xd62f105d
DATA consts<>+84(SB)/4, $0x02441453
DATA consts<>+88(SB)/4, $0xd8a1e681
DATA consts<>+92(SB)/4, $0xe7d3fbc8
DATA consts<>+96(SB)/4, $0x21e1cde6
DATA consts<>+100(SB)/4, $0xc33707d6
DATA consts<>+104(SB)/4, $0xf4d50d87
DATA consts<>+108(SB)/4, $0x455a14ed
DATA consts<>+112(SB)/4, $0xa9e3e905
DATA consts<>+116(SB)/4, $0xfcefa3f8
DATA consts<>+120(SB)/4, $0x676f02d9
DATA consts<>+124(SB)/4, $0x8d2a4c8a
DATA consts<>+128(SB)/4, $0xfffa3942
DATA consts<>+132(SB)/4, $0x8771f681
DATA consts<>+136(SB)/4, $0x6d9d6122
DATA consts<>+140(SB)/4, $0xfde5380c
DATA consts<>+144(SB)/4, $0xa4beea44
DATA consts<>+148(SB)/4, $0x4bdecfa9
DATA consts<>+152(SB)/4, $0xf6bb4b60
DATA consts<>+156(SB)/4, $0xbebfbc70
DATA consts<>+160(SB)/4, $0x289b7ec6
DATA consts<>+164(SB)/4, $0xeaa127fa
DATA consts<>+168(SB)/4, $0xd4ef3085
DATA consts<>+172(SB)/4, $0x04881d05
DATA consts<>+176(SB)/4, $0xd9d4d039
DATA consts<>+180(SB)/4, $0xe6db99e5
DATA consts<>+184(SB)/4, $0x1fa27cf8
DATA consts<>+188(SB)/4, $0xc4ac5665
DATA consts<>+192(SB)/4, $0xf4292244
DATA consts<>+196(SB)/4, $0x432aff97
DATA consts<>+200(SB)/4, $0xab9423a7
DATA consts<>+204(SB)/4, $0xfc93a039
DATA consts<>+208(SB)/4, $0x655b59c3
DATA consts<>+212(SB)/4, $0x8f0ccc92
DATA consts<>+216(SB)/4, $0xffeff47d
DATA consts<>+220(SB)/4, $0x85845dd1
DATA consts<>+224(SB)/4, $0x6fa87e4f
DATA consts<>+228(SB)/4, $0xfe2ce6e0
DATA consts<>+232(SB)/4, $0xa3014314
DATA consts<>+236(SB)/4, $0x4e0811a1
DATA consts<>+240(SB)/4, $0xf7537e82
DATA consts<>+244(SB)/4, $0xbd3af235
DATA consts<>+248(SB)/4, $0x2ad7d2bb
DATA consts<>+252(SB)/4, $0xeb86d391
GLOBL consts<>(SB), RODATA|NOPTR, $256
// func block(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)
// Requires: AVX, AVX512F
TEXT ·block(SB), $0-26
MOVQ h+0(FP), AX
MOVQ base+8(FP), CX
MOVQ offsets+16(FP), DX
KMOVW mask+24(FP), K1
// Load offsets.
VMOVUPD (DX), Z0
// Load initial hash.
VMOVUPD (AX), Z1
VMOVUPD 64(AX), Z2
VMOVUPD 128(AX), Z3
VMOVUPD 192(AX), Z4
// Initialize registers.
VMOVUPD Z1, Z5
VMOVUPD Z2, Z6
VMOVUPD Z3, Z7
VMOVUPD Z4, Z8
// Round 0.
KMOVW K1, K2
VPGATHERDD (CX)(Z0*1), K2, Z9
VPADDD Z9, Z5, Z5
VPADDD.BCST consts<>+0(SB), Z5, Z5
VMOVUPD Z8, Z25
VPTERNLOGD $0xd8, Z6, Z7, Z25
VPADDD Z25, Z5, Z5
VPROLD $0x07, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 1.
KMOVW K1, K2
VPGATHERDD 4(CX)(Z0*1), K2, Z10
VPADDD Z10, Z8, Z8
VPADDD.BCST consts<>+4(SB), Z8, Z8
VMOVUPD Z7, Z25
VPTERNLOGD $0xd8, Z5, Z6, Z25
VPADDD Z25, Z8, Z8
VPROLD $0x0c, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 2.
KMOVW K1, K2
VPGATHERDD 8(CX)(Z0*1), K2, Z11
VPADDD Z11, Z7, Z7
VPADDD.BCST consts<>+8(SB), Z7, Z7
VMOVUPD Z6, Z25
VPTERNLOGD $0xd8, Z8, Z5, Z25
VPADDD Z25, Z7, Z7
VPROLD $0x11, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 3.
KMOVW K1, K2
VPGATHERDD 12(CX)(Z0*1), K2, Z12
VPADDD Z12, Z6, Z6
VPADDD.BCST consts<>+12(SB), Z6, Z6
VMOVUPD Z5, Z25
VPTERNLOGD $0xd8, Z7, Z8, Z25
VPADDD Z25, Z6, Z6
VPROLD $0x16, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 4.
KMOVW K1, K2
VPGATHERDD 16(CX)(Z0*1), K2, Z13
VPADDD Z13, Z5, Z5
VPADDD.BCST consts<>+16(SB), Z5, Z5
VMOVUPD Z8, Z25
VPTERNLOGD $0xd8, Z6, Z7, Z25
VPADDD Z25, Z5, Z5
VPROLD $0x07, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 5.
KMOVW K1, K2
VPGATHERDD 20(CX)(Z0*1), K2, Z14
VPADDD Z14, Z8, Z8
VPADDD.BCST consts<>+20(SB), Z8, Z8
VMOVUPD Z7, Z25
VPTERNLOGD $0xd8, Z5, Z6, Z25
VPADDD Z25, Z8, Z8
VPROLD $0x0c, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 6.
KMOVW K1, K2
VPGATHERDD 24(CX)(Z0*1), K2, Z15
VPADDD Z15, Z7, Z7
VPADDD.BCST consts<>+24(SB), Z7, Z7
VMOVUPD Z6, Z25
VPTERNLOGD $0xd8, Z8, Z5, Z25
VPADDD Z25, Z7, Z7
VPROLD $0x11, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 7.
KMOVW K1, K2
VPGATHERDD 28(CX)(Z0*1), K2, Z16
VPADDD Z16, Z6, Z6
VPADDD.BCST consts<>+28(SB), Z6, Z6
VMOVUPD Z5, Z25
VPTERNLOGD $0xd8, Z7, Z8, Z25
VPADDD Z25, Z6, Z6
VPROLD $0x16, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 8.
KMOVW K1, K2
VPGATHERDD 32(CX)(Z0*1), K2, Z17
VPADDD Z17, Z5, Z5
VPADDD.BCST consts<>+32(SB), Z5, Z5
VMOVUPD Z8, Z25
VPTERNLOGD $0xd8, Z6, Z7, Z25
VPADDD Z25, Z5, Z5
VPROLD $0x07, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 9.
KMOVW K1, K2
VPGATHERDD 36(CX)(Z0*1), K2, Z18
VPADDD Z18, Z8, Z8
VPADDD.BCST consts<>+36(SB), Z8, Z8
VMOVUPD Z7, Z25
VPTERNLOGD $0xd8, Z5, Z6, Z25
VPADDD Z25, Z8, Z8
VPROLD $0x0c, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 10.
KMOVW K1, K2
VPGATHERDD 40(CX)(Z0*1), K2, Z19
VPADDD Z19, Z7, Z7
VPADDD.BCST consts<>+40(SB), Z7, Z7
VMOVUPD Z6, Z25
VPTERNLOGD $0xd8, Z8, Z5, Z25
VPADDD Z25, Z7, Z7
VPROLD $0x11, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 11.
KMOVW K1, K2
VPGATHERDD 44(CX)(Z0*1), K2, Z20
VPADDD Z20, Z6, Z6
VPADDD.BCST consts<>+44(SB), Z6, Z6
VMOVUPD Z5, Z25
VPTERNLOGD $0xd8, Z7, Z8, Z25
VPADDD Z25, Z6, Z6
VPROLD $0x16, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 12.
KMOVW K1, K2
VPGATHERDD 48(CX)(Z0*1), K2, Z21
VPADDD Z21, Z5, Z5
VPADDD.BCST consts<>+48(SB), Z5, Z5
VMOVUPD Z8, Z25
VPTERNLOGD $0xd8, Z6, Z7, Z25
VPADDD Z25, Z5, Z5
VPROLD $0x07, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 13.
KMOVW K1, K2
VPGATHERDD 52(CX)(Z0*1), K2, Z22
VPADDD Z22, Z8, Z8
VPADDD.BCST consts<>+52(SB), Z8, Z8
VMOVUPD Z7, Z25
VPTERNLOGD $0xd8, Z5, Z6, Z25
VPADDD Z25, Z8, Z8
VPROLD $0x0c, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 14.
KMOVW K1, K2
VPGATHERDD 56(CX)(Z0*1), K2, Z23
VPADDD Z23, Z7, Z7
VPADDD.BCST consts<>+56(SB), Z7, Z7
VMOVUPD Z6, Z25
VPTERNLOGD $0xd8, Z8, Z5, Z25
VPADDD Z25, Z7, Z7
VPROLD $0x11, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 15.
KMOVW K1, K1
VPGATHERDD 60(CX)(Z0*1), K1, Z24
VPADDD Z24, Z6, Z6
VPADDD.BCST consts<>+60(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0xd8, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x16, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 16.
VPADDD Z10, Z5, Z5
VPADDD.BCST consts<>+64(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0xac, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x05, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 17.
VPADDD Z15, Z8, Z8
VPADDD.BCST consts<>+68(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0xac, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x09, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 18.
VPADDD Z20, Z7, Z7
VPADDD.BCST consts<>+72(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0xac, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0e, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 19.
VPADDD Z9, Z6, Z6
VPADDD.BCST consts<>+76(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0xac, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x14, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 20.
VPADDD Z14, Z5, Z5
VPADDD.BCST consts<>+80(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0xac, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x05, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 21.
VPADDD Z19, Z8, Z8
VPADDD.BCST consts<>+84(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0xac, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x09, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 22.
VPADDD Z24, Z7, Z7
VPADDD.BCST consts<>+88(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0xac, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0e, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 23.
VPADDD Z13, Z6, Z6
VPADDD.BCST consts<>+92(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0xac, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x14, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 24.
VPADDD Z18, Z5, Z5
VPADDD.BCST consts<>+96(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0xac, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x05, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 25.
VPADDD Z23, Z8, Z8
VPADDD.BCST consts<>+100(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0xac, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x09, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 26.
VPADDD Z12, Z7, Z7
VPADDD.BCST consts<>+104(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0xac, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0e, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 27.
VPADDD Z17, Z6, Z6
VPADDD.BCST consts<>+108(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0xac, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x14, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 28.
VPADDD Z22, Z5, Z5
VPADDD.BCST consts<>+112(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0xac, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x05, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 29.
VPADDD Z11, Z8, Z8
VPADDD.BCST consts<>+116(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0xac, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x09, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 30.
VPADDD Z16, Z7, Z7
VPADDD.BCST consts<>+120(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0xac, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0e, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 31.
VPADDD Z21, Z6, Z6
VPADDD.BCST consts<>+124(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0xac, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x14, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 32.
VPADDD Z14, Z5, Z5
VPADDD.BCST consts<>+128(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x96, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x04, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 33.
VPADDD Z17, Z8, Z8
VPADDD.BCST consts<>+132(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x96, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0b, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 34.
VPADDD Z20, Z7, Z7
VPADDD.BCST consts<>+136(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x96, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x10, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 35.
VPADDD Z23, Z6, Z6
VPADDD.BCST consts<>+140(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x96, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x17, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 36.
VPADDD Z10, Z5, Z5
VPADDD.BCST consts<>+144(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x96, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x04, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 37.
VPADDD Z13, Z8, Z8
VPADDD.BCST consts<>+148(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x96, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0b, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 38.
VPADDD Z16, Z7, Z7
VPADDD.BCST consts<>+152(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x96, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x10, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 39.
VPADDD Z19, Z6, Z6
VPADDD.BCST consts<>+156(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x96, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x17, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 40.
VPADDD Z22, Z5, Z5
VPADDD.BCST consts<>+160(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x96, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x04, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 41.
VPADDD Z9, Z8, Z8
VPADDD.BCST consts<>+164(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x96, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0b, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 42.
VPADDD Z12, Z7, Z7
VPADDD.BCST consts<>+168(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x96, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x10, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 43.
VPADDD Z15, Z6, Z6
VPADDD.BCST consts<>+172(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x96, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x17, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 44.
VPADDD Z18, Z5, Z5
VPADDD.BCST consts<>+176(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x96, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x04, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 45.
VPADDD Z21, Z8, Z8
VPADDD.BCST consts<>+180(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x96, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0b, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 46.
VPADDD Z24, Z7, Z7
VPADDD.BCST consts<>+184(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x96, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x10, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 47.
VPADDD Z11, Z6, Z6
VPADDD.BCST consts<>+188(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x96, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x17, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 48.
VPADDD Z9, Z5, Z5
VPADDD.BCST consts<>+192(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x63, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x06, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 49.
VPADDD Z16, Z8, Z8
VPADDD.BCST consts<>+196(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x63, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0a, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 50.
VPADDD Z23, Z7, Z7
VPADDD.BCST consts<>+200(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x63, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0f, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 51.
VPADDD Z14, Z6, Z6
VPADDD.BCST consts<>+204(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x63, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x15, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 52.
VPADDD Z21, Z5, Z5
VPADDD.BCST consts<>+208(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x63, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x06, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 53.
VPADDD Z12, Z8, Z8
VPADDD.BCST consts<>+212(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x63, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0a, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 54.
VPADDD Z19, Z7, Z7
VPADDD.BCST consts<>+216(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x63, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0f, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 55.
VPADDD Z10, Z6, Z6
VPADDD.BCST consts<>+220(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x63, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x15, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 56.
VPADDD Z17, Z5, Z5
VPADDD.BCST consts<>+224(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x63, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x06, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 57.
VPADDD Z24, Z8, Z8
VPADDD.BCST consts<>+228(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x63, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0a, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 58.
VPADDD Z15, Z7, Z7
VPADDD.BCST consts<>+232(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x63, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0f, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 59.
VPADDD Z22, Z6, Z6
VPADDD.BCST consts<>+236(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x63, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x15, Z6, Z6
VPADDD Z7, Z6, Z6
// Round 60.
VPADDD Z13, Z5, Z5
VPADDD.BCST consts<>+240(SB), Z5, Z5
VMOVUPD Z8, Z0
VPTERNLOGD $0x63, Z6, Z7, Z0
VPADDD Z0, Z5, Z5
VPROLD $0x06, Z5, Z5
VPADDD Z6, Z5, Z5
// Round 61.
VPADDD Z20, Z8, Z8
VPADDD.BCST consts<>+244(SB), Z8, Z8
VMOVUPD Z7, Z0
VPTERNLOGD $0x63, Z5, Z6, Z0
VPADDD Z0, Z8, Z8
VPROLD $0x0a, Z8, Z8
VPADDD Z5, Z8, Z8
// Round 62.
VPADDD Z11, Z7, Z7
VPADDD.BCST consts<>+248(SB), Z7, Z7
VMOVUPD Z6, Z0
VPTERNLOGD $0x63, Z8, Z5, Z0
VPADDD Z0, Z7, Z7
VPROLD $0x0f, Z7, Z7
VPADDD Z8, Z7, Z7
// Round 63.
VPADDD Z18, Z6, Z6
VPADDD.BCST consts<>+252(SB), Z6, Z6
VMOVUPD Z5, Z0
VPTERNLOGD $0x63, Z7, Z8, Z0
VPADDD Z0, Z6, Z6
VPROLD $0x15, Z6, Z6
VPADDD Z7, Z6, Z6
// Final add.
VPADDD Z5, Z1, Z1
VPADDD Z6, Z2, Z2
VPADDD Z7, Z3, Z3
VPADDD Z8, Z4, Z4
// Store results back.
VMOVUPD Z1, (AX)
VMOVUPD Z2, 64(AX)
VMOVUPD Z3, 128(AX)
VMOVUPD Z4, 192(AX)
VZEROUPPER
RET

View File

@@ -0,0 +1,136 @@
package md5x16
import (
"crypto/md5"
"encoding/hex"
"math/rand"
"testing"
"testing/quick"
"golang.org/x/sys/cpu"
)
func RequireISA(t *testing.T) {
t.Helper()
if !cpu.X86.HasAVX512F {
t.Skip("requires AVX512F instruction set")
}
}
func TestVectors(t *testing.T) {
RequireISA(t)
cases := []struct {
Data string
HexDigest string
}{
{"", "d41d8cd98f00b204e9800998ecf8427e"},
{"The quick brown fox jumps over the lazy dog", "9e107d9d372bb6826bd81d3542a419d6"},
{"The quick brown fox jumps over the lazy dog.", "e4d909c290d0fb1ca068ffaddf22cbd0"},
}
for _, c := range cases {
digest := Single(t, []byte(c.Data))
got := hex.EncodeToString(digest[:])
if got != c.HexDigest {
t.Errorf("Sum(%#v) = %s; expect %s", c.Data, got, c.HexDigest)
}
}
}
func TestCmp(t *testing.T) {
RequireISA(t)
sum := func(data []byte) [Size]byte { return Single(t, data) }
if err := quick.CheckEqual(sum, md5.Sum, nil); err != nil {
t.Fatal(err)
}
}
func TestLengths(t *testing.T) {
RequireISA(t)
const max = BlockSize << 6
data := make([]byte, max)
rand.Read(data)
for n := 0; n <= max; n++ {
got := Single(t, data[:n])
expect := md5.Sum(data[:n])
if got != expect {
t.Fatalf("failed on length %d", n)
}
}
}
// Single hashes a single data buffer in all 16 lanes and returns the result,
// after asserting that all lanes are the same.
func Single(t *testing.T, d []byte) [Size]byte {
// Place the same data in every lane.
var data [Lanes][]byte
for l := range data {
data[l] = d
}
if err := Validate(data); err != nil {
t.Fatal(err)
}
// Hash and check the lanes are the same.
digest := Sum(data)
for l := range data {
if digest[0] != digest[l] {
t.Logf("lane %02d: %x", 0, digest[0])
t.Logf("lane %02d: %x", l, digest[l])
t.Fatal("lane mismatch")
}
}
return digest[0]
}
func TestActiveLanes(t *testing.T) {
RequireISA(t)
const trials = 1 << 10
const maxlen = BlockSize << 6
for trial := 0; trial < trials; trial++ {
// Pick active lanes.
lanes := 1 + rand.Intn(Lanes-1)
active := rand.Perm(Lanes)[:lanes]
// Fill active lanes with random data.
n := rand.Intn(maxlen)
buffer := make([]byte, lanes*n)
rand.Read(buffer)
var data [Lanes][]byte
for i, l := range active {
data[l] = buffer[i*n : (i+1)*n]
}
// Hash.
digest := Sum(data)
// Verify correct result in active lanes.
for _, l := range active {
expect := md5.Sum(data[l])
if digest[l] != expect {
t.Fatalf("lane %02d: mismatch", l)
}
}
// Verify other lanes are zero.
isactive := map[int]bool{}
for _, l := range active {
isactive[l] = true
}
for l := 0; l < Lanes; l++ {
if !isactive[l] {
var zero [Size]byte
if digest[l] != zero {
t.Fatalf("inactive lane %d is non-zero", l)
}
}
}
}
}

9
examples/md5x16/stub.go Normal file
View File

@@ -0,0 +1,9 @@
// Code generated by command: go run asm.go -out md5x16.s -stubs stub.go. DO NOT EDIT.
package md5x16
// block MD5 hashes 16 messages into the running hash states h. Messages are
// at the given offsets from the base pointer. The 16-bit mask specifies
// which lanes are active: when bit i is not set loads will be disabled and
// the value of the resulting hash is undefined.
func block(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)