Files
avo/examples/md5x16/asm.go
Michael McLoughlin b76e849b5c all: AVX-512 (#217)
Extends avo to support most AVX-512 instruction sets.

The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.

Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.

AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:

1.  Instruction constructors in the `x86` package moved to an optab-based
    approach. This compiles substantially faster than the verbose code
    generation we had before.

2.  The most verbose code-generated tests are moved under build tags and
    limited to a stress test mode. Stress test builds are run on
    schedule but not in regular CI.

An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.

Updates #20 #163 #229

Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
2021-11-12 19:02:39 -08:00

133 lines
3.0 KiB
Go

//go:build ignore
// +build ignore
package main
import (
"math"
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
)
func main() {
// Define round constants data section.
//
// These may be computed as the integer part of abs(sin(i+1))*2^32.
T := GLOBL("consts", RODATA|NOPTR)
for i := 0; i < 64; i++ {
k := uint32(math.Floor(math.Ldexp(math.Abs(math.Sin(float64(i+1))), 32)))
DATA(4*i, U32(k))
}
// MD5 16-lane block function.
TEXT("block", 0, "func(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)")
Doc(
"block MD5 hashes 16 messages into the running hash states h. Messages are",
"at the given offsets from the base pointer. The 16-bit mask specifies",
"which lanes are active: when bit i is not set loads will be disabled and",
"the value of the resulting hash is undefined.",
)
h := Mem{Base: Load(Param("h"), GP64())}
base := Mem{Base: Load(Param("base"), GP64())}
offsetsptr := Mem{Base: Load(Param("offsets"), GP64())}
mask := Load(Param("mask"), K())
Comment("Load offsets.")
offsets := ZMM()
VMOVUPD(offsetsptr, offsets)
Comment("Load initial hash.")
hash := [4]Register{ZMM(), ZMM(), ZMM(), ZMM()}
for i, r := range hash {
VMOVUPD(h.Offset(64*i), r)
}
Comment("Initialize registers.")
a, b, c, d := ZMM(), ZMM(), ZMM(), ZMM()
for i, r := range []Register{a, b, c, d} {
VMOVUPD(hash[i], r)
}
// Allocate message registers.
m := make([]Register, 16)
for i := range m {
m[i] = ZMM()
}
// Generate round updates.
//
// Each 16-round block is parameterized based on the btiwise function,
// message indexes and shift amounts. Constants B, C, D are helpers in
// computing the logic table required by VPTERNLOGD.
const (
B = uint8(0b10101010)
C = uint8(0b11001100)
D = uint8(0b11110000)
)
quarter := []struct {
F uint8 // ternary logic table
i func(int) int // message index at round r
s []int // shift amounts
}{
{
F: (B & C) | (^B & D),
i: func(r int) int { return r % 16 },
s: []int{7, 12, 17, 22},
},
{
F: (D & B) | (^D & C),
i: func(r int) int { return (5*r + 1) % 16 },
s: []int{5, 9, 14, 20},
},
{
F: B ^ C ^ D,
i: func(r int) int { return (3*r + 5) % 16 },
s: []int{4, 11, 16, 23},
},
{
F: C ^ (B | ^D),
i: func(r int) int { return (7 * r) % 16 },
s: []int{6, 10, 15, 21},
},
}
for r := 0; r < 64; r++ {
Commentf("Round %d.", r)
q := quarter[r/16]
// Load message words.
if r < 16 {
k := K()
KMOVW(mask, k)
VPGATHERDD(base.Offset(4*r).Idx(offsets, 1), k, m[r])
}
VPADDD(m[q.i(r)], a, a)
VPADDD_BCST(T.Offset(4*r), a, a)
f := ZMM()
VMOVUPD(d, f)
VPTERNLOGD(U8(q.F), b, c, f)
VPADDD(f, a, a)
VPROLD(U8(q.s[r%4]), a, a)
VPADDD(b, a, a)
a, b, c, d = d, a, b, c
}
Comment("Final add.")
for i, r := range []Register{a, b, c, d} {
VPADDD(r, hash[i], hash[i])
}
Comment("Store results back.")
for i, r := range hash {
VMOVUPD(r, h.Offset(64*i))
}
VZEROUPPER()
RET()
Generate()
}