Files
avo/examples/md5x16
Michael McLoughlin b76e849b5c all: AVX-512 (#217)
Extends avo to support most AVX-512 instruction sets.

The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.

Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.

AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:

1.  Instruction constructors in the `x86` package moved to an optab-based
    approach. This compiles substantially faster than the verbose code
    generation we had before.

2.  The most verbose code-generated tests are moved under build tags and
    limited to a stress test mode. Stress test builds are run on
    schedule but not in regular CI.

An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.

Updates #20 #163 #229

Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
2021-11-12 19:02:39 -08:00
..
2021-11-12 19:02:39 -08:00
2021-11-12 19:02:39 -08:00
2021-11-12 19:02:39 -08:00
2021-11-12 19:02:39 -08:00
2021-11-12 19:02:39 -08:00
2021-11-12 19:02:39 -08:00

md5x16

AVX-512 accelerated 16-lane MD5 in avo.

Inspired by minio/md5-simd and igneous-systems/md5vec.

Note that the focus of this example is the core assembly block function. The Sum function can only handle parallel hashes of exactly the same length. In practice you'd likely need hash server functionality provided by md5-simd to multiplex independent hashes of different lengths into the 16 SIMD lanes.

func main() {
	// Define round constants data section.
	//
	// These may be computed as the integer part of abs(sin(i+1))*2^32.
	T := GLOBL("consts", RODATA|NOPTR)
	for i := 0; i < 64; i++ {
		k := uint32(math.Floor(math.Ldexp(math.Abs(math.Sin(float64(i+1))), 32)))
		DATA(4*i, U32(k))
	}

	// MD5 16-lane block function.
	TEXT("block", 0, "func(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)")
	Doc(
		"block MD5 hashes 16 messages into the running hash states h. Messages are",
		"at the given offsets from the base pointer. The 16-bit mask specifies",
		"which lanes are active: when bit i is not set loads will be disabled and",
		"the value of the resulting hash is undefined.",
	)
	h := Mem{Base: Load(Param("h"), GP64())}
	base := Mem{Base: Load(Param("base"), GP64())}
	offsetsptr := Mem{Base: Load(Param("offsets"), GP64())}
	mask := Load(Param("mask"), K())

	Comment("Load offsets.")
	offsets := ZMM()
	VMOVUPD(offsetsptr, offsets)

	Comment("Load initial hash.")
	hash := [4]Register{ZMM(), ZMM(), ZMM(), ZMM()}
	for i, r := range hash {
		VMOVUPD(h.Offset(64*i), r)
	}

	Comment("Initialize registers.")
	a, b, c, d := ZMM(), ZMM(), ZMM(), ZMM()
	for i, r := range []Register{a, b, c, d} {
		VMOVUPD(hash[i], r)
	}

	// Allocate message registers.
	m := make([]Register, 16)
	for i := range m {
		m[i] = ZMM()
	}

	// Generate round updates.
	//
	// Each 16-round block is parameterized based on the btiwise function,
	// message indexes and shift amounts. Constants B, C, D are helpers in
	// computing the logic table required by VPTERNLOGD.
	const (
		B = uint8(0b10101010)
		C = uint8(0b11001100)
		D = uint8(0b11110000)
	)
	quarter := []struct {
		F uint8         // ternary logic table
		i func(int) int // message index at round r
		s []int         // shift amounts
	}{
		{
			F: (B & C) | (^B & D),
			i: func(r int) int { return r % 16 },
			s: []int{7, 12, 17, 22},
		},
		{
			F: (D & B) | (^D & C),
			i: func(r int) int { return (5*r + 1) % 16 },
			s: []int{5, 9, 14, 20},
		},
		{
			F: B ^ C ^ D,
			i: func(r int) int { return (3*r + 5) % 16 },
			s: []int{4, 11, 16, 23},
		},
		{
			F: C ^ (B | ^D),
			i: func(r int) int { return (7 * r) % 16 },
			s: []int{6, 10, 15, 21},
		},
	}

	for r := 0; r < 64; r++ {
		Commentf("Round %d.", r)
		q := quarter[r/16]

		// Load message words.
		if r < 16 {
			k := K()
			KMOVW(mask, k)
			VPGATHERDD(base.Offset(4*r).Idx(offsets, 1), k, m[r])
		}

		VPADDD(m[q.i(r)], a, a)
		VPADDD_BCST(T.Offset(4*r), a, a)
		f := ZMM()
		VMOVUPD(d, f)
		VPTERNLOGD(U8(q.F), b, c, f)
		VPADDD(f, a, a)
		VPROLD(U8(q.s[r%4]), a, a)
		VPADDD(b, a, a)
		a, b, c, d = d, a, b, c
	}

	Comment("Final add.")
	for i, r := range []Register{a, b, c, d} {
		VPADDD(r, hash[i], hash[i])
	}

	Comment("Store results back.")
	for i, r := range hash {
		VMOVUPD(r, h.Offset(64*i))
	}

	VZEROUPPER()
	RET()

	Generate()
}