all: AVX-512 (#217)

Extends avo to support most AVX-512 instruction sets. The instruction type is extended to support suffixes. The K family of opmask registers is added to the register package, and the operand package is updated to support the new operand types. Move instruction deduction in `Load` and `Store` is extended to support KMOV* and VMOV* forms. Internal code generation packages were overhauled. Instruction database loading required various messy changes to account for the additional complexities of the AVX-512 instruction sets. The internal/api package was added to introduce a separation between instruction forms in the database, and the functions avo provides to create them. This was required since with instruction suffixes there is no longer a one-to-one mapping between instruction constructors and opcodes. AVX-512 bloated generated source code size substantially, initially increasing compilation and CI test times to an unacceptable level. Two changes were made to address this: 1. Instruction constructors in the `x86` package moved to an optab-based approach. This compiles substantially faster than the verbose code generation we had before. 2. The most verbose code-generated tests are moved under build tags and limited to a stress test mode. Stress test builds are run on schedule but not in regular CI. An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and test the new functionality. Updates #20 #163 #229 Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
2021-11-12 18:35:36 -08:00
parent 2867bd7e01
commit b76e849b5c
71 changed files with 257395 additions and 61474 deletions
--- a/examples/README.md
+++ b/examples/README.md
@@ -19,5 +19,6 @@ Features:
 * **[fnv1a](fnv1a):** [FNV-1a](https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function#FNV-1a_hash) hash function.
 * **[dot](dot):** Vector dot product.
 * **[geohash](geohash):** Integer [geohash](https://en.wikipedia.org/wiki/Geohash) encoding.
+* **[md5x16](md5x16):** AVX-512 accelerated [MD5](https://en.wikipedia.org/wiki/MD5).
 * **[sha1](sha1):** [SHA-1](https://en.wikipedia.org/wiki/SHA-1) cryptographic hash.
 * **[stadtx](stadtx):** [`StadtX` hash](https://github.com/demerphq/BeagleHash) port from [dgryski/go-stadtx](https://github.com/dgryski/go-stadtx).
--- a/examples/md5x16/README.md
+++ b/examples/md5x16/README.md
@@ -0,0 +1,135 @@
+# md5x16
+
+AVX-512 accelerated 16-lane [MD5](https://en.wikipedia.org/wiki/MD5) in `avo`.
+
+Inspired by [`minio/md5-simd`](https://github.com/minio/md5-simd) and
+[`igneous-systems/md5vec`](https://github.com/igneous-systems/md5vec).
+
+Note that the focus of this example is the core assembly `block` function. The
+`Sum` function can only handle parallel hashes of exactly the same length. In
+practice you'd likely need hash server functionality provided by
+[`md5-simd`](https://github.com/minio/md5-simd) to multiplex independent hashes
+of different lengths into the 16 SIMD lanes.
+
+[embedmd]:# (asm.go /func main/ /^}/)
+```go
+func main() {
+	// Define round constants data section.
+	//
+	// These may be computed as the integer part of abs(sin(i+1))*2^32.
+	T := GLOBL("consts", RODATA|NOPTR)
+	for i := 0; i < 64; i++ {
+		k := uint32(math.Floor(math.Ldexp(math.Abs(math.Sin(float64(i+1))), 32)))
+		DATA(4*i, U32(k))
+	}
+
+	// MD5 16-lane block function.
+	TEXT("block", 0, "func(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)")
+	Doc(
+		"block MD5 hashes 16 messages into the running hash states h. Messages are",
+		"at the given offsets from the base pointer. The 16-bit mask specifies",
+		"which lanes are active: when bit i is not set loads will be disabled and",
+		"the value of the resulting hash is undefined.",
+	)
+	h := Mem{Base: Load(Param("h"), GP64())}
+	base := Mem{Base: Load(Param("base"), GP64())}
+	offsetsptr := Mem{Base: Load(Param("offsets"), GP64())}
+	mask := Load(Param("mask"), K())
+
+	Comment("Load offsets.")
+	offsets := ZMM()
+	VMOVUPD(offsetsptr, offsets)
+
+	Comment("Load initial hash.")
+	hash := [4]Register{ZMM(), ZMM(), ZMM(), ZMM()}
+	for i, r := range hash {
+		VMOVUPD(h.Offset(64*i), r)
+	}
+
+	Comment("Initialize registers.")
+	a, b, c, d := ZMM(), ZMM(), ZMM(), ZMM()
+	for i, r := range []Register{a, b, c, d} {
+		VMOVUPD(hash[i], r)
+	}
+
+	// Allocate message registers.
+	m := make([]Register, 16)
+	for i := range m {
+		m[i] = ZMM()
+	}
+
+	// Generate round updates.
+	//
+	// Each 16-round block is parameterized based on the btiwise function,
+	// message indexes and shift amounts. Constants B, C, D are helpers in
+	// computing the logic table required by VPTERNLOGD.
+	const (
+		B = uint8(0b10101010)
+		C = uint8(0b11001100)
+		D = uint8(0b11110000)
+	)
+	quarter := []struct {
+		F uint8         // ternary logic table
+		i func(int) int // message index at round r
+		s []int         // shift amounts
+	}{
+		{
+			F: (B & C) | (^B & D),
+			i: func(r int) int { return r % 16 },
+			s: []int{7, 12, 17, 22},
+		},
+		{
+			F: (D & B) | (^D & C),
+			i: func(r int) int { return (5*r + 1) % 16 },
+			s: []int{5, 9, 14, 20},
+		},
+		{
+			F: B ^ C ^ D,
+			i: func(r int) int { return (3*r + 5) % 16 },
+			s: []int{4, 11, 16, 23},
+		},
+		{
+			F: C ^ (B | ^D),
+			i: func(r int) int { return (7 * r) % 16 },
+			s: []int{6, 10, 15, 21},
+		},
+	}
+
+	for r := 0; r < 64; r++ {
+		Commentf("Round %d.", r)
+		q := quarter[r/16]
+
+		// Load message words.
+		if r < 16 {
+			k := K()
+			KMOVW(mask, k)
+			VPGATHERDD(base.Offset(4*r).Idx(offsets, 1), k, m[r])
+		}
+
+		VPADDD(m[q.i(r)], a, a)
+		VPADDD_BCST(T.Offset(4*r), a, a)
+		f := ZMM()
+		VMOVUPD(d, f)
+		VPTERNLOGD(U8(q.F), b, c, f)
+		VPADDD(f, a, a)
+		VPROLD(U8(q.s[r%4]), a, a)
+		VPADDD(b, a, a)
+		a, b, c, d = d, a, b, c
+	}
+
+	Comment("Final add.")
+	for i, r := range []Register{a, b, c, d} {
+		VPADDD(r, hash[i], hash[i])
+	}
+
+	Comment("Store results back.")
+	for i, r := range hash {
+		VMOVUPD(r, h.Offset(64*i))
+	}
+
+	VZEROUPPER()
+	RET()
+
+	Generate()
+}
+```
--- a/examples/md5x16/asm.go
+++ b/examples/md5x16/asm.go
@@ -0,0 +1,132 @@
+//go:build ignore
+// +build ignore
+
+package main
+
+import (
+	"math"
+
+	. "github.com/mmcloughlin/avo/build"
+	. "github.com/mmcloughlin/avo/operand"
+	. "github.com/mmcloughlin/avo/reg"
+)
+
+func main() {
+	// Define round constants data section.
+	//
+	// These may be computed as the integer part of abs(sin(i+1))*2^32.
+	T := GLOBL("consts", RODATA|NOPTR)
+	for i := 0; i < 64; i++ {
+		k := uint32(math.Floor(math.Ldexp(math.Abs(math.Sin(float64(i+1))), 32)))
+		DATA(4*i, U32(k))
+	}
+
+	// MD5 16-lane block function.
+	TEXT("block", 0, "func(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)")
+	Doc(
+		"block MD5 hashes 16 messages into the running hash states h. Messages are",
+		"at the given offsets from the base pointer. The 16-bit mask specifies",
+		"which lanes are active: when bit i is not set loads will be disabled and",
+		"the value of the resulting hash is undefined.",
+	)
+	h := Mem{Base: Load(Param("h"), GP64())}
+	base := Mem{Base: Load(Param("base"), GP64())}
+	offsetsptr := Mem{Base: Load(Param("offsets"), GP64())}
+	mask := Load(Param("mask"), K())
+
+	Comment("Load offsets.")
+	offsets := ZMM()
+	VMOVUPD(offsetsptr, offsets)
+
+	Comment("Load initial hash.")
+	hash := [4]Register{ZMM(), ZMM(), ZMM(), ZMM()}
+	for i, r := range hash {
+		VMOVUPD(h.Offset(64*i), r)
+	}
+
+	Comment("Initialize registers.")
+	a, b, c, d := ZMM(), ZMM(), ZMM(), ZMM()
+	for i, r := range []Register{a, b, c, d} {
+		VMOVUPD(hash[i], r)
+	}
+
+	// Allocate message registers.
+	m := make([]Register, 16)
+	for i := range m {
+		m[i] = ZMM()
+	}
+
+	// Generate round updates.
+	//
+	// Each 16-round block is parameterized based on the btiwise function,
+	// message indexes and shift amounts. Constants B, C, D are helpers in
+	// computing the logic table required by VPTERNLOGD.
+	const (
+		B = uint8(0b10101010)
+		C = uint8(0b11001100)
+		D = uint8(0b11110000)
+	)
+	quarter := []struct {
+		F uint8         // ternary logic table
+		i func(int) int // message index at round r
+		s []int         // shift amounts
+	}{
+		{
+			F: (B & C) | (^B & D),
+			i: func(r int) int { return r % 16 },
+			s: []int{7, 12, 17, 22},
+		},
+		{
+			F: (D & B) | (^D & C),
+			i: func(r int) int { return (5*r + 1) % 16 },
+			s: []int{5, 9, 14, 20},
+		},
+		{
+			F: B ^ C ^ D,
+			i: func(r int) int { return (3*r + 5) % 16 },
+			s: []int{4, 11, 16, 23},
+		},
+		{
+			F: C ^ (B | ^D),
+			i: func(r int) int { return (7 * r) % 16 },
+			s: []int{6, 10, 15, 21},
+		},
+	}
+
+	for r := 0; r < 64; r++ {
+		Commentf("Round %d.", r)
+		q := quarter[r/16]
+
+		// Load message words.
+		if r < 16 {
+			k := K()
+			KMOVW(mask, k)
+			VPGATHERDD(base.Offset(4*r).Idx(offsets, 1), k, m[r])
+		}
+
+		VPADDD(m[q.i(r)], a, a)
+		VPADDD_BCST(T.Offset(4*r), a, a)
+		f := ZMM()
+		VMOVUPD(d, f)
+		VPTERNLOGD(U8(q.F), b, c, f)
+		VPADDD(f, a, a)
+		VPROLD(U8(q.s[r%4]), a, a)
+		VPADDD(b, a, a)
+		a, b, c, d = d, a, b, c
+	}
+
+	Comment("Final add.")
+	for i, r := range []Register{a, b, c, d} {
+		VPADDD(r, hash[i], hash[i])
+	}
+
+	Comment("Store results back.")
+	for i, r := range hash {
+		VMOVUPD(r, h.Offset(64*i))
+	}
+
+	VZEROUPPER()
+	RET()
+
+	Generate()
+}
--- a/examples/md5x16/md5x16.go
+++ b/examples/md5x16/md5x16.go
@@ -0,0 +1,151 @@
+// Package md5x16 implements 16-lane parallel MD5 with AVX-512 instructions.
+package md5x16
+
+import (
+	"encoding/binary"
+	"errors"
+	"math"
+	"reflect"
+	"unsafe"
+)
+
+//go:generate go run asm.go -out md5x16.s -stubs stub.go
+
+// Size of a MD5 checksum in bytes.
+const Size = 16
+
+// BlockSize is the block size of MD5 in bytes.
+const BlockSize = 64
+
+// Lanes is the maximum number of parallel MD5 computations.
+const Lanes = 16
+
+// Validate checks whether the preconditions required by Sum() are met.
+func Validate(data [Lanes][]byte) error {
+	_, err := config(data)
+	return err
+}
+
+// Sum returns the MD5 checksum of up to Lanes data of the same length.
+//
+// Non-nil inputs must all have the same length, and occupy a memory span not
+// exceeding 32 bits.
+func Sum(data [Lanes][]byte) [Lanes][Size]byte {
+	// Determine lane configuration.
+	cfg, err := config(data)
+	if err != nil {
+		panic(err)
+	}
+
+	// Initialize hash.
+	var h [4][Lanes]uint32
+	for _, l := range cfg.active {
+		h[0][l] = 0x67452301
+		h[1][l] = 0xefcdab89
+		h[2][l] = 0x98badcfe
+		h[3][l] = 0x10325476
+	}
+
+	// Consume full blocks.
+	base, n := cfg.base, cfg.n
+	for ; n >= BlockSize; n -= BlockSize {
+		block(&h, base, &cfg.offsets, cfg.mask)
+		base += BlockSize
+	}
+
+	// Final block.
+	var last [Lanes][]byte
+	var buffer [Lanes * BlockSize]byte
+	base = dataptr(buffer[:])
+	var offsets [Lanes]uint32
+	for _, l := range cfg.active {
+		last[l] = buffer[l*BlockSize : (l+1)*BlockSize]
+		offsets[l] = uint32(l * BlockSize)
+		copy(last[l], data[l][cfg.n-n:])
+		last[l][n] = 0x80
+	}
+
+	if n >= 56 {
+		block(&h, base, &offsets, cfg.mask)
+		for i := range buffer {
+			buffer[i] = 0
+		}
+	}
+
+	for _, l := range cfg.active {
+		binary.LittleEndian.PutUint64(last[l][56:], uint64(8*cfg.n))
+	}
+	block(&h, base, &offsets, cfg.mask)
+
+	// Write into byte array.
+	var digest [Lanes][Size]byte
+	for _, l := range cfg.active {
+		for i := 0; i < 4; i++ {
+			binary.LittleEndian.PutUint32(digest[l][4*i:], h[i][l])
+		}
+	}
+
+	return digest
+}
+
+// lanes represents the configuration of the 16 data lanes of an MD5
+// computation.
+type lanes struct {
+	n       int           // length of all active (non-nil) lanes
+	active  []int         // indexes of active lanes
+	mask    uint16        // mask of active lanes
+	base    uintptr       // base pointer
+	offsets [Lanes]uint32 // offset of data lanes relative to base
+}
+
+// config determines the lane configuration for the provided data. Returns an
+// error if there are no active lanes, there's a length mismatch among active
+// lanes, or the data spans a memory region larger than 32-bits.
+func config(data [Lanes][]byte) (*lanes, error) {
+	cfg := &lanes{}
+
+	// Populate active lanes, and ensure they're all the same length.
+	for l, d := range data {
+		if d != nil {
+			cfg.active = append(cfg.active, l)
+		}
+	}
+
+	if len(cfg.active) == 0 {
+		return nil, errors.New("no active lanes")
+	}
+
+	cfg.n = len(data[cfg.active[0]])
+	for _, l := range cfg.active {
+		cfg.mask |= 1 << l
+		if len(data[l]) != cfg.n {
+			return nil, errors.New("length mismatch")
+		}
+	}
+
+	// Compute base pointer and lane offsets.
+	cfg.base = ^uintptr(0)
+	for _, l := range cfg.active {
+		ptr := dataptr(data[l])
+		if ptr < cfg.base {
+			cfg.base = ptr
+		}
+	}
+
+	for _, l := range cfg.active {
+		ptr := dataptr(data[l])
+		offset := ptr - cfg.base
+		if offset > math.MaxUint32 {
+			return nil, errors.New("input data exceed 32-bit memory region")
+		}
+		cfg.offsets[l] = uint32(offset)
+	}
+
+	return cfg, nil
+}
+
+// dataptr extracts the data pointer from the given slice.
+func dataptr(data []byte) uintptr {
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&data))
+	return hdr.Data
+}
--- a/examples/md5x16/md5x16.s
+++ b/examples/md5x16/md5x16.s
@@ -0,0 +1,714 @@
+// Code generated by command: go run asm.go -out md5x16.s -stubs stub.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA consts<>+0(SB)/4, $0xd76aa478
+DATA consts<>+4(SB)/4, $0xe8c7b756
+DATA consts<>+8(SB)/4, $0x242070db
+DATA consts<>+12(SB)/4, $0xc1bdceee
+DATA consts<>+16(SB)/4, $0xf57c0faf
+DATA consts<>+20(SB)/4, $0x4787c62a
+DATA consts<>+24(SB)/4, $0xa8304613
+DATA consts<>+28(SB)/4, $0xfd469501
+DATA consts<>+32(SB)/4, $0x698098d8
+DATA consts<>+36(SB)/4, $0x8b44f7af
+DATA consts<>+40(SB)/4, $0xffff5bb1
+DATA consts<>+44(SB)/4, $0x895cd7be
+DATA consts<>+48(SB)/4, $0x6b901122
+DATA consts<>+52(SB)/4, $0xfd987193
+DATA consts<>+56(SB)/4, $0xa679438e
+DATA consts<>+60(SB)/4, $0x49b40821
+DATA consts<>+64(SB)/4, $0xf61e2562
+DATA consts<>+68(SB)/4, $0xc040b340
+DATA consts<>+72(SB)/4, $0x265e5a51
+DATA consts<>+76(SB)/4, $0xe9b6c7aa
+DATA consts<>+80(SB)/4, $0xd62f105d
+DATA consts<>+84(SB)/4, $0x02441453
+DATA consts<>+88(SB)/4, $0xd8a1e681
+DATA consts<>+92(SB)/4, $0xe7d3fbc8
+DATA consts<>+96(SB)/4, $0x21e1cde6
+DATA consts<>+100(SB)/4, $0xc33707d6
+DATA consts<>+104(SB)/4, $0xf4d50d87
+DATA consts<>+108(SB)/4, $0x455a14ed
+DATA consts<>+112(SB)/4, $0xa9e3e905
+DATA consts<>+116(SB)/4, $0xfcefa3f8
+DATA consts<>+120(SB)/4, $0x676f02d9
+DATA consts<>+124(SB)/4, $0x8d2a4c8a
+DATA consts<>+128(SB)/4, $0xfffa3942
+DATA consts<>+132(SB)/4, $0x8771f681
+DATA consts<>+136(SB)/4, $0x6d9d6122
+DATA consts<>+140(SB)/4, $0xfde5380c
+DATA consts<>+144(SB)/4, $0xa4beea44
+DATA consts<>+148(SB)/4, $0x4bdecfa9
+DATA consts<>+152(SB)/4, $0xf6bb4b60
+DATA consts<>+156(SB)/4, $0xbebfbc70
+DATA consts<>+160(SB)/4, $0x289b7ec6
+DATA consts<>+164(SB)/4, $0xeaa127fa
+DATA consts<>+168(SB)/4, $0xd4ef3085
+DATA consts<>+172(SB)/4, $0x04881d05
+DATA consts<>+176(SB)/4, $0xd9d4d039
+DATA consts<>+180(SB)/4, $0xe6db99e5
+DATA consts<>+184(SB)/4, $0x1fa27cf8
+DATA consts<>+188(SB)/4, $0xc4ac5665
+DATA consts<>+192(SB)/4, $0xf4292244
+DATA consts<>+196(SB)/4, $0x432aff97
+DATA consts<>+200(SB)/4, $0xab9423a7
+DATA consts<>+204(SB)/4, $0xfc93a039
+DATA consts<>+208(SB)/4, $0x655b59c3
+DATA consts<>+212(SB)/4, $0x8f0ccc92
+DATA consts<>+216(SB)/4, $0xffeff47d
+DATA consts<>+220(SB)/4, $0x85845dd1
+DATA consts<>+224(SB)/4, $0x6fa87e4f
+DATA consts<>+228(SB)/4, $0xfe2ce6e0
+DATA consts<>+232(SB)/4, $0xa3014314
+DATA consts<>+236(SB)/4, $0x4e0811a1
+DATA consts<>+240(SB)/4, $0xf7537e82
+DATA consts<>+244(SB)/4, $0xbd3af235
+DATA consts<>+248(SB)/4, $0x2ad7d2bb
+DATA consts<>+252(SB)/4, $0xeb86d391
+GLOBL consts<>(SB), RODATA|NOPTR, $256
+
+// func block(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)
+// Requires: AVX, AVX512F
+TEXT ·block(SB), $0-26
+	MOVQ  h+0(FP), AX
+	MOVQ  base+8(FP), CX
+	MOVQ  offsets+16(FP), DX
+	KMOVW mask+24(FP), K1
+
+	// Load offsets.
+	VMOVUPD (DX), Z0
+
+	// Load initial hash.
+	VMOVUPD (AX), Z1
+	VMOVUPD 64(AX), Z2
+	VMOVUPD 128(AX), Z3
+	VMOVUPD 192(AX), Z4
+
+	// Initialize registers.
+	VMOVUPD Z1, Z5
+	VMOVUPD Z2, Z6
+	VMOVUPD Z3, Z7
+	VMOVUPD Z4, Z8
+
+	// Round 0.
+	KMOVW       K1, K2
+	VPGATHERDD  (CX)(Z0*1), K2, Z9
+	VPADDD      Z9, Z5, Z5
+	VPADDD.BCST consts<>+0(SB), Z5, Z5
+	VMOVUPD     Z8, Z25
+	VPTERNLOGD  $0xd8, Z6, Z7, Z25
+	VPADDD      Z25, Z5, Z5
+	VPROLD      $0x07, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 1.
+	KMOVW       K1, K2
+	VPGATHERDD  4(CX)(Z0*1), K2, Z10
+	VPADDD      Z10, Z8, Z8
+	VPADDD.BCST consts<>+4(SB), Z8, Z8
+	VMOVUPD     Z7, Z25
+	VPTERNLOGD  $0xd8, Z5, Z6, Z25
+	VPADDD      Z25, Z8, Z8
+	VPROLD      $0x0c, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 2.
+	KMOVW       K1, K2
+	VPGATHERDD  8(CX)(Z0*1), K2, Z11
+	VPADDD      Z11, Z7, Z7
+	VPADDD.BCST consts<>+8(SB), Z7, Z7
+	VMOVUPD     Z6, Z25
+	VPTERNLOGD  $0xd8, Z8, Z5, Z25
+	VPADDD      Z25, Z7, Z7
+	VPROLD      $0x11, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 3.
+	KMOVW       K1, K2
+	VPGATHERDD  12(CX)(Z0*1), K2, Z12
+	VPADDD      Z12, Z6, Z6
+	VPADDD.BCST consts<>+12(SB), Z6, Z6
+	VMOVUPD     Z5, Z25
+	VPTERNLOGD  $0xd8, Z7, Z8, Z25
+	VPADDD      Z25, Z6, Z6
+	VPROLD      $0x16, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 4.
+	KMOVW       K1, K2
+	VPGATHERDD  16(CX)(Z0*1), K2, Z13
+	VPADDD      Z13, Z5, Z5
+	VPADDD.BCST consts<>+16(SB), Z5, Z5
+	VMOVUPD     Z8, Z25
+	VPTERNLOGD  $0xd8, Z6, Z7, Z25
+	VPADDD      Z25, Z5, Z5
+	VPROLD      $0x07, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 5.
+	KMOVW       K1, K2
+	VPGATHERDD  20(CX)(Z0*1), K2, Z14
+	VPADDD      Z14, Z8, Z8
+	VPADDD.BCST consts<>+20(SB), Z8, Z8
+	VMOVUPD     Z7, Z25
+	VPTERNLOGD  $0xd8, Z5, Z6, Z25
+	VPADDD      Z25, Z8, Z8
+	VPROLD      $0x0c, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 6.
+	KMOVW       K1, K2
+	VPGATHERDD  24(CX)(Z0*1), K2, Z15
+	VPADDD      Z15, Z7, Z7
+	VPADDD.BCST consts<>+24(SB), Z7, Z7
+	VMOVUPD     Z6, Z25
+	VPTERNLOGD  $0xd8, Z8, Z5, Z25
+	VPADDD      Z25, Z7, Z7
+	VPROLD      $0x11, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 7.
+	KMOVW       K1, K2
+	VPGATHERDD  28(CX)(Z0*1), K2, Z16
+	VPADDD      Z16, Z6, Z6
+	VPADDD.BCST consts<>+28(SB), Z6, Z6
+	VMOVUPD     Z5, Z25
+	VPTERNLOGD  $0xd8, Z7, Z8, Z25
+	VPADDD      Z25, Z6, Z6
+	VPROLD      $0x16, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 8.
+	KMOVW       K1, K2
+	VPGATHERDD  32(CX)(Z0*1), K2, Z17
+	VPADDD      Z17, Z5, Z5
+	VPADDD.BCST consts<>+32(SB), Z5, Z5
+	VMOVUPD     Z8, Z25
+	VPTERNLOGD  $0xd8, Z6, Z7, Z25
+	VPADDD      Z25, Z5, Z5
+	VPROLD      $0x07, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 9.
+	KMOVW       K1, K2
+	VPGATHERDD  36(CX)(Z0*1), K2, Z18
+	VPADDD      Z18, Z8, Z8
+	VPADDD.BCST consts<>+36(SB), Z8, Z8
+	VMOVUPD     Z7, Z25
+	VPTERNLOGD  $0xd8, Z5, Z6, Z25
+	VPADDD      Z25, Z8, Z8
+	VPROLD      $0x0c, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 10.
+	KMOVW       K1, K2
+	VPGATHERDD  40(CX)(Z0*1), K2, Z19
+	VPADDD      Z19, Z7, Z7
+	VPADDD.BCST consts<>+40(SB), Z7, Z7
+	VMOVUPD     Z6, Z25
+	VPTERNLOGD  $0xd8, Z8, Z5, Z25
+	VPADDD      Z25, Z7, Z7
+	VPROLD      $0x11, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 11.
+	KMOVW       K1, K2
+	VPGATHERDD  44(CX)(Z0*1), K2, Z20
+	VPADDD      Z20, Z6, Z6
+	VPADDD.BCST consts<>+44(SB), Z6, Z6
+	VMOVUPD     Z5, Z25
+	VPTERNLOGD  $0xd8, Z7, Z8, Z25
+	VPADDD      Z25, Z6, Z6
+	VPROLD      $0x16, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 12.
+	KMOVW       K1, K2
+	VPGATHERDD  48(CX)(Z0*1), K2, Z21
+	VPADDD      Z21, Z5, Z5
+	VPADDD.BCST consts<>+48(SB), Z5, Z5
+	VMOVUPD     Z8, Z25
+	VPTERNLOGD  $0xd8, Z6, Z7, Z25
+	VPADDD      Z25, Z5, Z5
+	VPROLD      $0x07, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 13.
+	KMOVW       K1, K2
+	VPGATHERDD  52(CX)(Z0*1), K2, Z22
+	VPADDD      Z22, Z8, Z8
+	VPADDD.BCST consts<>+52(SB), Z8, Z8
+	VMOVUPD     Z7, Z25
+	VPTERNLOGD  $0xd8, Z5, Z6, Z25
+	VPADDD      Z25, Z8, Z8
+	VPROLD      $0x0c, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 14.
+	KMOVW       K1, K2
+	VPGATHERDD  56(CX)(Z0*1), K2, Z23
+	VPADDD      Z23, Z7, Z7
+	VPADDD.BCST consts<>+56(SB), Z7, Z7
+	VMOVUPD     Z6, Z25
+	VPTERNLOGD  $0xd8, Z8, Z5, Z25
+	VPADDD      Z25, Z7, Z7
+	VPROLD      $0x11, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 15.
+	KMOVW       K1, K1
+	VPGATHERDD  60(CX)(Z0*1), K1, Z24
+	VPADDD      Z24, Z6, Z6
+	VPADDD.BCST consts<>+60(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0xd8, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x16, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 16.
+	VPADDD      Z10, Z5, Z5
+	VPADDD.BCST consts<>+64(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0xac, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x05, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 17.
+	VPADDD      Z15, Z8, Z8
+	VPADDD.BCST consts<>+68(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0xac, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x09, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 18.
+	VPADDD      Z20, Z7, Z7
+	VPADDD.BCST consts<>+72(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0xac, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0e, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 19.
+	VPADDD      Z9, Z6, Z6
+	VPADDD.BCST consts<>+76(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0xac, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x14, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 20.
+	VPADDD      Z14, Z5, Z5
+	VPADDD.BCST consts<>+80(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0xac, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x05, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 21.
+	VPADDD      Z19, Z8, Z8
+	VPADDD.BCST consts<>+84(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0xac, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x09, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 22.
+	VPADDD      Z24, Z7, Z7
+	VPADDD.BCST consts<>+88(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0xac, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0e, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 23.
+	VPADDD      Z13, Z6, Z6
+	VPADDD.BCST consts<>+92(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0xac, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x14, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 24.
+	VPADDD      Z18, Z5, Z5
+	VPADDD.BCST consts<>+96(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0xac, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x05, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 25.
+	VPADDD      Z23, Z8, Z8
+	VPADDD.BCST consts<>+100(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0xac, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x09, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 26.
+	VPADDD      Z12, Z7, Z7
+	VPADDD.BCST consts<>+104(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0xac, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0e, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 27.
+	VPADDD      Z17, Z6, Z6
+	VPADDD.BCST consts<>+108(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0xac, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x14, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 28.
+	VPADDD      Z22, Z5, Z5
+	VPADDD.BCST consts<>+112(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0xac, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x05, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 29.
+	VPADDD      Z11, Z8, Z8
+	VPADDD.BCST consts<>+116(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0xac, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x09, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 30.
+	VPADDD      Z16, Z7, Z7
+	VPADDD.BCST consts<>+120(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0xac, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0e, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 31.
+	VPADDD      Z21, Z6, Z6
+	VPADDD.BCST consts<>+124(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0xac, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x14, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 32.
+	VPADDD      Z14, Z5, Z5
+	VPADDD.BCST consts<>+128(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x96, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x04, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 33.
+	VPADDD      Z17, Z8, Z8
+	VPADDD.BCST consts<>+132(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x96, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0b, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 34.
+	VPADDD      Z20, Z7, Z7
+	VPADDD.BCST consts<>+136(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x96, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x10, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 35.
+	VPADDD      Z23, Z6, Z6
+	VPADDD.BCST consts<>+140(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x96, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x17, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 36.
+	VPADDD      Z10, Z5, Z5
+	VPADDD.BCST consts<>+144(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x96, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x04, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 37.
+	VPADDD      Z13, Z8, Z8
+	VPADDD.BCST consts<>+148(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x96, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0b, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 38.
+	VPADDD      Z16, Z7, Z7
+	VPADDD.BCST consts<>+152(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x96, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x10, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 39.
+	VPADDD      Z19, Z6, Z6
+	VPADDD.BCST consts<>+156(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x96, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x17, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 40.
+	VPADDD      Z22, Z5, Z5
+	VPADDD.BCST consts<>+160(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x96, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x04, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 41.
+	VPADDD      Z9, Z8, Z8
+	VPADDD.BCST consts<>+164(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x96, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0b, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 42.
+	VPADDD      Z12, Z7, Z7
+	VPADDD.BCST consts<>+168(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x96, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x10, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 43.
+	VPADDD      Z15, Z6, Z6
+	VPADDD.BCST consts<>+172(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x96, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x17, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 44.
+	VPADDD      Z18, Z5, Z5
+	VPADDD.BCST consts<>+176(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x96, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x04, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 45.
+	VPADDD      Z21, Z8, Z8
+	VPADDD.BCST consts<>+180(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x96, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0b, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 46.
+	VPADDD      Z24, Z7, Z7
+	VPADDD.BCST consts<>+184(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x96, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x10, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 47.
+	VPADDD      Z11, Z6, Z6
+	VPADDD.BCST consts<>+188(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x96, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x17, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 48.
+	VPADDD      Z9, Z5, Z5
+	VPADDD.BCST consts<>+192(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x63, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x06, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 49.
+	VPADDD      Z16, Z8, Z8
+	VPADDD.BCST consts<>+196(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x63, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0a, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 50.
+	VPADDD      Z23, Z7, Z7
+	VPADDD.BCST consts<>+200(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x63, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0f, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 51.
+	VPADDD      Z14, Z6, Z6
+	VPADDD.BCST consts<>+204(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x63, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x15, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 52.
+	VPADDD      Z21, Z5, Z5
+	VPADDD.BCST consts<>+208(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x63, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x06, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 53.
+	VPADDD      Z12, Z8, Z8
+	VPADDD.BCST consts<>+212(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x63, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0a, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 54.
+	VPADDD      Z19, Z7, Z7
+	VPADDD.BCST consts<>+216(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x63, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0f, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 55.
+	VPADDD      Z10, Z6, Z6
+	VPADDD.BCST consts<>+220(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x63, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x15, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 56.
+	VPADDD      Z17, Z5, Z5
+	VPADDD.BCST consts<>+224(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x63, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x06, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 57.
+	VPADDD      Z24, Z8, Z8
+	VPADDD.BCST consts<>+228(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x63, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0a, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 58.
+	VPADDD      Z15, Z7, Z7
+	VPADDD.BCST consts<>+232(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x63, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0f, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 59.
+	VPADDD      Z22, Z6, Z6
+	VPADDD.BCST consts<>+236(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x63, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x15, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Round 60.
+	VPADDD      Z13, Z5, Z5
+	VPADDD.BCST consts<>+240(SB), Z5, Z5
+	VMOVUPD     Z8, Z0
+	VPTERNLOGD  $0x63, Z6, Z7, Z0
+	VPADDD      Z0, Z5, Z5
+	VPROLD      $0x06, Z5, Z5
+	VPADDD      Z6, Z5, Z5
+
+	// Round 61.
+	VPADDD      Z20, Z8, Z8
+	VPADDD.BCST consts<>+244(SB), Z8, Z8
+	VMOVUPD     Z7, Z0
+	VPTERNLOGD  $0x63, Z5, Z6, Z0
+	VPADDD      Z0, Z8, Z8
+	VPROLD      $0x0a, Z8, Z8
+	VPADDD      Z5, Z8, Z8
+
+	// Round 62.
+	VPADDD      Z11, Z7, Z7
+	VPADDD.BCST consts<>+248(SB), Z7, Z7
+	VMOVUPD     Z6, Z0
+	VPTERNLOGD  $0x63, Z8, Z5, Z0
+	VPADDD      Z0, Z7, Z7
+	VPROLD      $0x0f, Z7, Z7
+	VPADDD      Z8, Z7, Z7
+
+	// Round 63.
+	VPADDD      Z18, Z6, Z6
+	VPADDD.BCST consts<>+252(SB), Z6, Z6
+	VMOVUPD     Z5, Z0
+	VPTERNLOGD  $0x63, Z7, Z8, Z0
+	VPADDD      Z0, Z6, Z6
+	VPROLD      $0x15, Z6, Z6
+	VPADDD      Z7, Z6, Z6
+
+	// Final add.
+	VPADDD Z5, Z1, Z1
+	VPADDD Z6, Z2, Z2
+	VPADDD Z7, Z3, Z3
+	VPADDD Z8, Z4, Z4
+
+	// Store results back.
+	VMOVUPD Z1, (AX)
+	VMOVUPD Z2, 64(AX)
+	VMOVUPD Z3, 128(AX)
+	VMOVUPD Z4, 192(AX)
+	VZEROUPPER
+	RET
--- a/examples/md5x16/md5x16_test.go
+++ b/examples/md5x16/md5x16_test.go
@@ -0,0 +1,136 @@
+package md5x16
+
+import (
+	"crypto/md5"
+	"encoding/hex"
+	"math/rand"
+	"testing"
+	"testing/quick"
+
+	"golang.org/x/sys/cpu"
+)
+
+func RequireISA(t *testing.T) {
+	t.Helper()
+	if !cpu.X86.HasAVX512F {
+		t.Skip("requires AVX512F instruction set")
+	}
+}
+
+func TestVectors(t *testing.T) {
+	RequireISA(t)
+
+	cases := []struct {
+		Data      string
+		HexDigest string
+	}{
+		{"", "d41d8cd98f00b204e9800998ecf8427e"},
+		{"The quick brown fox jumps over the lazy dog", "9e107d9d372bb6826bd81d3542a419d6"},
+		{"The quick brown fox jumps over the lazy dog.", "e4d909c290d0fb1ca068ffaddf22cbd0"},
+	}
+	for _, c := range cases {
+		digest := Single(t, []byte(c.Data))
+		got := hex.EncodeToString(digest[:])
+		if got != c.HexDigest {
+			t.Errorf("Sum(%#v) = %s; expect %s", c.Data, got, c.HexDigest)
+		}
+	}
+}
+
+func TestCmp(t *testing.T) {
+	RequireISA(t)
+
+	sum := func(data []byte) [Size]byte { return Single(t, data) }
+	if err := quick.CheckEqual(sum, md5.Sum, nil); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestLengths(t *testing.T) {
+	RequireISA(t)
+
+	const max = BlockSize << 6
+	data := make([]byte, max)
+	rand.Read(data)
+
+	for n := 0; n <= max; n++ {
+		got := Single(t, data[:n])
+		expect := md5.Sum(data[:n])
+		if got != expect {
+			t.Fatalf("failed on length %d", n)
+		}
+	}
+}
+
+// Single hashes a single data buffer in all 16 lanes and returns the result,
+// after asserting that all lanes are the same.
+func Single(t *testing.T, d []byte) [Size]byte {
+	// Place the same data in every lane.
+	var data [Lanes][]byte
+	for l := range data {
+		data[l] = d
+	}
+
+	if err := Validate(data); err != nil {
+		t.Fatal(err)
+	}
+
+	// Hash and check the lanes are the same.
+	digest := Sum(data)
+	for l := range data {
+		if digest[0] != digest[l] {
+			t.Logf("lane %02d: %x", 0, digest[0])
+			t.Logf("lane %02d: %x", l, digest[l])
+			t.Fatal("lane mismatch")
+		}
+	}
+
+	return digest[0]
+}
+
+func TestActiveLanes(t *testing.T) {
+	RequireISA(t)
+
+	const trials = 1 << 10
+	const maxlen = BlockSize << 6
+	for trial := 0; trial < trials; trial++ {
+		// Pick active lanes.
+		lanes := 1 + rand.Intn(Lanes-1)
+		active := rand.Perm(Lanes)[:lanes]
+
+		// Fill active lanes with random data.
+		n := rand.Intn(maxlen)
+		buffer := make([]byte, lanes*n)
+		rand.Read(buffer)
+
+		var data [Lanes][]byte
+		for i, l := range active {
+			data[l] = buffer[i*n : (i+1)*n]
+		}
+
+		// Hash.
+		digest := Sum(data)
+
+		// Verify correct result in active lanes.
+		for _, l := range active {
+			expect := md5.Sum(data[l])
+			if digest[l] != expect {
+				t.Fatalf("lane %02d: mismatch", l)
+			}
+		}
+
+		// Verify other lanes are zero.
+		isactive := map[int]bool{}
+		for _, l := range active {
+			isactive[l] = true
+		}
+		for l := 0; l < Lanes; l++ {
+			if !isactive[l] {
+				var zero [Size]byte
+				if digest[l] != zero {
+					t.Fatalf("inactive lane %d is non-zero", l)
+				}
+			}
+		}
+	}
+}
--- a/examples/md5x16/stub.go
+++ b/examples/md5x16/stub.go
@@ -0,0 +1,9 @@
+// Code generated by command: go run asm.go -out md5x16.s -stubs stub.go. DO NOT EDIT.
+
+package md5x16
+
+// block MD5 hashes 16 messages into the running hash states h. Messages are
+// at the given offsets from the base pointer. The 16-bit mask specifies
+// which lanes are active: when bit i is not set loads will be disabled and
+// the value of the resulting hash is undefined.
+func block(h *[4][16]uint32, base uintptr, offsets *[16]uint32, mask uint16)