all: AVX-512 (#217)
Extends avo to support most AVX-512 instruction sets.
The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.
Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.
AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:
1. Instruction constructors in the `x86` package moved to an optab-based
approach. This compiles substantially faster than the verbose code
generation we had before.
2. The most verbose code-generated tests are moved under build tags and
limited to a stress test mode. Stress test builds are run on
schedule but not in regular CI.
An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.
Updates #20 #163 #229
Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
This commit is contained in:
151
examples/md5x16/md5x16.go
Normal file
151
examples/md5x16/md5x16.go
Normal file
@@ -0,0 +1,151 @@
|
||||
// Package md5x16 implements 16-lane parallel MD5 with AVX-512 instructions.
|
||||
package md5x16
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"math"
|
||||
"reflect"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
//go:generate go run asm.go -out md5x16.s -stubs stub.go
|
||||
|
||||
// Size of a MD5 checksum in bytes.
|
||||
const Size = 16
|
||||
|
||||
// BlockSize is the block size of MD5 in bytes.
|
||||
const BlockSize = 64
|
||||
|
||||
// Lanes is the maximum number of parallel MD5 computations.
|
||||
const Lanes = 16
|
||||
|
||||
// Validate checks whether the preconditions required by Sum() are met.
|
||||
func Validate(data [Lanes][]byte) error {
|
||||
_, err := config(data)
|
||||
return err
|
||||
}
|
||||
|
||||
// Sum returns the MD5 checksum of up to Lanes data of the same length.
|
||||
//
|
||||
// Non-nil inputs must all have the same length, and occupy a memory span not
|
||||
// exceeding 32 bits.
|
||||
func Sum(data [Lanes][]byte) [Lanes][Size]byte {
|
||||
// Determine lane configuration.
|
||||
cfg, err := config(data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Initialize hash.
|
||||
var h [4][Lanes]uint32
|
||||
for _, l := range cfg.active {
|
||||
h[0][l] = 0x67452301
|
||||
h[1][l] = 0xefcdab89
|
||||
h[2][l] = 0x98badcfe
|
||||
h[3][l] = 0x10325476
|
||||
}
|
||||
|
||||
// Consume full blocks.
|
||||
base, n := cfg.base, cfg.n
|
||||
for ; n >= BlockSize; n -= BlockSize {
|
||||
block(&h, base, &cfg.offsets, cfg.mask)
|
||||
base += BlockSize
|
||||
}
|
||||
|
||||
// Final block.
|
||||
var last [Lanes][]byte
|
||||
var buffer [Lanes * BlockSize]byte
|
||||
base = dataptr(buffer[:])
|
||||
var offsets [Lanes]uint32
|
||||
for _, l := range cfg.active {
|
||||
last[l] = buffer[l*BlockSize : (l+1)*BlockSize]
|
||||
offsets[l] = uint32(l * BlockSize)
|
||||
copy(last[l], data[l][cfg.n-n:])
|
||||
last[l][n] = 0x80
|
||||
}
|
||||
|
||||
if n >= 56 {
|
||||
block(&h, base, &offsets, cfg.mask)
|
||||
for i := range buffer {
|
||||
buffer[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range cfg.active {
|
||||
binary.LittleEndian.PutUint64(last[l][56:], uint64(8*cfg.n))
|
||||
}
|
||||
block(&h, base, &offsets, cfg.mask)
|
||||
|
||||
// Write into byte array.
|
||||
var digest [Lanes][Size]byte
|
||||
for _, l := range cfg.active {
|
||||
for i := 0; i < 4; i++ {
|
||||
binary.LittleEndian.PutUint32(digest[l][4*i:], h[i][l])
|
||||
}
|
||||
}
|
||||
|
||||
return digest
|
||||
}
|
||||
|
||||
// lanes represents the configuration of the 16 data lanes of an MD5
|
||||
// computation.
|
||||
type lanes struct {
|
||||
n int // length of all active (non-nil) lanes
|
||||
active []int // indexes of active lanes
|
||||
mask uint16 // mask of active lanes
|
||||
base uintptr // base pointer
|
||||
offsets [Lanes]uint32 // offset of data lanes relative to base
|
||||
}
|
||||
|
||||
// config determines the lane configuration for the provided data. Returns an
|
||||
// error if there are no active lanes, there's a length mismatch among active
|
||||
// lanes, or the data spans a memory region larger than 32-bits.
|
||||
func config(data [Lanes][]byte) (*lanes, error) {
|
||||
cfg := &lanes{}
|
||||
|
||||
// Populate active lanes, and ensure they're all the same length.
|
||||
for l, d := range data {
|
||||
if d != nil {
|
||||
cfg.active = append(cfg.active, l)
|
||||
}
|
||||
}
|
||||
|
||||
if len(cfg.active) == 0 {
|
||||
return nil, errors.New("no active lanes")
|
||||
}
|
||||
|
||||
cfg.n = len(data[cfg.active[0]])
|
||||
for _, l := range cfg.active {
|
||||
cfg.mask |= 1 << l
|
||||
if len(data[l]) != cfg.n {
|
||||
return nil, errors.New("length mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
// Compute base pointer and lane offsets.
|
||||
cfg.base = ^uintptr(0)
|
||||
for _, l := range cfg.active {
|
||||
ptr := dataptr(data[l])
|
||||
if ptr < cfg.base {
|
||||
cfg.base = ptr
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range cfg.active {
|
||||
ptr := dataptr(data[l])
|
||||
offset := ptr - cfg.base
|
||||
if offset > math.MaxUint32 {
|
||||
return nil, errors.New("input data exceed 32-bit memory region")
|
||||
}
|
||||
cfg.offsets[l] = uint32(offset)
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
// dataptr extracts the data pointer from the given slice.
|
||||
func dataptr(data []byte) uintptr {
|
||||
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&data))
|
||||
return hdr.Data
|
||||
}
|
||||
Reference in New Issue
Block a user