Files
avo/examples/md5x16/md5x16.go

152 lines
3.5 KiB
Go
Raw Normal View History

all: AVX-512 (#217) Extends avo to support most AVX-512 instruction sets. The instruction type is extended to support suffixes. The K family of opmask registers is added to the register package, and the operand package is updated to support the new operand types. Move instruction deduction in `Load` and `Store` is extended to support KMOV* and VMOV* forms. Internal code generation packages were overhauled. Instruction database loading required various messy changes to account for the additional complexities of the AVX-512 instruction sets. The internal/api package was added to introduce a separation between instruction forms in the database, and the functions avo provides to create them. This was required since with instruction suffixes there is no longer a one-to-one mapping between instruction constructors and opcodes. AVX-512 bloated generated source code size substantially, initially increasing compilation and CI test times to an unacceptable level. Two changes were made to address this: 1. Instruction constructors in the `x86` package moved to an optab-based approach. This compiles substantially faster than the verbose code generation we had before. 2. The most verbose code-generated tests are moved under build tags and limited to a stress test mode. Stress test builds are run on schedule but not in regular CI. An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and test the new functionality. Updates #20 #163 #229 Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
2021-11-12 18:35:36 -08:00
// Package md5x16 implements 16-lane parallel MD5 with AVX-512 instructions.
package md5x16
import (
"encoding/binary"
"errors"
"math"
"reflect"
"unsafe"
)
//go:generate go run asm.go -out md5x16.s -stubs stub.go
// Size of a MD5 checksum in bytes.
const Size = 16
// BlockSize is the block size of MD5 in bytes.
const BlockSize = 64
// Lanes is the maximum number of parallel MD5 computations.
const Lanes = 16
// Validate checks whether the preconditions required by Sum() are met.
func Validate(data [Lanes][]byte) error {
_, err := config(data)
return err
}
// Sum returns the MD5 checksum of up to Lanes data of the same length.
//
// Non-nil inputs must all have the same length, and occupy a memory span not
// exceeding 32 bits.
func Sum(data [Lanes][]byte) [Lanes][Size]byte {
// Determine lane configuration.
cfg, err := config(data)
if err != nil {
panic(err)
}
// Initialize hash.
var h [4][Lanes]uint32
for _, l := range cfg.active {
h[0][l] = 0x67452301
h[1][l] = 0xefcdab89
h[2][l] = 0x98badcfe
h[3][l] = 0x10325476
}
// Consume full blocks.
base, n := cfg.base, cfg.n
for ; n >= BlockSize; n -= BlockSize {
block(&h, base, &cfg.offsets, cfg.mask)
base += BlockSize
}
// Final block.
var last [Lanes][]byte
var buffer [Lanes * BlockSize]byte
base = dataptr(buffer[:])
var offsets [Lanes]uint32
for _, l := range cfg.active {
last[l] = buffer[l*BlockSize : (l+1)*BlockSize]
offsets[l] = uint32(l * BlockSize)
copy(last[l], data[l][cfg.n-n:])
last[l][n] = 0x80
}
if n >= 56 {
block(&h, base, &offsets, cfg.mask)
for i := range buffer {
buffer[i] = 0
}
}
for _, l := range cfg.active {
binary.LittleEndian.PutUint64(last[l][56:], uint64(8*cfg.n))
}
block(&h, base, &offsets, cfg.mask)
// Write into byte array.
var digest [Lanes][Size]byte
for _, l := range cfg.active {
for i := 0; i < 4; i++ {
binary.LittleEndian.PutUint32(digest[l][4*i:], h[i][l])
}
}
return digest
}
// lanes represents the configuration of the 16 data lanes of an MD5
// computation.
type lanes struct {
n int // length of all active (non-nil) lanes
active []int // indexes of active lanes
mask uint16 // mask of active lanes
base uintptr // base pointer
offsets [Lanes]uint32 // offset of data lanes relative to base
}
// config determines the lane configuration for the provided data. Returns an
// error if there are no active lanes, there's a length mismatch among active
// lanes, or the data spans a memory region larger than 32-bits.
func config(data [Lanes][]byte) (*lanes, error) {
cfg := &lanes{}
// Populate active lanes, and ensure they're all the same length.
for l, d := range data {
if d != nil {
cfg.active = append(cfg.active, l)
}
}
if len(cfg.active) == 0 {
return nil, errors.New("no active lanes")
}
cfg.n = len(data[cfg.active[0]])
for _, l := range cfg.active {
cfg.mask |= 1 << l
if len(data[l]) != cfg.n {
return nil, errors.New("length mismatch")
}
}
// Compute base pointer and lane offsets.
cfg.base = ^uintptr(0)
for _, l := range cfg.active {
ptr := dataptr(data[l])
if ptr < cfg.base {
cfg.base = ptr
}
}
for _, l := range cfg.active {
ptr := dataptr(data[l])
offset := ptr - cfg.base
if offset > math.MaxUint32 {
return nil, errors.New("input data exceed 32-bit memory region")
}
cfg.offsets[l] = uint32(offset)
}
return cfg, nil
}
// dataptr extracts the data pointer from the given slice.
func dataptr(data []byte) uintptr {
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&data))
return hdr.Data
}