all: AVX-512 (#217)

Extends avo to support most AVX-512 instruction sets.

The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.

Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.

AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:

1.  Instruction constructors in the `x86` package moved to an optab-based
    approach. This compiles substantially faster than the verbose code
    generation we had before.

2.  The most verbose code-generated tests are moved under build tags and
    limited to a stress test mode. Stress test builds are run on
    schedule but not in regular CI.

An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.

Updates #20 #163 #229

Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
This commit is contained in:
Michael McLoughlin
2021-11-12 18:35:36 -08:00
parent 2867bd7e01
commit b76e849b5c
71 changed files with 257395 additions and 61474 deletions

View File

@@ -1,8 +1,11 @@
package load
import (
"errors"
"fmt"
"path/filepath"
"reflect"
"regexp"
"sort"
"strconv"
"strings"
@@ -12,6 +15,18 @@ import (
"github.com/mmcloughlin/avo/internal/opcodesxml"
)
// This file is a mess. Some of this complexity is unavoidable, since the state
// of x86 instruction databases is also a mess, especially when it comes to
// idiosyncrasies of the Go assembler implementation. Some of the complexity is
// probably avoidable by migrating to using Intel XED
// (https://github.com/mmcloughlin/avo/issues/23), but for now this is an unholy
// mix of PeachPy's Opcodes database and Go's x86 CSV file.
//
// The goal is simply to keep as much of the uglyness in this file as possible,
// producing a clean instruction database for the rest of avo to use. Any nasty
// logic here should be backed up with a test somewhere to ensure the result is
// correct, even if the code that produced it is awful.
// Expected data source filenames.
const (
DefaultCSVName = "x86.v0.2.csv"
@@ -64,7 +79,8 @@ func (l *Loader) Load() ([]inst.Instruction, error) {
Summary: i.Summary,
}
}
im[opcode].Forms = append(im[opcode].Forms, l.form(opcode, f))
forms := l.forms(opcode, f)
im[opcode].Forms = append(im[opcode].Forms, forms...)
}
}
}
@@ -94,7 +110,15 @@ func (l *Loader) Load() ([]inst.Instruction, error) {
i.Forms = dedupe(i.Forms)
}
// Convert to a slice, sorted by opcode.
// Resolve forms that have VEX and EVEX encoded forms.
for _, i := range im {
i.Forms, err = vexevex(i.Forms)
if err != nil {
return nil, err
}
}
// Convert to a slice. Sort instructions and forms for reproducibility.
is := make([]inst.Instruction, 0, len(im))
for _, i := range im {
is = append(is, *i)
@@ -104,6 +128,10 @@ func (l *Loader) Load() ([]inst.Instruction, error) {
return is[i].Opcode < is[j].Opcode
})
for _, i := range im {
sortforms(i.Forms)
}
return is, nil
}
@@ -132,6 +160,9 @@ func (l Loader) include(f opcodesxml.Form) bool {
// AMD-only.
case "TBM", "CLZERO", "FMA4", "XOP", "SSE4A", "3dnow!", "3dnow!+":
return false
// AVX512PF doesn't work without some special case handling, and is only on Knights Landing/Knights Mill.
case "AVX512PF":
return false
// Incomplete support for some prefetching instructions.
case "PREFETCH", "PREFETCHW", "PREFETCHWT1", "CLWB":
return false
@@ -139,11 +170,6 @@ func (l Loader) include(f opcodesxml.Form) bool {
case "MONITORX", "FEMMS":
return false
}
// TODO(mbm): support AVX512
if strings.HasPrefix(isa.ID, "AVX512") {
return false
}
}
// Go appears to have skeleton support for MMX instructions. See the many TODO lines in the testcases:
@@ -277,32 +303,12 @@ func (l Loader) gonames(f opcodesxml.Form) []string {
n := strings.ToUpper(f.GASName)
// Some need data sizes added to them.
// TODO(mbm): is there a better way of determining which ones these are?
suffix := map[int]string{16: "W", 32: "L", 64: "Q", 128: "X", 256: "Y"}
switch n {
case "VCVTUSI2SS", "VCVTSD2USI", "VCVTSS2USI", "VCVTUSI2SD", "VCVTTSS2USI", "VCVTTSD2USI":
fallthrough
case "MOVBEW", "MOVBEL", "MOVBEQ":
// MOVEBE* instructions seem to be inconsistent with x86 CSV.
//
// Reference: https://github.com/golang/arch/blob/b19384d3c130858bb31a343ea8fce26be71b5998/x86/x86spec/format.go#L282-L287
//
// "MOVBE r16, m16": "movbeww",
// "MOVBE m16, r16": "movbeww",
// "MOVBE m32, r32": "movbell",
// "MOVBE r32, m32": "movbell",
// "MOVBE m64, r64": "movbeqq",
// "MOVBE r64, m64": "movbeqq",
//
fallthrough
case "RDRAND", "RDSEED":
n += suffix[s]
}
n += sizesuffix(n, f)
return []string{n}
}
func (l Loader) form(opcode string, f opcodesxml.Form) inst.Form {
func (l Loader) forms(opcode string, f opcodesxml.Form) []inst.Form {
// Map operands to avo format and ensure correct order.
ops := operands(f.Operands)
@@ -357,13 +363,36 @@ func (l Loader) form(opcode string, f opcodesxml.Form) inst.Form {
for _, isa := range f.ISA {
isas = append(isas, isa.ID)
}
sort.Strings(isas)
return inst.Form{
// Initialize form.
form := inst.Form{
ISA: isas,
Operands: ops,
ImplicitOperands: implicits,
EncodingType: enctype(f),
CancellingInputs: f.CancellingInputs,
}
// Apply modification stages to produce final list of forms.
stages := []func(string, inst.Form) []inst.Form{
avx512rounding,
avx512sae,
avx512bcst,
avx512masking,
avx512zeroing,
}
forms := []inst.Form{form}
for _, stage := range stages {
var next []inst.Form
for _, f := range forms {
next = append(next, stage(opcode, f)...)
}
forms = next
}
return forms
}
// operands maps Opcodes XML operands to avo format. Returned in Intel order.
@@ -384,6 +413,187 @@ func operand(op opcodesxml.Operand) inst.Operand {
}
}
// avx512rounding handles AVX-512 embedded rounding. Opcodes database represents
// these as {er} operands, whereas Go uses instruction suffixes. Remove the
// operand if present and set the corresponding flag.
func avx512rounding(opcode string, f inst.Form) []inst.Form {
i, found := findoperand(f.Operands, "{er}")
if !found {
return []inst.Form{f}
}
// Delete the {er} operand.
f.Operands = append(f.Operands[:i], f.Operands[i+1:]...)
// Create a second form with the rounding flag.
er := f.Clone()
er.EmbeddedRounding = true
return []inst.Form{f, er}
}
// avx512sae handles AVX-512 "suppress all exceptions". Opcodes database
// represents these as {sae} operands, whereas Go uses instruction suffixes.
// Remove the operand if present and set the corresponding flag.
func avx512sae(opcode string, f inst.Form) []inst.Form {
i, found := findoperand(f.Operands, "{sae}")
if !found {
return []inst.Form{f}
}
// Delete the {sae} operand.
f.Operands = append(f.Operands[:i], f.Operands[i+1:]...)
// Create a second form with the rounding flag.
sae := f.Clone()
sae.SuppressAllExceptions = true
return []inst.Form{f, sae}
}
// avx512bcst handles AVX-512 broadcast. Opcodes database uses operands like
// "m512/m64bcst" to indicate broadcast. Go uses the BCST suffix to enable it.
// Split the form into two, the regular and broadcast versions.
func avx512bcst(opcode string, f inst.Form) []inst.Form {
// Look for broadcast operand.
idx := -1
for i, op := range f.Operands {
if bcstrx.MatchString(op.Type) {
idx = i
break
}
}
if idx < 0 {
return []inst.Form{f}
}
// Create two forms.
match := bcstrx.FindStringSubmatch(f.Operands[idx].Type)
mem := f.Clone()
mem.Operands[idx].Type = match[1]
bcst := f.Clone()
bcst.Broadcast = true
bcst.Operands[idx].Type = match[2]
return []inst.Form{mem, bcst}
}
var bcstrx = regexp.MustCompile(`^(m\d+)/(m\d+)bcst$`)
// avx512masking handles AVX-512 masking forms.
func avx512masking(opcode string, f inst.Form) []inst.Form {
// In order to support implicit masking (with K0), Go has two instruction
// forms, one with the mask and one without. The mask register precedes the
// output register. The Opcodes database (similar to Intel manuals)
// represents masking with the {k} operand suffix, possibly with {z} for
// zeroing.
// Look for masking with possible zeroing. Zeroing is handled by a later
// processing stage, but we need to be sure to notice and preserve it here.
masking := false
zeroing := false
idx := -1
for i := range f.Operands {
op := &f.Operands[i]
if strings.HasSuffix(op.Type, "{z}") {
zeroing = true
op.Type = strings.TrimSuffix(op.Type, "{z}")
}
if strings.HasSuffix(op.Type, "{k}") {
masking = true
idx = i
op.Type = strings.TrimSuffix(op.Type, "{k}")
break
}
}
// Bail if no masking.
if !masking {
return []inst.Form{f}
}
// Unmasked variant.
unmasked := f.Clone()
// Masked form has "k" operand inserted.
masked := f.Clone()
mask := inst.Operand{Type: "k", Action: inst.R}
ops := append([]inst.Operand(nil), masked.Operands[:idx]...)
ops = append(ops, mask)
ops = append(ops, masked.Operands[idx:]...)
masked.Operands = ops
// Restore zeroing suffix, so it can he handled later.
if zeroing {
masked.Operands[idx+1].Type += "{z}"
}
// Almost all instructions take an optional mask, apart from a few
// special cases.
if maskrequired[opcode] {
return []inst.Form{masked}
}
return []inst.Form{unmasked, masked}
}
// avx512zeroing handles AVX-512 zeroing forms.
func avx512zeroing(opcode string, f inst.Form) []inst.Form {
// Zeroing in Go is handled with the Z opcode suffix. Note that zeroing has
// an important effect on the instruction form, since the merge masking form
// has an input dependency for the output register, and the zeroing form
// does not.
// Look for zeroing operand.
idx := -1
for i := range f.Operands {
op := &f.Operands[i]
if strings.HasSuffix(op.Type, "{z}") {
idx = i
op.Type = strings.TrimSuffix(op.Type, "{z}")
}
}
if idx < 0 {
return []inst.Form{f}
}
// Duplicate into two forms for merging and zeroing.
merging := f.Clone()
merging.Operands[idx].Action |= inst.R
zeroing := f.Clone()
zeroing.Zeroing = true
return []inst.Form{merging, zeroing}
}
// findoperand looks for an operand type and returns its index, if found.
func findoperand(ops []inst.Operand, t string) (int, bool) {
for i, op := range ops {
if op.Type == t {
return i, true
}
}
return 0, false
}
// enctype selects the encoding type for the instruction form.
func enctype(f opcodesxml.Form) inst.EncodingType {
switch {
case f.Encoding.EVEX != nil:
return inst.EncodingTypeEVEX
case f.Encoding.VEX != nil:
return inst.EncodingTypeVEX
case f.Encoding.REX != nil:
return inst.EncodingTypeREX
default:
return inst.EncodingTypeLegacy
}
}
// datasize (intelligently) guesses the datasize of an instruction form.
func datasize(f opcodesxml.Form) int {
// Determine from encoding bits.
@@ -413,6 +623,220 @@ func operandsize(op opcodesxml.Operand) int {
return 0
}
// sizesuffix returns an optional size suffix to be added to the opcode name.
func sizesuffix(n string, f opcodesxml.Form) string {
// Reference: https://github.com/golang/arch/blob/5de9028c2478e6cb4e1c1b1f4386f3f0a93e383a/x86/x86avxgen/main.go#L275-L322
//
// func addGoSuffixes(ctx *context) {
// var opcodeSuffixMatchers map[string][]string
// {
// opXY := []string{"VL=0", "X", "VL=1", "Y"}
// opXYZ := []string{"VL=0", "X", "VL=1", "Y", "VL=2", "Z"}
// opQ := []string{"REXW=1", "Q"}
// opLQ := []string{"REXW=0", "L", "REXW=1", "Q"}
//
// opcodeSuffixMatchers = map[string][]string{
// "VCVTPD2DQ": opXY,
// "VCVTPD2PS": opXY,
// "VCVTTPD2DQ": opXY,
// "VCVTQQ2PS": opXY,
// "VCVTUQQ2PS": opXY,
// "VCVTPD2UDQ": opXY,
// "VCVTTPD2UDQ": opXY,
//
// "VFPCLASSPD": opXYZ,
// "VFPCLASSPS": opXYZ,
//
// "VCVTSD2SI": opQ,
// "VCVTTSD2SI": opQ,
// "VCVTTSS2SI": opQ,
// "VCVTSS2SI": opQ,
//
// "VCVTSD2USI": opLQ,
// "VCVTSS2USI": opLQ,
// "VCVTTSD2USI": opLQ,
// "VCVTTSS2USI": opLQ,
// "VCVTUSI2SD": opLQ,
// "VCVTUSI2SS": opLQ,
// "VCVTSI2SD": opLQ,
// "VCVTSI2SS": opLQ,
// "ANDN": opLQ,
// "BEXTR": opLQ,
// "BLSI": opLQ,
// "BLSMSK": opLQ,
// "BLSR": opLQ,
// "BZHI": opLQ,
// "MULX": opLQ,
// "PDEP": opLQ,
// "PEXT": opLQ,
// "RORX": opLQ,
// "SARX": opLQ,
// "SHLX": opLQ,
// "SHRX": opLQ,
// }
// }
//
type rule struct {
Size func(opcodesxml.Form) int
Suffix map[int]string
}
var (
XY = rule{evexLLsize, map[int]string{128: "X", 256: "Y"}}
XYZ = rule{evexLLsize, map[int]string{128: "X", 256: "Y", 512: "Z"}}
Q = rule{rexWsize, map[int]string{64: "Q"}}
LQ = rule{rexWsize, map[int]string{32: "L", 64: "Q"}}
WLQ = rule{datasize, map[int]string{16: "W", 32: "L", 64: "Q"}}
)
rules := map[string]rule{
"VCVTPD2DQ": XY,
"VCVTPD2PS": XY,
"VCVTTPD2DQ": XY,
"VCVTQQ2PS": XY,
"VCVTUQQ2PS": XY,
"VCVTPD2UDQ": XY,
"VCVTTPD2UDQ": XY,
"VFPCLASSPD": XYZ,
"VFPCLASSPS": XYZ,
"VCVTSD2SI": Q,
"VCVTTSD2SI": Q,
"VCVTTSS2SI": Q,
"VCVTSS2SI": Q,
"VCVTSD2USI": LQ,
"VCVTSS2USI": LQ,
"VCVTTSD2USI": LQ,
"VCVTTSS2USI": LQ,
"VCVTUSI2SD": LQ,
"VCVTUSI2SS": LQ,
"VCVTSI2SD": LQ,
"VCVTSI2SS": LQ,
"ANDN": LQ,
"BEXTR": LQ,
"BLSI": LQ,
"BLSMSK": LQ,
"BLSR": LQ,
"BZHI": LQ,
"MULX": LQ,
"PDEP": LQ,
"PEXT": LQ,
"RORX": LQ,
"SARX": LQ,
"SHLX": LQ,
"SHRX": LQ,
"RDRAND": LQ,
"RDSEED": LQ,
// MOVEBE* instructions seem to be inconsistent with x86 CSV.
//
// Reference: https://github.com/golang/arch/blob/b19384d3c130858bb31a343ea8fce26be71b5998/x86/x86spec/format.go#L282-L287
//
// "MOVBE r16, m16": "movbeww",
// "MOVBE m16, r16": "movbeww",
// "MOVBE m32, r32": "movbell",
// "MOVBE r32, m32": "movbell",
// "MOVBE m64, r64": "movbeqq",
// "MOVBE r64, m64": "movbeqq",
//
"MOVBEW": WLQ,
"MOVBEL": WLQ,
"MOVBEQ": WLQ,
}
r, ok := rules[n]
if !ok {
return ""
}
s := r.Size(f)
return r.Suffix[s]
}
func rexWsize(f opcodesxml.Form) int {
e := f.Encoding
switch {
case e.EVEX != nil && e.EVEX.W != nil:
return 32 << *e.EVEX.W
default:
return 32
}
}
func evexLLsize(f opcodesxml.Form) int {
e := f.Encoding
if e.EVEX == nil {
return 0
}
size := map[string]int{"00": 128, "01": 256, "10": 512}
return size[e.EVEX.LL]
}
// vexevex fixes instructions that have both VEX and EVEX encoded forms with the
// same operand types. Go uses the VEX encoded form unless EVEX-only features
// are used. This function will only keep the VEX encoded version in the case
// where both exist.
//
// Note this is somewhat of a hack. There are real reasons to use the EVEX
// encoded version even when both exist. The main reason to use the EVEX version
// rather than VEX is to use the registers Z16, Z17, ... and up. However, avo
// does not implement the logic to distinguish between the two halfs of the
// vector registers. So in its current state the only reason to need the EVEX
// version is to encode suffixes, and these are represented by other instruction
// forms.
//
// TODO(mbm): restrict use of vector registers https://github.com/mmcloughlin/avo/issues/146
func vexevex(fs []inst.Form) ([]inst.Form, error) {
// Group forms by deduping ID.
byid := map[string][]inst.Form{}
for _, f := range fs {
id := fmt.Sprintf(
"%s {%t,%t,%t,%t}",
strings.Join(f.Signature(), "_"),
f.Zeroing,
f.EmbeddedRounding,
f.SuppressAllExceptions,
f.Broadcast,
)
byid[id] = append(byid[id], f)
}
// Resolve overlaps.
var results []inst.Form
for id, group := range byid {
if len(group) < 2 {
results = append(results, group...)
continue
}
// We expect these conflicts are caused by VEX/EVEX pairs. Bail if it's
// something else.
if len(group) > 2 {
return nil, fmt.Errorf("more than two forms of type %q", id)
}
if group[0].EncodingType == inst.EncodingTypeEVEX {
group[0], group[1] = group[1], group[0]
}
if group[0].EncodingType != inst.EncodingTypeVEX || group[1].EncodingType != inst.EncodingTypeEVEX {
fmt.Println(group)
return nil, errors.New("expected pair of VEX/EVEX encoded forms")
}
vex := group[0]
// In this case we only keep the VEX encoded form.
results = append(results, vex)
}
return results, nil
}
// dedupe a list of forms.
func dedupe(fs []inst.Form) []inst.Form {
uniq := make([]inst.Form, 0, len(fs))
@@ -430,3 +854,14 @@ func dedupe(fs []inst.Form) []inst.Form {
}
return uniq
}
// sortforms sorts a list of forms
func sortforms(fs []inst.Form) {
sort.Slice(fs, func(i, j int) bool {
return sortkey(fs[i]) < sortkey(fs[j])
})
}
func sortkey(f inst.Form) string {
return fmt.Sprintf("%d %v %v", f.EncodingType, f.ISA, f)
}