all: AVX-512 (#217)
Extends avo to support most AVX-512 instruction sets.
The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.
Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.
AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:
1. Instruction constructors in the `x86` package moved to an optab-based
approach. This compiles substantially faster than the verbose code
generation we had before.
2. The most verbose code-generated tests are moved under build tags and
limited to a stress test mode. Stress test builds are run on
schedule but not in regular CI.
An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.
Updates #20 #163 #229
Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
This commit is contained in:
79
tests/alloc/zeroing/asm.go
Normal file
79
tests/alloc/zeroing/asm.go
Normal file
@@ -0,0 +1,79 @@
|
||||
//go:build ignore
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
|
||||
. "github.com/mmcloughlin/avo/build"
|
||||
. "github.com/mmcloughlin/avo/operand"
|
||||
. "github.com/mmcloughlin/avo/reg"
|
||||
)
|
||||
|
||||
// The goal of this test is to confirm correct liveness analysis of zeroing mode
|
||||
// when masking in AVX-512. In merge masking, some of the bits of the output
|
||||
// register will be preserved, so the register is live coming into the
|
||||
// instruction. Zeroing mode removes any input dependency.
|
||||
//
|
||||
// This synthetic test sets up a situation where we allocate multiple temporary
|
||||
// registers. Allocation is only feasible if the liveness pass correctly
|
||||
// identifies that they are not all live at once.
|
||||
|
||||
func main() {
|
||||
const n = 32
|
||||
|
||||
TEXT("Zeroing", NOSPLIT, "func(out *[8]uint64)")
|
||||
Doc("Zeroing computes the sum 1+2+...+" + strconv.Itoa(n) + " in 8 lanes of 512-bit register.")
|
||||
|
||||
out := Load(Param("out"), GP64())
|
||||
|
||||
Comment("Initialize sum.")
|
||||
s := ZMM()
|
||||
VPXORD(s, s, s)
|
||||
|
||||
// Allocate registers for the terms of the sum. Write garbage to them.
|
||||
//
|
||||
// The point here is that under merge-masking, or an incorrect handling of
|
||||
// zeroing-masking, these registers would be live from this point. And there
|
||||
// would be too many of them so register allocation would fail.
|
||||
Comment("Initialize summand registers.")
|
||||
filler := GP64()
|
||||
MOVQ(U64(0x9e77d78aacb8cbcc), filler)
|
||||
|
||||
z := make([]VecVirtual, n)
|
||||
for i := 0; i < n; i++ {
|
||||
z[i] = ZMM()
|
||||
VPBROADCASTQ(filler, z[i])
|
||||
}
|
||||
|
||||
// Prepare a mask register set to all ones.
|
||||
Comment("Prepare mask register.")
|
||||
k := K()
|
||||
KXNORW(k, k, k)
|
||||
|
||||
// Prepare an increment register set to 1 in each lane.
|
||||
Comment("Prepare constant registers.")
|
||||
one := GP64()
|
||||
MOVQ(U64(1), one)
|
||||
ones := ZMM()
|
||||
VPBROADCASTQ(one, ones)
|
||||
|
||||
zero := ZMM()
|
||||
VPXORD(zero, zero, zero)
|
||||
|
||||
last := zero
|
||||
for i := 0; i < n; i++ {
|
||||
Commentf("Summand %d.", i+1)
|
||||
VPADDD_Z(last, ones, k, z[i])
|
||||
VPADDD(s, z[i], s)
|
||||
last = z[i]
|
||||
}
|
||||
|
||||
Comment("Write result to output pointer.")
|
||||
VMOVDQU64(s, Mem{Base: out})
|
||||
|
||||
RET()
|
||||
|
||||
Generate()
|
||||
}
|
||||
2
tests/alloc/zeroing/doc.go
Normal file
2
tests/alloc/zeroing/doc.go
Normal file
@@ -0,0 +1,2 @@
|
||||
// Package zeroing tests liveness analysis of AVX-512 operations with zeroing masking.
|
||||
package zeroing
|
||||
6
tests/alloc/zeroing/stub.go
Normal file
6
tests/alloc/zeroing/stub.go
Normal file
@@ -0,0 +1,6 @@
|
||||
// Code generated by command: go run asm.go -out zeroing.s -stubs stub.go. DO NOT EDIT.
|
||||
|
||||
package zeroing
|
||||
|
||||
// Zeroing computes the sum 1+2+...+32 in 8 lanes of 512-bit register.
|
||||
func Zeroing(out *[8]uint64)
|
||||
186
tests/alloc/zeroing/zeroing.s
Normal file
186
tests/alloc/zeroing/zeroing.s
Normal file
@@ -0,0 +1,186 @@
|
||||
// Code generated by command: go run asm.go -out zeroing.s -stubs stub.go. DO NOT EDIT.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func Zeroing(out *[8]uint64)
|
||||
// Requires: AVX512F
|
||||
TEXT ·Zeroing(SB), NOSPLIT, $0-8
|
||||
MOVQ out+0(FP), AX
|
||||
|
||||
// Initialize sum.
|
||||
VPXORD Z0, Z0, Z0
|
||||
|
||||
// Initialize summand registers.
|
||||
MOVQ $0x9e77d78aacb8cbcc, CX
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
VPBROADCASTQ CX, Z1
|
||||
|
||||
// Prepare mask register.
|
||||
KXNORW K1, K1, K1
|
||||
|
||||
// Prepare constant registers.
|
||||
MOVQ $0x0000000000000001, CX
|
||||
VPBROADCASTQ CX, Z2
|
||||
VPXORD Z3, Z3, Z3
|
||||
|
||||
// Summand 1.
|
||||
VPADDD.Z Z3, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 2.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 3.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 4.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 5.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 6.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 7.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 8.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 9.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 10.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 11.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 12.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 13.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 14.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 15.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 16.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 17.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 18.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 19.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 20.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 21.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 22.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 23.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 24.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 25.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 26.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 27.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 28.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 29.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 30.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 31.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Summand 32.
|
||||
VPADDD.Z Z1, Z2, K1, Z1
|
||||
VPADDD Z0, Z1, Z0
|
||||
|
||||
// Write result to output pointer.
|
||||
VMOVDQU64 Z0, (AX)
|
||||
RET
|
||||
29
tests/alloc/zeroing/zeroing_test.go
Normal file
29
tests/alloc/zeroing/zeroing_test.go
Normal file
@@ -0,0 +1,29 @@
|
||||
package zeroing
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
//go:generate go run asm.go -out zeroing.s -stubs stub.go
|
||||
|
||||
func TestZeroing(t *testing.T) {
|
||||
const (
|
||||
n = 32
|
||||
expect = n * (n + 1) / 2
|
||||
)
|
||||
|
||||
if !cpu.X86.HasAVX512F {
|
||||
t.Skip("require AVX512F")
|
||||
}
|
||||
|
||||
var got [8]uint64
|
||||
Zeroing(&got)
|
||||
|
||||
for i := 0; i < 8; i++ {
|
||||
if got[i] != expect {
|
||||
t.Errorf("got[%d] = %d; expect %d", i, got[i], expect)
|
||||
}
|
||||
}
|
||||
}
|
||||
3
tests/thirdparty/packages.json
vendored
3
tests/thirdparty/packages.json
vendored
@@ -458,7 +458,8 @@
|
||||
"make --always-make build"
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"known_issue": 229
|
||||
},
|
||||
{
|
||||
"repository": {
|
||||
|
||||
Reference in New Issue
Block a user