Extends avo to support most AVX-512 instruction sets.
The instruction type is extended to support suffixes. The K family of opmask
registers is added to the register package, and the operand package is updated
to support the new operand types. Move instruction deduction in `Load` and
`Store` is extended to support KMOV* and VMOV* forms.
Internal code generation packages were overhauled. Instruction database loading
required various messy changes to account for the additional complexities of the
AVX-512 instruction sets. The internal/api package was added to introduce a
separation between instruction forms in the database, and the functions avo
provides to create them. This was required since with instruction suffixes there
is no longer a one-to-one mapping between instruction constructors and opcodes.
AVX-512 bloated generated source code size substantially, initially increasing
compilation and CI test times to an unacceptable level. Two changes were made to
address this:
1. Instruction constructors in the `x86` package moved to an optab-based
approach. This compiles substantially faster than the verbose code
generation we had before.
2. The most verbose code-generated tests are moved under build tags and
limited to a stress test mode. Stress test builds are run on
schedule but not in regular CI.
An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and
test the new functionality.
Updates #20 #163 #229
Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com>
187 lines
3.2 KiB
ArmAsm
187 lines
3.2 KiB
ArmAsm
// Code generated by command: go run asm.go -out zeroing.s -stubs stub.go. DO NOT EDIT.
|
|
|
|
#include "textflag.h"
|
|
|
|
// func Zeroing(out *[8]uint64)
|
|
// Requires: AVX512F
|
|
TEXT ·Zeroing(SB), NOSPLIT, $0-8
|
|
MOVQ out+0(FP), AX
|
|
|
|
// Initialize sum.
|
|
VPXORD Z0, Z0, Z0
|
|
|
|
// Initialize summand registers.
|
|
MOVQ $0x9e77d78aacb8cbcc, CX
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
VPBROADCASTQ CX, Z1
|
|
|
|
// Prepare mask register.
|
|
KXNORW K1, K1, K1
|
|
|
|
// Prepare constant registers.
|
|
MOVQ $0x0000000000000001, CX
|
|
VPBROADCASTQ CX, Z2
|
|
VPXORD Z3, Z3, Z3
|
|
|
|
// Summand 1.
|
|
VPADDD.Z Z3, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 2.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 3.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 4.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 5.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 6.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 7.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 8.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 9.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 10.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 11.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 12.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 13.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 14.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 15.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 16.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 17.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 18.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 19.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 20.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 21.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 22.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 23.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 24.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 25.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 26.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 27.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 28.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 29.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 30.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 31.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Summand 32.
|
|
VPADDD.Z Z1, Z2, K1, Z1
|
|
VPADDD Z0, Z1, Z0
|
|
|
|
// Write result to output pointer.
|
|
VMOVDQU64 Z0, (AX)
|
|
RET
|