tests/alloc/zeroing/asm.go

//go:build ignore

package main

import (
	"strconv"

	. "sources.truenas.cloud/code/avo/build"
	. "sources.truenas.cloud/code/avo/operand"
	. "sources.truenas.cloud/code/avo/reg"
)

// The goal of this test is to confirm correct liveness analysis of zeroing mode
// when masking in AVX-512. In merge masking, some of the bits of the output
// register will be preserved, so the register is live coming into the
// instruction. Zeroing mode removes any input dependency.
//
// This synthetic test sets up a situation where we allocate multiple temporary
// registers. Allocation is only feasible if the liveness pass correctly
// identifies that they are not all live at once.

func main() {
	const n = 32

	TEXT("Zeroing", NOSPLIT, "func(out *[8]uint64)")
	Doc("Zeroing computes the sum 1+2+...+" + strconv.Itoa(n) + " in 8 lanes of 512-bit register.")

	out := Load(Param("out"), GP64())

	Comment("Initialize sum.")
	s := ZMM()
	VPXORD(s, s, s)

	// Allocate registers for the terms of the sum. Write garbage to them.
	//
	// The point here is that under merge-masking, or an incorrect handling of
	// zeroing-masking, these registers would be live from this point. And there
	// would be too many of them so register allocation would fail.
	Comment("Initialize summand registers.")
	filler := GP64()
	MOVQ(U64(0x9e77d78aacb8cbcc), filler)

	z := make([]VecVirtual, n)
	for i := 0; i < n; i++ {
		z[i] = ZMM()
		VPBROADCASTQ(filler, z[i])
	}

	// Prepare a mask register set to all ones.
	Comment("Prepare mask register.")
	k := K()
	KXNORW(k, k, k)

	// Prepare an increment register set to 1 in each lane.
	Comment("Prepare constant registers.")
	one := GP64()
	MOVQ(U64(1), one)
	ones := ZMM()
	VPBROADCASTQ(one, ones)

	zero := ZMM()
	VPXORD(zero, zero, zero)

	last := zero
	for i := 0; i < n; i++ {
		Commentf("Summand %d.", i+1)
		VPADDD_Z(last, ones, k, z[i])
		VPADDD(s, z[i], s)
		last = z[i]
	}

	Comment("Write result to output pointer.")
	VMOVDQU64(s, Mem{Base: out})

	RET()

	Generate()
}
all: AVX-512 (#217) Extends avo to support most AVX-512 instruction sets. The instruction type is extended to support suffixes. The K family of opmask registers is added to the register package, and the operand package is updated to support the new operand types. Move instruction deduction in `Load` and `Store` is extended to support KMOV* and VMOV* forms. Internal code generation packages were overhauled. Instruction database loading required various messy changes to account for the additional complexities of the AVX-512 instruction sets. The internal/api package was added to introduce a separation between instruction forms in the database, and the functions avo provides to create them. This was required since with instruction suffixes there is no longer a one-to-one mapping between instruction constructors and opcodes. AVX-512 bloated generated source code size substantially, initially increasing compilation and CI test times to an unacceptable level. Two changes were made to address this: 1. Instruction constructors in the `x86` package moved to an optab-based approach. This compiles substantially faster than the verbose code generation we had before. 2. The most verbose code-generated tests are moved under build tags and limited to a stress test mode. Stress test builds are run on schedule but not in regular CI. An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and test the new functionality. Updates #20 #163 #229 Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com> 2021-11-12 18:35:36 -08:00			`//go:build ignore`

			`package main`

			`import (`
			`"strconv"`

update 2026-03-06 20:14:02 +00:00			`. "sources.truenas.cloud/code/avo/build"`
			`. "sources.truenas.cloud/code/avo/operand"`
			`. "sources.truenas.cloud/code/avo/reg"`
all: AVX-512 (#217) Extends avo to support most AVX-512 instruction sets. The instruction type is extended to support suffixes. The K family of opmask registers is added to the register package, and the operand package is updated to support the new operand types. Move instruction deduction in `Load` and `Store` is extended to support KMOV* and VMOV* forms. Internal code generation packages were overhauled. Instruction database loading required various messy changes to account for the additional complexities of the AVX-512 instruction sets. The internal/api package was added to introduce a separation between instruction forms in the database, and the functions avo provides to create them. This was required since with instruction suffixes there is no longer a one-to-one mapping between instruction constructors and opcodes. AVX-512 bloated generated source code size substantially, initially increasing compilation and CI test times to an unacceptable level. Two changes were made to address this: 1. Instruction constructors in the `x86` package moved to an optab-based approach. This compiles substantially faster than the verbose code generation we had before. 2. The most verbose code-generated tests are moved under build tags and limited to a stress test mode. Stress test builds are run on schedule but not in regular CI. An example of AVX-512 accelerated 16-lane MD5 is provided to demonstrate and test the new functionality. Updates #20 #163 #229 Co-authored-by: Vaughn Iverson <vsivsi@yahoo.com> 2021-11-12 18:35:36 -08:00			`)`

			`// The goal of this test is to confirm correct liveness analysis of zeroing mode`
			`// when masking in AVX-512. In merge masking, some of the bits of the output`
			`// register will be preserved, so the register is live coming into the`
			`// instruction. Zeroing mode removes any input dependency.`
			`//`
			`// This synthetic test sets up a situation where we allocate multiple temporary`
			`// registers. Allocation is only feasible if the liveness pass correctly`
			`// identifies that they are not all live at once.`

			`func main() {`
			`const n = 32`

			`TEXT("Zeroing", NOSPLIT, "func(out *[8]uint64)")`
			`Doc("Zeroing computes the sum 1+2+...+" + strconv.Itoa(n) + " in 8 lanes of 512-bit register.")`

			`out := Load(Param("out"), GP64())`

			`Comment("Initialize sum.")`
			`s := ZMM()`
			`VPXORD(s, s, s)`

			`// Allocate registers for the terms of the sum. Write garbage to them.`
			`//`
			`// The point here is that under merge-masking, or an incorrect handling of`
			`// zeroing-masking, these registers would be live from this point. And there`
			`// would be too many of them so register allocation would fail.`
			`Comment("Initialize summand registers.")`
			`filler := GP64()`
			`MOVQ(U64(0x9e77d78aacb8cbcc), filler)`

			`z := make([]VecVirtual, n)`
			`for i := 0; i < n; i++ {`
			`z[i] = ZMM()`
			`VPBROADCASTQ(filler, z[i])`
			`}`

			`// Prepare a mask register set to all ones.`
			`Comment("Prepare mask register.")`
			`k := K()`
			`KXNORW(k, k, k)`

			`// Prepare an increment register set to 1 in each lane.`
			`Comment("Prepare constant registers.")`
			`one := GP64()`
			`MOVQ(U64(1), one)`
			`ones := ZMM()`
			`VPBROADCASTQ(one, ones)`

			`zero := ZMM()`
			`VPXORD(zero, zero, zero)`

			`last := zero`
			`for i := 0; i < n; i++ {`
			`Commentf("Summand %d.", i+1)`
			`VPADDD_Z(last, ones, k, z[i])`
			`VPADDD(s, z[i], s)`
			`last = z[i]`
			`}`

			`Comment("Write result to output pointer.")`
			`VMOVDQU64(s, Mem{Base: out})`

			`RET()`

			`Generate()`
			`}`