From edc4d3f00c077d1d8b5252af3085a054c76bbdae Mon Sep 17 00:00:00 2001 From: Michael McLoughlin Date: Sun, 30 Dec 2018 18:46:36 -0800 Subject: [PATCH] examples/dot: initial version Implements dot product in avo. Closes #4 --- examples/dot/README.md | 3 ++ examples/dot/asm.go | 93 ++++++++++++++++++++++++++++++++++++++++ examples/dot/dot.s | 58 +++++++++++++++++++++++++ examples/dot/dot_test.go | 50 +++++++++++++++++++++ examples/dot/stub.go | 5 +++ 5 files changed, 209 insertions(+) create mode 100644 examples/dot/README.md create mode 100644 examples/dot/asm.go create mode 100644 examples/dot/dot.s create mode 100644 examples/dot/dot_test.go create mode 100644 examples/dot/stub.go diff --git a/examples/dot/README.md b/examples/dot/README.md new file mode 100644 index 0000000..a6e1166 --- /dev/null +++ b/examples/dot/README.md @@ -0,0 +1,3 @@ +# dot + +[Dot product](https://en.wikipedia.org/wiki/Dot_product) in `avo`. Ported from the [`dot_product.py` PeachPy example](https://github.com/Maratyszcza/PeachPy/blob/01d15157a973a4ae16b8046313ddab371ea582db/examples/go-generate/dot_product.py). diff --git a/examples/dot/asm.go b/examples/dot/asm.go new file mode 100644 index 0000000..88cc2fd --- /dev/null +++ b/examples/dot/asm.go @@ -0,0 +1,93 @@ +// +build ignore + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +var unroll = 6 + +func main() { + TEXT("Dot", "func(x, y []float32) float32") + x := Mem{Base: Load(Param("x").Base(), GP64v())} + y := Mem{Base: Load(Param("y").Base(), GP64v())} + n := Load(Param("x").Len(), GP64v()) + + // Allocate accumulation registers. + acc := make([]VecVirtual, unroll) + for i := 0; i < unroll; i++ { + acc[i] = Yv() + } + + // Zero initialization. + for i := 0; i < unroll; i++ { + VXORPS(acc[i], acc[i], acc[i]) + } + + // Loop over blocks and process them with vector instructions. + blockitems := 8 * unroll + blocksize := 4 * blockitems + LABEL("blockloop") + CMPQ(n, U32(blockitems)) + JL(LabelRef("tail")) + + // Load x. + xs := make([]VecVirtual, unroll) + for i := 0; i < unroll; i++ { + xs[i] = Yv() + } + + for i := 0; i < unroll; i++ { + VMOVUPS(x.Offset(32*i), xs[i]) + } + + // The actual FMA. + for i := 0; i < unroll; i++ { + VFMADD231PS(y.Offset(32*i), xs[i], acc[i]) + } + + ADDQ(U32(blocksize), x.Base) + ADDQ(U32(blocksize), y.Base) + SUBQ(U32(blockitems), n) + JMP(LabelRef("blockloop")) + + // Process any trailing entries. + LABEL("tail") + tail := Xv() + VXORPS(tail, tail, tail) + + LABEL("tailloop") + CMPQ(n, U32(0)) + JE(LabelRef("reduce")) + + xt := Xv() + VMOVSS(x, xt) + VFMADD231SS(y, xt, tail) + + ADDQ(U32(4), x.Base) + ADDQ(U32(4), y.Base) + DECQ(n) + JMP(LabelRef("tailloop")) + + // Reduce the lanes to one. + LABEL("reduce") + for i := 1; i < unroll; i++ { + VADDPS(acc[0], acc[i], acc[0]) + } + + result := acc[0].AsX() + top := Xv() + VEXTRACTF128(U8(1), acc[0], top) + VADDPS(result, top, result) + VADDPS(result, tail, result) + VHADDPS(result, result, result) + VHADDPS(result, result, result) + Store(result, ReturnIndex(0)) + + RET() + + Generate() +} diff --git a/examples/dot/dot.s b/examples/dot/dot.s new file mode 100644 index 0000000..d8d8a6a --- /dev/null +++ b/examples/dot/dot.s @@ -0,0 +1,58 @@ +// Code generated by command: go run asm.go -out dot.s -stubs stub.go. DO NOT EDIT. + +#include "textflag.h" + +// func Dot(x []float32, y []float32) float32 +TEXT ·Dot(SB), 0, $0-52 + MOVQ x_base(FP), AX + MOVQ y_base+24(FP), CX + MOVQ x_len+8(FP), DX + VXORPS Y0, Y0, Y0 + VXORPS Y1, Y1, Y1 + VXORPS Y2, Y2, Y2 + VXORPS Y3, Y3, Y3 + VXORPS Y4, Y4, Y4 + VXORPS Y5, Y5, Y5 +blockloop: + CMPQ DX, $0x00000030 + JL tail + VMOVUPS (AX), Y6 + VMOVUPS 32(AX), Y7 + VMOVUPS 64(AX), Y8 + VMOVUPS 96(AX), Y9 + VMOVUPS 128(AX), Y10 + VMOVUPS 160(AX), Y11 + VFMADD231PS (CX), Y6, Y0 + VFMADD231PS 32(CX), Y7, Y1 + VFMADD231PS 64(CX), Y8, Y2 + VFMADD231PS 96(CX), Y9, Y3 + VFMADD231PS 128(CX), Y10, Y4 + VFMADD231PS 160(CX), Y11, Y5 + ADDQ $0x000000c0, AX + ADDQ $0x000000c0, CX + SUBQ $0x00000030, DX + JMP blockloop +tail: + VXORPS X12, X12, X12 +tailloop: + CMPQ DX, $0x00000000 + JE reduce + VMOVSS (AX), X6 + VFMADD231SS (CX), X6, X12 + ADDQ $0x00000004, AX + ADDQ $0x00000004, CX + DECQ DX + JMP tailloop +reduce: + VADDPS Y0, Y1, Y0 + VADDPS Y0, Y2, Y0 + VADDPS Y0, Y3, Y0 + VADDPS Y0, Y4, Y0 + VADDPS Y0, Y5, Y0 + VEXTRACTF128 $0x01, Y0, X1 + VADDPS X0, X1, X0 + VADDPS X0, X12, X0 + VHADDPS X0, X0, X0 + VHADDPS X0, X0, X0 + MOVSS X0, ret+48(FP) + RET diff --git a/examples/dot/dot_test.go b/examples/dot/dot_test.go new file mode 100644 index 0000000..4c373ff --- /dev/null +++ b/examples/dot/dot_test.go @@ -0,0 +1,50 @@ +package dot + +import ( + "math/rand" + "testing" +) + +//go:generate go run asm.go -out dot.s -stubs stub.go + +func TestEmpty(t *testing.T) { + if Dot(nil, nil) != 0.0 { + t.Fatal("expect dot product of empty vectors to be zero") + } +} + +func TestLengths(t *testing.T) { + const epsilon = 0.00001 + for n := 0; n < 1000; n++ { + x, y := RandomVector(n), RandomVector(n) + got := Dot(x, y) + expect := Expect(x, y) + relerr := got/expect - 1.0 + if Abs(relerr) > epsilon { + t.Fatalf("bad result on vector length %d: got %v expect %v relative error %f", n, got, expect, relerr) + } + } +} + +func Expect(x, y []float32) float32 { + var p float32 + for i := range x { + p += x[i] * y[i] + } + return p +} + +func RandomVector(n int) []float32 { + x := make([]float32, n) + for i := 0; i < n; i++ { + x[i] = rand.Float32() * 100 + } + return x +} + +func Abs(x float32) float32 { + if x < 0.0 { + return -x + } + return x +} diff --git a/examples/dot/stub.go b/examples/dot/stub.go new file mode 100644 index 0000000..d5deab7 --- /dev/null +++ b/examples/dot/stub.go @@ -0,0 +1,5 @@ +// Code generated by command: go run asm.go -out dot.s -stubs stub.go. DO NOT EDIT. + +package dot + +func Dot(x []float32, y []float32) float32