printer: use tabwriter to align instructions (#8)
This commit is contained in:
@@ -4,55 +4,59 @@
|
||||
|
||||
// func Dot(x []float32, y []float32) float32
|
||||
TEXT ·Dot(SB), NOSPLIT, $0-52
|
||||
MOVQ x_base(FP), AX
|
||||
MOVQ y_base+24(FP), CX
|
||||
MOVQ x_len+8(FP), DX
|
||||
VXORPS Y0, Y0, Y0
|
||||
VXORPS Y1, Y1, Y1
|
||||
VXORPS Y2, Y2, Y2
|
||||
VXORPS Y3, Y3, Y3
|
||||
VXORPS Y4, Y4, Y4
|
||||
VXORPS Y5, Y5, Y5
|
||||
MOVQ x_base(FP), AX
|
||||
MOVQ y_base+24(FP), CX
|
||||
MOVQ x_len+8(FP), DX
|
||||
VXORPS Y0, Y0, Y0
|
||||
VXORPS Y1, Y1, Y1
|
||||
VXORPS Y2, Y2, Y2
|
||||
VXORPS Y3, Y3, Y3
|
||||
VXORPS Y4, Y4, Y4
|
||||
VXORPS Y5, Y5, Y5
|
||||
|
||||
blockloop:
|
||||
CMPQ DX, $0x00000030
|
||||
JL tail
|
||||
VMOVUPS (AX), Y6
|
||||
VMOVUPS 32(AX), Y7
|
||||
VMOVUPS 64(AX), Y8
|
||||
VMOVUPS 96(AX), Y9
|
||||
VMOVUPS 128(AX), Y10
|
||||
VMOVUPS 160(AX), Y11
|
||||
VFMADD231PS (CX), Y6, Y0
|
||||
VFMADD231PS 32(CX), Y7, Y1
|
||||
VFMADD231PS 64(CX), Y8, Y2
|
||||
VFMADD231PS 96(CX), Y9, Y3
|
||||
VFMADD231PS 128(CX), Y10, Y4
|
||||
VFMADD231PS 160(CX), Y11, Y5
|
||||
ADDQ $0x000000c0, AX
|
||||
ADDQ $0x000000c0, CX
|
||||
SUBQ $0x00000030, DX
|
||||
JMP blockloop
|
||||
CMPQ DX, $0x00000030
|
||||
JL tail
|
||||
VMOVUPS (AX), Y6
|
||||
VMOVUPS 32(AX), Y7
|
||||
VMOVUPS 64(AX), Y8
|
||||
VMOVUPS 96(AX), Y9
|
||||
VMOVUPS 128(AX), Y10
|
||||
VMOVUPS 160(AX), Y11
|
||||
VFMADD231PS (CX), Y6, Y0
|
||||
VFMADD231PS 32(CX), Y7, Y1
|
||||
VFMADD231PS 64(CX), Y8, Y2
|
||||
VFMADD231PS 96(CX), Y9, Y3
|
||||
VFMADD231PS 128(CX), Y10, Y4
|
||||
VFMADD231PS 160(CX), Y11, Y5
|
||||
ADDQ $0x000000c0, AX
|
||||
ADDQ $0x000000c0, CX
|
||||
SUBQ $0x00000030, DX
|
||||
JMP blockloop
|
||||
|
||||
tail:
|
||||
VXORPS X12, X12, X12
|
||||
VXORPS X12, X12, X12
|
||||
|
||||
tailloop:
|
||||
CMPQ DX, $0x00000000
|
||||
JE reduce
|
||||
VMOVSS (AX), X6
|
||||
VFMADD231SS (CX), X6, X12
|
||||
ADDQ $0x00000004, AX
|
||||
ADDQ $0x00000004, CX
|
||||
DECQ DX
|
||||
JMP tailloop
|
||||
CMPQ DX, $0x00000000
|
||||
JE reduce
|
||||
VMOVSS (AX), X6
|
||||
VFMADD231SS (CX), X6, X12
|
||||
ADDQ $0x00000004, AX
|
||||
ADDQ $0x00000004, CX
|
||||
DECQ DX
|
||||
JMP tailloop
|
||||
|
||||
reduce:
|
||||
VADDPS Y0, Y1, Y0
|
||||
VADDPS Y0, Y2, Y0
|
||||
VADDPS Y0, Y3, Y0
|
||||
VADDPS Y0, Y4, Y0
|
||||
VADDPS Y0, Y5, Y0
|
||||
VEXTRACTF128 $0x01, Y0, X1
|
||||
VADDPS X0, X1, X0
|
||||
VADDPS X0, X12, X0
|
||||
VHADDPS X0, X0, X0
|
||||
VHADDPS X0, X0, X0
|
||||
MOVSS X0, ret+48(FP)
|
||||
VADDPS Y0, Y1, Y0
|
||||
VADDPS Y0, Y2, Y0
|
||||
VADDPS Y0, Y3, Y0
|
||||
VADDPS Y0, Y4, Y0
|
||||
VADDPS Y0, Y5, Y0
|
||||
VEXTRACTF128 $0x01, Y0, X1
|
||||
VADDPS X0, X1, X0
|
||||
VADDPS X0, X12, X0
|
||||
VHADDPS X0, X0, X0
|
||||
VHADDPS X0, X0, X0
|
||||
MOVSS X0, ret+48(FP)
|
||||
RET
|
||||
|
||||
Reference in New Issue
Block a user