printer: use tabwriter to align instructions (#8)

This commit is contained in:
Michael McLoughlin
2019-01-10 21:21:41 -08:00
parent 0e253b3753
commit f77a2e3b9e
20 changed files with 1876 additions and 1821 deletions

View File

@@ -43,9 +43,9 @@ This produces [`add.s`](add.s) as follows:
// func Add(x uint64, y uint64) uint64
TEXT ·Add(SB), NOSPLIT, $0-24
MOVQ x(FP), AX
MOVQ y+8(FP), CX
ADDQ AX, CX
MOVQ CX, ret+16(FP)
MOVQ x(FP), AX
MOVQ y+8(FP), CX
ADDQ AX, CX
MOVQ CX, ret+16(FP)
RET
```

View File

@@ -4,8 +4,8 @@
// func Add(x uint64, y uint64) uint64
TEXT ·Add(SB), NOSPLIT, $0-24
MOVQ x(FP), AX
MOVQ y+8(FP), CX
ADDQ AX, CX
MOVQ CX, ret+16(FP)
MOVQ x(FP), AX
MOVQ y+8(FP), CX
ADDQ AX, CX
MOVQ CX, ret+16(FP)
RET

View File

@@ -20,8 +20,8 @@ This `avo` code will generate the following assembly. Note that parameter refere
```s
// func Second(x int32, y int32) int32
TEXT ·Second(SB), NOSPLIT, $0-12
MOVL y+4(FP), AX
MOVL AX, ret+8(FP)
MOVL y+4(FP), AX
MOVL AX, ret+8(FP)
RET
```

View File

@@ -4,108 +4,108 @@
// func Second(x int32, y int32) int32
TEXT ·Second(SB), NOSPLIT, $0-12
MOVL y+4(FP), AX
MOVL AX, ret+8(FP)
MOVL y+4(FP), AX
MOVL AX, ret+8(FP)
RET
// func StringLen(s string) int
TEXT ·StringLen(SB), NOSPLIT, $0-24
MOVQ s_len+8(FP), AX
MOVQ AX, ret+16(FP)
MOVQ s_len+8(FP), AX
MOVQ AX, ret+16(FP)
RET
// func SliceLen(s []int) int
TEXT ·SliceLen(SB), NOSPLIT, $0-32
MOVQ s_len+8(FP), AX
MOVQ AX, ret+24(FP)
MOVQ s_len+8(FP), AX
MOVQ AX, ret+24(FP)
RET
// func SliceCap(s []int) int
TEXT ·SliceCap(SB), NOSPLIT, $0-32
MOVQ s_cap+16(FP), AX
MOVQ AX, ret+24(FP)
MOVQ s_cap+16(FP), AX
MOVQ AX, ret+24(FP)
RET
// func ArrayThree(a [7]uint64) uint64
TEXT ·ArrayThree(SB), NOSPLIT, $0-64
MOVQ a_3+24(FP), AX
MOVQ AX, ret+56(FP)
MOVQ a_3+24(FP), AX
MOVQ AX, ret+56(FP)
RET
// func FieldByte(s Struct) byte
TEXT ·FieldByte(SB), NOSPLIT, $0-177
MOVB s_Byte(FP), AL
MOVB AL, ret+176(FP)
MOVB s_Byte(FP), AL
MOVB AL, ret+176(FP)
RET
// func FieldInt8(s Struct) int8
TEXT ·FieldInt8(SB), NOSPLIT, $0-177
MOVB s_Int8+1(FP), AL
MOVB AL, ret+176(FP)
MOVB s_Int8+1(FP), AL
MOVB AL, ret+176(FP)
RET
// func FieldUint16(s Struct) uint16
TEXT ·FieldUint16(SB), NOSPLIT, $0-178
MOVW s_Uint16+2(FP), AX
MOVW AX, ret+176(FP)
MOVW s_Uint16+2(FP), AX
MOVW AX, ret+176(FP)
RET
// func FieldInt32(s Struct) int32
TEXT ·FieldInt32(SB), NOSPLIT, $0-180
MOVL s_Int32+4(FP), AX
MOVL AX, ret+176(FP)
MOVL s_Int32+4(FP), AX
MOVL AX, ret+176(FP)
RET
// func FieldUint64(s Struct) uint64
TEXT ·FieldUint64(SB), NOSPLIT, $0-184
MOVQ s_Uint64+8(FP), AX
MOVQ AX, ret+176(FP)
MOVQ s_Uint64+8(FP), AX
MOVQ AX, ret+176(FP)
RET
// func FieldFloat32(s Struct) float32
TEXT ·FieldFloat32(SB), NOSPLIT, $0-180
MOVSS s_Float32+16(FP), X0
MOVSS X0, ret+176(FP)
MOVSS s_Float32+16(FP), X0
MOVSS X0, ret+176(FP)
RET
// func FieldFloat64(s Struct) float64
TEXT ·FieldFloat64(SB), NOSPLIT, $0-184
MOVSD s_Float64+24(FP), X0
MOVSD X0, ret+176(FP)
MOVSD s_Float64+24(FP), X0
MOVSD X0, ret+176(FP)
RET
// func FieldStringLen(s Struct) int
TEXT ·FieldStringLen(SB), NOSPLIT, $0-184
MOVQ s_String_len+40(FP), AX
MOVQ AX, ret+176(FP)
MOVQ s_String_len+40(FP), AX
MOVQ AX, ret+176(FP)
RET
// func FieldSliceCap(s Struct) int
TEXT ·FieldSliceCap(SB), NOSPLIT, $0-184
MOVQ s_Slice_cap+64(FP), AX
MOVQ AX, ret+176(FP)
MOVQ s_Slice_cap+64(FP), AX
MOVQ AX, ret+176(FP)
RET
// func FieldArrayTwoBTwo(s Struct) byte
TEXT ·FieldArrayTwoBTwo(SB), NOSPLIT, $0-177
MOVB s_Array_2_B_2+114(FP), AL
MOVB AL, ret+176(FP)
MOVB s_Array_2_B_2+114(FP), AL
MOVB AL, ret+176(FP)
RET
// func FieldArrayOneC(s Struct) uint16
TEXT ·FieldArrayOneC(SB), NOSPLIT, $0-178
MOVW s_Array_1_C+100(FP), AX
MOVW AX, ret+176(FP)
MOVW s_Array_1_C+100(FP), AX
MOVW AX, ret+176(FP)
RET
// func FieldComplex64Imag(s Struct) float32
TEXT ·FieldComplex64Imag(SB), NOSPLIT, $0-180
MOVSS s_Complex64_imag+156(FP), X0
MOVSS X0, ret+176(FP)
MOVSS s_Complex64_imag+156(FP), X0
MOVSS X0, ret+176(FP)
RET
// func FieldComplex128Real(s Struct) float64
TEXT ·FieldComplex128Real(SB), NOSPLIT, $0-184
MOVSD s_Complex128_real+160(FP), X0
MOVSD X0, ret+176(FP)
MOVSD s_Complex128_real+160(FP), X0
MOVSD X0, ret+176(FP)
RET

BIN
examples/backup.tar.gz Normal file

Binary file not shown.

View File

@@ -25,12 +25,12 @@ Generated assembly:
```s
// func Norm(z complex128) float64
TEXT ·Norm(SB), NOSPLIT, $0-24
MOVSD z_real(FP), X0
MOVSD z_imag+8(FP), X1
MULSD X0, X0
MULSD X1, X1
ADDSD X1, X0
SQRTSD X0, X2
MOVSD X2, ret+16(FP)
MOVSD z_real(FP), X0
MOVSD z_imag+8(FP), X1
MULSD X0, X0
MULSD X1, X1
ADDSD X1, X0
SQRTSD X0, X2
MOVSD X2, ret+16(FP)
RET
```

View File

@@ -4,23 +4,23 @@
// func Real(z complex128) float64
TEXT ·Real(SB), NOSPLIT, $0-24
MOVSD z_real(FP), X0
MOVSD X0, ret+16(FP)
MOVSD z_real(FP), X0
MOVSD X0, ret+16(FP)
RET
// func Imag(z complex128) float64
TEXT ·Imag(SB), NOSPLIT, $0-24
MOVSD z_imag+8(FP), X0
MOVSD X0, ret+16(FP)
MOVSD z_imag+8(FP), X0
MOVSD X0, ret+16(FP)
RET
// func Norm(z complex128) float64
TEXT ·Norm(SB), NOSPLIT, $0-24
MOVSD z_real(FP), X0
MOVSD z_imag+8(FP), X1
MULSD X0, X0
MULSD X1, X1
ADDSD X1, X0
SQRTSD X0, X2
MOVSD X2, ret+16(FP)
MOVSD z_real(FP), X0
MOVSD z_imag+8(FP), X1
MULSD X0, X0
MULSD X1, X1
ADDSD X1, X0
SQRTSD X0, X2
MOVSD X2, ret+16(FP)
RET

View File

@@ -14,8 +14,8 @@ GLOBL bytes<>(SB), RODATA|NOPTR, $40
// func DataAt(i int) byte
TEXT ·DataAt(SB), NOSPLIT, $0-9
MOVQ i(FP), AX
LEAQ bytes<>(SB), CX
MOVB (CX)(AX*1), AL
MOVB AL, ret+8(FP)
MOVQ i(FP), AX
LEAQ bytes<>(SB), CX
MOVB (CX)(AX*1), AL
MOVB AL, ret+8(FP)
RET

View File

@@ -4,55 +4,59 @@
// func Dot(x []float32, y []float32) float32
TEXT ·Dot(SB), NOSPLIT, $0-52
MOVQ x_base(FP), AX
MOVQ y_base+24(FP), CX
MOVQ x_len+8(FP), DX
VXORPS Y0, Y0, Y0
VXORPS Y1, Y1, Y1
VXORPS Y2, Y2, Y2
VXORPS Y3, Y3, Y3
VXORPS Y4, Y4, Y4
VXORPS Y5, Y5, Y5
MOVQ x_base(FP), AX
MOVQ y_base+24(FP), CX
MOVQ x_len+8(FP), DX
VXORPS Y0, Y0, Y0
VXORPS Y1, Y1, Y1
VXORPS Y2, Y2, Y2
VXORPS Y3, Y3, Y3
VXORPS Y4, Y4, Y4
VXORPS Y5, Y5, Y5
blockloop:
CMPQ DX, $0x00000030
JL tail
VMOVUPS (AX), Y6
VMOVUPS 32(AX), Y7
VMOVUPS 64(AX), Y8
VMOVUPS 96(AX), Y9
VMOVUPS 128(AX), Y10
VMOVUPS 160(AX), Y11
VFMADD231PS (CX), Y6, Y0
VFMADD231PS 32(CX), Y7, Y1
VFMADD231PS 64(CX), Y8, Y2
VFMADD231PS 96(CX), Y9, Y3
VFMADD231PS 128(CX), Y10, Y4
VFMADD231PS 160(CX), Y11, Y5
ADDQ $0x000000c0, AX
ADDQ $0x000000c0, CX
SUBQ $0x00000030, DX
JMP blockloop
CMPQ DX, $0x00000030
JL tail
VMOVUPS (AX), Y6
VMOVUPS 32(AX), Y7
VMOVUPS 64(AX), Y8
VMOVUPS 96(AX), Y9
VMOVUPS 128(AX), Y10
VMOVUPS 160(AX), Y11
VFMADD231PS (CX), Y6, Y0
VFMADD231PS 32(CX), Y7, Y1
VFMADD231PS 64(CX), Y8, Y2
VFMADD231PS 96(CX), Y9, Y3
VFMADD231PS 128(CX), Y10, Y4
VFMADD231PS 160(CX), Y11, Y5
ADDQ $0x000000c0, AX
ADDQ $0x000000c0, CX
SUBQ $0x00000030, DX
JMP blockloop
tail:
VXORPS X12, X12, X12
VXORPS X12, X12, X12
tailloop:
CMPQ DX, $0x00000000
JE reduce
VMOVSS (AX), X6
VFMADD231SS (CX), X6, X12
ADDQ $0x00000004, AX
ADDQ $0x00000004, CX
DECQ DX
JMP tailloop
CMPQ DX, $0x00000000
JE reduce
VMOVSS (AX), X6
VFMADD231SS (CX), X6, X12
ADDQ $0x00000004, AX
ADDQ $0x00000004, CX
DECQ DX
JMP tailloop
reduce:
VADDPS Y0, Y1, Y0
VADDPS Y0, Y2, Y0
VADDPS Y0, Y3, Y0
VADDPS Y0, Y4, Y0
VADDPS Y0, Y5, Y0
VEXTRACTF128 $0x01, Y0, X1
VADDPS X0, X1, X0
VADDPS X0, X12, X0
VHADDPS X0, X0, X0
VHADDPS X0, X0, X0
MOVSS X0, ret+48(FP)
VADDPS Y0, Y1, Y0
VADDPS Y0, Y2, Y0
VADDPS Y0, Y3, Y0
VADDPS Y0, Y4, Y0
VADDPS Y0, Y5, Y0
VEXTRACTF128 $0x01, Y0, X1
VADDPS X0, X1, X0
VADDPS X0, X12, X0
VHADDPS X0, X0, X0
VHADDPS X0, X0, X0
MOVSS X0, ret+48(FP)
RET

View File

@@ -4,19 +4,21 @@
// func Hash64(data []byte) uint64
TEXT ·Hash64(SB), NOSPLIT, $0-32
MOVQ data_base(FP), CX
MOVQ data_len+8(FP), BX
MOVQ $0xcbf29ce484222325, AX
MOVQ $0x00000100000001b3, BP
MOVQ data_base(FP), CX
MOVQ data_len+8(FP), BX
MOVQ $0xcbf29ce484222325, AX
MOVQ $0x00000100000001b3, BP
loop:
CMPQ BX, $0x00
JE done
MOVBQZX (CX), DX
XORQ DX, AX
MULQ BP
INCQ CX
DECQ BX
JMP loop
CMPQ BX, $0x00
JE done
MOVBQZX (CX), DX
XORQ DX, AX
MULQ BP
INCQ CX
DECQ BX
JMP loop
done:
MOVQ AX, ret+24(FP)
MOVQ AX, ret+24(FP)
RET

View File

@@ -4,21 +4,21 @@
// func EncodeInt(lat float64, lng float64) uint64
TEXT ·EncodeInt(SB), NOSPLIT, $0-24
MOVSD lat(FP), X0
MOVSD lng+8(FP), X1
MULSD reciprocal180<>(SB), X0
ADDSD onepointfive<>(SB), X0
MULSD reciprocal360<>(SB), X1
ADDSD onepointfive<>(SB), X1
MOVQ X0, CX
SHRQ $0x14, CX
MOVQ X1, AX
SHRQ $0x14, AX
PDEPQ mask<>(SB), CX, CX
PDEPQ mask<>(SB), AX, AX
SHLQ $0x01, AX
XORQ AX, CX
MOVQ CX, ret+16(FP)
MOVSD lat(FP), X0
MOVSD lng+8(FP), X1
MULSD reciprocal180<>(SB), X0
ADDSD onepointfive<>(SB), X0
MULSD reciprocal360<>(SB), X1
ADDSD onepointfive<>(SB), X1
MOVQ X0, CX
SHRQ $0x14, CX
MOVQ X1, AX
SHRQ $0x14, AX
PDEPQ mask<>(SB), CX, CX
PDEPQ mask<>(SB), AX, AX
SHLQ $0x01, AX
XORQ AX, CX
MOVQ CX, ret+16(FP)
RET
DATA reciprocal180<>(SB)/8, $(0.005555555555555556)

View File

@@ -4,43 +4,43 @@
// func Interval(start uint64, size uint64) (uint64, uint64)
TEXT ·Interval(SB), NOSPLIT, $0-32
MOVQ start(FP), AX
MOVQ size+8(FP), CX
ADDQ AX, CX
MOVQ AX, ret+16(FP)
MOVQ CX, ret1+24(FP)
MOVQ start(FP), AX
MOVQ size+8(FP), CX
ADDQ AX, CX
MOVQ AX, ret+16(FP)
MOVQ CX, ret1+24(FP)
RET
// func Butterfly(x0 float64, x1 float64) (y0 float64, y1 float64)
TEXT ·Butterfly(SB), NOSPLIT, $0-32
MOVSD x0(FP), X0
MOVSD x1+8(FP), X1
MOVSD X0, X2
ADDSD X1, X2
MOVSD X0, X3
SUBSD X1, X3
MOVSD X2, y0+16(FP)
MOVSD X3, y1+24(FP)
MOVSD x0(FP), X0
MOVSD x1+8(FP), X1
MOVSD X0, X2
ADDSD X1, X2
MOVSD X0, X3
SUBSD X1, X3
MOVSD X2, y0+16(FP)
MOVSD X3, y1+24(FP)
RET
// func Septuple(byte) [7]byte
TEXT ·Septuple(SB), NOSPLIT, $0-15
MOVB arg(FP), AL
MOVB AL, ret_0+8(FP)
MOVB AL, ret_1+9(FP)
MOVB AL, ret_2+10(FP)
MOVB AL, ret_3+11(FP)
MOVB AL, ret_4+12(FP)
MOVB AL, ret_5+13(FP)
MOVB AL, ret_6+14(FP)
MOVB arg(FP), AL
MOVB AL, ret_0+8(FP)
MOVB AL, ret_1+9(FP)
MOVB AL, ret_2+10(FP)
MOVB AL, ret_3+11(FP)
MOVB AL, ret_4+12(FP)
MOVB AL, ret_5+13(FP)
MOVB AL, ret_6+14(FP)
RET
// func CriticalLine(t float64) complex128
TEXT ·CriticalLine(SB), NOSPLIT, $0-24
MOVSD t(FP), X0
MOVSD half<>(SB), X1
MOVSD X1, ret_real+8(FP)
MOVSD X0, ret_imag+16(FP)
MOVSD t(FP), X0
MOVSD half<>(SB), X1
MOVSD X1, ret_real+8(FP)
MOVSD X0, ret_imag+16(FP)
RET
DATA half<>(SB)/8, $(0.5)
@@ -48,12 +48,12 @@ GLOBL half<>(SB), RODATA|NOPTR, $8
// func NewStruct(w uint16, p [2]float64, q uint64) Struct
TEXT ·NewStruct(SB), NOSPLIT, $0-64
MOVW w(FP), AX
MOVSD p_0+8(FP), X0
MOVSD p_1+16(FP), X1
MOVQ q+24(FP), CX
MOVW AX, ret_Word+32(FP)
MOVSD X0, ret_Point_0+40(FP)
MOVSD X1, ret_Point_1+48(FP)
MOVQ CX, ret_Quad+56(FP)
MOVW w(FP), AX
MOVSD p_0+8(FP), X0
MOVSD p_1+16(FP), X1
MOVQ q+24(FP), CX
MOVW AX, ret_Word+32(FP)
MOVSD X0, ret_Point_0+40(FP)
MOVSD X1, ret_Point_1+48(FP)
MOVQ CX, ret_Quad+56(FP)
RET

File diff suppressed because it is too large Load Diff

View File

@@ -4,307 +4,335 @@
// func Hash(state *State, key []byte) uint64
TEXT ·Hash(SB), NOSPLIT, $0-40
MOVQ state(FP), AX
MOVQ key_base+8(FP), CX
MOVQ key_len+16(FP), DX
MOVQ (AX), BX
MOVQ 8(AX), BP
MOVQ DX, SI
ADDQ $0x00000001, SI
MOVQ $0xb89b0f8e1655514f, DI
IMULQ DI, SI
XORQ SI, BX
MOVQ DX, SI
ADDQ $0x00000002, SI
MOVQ $0x8c6f736011bd5127, DI
IMULQ DI, SI
XORQ SI, BP
CMPQ DX, $0x00000020
JGE coreLong
MOVQ DX, SI
SHRQ $0x03, SI
CMPQ SI, $0x00000000
JE shortCore0
CMPQ SI, $0x00000001
JE shortCore1
CMPQ SI, $0x00000002
JE shortCore2
CMPQ SI, $0x00000003
JE shortCore3
MOVQ state(FP), AX
MOVQ key_base+8(FP), CX
MOVQ key_len+16(FP), DX
MOVQ (AX), BX
MOVQ 8(AX), BP
MOVQ DX, SI
ADDQ $0x00000001, SI
MOVQ $0xb89b0f8e1655514f, DI
IMULQ DI, SI
XORQ SI, BX
MOVQ DX, SI
ADDQ $0x00000002, SI
MOVQ $0x8c6f736011bd5127, DI
IMULQ DI, SI
XORQ SI, BP
CMPQ DX, $0x00000020
JGE coreLong
MOVQ DX, SI
SHRQ $0x03, SI
CMPQ SI, $0x00000000
JE shortCore0
CMPQ SI, $0x00000001
JE shortCore1
CMPQ SI, $0x00000002
JE shortCore2
CMPQ SI, $0x00000003
JE shortCore3
shortCore3:
MOVQ (CX), SI
MOVQ $0x9c1b8e1e9628323f, DI
IMULQ DI, SI
ADDQ SI, BX
RORQ $0x11, BX
XORQ BP, BX
RORQ $0x35, BP
ADDQ BX, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
MOVQ (CX), SI
MOVQ $0x9c1b8e1e9628323f, DI
IMULQ DI, SI
ADDQ SI, BX
RORQ $0x11, BX
XORQ BP, BX
RORQ $0x35, BP
ADDQ BX, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
shortCore2:
MOVQ (CX), SI
MOVQ $0x9c1b8e1e9628323f, DI
IMULQ DI, SI
ADDQ SI, BX
RORQ $0x11, BX
XORQ BP, BX
RORQ $0x35, BP
ADDQ BX, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
MOVQ (CX), SI
MOVQ $0x9c1b8e1e9628323f, DI
IMULQ DI, SI
ADDQ SI, BX
RORQ $0x11, BX
XORQ BP, BX
RORQ $0x35, BP
ADDQ BX, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
shortCore1:
MOVQ (CX), SI
MOVQ $0x9c1b8e1e9628323f, DI
IMULQ DI, SI
ADDQ SI, BX
RORQ $0x11, BX
XORQ BP, BX
RORQ $0x35, BP
ADDQ BX, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
MOVQ (CX), SI
MOVQ $0x9c1b8e1e9628323f, DI
IMULQ DI, SI
ADDQ SI, BX
RORQ $0x11, BX
XORQ BP, BX
RORQ $0x35, BP
ADDQ BX, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
shortCore0:
CMPQ DX, $0x00000000
JE shortTail0
CMPQ DX, $0x00000001
JE shortTail1
CMPQ DX, $0x00000002
JE shortTail2
CMPQ DX, $0x00000003
JE shortTail3
CMPQ DX, $0x00000004
JE shortTail4
CMPQ DX, $0x00000005
JE shortTail5
CMPQ DX, $0x00000006
JE shortTail6
CMPQ DX, $0x00000007
JE shortTail7
CMPQ DX, $0x00000000
JE shortTail0
CMPQ DX, $0x00000001
JE shortTail1
CMPQ DX, $0x00000002
JE shortTail2
CMPQ DX, $0x00000003
JE shortTail3
CMPQ DX, $0x00000004
JE shortTail4
CMPQ DX, $0x00000005
JE shortTail5
CMPQ DX, $0x00000006
JE shortTail6
CMPQ DX, $0x00000007
JE shortTail7
shortTail7:
MOVBQZX 6(CX), SI
SHLQ $0x20, SI
ADDQ SI, BX
MOVBQZX 6(CX), SI
SHLQ $0x20, SI
ADDQ SI, BX
shortTail6:
MOVBQZX 5(CX), SI
SHLQ $0x30, SI
ADDQ SI, BP
MOVBQZX 5(CX), SI
SHLQ $0x30, SI
ADDQ SI, BP
shortTail5:
MOVBQZX 4(CX), SI
SHLQ $0x10, SI
ADDQ SI, BX
MOVBQZX 4(CX), SI
SHLQ $0x10, SI
ADDQ SI, BX
shortTail4:
MOVLQZX (CX), SI
ADDQ SI, BP
JMP shortAfter
MOVLQZX (CX), SI
ADDQ SI, BP
JMP shortAfter
shortTail3:
MOVBQZX 2(CX), SI
SHLQ $0x30, SI
ADDQ SI, BX
MOVBQZX 2(CX), SI
SHLQ $0x30, SI
ADDQ SI, BX
shortTail2:
MOVWQZX (CX), SI
ADDQ SI, BP
JMP shortAfter
MOVWQZX (CX), SI
ADDQ SI, BP
JMP shortAfter
shortTail1:
MOVBQZX (CX), SI
ADDQ SI, BX
MOVBQZX (CX), SI
ADDQ SI, BX
shortTail0:
RORQ $0x20, BP
XORQ $0x000000ff, BP
RORQ $0x20, BP
XORQ $0x000000ff, BP
shortAfter:
XORQ BX, BP
RORQ $0x21, BX
ADDQ BP, BX
ROLQ $0x11, BP
XORQ BX, BP
ROLQ $0x2b, BX
ADDQ BP, BX
ROLQ $0x1f, BP
SUBQ BX, BP
ROLQ $0x0d, BX
XORQ BP, BX
SUBQ BX, BP
ROLQ $0x29, BX
ADDQ BP, BX
ROLQ $0x25, BP
XORQ BX, BP
RORQ $0x27, BX
ADDQ BP, BX
RORQ $0x0f, BP
ADDQ BX, BP
ROLQ $0x0f, BX
XORQ BP, BX
RORQ $0x05, BP
XORQ BP, BX
MOVQ BX, ret+32(FP)
XORQ BX, BP
RORQ $0x21, BX
ADDQ BP, BX
ROLQ $0x11, BP
XORQ BX, BP
ROLQ $0x2b, BX
ADDQ BP, BX
ROLQ $0x1f, BP
SUBQ BX, BP
ROLQ $0x0d, BX
XORQ BP, BX
SUBQ BX, BP
ROLQ $0x29, BX
ADDQ BP, BX
ROLQ $0x25, BP
XORQ BX, BP
RORQ $0x27, BX
ADDQ BP, BX
RORQ $0x0f, BP
ADDQ BX, BP
ROLQ $0x0f, BX
XORQ BP, BX
RORQ $0x05, BP
XORQ BP, BX
MOVQ BX, ret+32(FP)
RET
coreLong:
MOVQ 16(AX), DI
MOVQ 24(AX), AX
MOVQ DX, SI
ADDQ $0x00000003, SI
MOVQ $0x8f29bd94edce7b39, R8
IMULQ R8, SI
XORQ SI, DI
MOVQ DX, SI
ADDQ $0x00000004, SI
MOVQ $0x9c1b8e1e9628323f, R8
IMULQ R8, SI
XORQ SI, AX
MOVQ 16(AX), DI
MOVQ 24(AX), AX
MOVQ DX, SI
ADDQ $0x00000003, SI
MOVQ $0x8f29bd94edce7b39, R8
IMULQ R8, SI
XORQ SI, DI
MOVQ DX, SI
ADDQ $0x00000004, SI
MOVQ $0x9c1b8e1e9628323f, R8
IMULQ R8, SI
XORQ SI, AX
block:
MOVQ (CX), SI
MOVQ $0x00000000802910e3, R8
IMULQ R8, SI
ADDQ SI, BX
ROLQ $0x39, BX
XORQ AX, BX
MOVQ 8(CX), SI
MOVQ $0x00000000819b13af, R8
IMULQ R8, SI
ADDQ SI, BP
ROLQ $0x3f, BP
XORQ DI, BP
MOVQ 16(CX), SI
MOVQ $0x0000000091cb27e5, R8
IMULQ R8, SI
ADDQ SI, DI
RORQ $0x2f, DI
ADDQ BX, DI
MOVQ 24(CX), SI
MOVQ $0x00000000c1a269c1, R8
IMULQ R8, SI
ADDQ SI, AX
RORQ $0x0b, AX
SUBQ BP, AX
ADDQ $0x00000020, CX
SUBQ $0x00000020, DX
CMPQ DX, $0x00000020
JGE block
MOVQ DX, R8
MOVQ DX, SI
SHRQ $0x03, SI
CMPQ SI, $0x00000000
JE longCore0
CMPQ SI, $0x00000001
JE longCore1
CMPQ SI, $0x00000002
JE longCore2
CMPQ SI, $0x00000003
JE longCore3
MOVQ (CX), SI
MOVQ $0x00000000802910e3, R8
IMULQ R8, SI
ADDQ SI, BX
ROLQ $0x39, BX
XORQ AX, BX
MOVQ 8(CX), SI
MOVQ $0x00000000819b13af, R8
IMULQ R8, SI
ADDQ SI, BP
ROLQ $0x3f, BP
XORQ DI, BP
MOVQ 16(CX), SI
MOVQ $0x0000000091cb27e5, R8
IMULQ R8, SI
ADDQ SI, DI
RORQ $0x2f, DI
ADDQ BX, DI
MOVQ 24(CX), SI
MOVQ $0x00000000c1a269c1, R8
IMULQ R8, SI
ADDQ SI, AX
RORQ $0x0b, AX
SUBQ BP, AX
ADDQ $0x00000020, CX
SUBQ $0x00000020, DX
CMPQ DX, $0x00000020
JGE block
MOVQ DX, R8
MOVQ DX, SI
SHRQ $0x03, SI
CMPQ SI, $0x00000000
JE longCore0
CMPQ SI, $0x00000001
JE longCore1
CMPQ SI, $0x00000002
JE longCore2
CMPQ SI, $0x00000003
JE longCore3
longCore3:
MOVQ (CX), SI
MOVQ $0x00000000802910e3, R9
IMULQ R9, SI
ADDQ SI, BX
ROLQ $0x39, BX
XORQ AX, BX
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
MOVQ (CX), SI
MOVQ $0x00000000802910e3, R9
IMULQ R9, SI
ADDQ SI, BX
ROLQ $0x39, BX
XORQ AX, BX
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
longCore2:
MOVQ (CX), SI
MOVQ $0x00000000819b13af, R9
IMULQ R9, SI
ADDQ SI, BP
ROLQ $0x3f, BP
XORQ DI, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
MOVQ (CX), SI
MOVQ $0x00000000819b13af, R9
IMULQ R9, SI
ADDQ SI, BP
ROLQ $0x3f, BP
XORQ DI, BP
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
longCore1:
MOVQ (CX), SI
MOVQ $0x0000000091cb27e5, R9
IMULQ R9, SI
ADDQ SI, DI
RORQ $0x2f, DI
ADDQ BX, DI
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
MOVQ (CX), SI
MOVQ $0x0000000091cb27e5, R9
IMULQ R9, SI
ADDQ SI, DI
RORQ $0x2f, DI
ADDQ BX, DI
ADDQ $0x00000008, CX
SUBQ $0x00000008, DX
longCore0:
RORQ $0x0b, AX
SUBQ BP, AX
ADDQ $0x00000001, R8
MOVQ $0x9c1b8e1e9628323f, SI
IMULQ SI, R8
XORQ R8, BX
CMPQ DX, $0x00000000
JE longTail0
CMPQ DX, $0x00000001
JE longTail1
CMPQ DX, $0x00000002
JE longTail2
CMPQ DX, $0x00000003
JE longTail3
CMPQ DX, $0x00000004
JE longTail4
CMPQ DX, $0x00000005
JE longTail5
CMPQ DX, $0x00000006
JE longTail6
CMPQ DX, $0x00000007
JE longTail7
RORQ $0x0b, AX
SUBQ BP, AX
ADDQ $0x00000001, R8
MOVQ $0x9c1b8e1e9628323f, SI
IMULQ SI, R8
XORQ R8, BX
CMPQ DX, $0x00000000
JE longTail0
CMPQ DX, $0x00000001
JE longTail1
CMPQ DX, $0x00000002
JE longTail2
CMPQ DX, $0x00000003
JE longTail3
CMPQ DX, $0x00000004
JE longTail4
CMPQ DX, $0x00000005
JE longTail5
CMPQ DX, $0x00000006
JE longTail6
CMPQ DX, $0x00000007
JE longTail7
longTail7:
MOVBQZX 6(CX), SI
ADDQ SI, BP
MOVBQZX 6(CX), SI
ADDQ SI, BP
longTail6:
MOVWQZX 4(CX), SI
ADDQ SI, DI
MOVLQZX (CX), SI
ADDQ SI, AX
JMP longAfter
MOVWQZX 4(CX), SI
ADDQ SI, DI
MOVLQZX (CX), SI
ADDQ SI, AX
JMP longAfter
longTail5:
MOVBQZX 4(CX), SI
ADDQ SI, BP
MOVBQZX 4(CX), SI
ADDQ SI, BP
longTail4:
MOVLQZX (CX), SI
ADDQ SI, DI
JMP longAfter
MOVLQZX (CX), SI
ADDQ SI, DI
JMP longAfter
longTail3:
MOVBQZX 2(CX), SI
ADDQ SI, AX
MOVBQZX 2(CX), SI
ADDQ SI, AX
longTail2:
MOVWQZX (CX), SI
ADDQ SI, BP
JMP longAfter
MOVWQZX (CX), SI
ADDQ SI, BP
JMP longAfter
longTail1:
MOVBQZX (CX), SI
ADDQ SI, DI
MOVBQZX (CX), SI
ADDQ SI, DI
longTail0:
ROLQ $0x20, AX
XORQ $0x000000ff, AX
ROLQ $0x20, AX
XORQ $0x000000ff, AX
longAfter:
SUBQ DI, BP
RORQ $0x13, BX
SUBQ BX, BP
RORQ $0x35, BP
XORQ BP, AX
SUBQ AX, BX
ROLQ $0x2b, AX
ADDQ AX, BX
RORQ $0x03, BX
SUBQ BX, AX
RORQ $0x2b, DI
SUBQ AX, DI
ROLQ $0x37, DI
XORQ BX, DI
SUBQ DI, BP
RORQ $0x07, AX
SUBQ DI, AX
RORQ $0x1f, DI
ADDQ DI, AX
SUBQ BP, DI
RORQ $0x27, AX
XORQ AX, DI
RORQ $0x11, AX
XORQ DI, AX
ADDQ AX, BP
RORQ $0x09, BP
XORQ BP, DI
ROLQ $0x18, DI
XORQ DI, AX
RORQ $0x3b, AX
RORQ $0x01, BX
SUBQ BP, BX
XORQ BP, BX
XORQ AX, DI
XORQ DI, BX
MOVQ BX, ret+32(FP)
SUBQ DI, BP
RORQ $0x13, BX
SUBQ BX, BP
RORQ $0x35, BP
XORQ BP, AX
SUBQ AX, BX
ROLQ $0x2b, AX
ADDQ AX, BX
RORQ $0x03, BX
SUBQ BX, AX
RORQ $0x2b, DI
SUBQ AX, DI
ROLQ $0x37, DI
XORQ BX, DI
SUBQ DI, BP
RORQ $0x07, AX
SUBQ DI, AX
RORQ $0x1f, DI
ADDQ DI, AX
SUBQ BP, DI
RORQ $0x27, AX
XORQ AX, DI
RORQ $0x11, AX
XORQ DI, AX
ADDQ AX, BP
RORQ $0x09, BP
XORQ BP, DI
ROLQ $0x18, DI
XORQ DI, AX
RORQ $0x3b, AX
RORQ $0x01, BX
SUBQ BP, BX
XORQ BP, BX
XORQ AX, DI
XORQ DI, BX
MOVQ BX, ret+32(FP)
RET

View File

@@ -4,16 +4,18 @@
// func Sum(xs []uint64) uint64
TEXT ·Sum(SB), NOSPLIT, $0-32
MOVQ xs_base(FP), AX
MOVQ xs_len+8(FP), CX
XORQ DX, DX
MOVQ xs_base(FP), AX
MOVQ xs_len+8(FP), CX
XORQ DX, DX
loop:
CMPQ CX, $0x00
JE done
ADDQ (AX), DX
ADDQ $0x08, AX
DECQ CX
JMP loop
CMPQ CX, $0x00
JE done
ADDQ (AX), DX
ADDQ $0x08, AX
DECQ CX
JMP loop
done:
MOVQ DX, ret+24(FP)
MOVQ DX, ret+24(FP)
RET