pass: de-prioritize base pointer in register allocation (#184)

Updates #156
This commit is contained in:
Michael McLoughlin
2021-04-18 19:22:09 -07:00
committed by GitHub
parent f295bde84c
commit c32f24fb1e
9 changed files with 5273 additions and 5207 deletions

View File

@@ -3,18 +3,18 @@
#include "textflag.h" #include "textflag.h"
// func Hash64(data []byte) uint64 // func Hash64(data []byte) uint64
TEXT ·Hash64(SB), NOSPLIT, $8-32 TEXT ·Hash64(SB), NOSPLIT, $0-32
MOVQ data_base+0(FP), CX MOVQ data_base+0(FP), CX
MOVQ data_len+8(FP), BX MOVQ data_len+8(FP), BX
MOVQ $0xcbf29ce484222325, AX MOVQ $0xcbf29ce484222325, AX
MOVQ $0x00000100000001b3, BP MOVQ $0x00000100000001b3, SI
loop: loop:
CMPQ BX, $0x00 CMPQ BX, $0x00
JE done JE done
MOVBQZX (CX), DX MOVBQZX (CX), DX
XORQ DX, AX XORQ DX, AX
MULQ BP MULQ SI
INCQ CX INCQ CX
DECQ BX DECQ BX
JMP loop JMP loop

File diff suppressed because it is too large Load Diff

View File

@@ -3,68 +3,68 @@
#include "textflag.h" #include "textflag.h"
// func Hash(state *State, key []byte) uint64 // func Hash(state *State, key []byte) uint64
TEXT ·Hash(SB), NOSPLIT, $8-40 TEXT ·Hash(SB), NOSPLIT, $0-40
MOVQ state+0(FP), AX MOVQ state+0(FP), AX
MOVQ key_base+8(FP), CX MOVQ key_base+8(FP), CX
MOVQ key_len+16(FP), DX MOVQ key_len+16(FP), DX
MOVQ (AX), BX MOVQ (AX), BX
MOVQ 8(AX), BP MOVQ 8(AX), SI
MOVQ DX, SI MOVQ DX, DI
ADDQ $0x00000001, SI ADDQ $0x00000001, DI
MOVQ $0xb89b0f8e1655514f, DI MOVQ $0xb89b0f8e1655514f, R8
IMULQ DI, SI IMULQ R8, DI
XORQ SI, BX XORQ DI, BX
MOVQ DX, SI MOVQ DX, DI
ADDQ $0x00000002, SI ADDQ $0x00000002, DI
MOVQ $0x8c6f736011bd5127, DI MOVQ $0x8c6f736011bd5127, R8
IMULQ DI, SI IMULQ R8, DI
XORQ SI, BP XORQ DI, SI
CMPQ DX, $0x00000020 CMPQ DX, $0x00000020
JGE coreLong JGE coreLong
MOVQ DX, SI MOVQ DX, DI
SHRQ $0x03, SI SHRQ $0x03, DI
CMPQ SI, $0x00000000 CMPQ DI, $0x00000000
JE shortCore0 JE shortCore0
CMPQ SI, $0x00000001 CMPQ DI, $0x00000001
JE shortCore1 JE shortCore1
CMPQ SI, $0x00000002 CMPQ DI, $0x00000002
JE shortCore2 JE shortCore2
CMPQ SI, $0x00000003 CMPQ DI, $0x00000003
JE shortCore3 JE shortCore3
shortCore3: shortCore3:
MOVQ (CX), AX MOVQ (CX), AX
MOVQ $0x9c1b8e1e9628323f, SI MOVQ $0x9c1b8e1e9628323f, DI
IMULQ SI, AX IMULQ DI, AX
ADDQ AX, BX ADDQ AX, BX
RORQ $0x11, BX RORQ $0x11, BX
XORQ BP, BX XORQ SI, BX
RORQ $0x35, BP RORQ $0x35, SI
ADDQ BX, BP ADDQ BX, SI
ADDQ $0x00000008, CX ADDQ $0x00000008, CX
SUBQ $0x00000008, DX SUBQ $0x00000008, DX
shortCore2: shortCore2:
MOVQ (CX), AX MOVQ (CX), AX
MOVQ $0x9c1b8e1e9628323f, SI MOVQ $0x9c1b8e1e9628323f, DI
IMULQ SI, AX IMULQ DI, AX
ADDQ AX, BX ADDQ AX, BX
RORQ $0x11, BX RORQ $0x11, BX
XORQ BP, BX XORQ SI, BX
RORQ $0x35, BP RORQ $0x35, SI
ADDQ BX, BP ADDQ BX, SI
ADDQ $0x00000008, CX ADDQ $0x00000008, CX
SUBQ $0x00000008, DX SUBQ $0x00000008, DX
shortCore1: shortCore1:
MOVQ (CX), AX MOVQ (CX), AX
MOVQ $0x9c1b8e1e9628323f, SI MOVQ $0x9c1b8e1e9628323f, DI
IMULQ SI, AX IMULQ DI, AX
ADDQ AX, BX ADDQ AX, BX
RORQ $0x11, BX RORQ $0x11, BX
XORQ BP, BX XORQ SI, BX
RORQ $0x35, BP RORQ $0x35, SI
ADDQ BX, BP ADDQ BX, SI
ADDQ $0x00000008, CX ADDQ $0x00000008, CX
SUBQ $0x00000008, DX SUBQ $0x00000008, DX
@@ -94,7 +94,7 @@ shortTail7:
shortTail6: shortTail6:
MOVBQZX 5(CX), DX MOVBQZX 5(CX), DX
SHLQ $0x30, DX SHLQ $0x30, DX
ADDQ DX, BP ADDQ DX, SI
shortTail5: shortTail5:
MOVBQZX 4(CX), DX MOVBQZX 4(CX), DX
@@ -103,7 +103,7 @@ shortTail5:
shortTail4: shortTail4:
MOVLQZX (CX), DX MOVLQZX (CX), DX
ADDQ DX, BP ADDQ DX, SI
JMP shortAfter JMP shortAfter
shortTail3: shortTail3:
@@ -113,7 +113,7 @@ shortTail3:
shortTail2: shortTail2:
MOVWQZX (CX), DX MOVWQZX (CX), DX
ADDQ DX, BP ADDQ DX, SI
JMP shortAfter JMP shortAfter
shortTail1: shortTail1:
@@ -121,129 +121,129 @@ shortTail1:
ADDQ DX, BX ADDQ DX, BX
shortTail0: shortTail0:
RORQ $0x20, BP RORQ $0x20, SI
XORQ $0x000000ff, BP XORQ $0x000000ff, SI
shortAfter: shortAfter:
XORQ BX, BP XORQ BX, SI
RORQ $0x21, BX RORQ $0x21, BX
ADDQ BP, BX ADDQ SI, BX
ROLQ $0x11, BP ROLQ $0x11, SI
XORQ BX, BP XORQ BX, SI
ROLQ $0x2b, BX ROLQ $0x2b, BX
ADDQ BP, BX ADDQ SI, BX
ROLQ $0x1f, BP ROLQ $0x1f, SI
SUBQ BX, BP SUBQ BX, SI
ROLQ $0x0d, BX ROLQ $0x0d, BX
XORQ BP, BX XORQ SI, BX
SUBQ BX, BP SUBQ BX, SI
ROLQ $0x29, BX ROLQ $0x29, BX
ADDQ BP, BX ADDQ SI, BX
ROLQ $0x25, BP ROLQ $0x25, SI
XORQ BX, BP XORQ BX, SI
RORQ $0x27, BX RORQ $0x27, BX
ADDQ BP, BX ADDQ SI, BX
RORQ $0x0f, BP RORQ $0x0f, SI
ADDQ BX, BP ADDQ BX, SI
ROLQ $0x0f, BX ROLQ $0x0f, BX
XORQ BP, BX XORQ SI, BX
RORQ $0x05, BP RORQ $0x05, SI
XORQ BP, BX XORQ SI, BX
MOVQ BX, ret+32(FP) MOVQ BX, ret+32(FP)
RET RET
coreLong: coreLong:
MOVQ 16(AX), DI MOVQ 16(AX), R8
MOVQ 24(AX), AX MOVQ 24(AX), AX
MOVQ DX, SI MOVQ DX, DI
ADDQ $0x00000003, SI ADDQ $0x00000003, DI
MOVQ $0x8f29bd94edce7b39, R8 MOVQ $0x8f29bd94edce7b39, R9
IMULQ R8, SI IMULQ R9, DI
XORQ SI, DI XORQ DI, R8
MOVQ DX, SI MOVQ DX, DI
ADDQ $0x00000004, SI ADDQ $0x00000004, DI
MOVQ $0x9c1b8e1e9628323f, R8 MOVQ $0x9c1b8e1e9628323f, R9
IMULQ R8, SI IMULQ R9, DI
XORQ SI, AX XORQ DI, AX
block: block:
MOVQ (CX), SI MOVQ (CX), DI
MOVQ $0x00000000802910e3, R8 MOVQ $0x00000000802910e3, R9
IMULQ R8, SI IMULQ R9, DI
ADDQ SI, BX ADDQ DI, BX
ROLQ $0x39, BX ROLQ $0x39, BX
XORQ AX, BX XORQ AX, BX
MOVQ 8(CX), SI MOVQ 8(CX), DI
MOVQ $0x00000000819b13af, R8 MOVQ $0x00000000819b13af, R9
IMULQ R8, SI IMULQ R9, DI
ADDQ SI, BP ADDQ DI, SI
ROLQ $0x3f, BP ROLQ $0x3f, SI
XORQ DI, BP XORQ R8, SI
MOVQ 16(CX), SI MOVQ 16(CX), DI
MOVQ $0x0000000091cb27e5, R8 MOVQ $0x0000000091cb27e5, R9
IMULQ R8, SI IMULQ R9, DI
ADDQ SI, DI ADDQ DI, R8
RORQ $0x2f, DI RORQ $0x2f, R8
ADDQ BX, DI ADDQ BX, R8
MOVQ 24(CX), SI MOVQ 24(CX), DI
MOVQ $0x00000000c1a269c1, R8 MOVQ $0x00000000c1a269c1, R9
IMULQ R8, SI IMULQ R9, DI
ADDQ SI, AX ADDQ DI, AX
RORQ $0x0b, AX RORQ $0x0b, AX
SUBQ BP, AX SUBQ SI, AX
ADDQ $0x00000020, CX ADDQ $0x00000020, CX
SUBQ $0x00000020, DX SUBQ $0x00000020, DX
CMPQ DX, $0x00000020 CMPQ DX, $0x00000020
JGE block JGE block
MOVQ DX, R8 MOVQ DX, R9
MOVQ DX, SI MOVQ DX, DI
SHRQ $0x03, SI SHRQ $0x03, DI
CMPQ SI, $0x00000000 CMPQ DI, $0x00000000
JE longCore0 JE longCore0
CMPQ SI, $0x00000001 CMPQ DI, $0x00000001
JE longCore1 JE longCore1
CMPQ SI, $0x00000002 CMPQ DI, $0x00000002
JE longCore2 JE longCore2
CMPQ SI, $0x00000003 CMPQ DI, $0x00000003
JE longCore3 JE longCore3
longCore3: longCore3:
MOVQ (CX), SI MOVQ (CX), DI
MOVQ $0x00000000802910e3, R9 MOVQ $0x00000000802910e3, R10
IMULQ R9, SI IMULQ R10, DI
ADDQ SI, BX ADDQ DI, BX
ROLQ $0x39, BX ROLQ $0x39, BX
XORQ AX, BX XORQ AX, BX
ADDQ $0x00000008, CX ADDQ $0x00000008, CX
SUBQ $0x00000008, DX SUBQ $0x00000008, DX
longCore2: longCore2:
MOVQ (CX), SI MOVQ (CX), DI
MOVQ $0x00000000819b13af, R9 MOVQ $0x00000000819b13af, R10
IMULQ R9, SI IMULQ R10, DI
ADDQ SI, BP ADDQ DI, SI
ROLQ $0x3f, BP ROLQ $0x3f, SI
XORQ DI, BP XORQ R8, SI
ADDQ $0x00000008, CX ADDQ $0x00000008, CX
SUBQ $0x00000008, DX SUBQ $0x00000008, DX
longCore1: longCore1:
MOVQ (CX), SI MOVQ (CX), DI
MOVQ $0x0000000091cb27e5, R9 MOVQ $0x0000000091cb27e5, R10
IMULQ R9, SI IMULQ R10, DI
ADDQ SI, DI ADDQ DI, R8
RORQ $0x2f, DI RORQ $0x2f, R8
ADDQ BX, DI ADDQ BX, R8
ADDQ $0x00000008, CX ADDQ $0x00000008, CX
SUBQ $0x00000008, DX SUBQ $0x00000008, DX
longCore0: longCore0:
RORQ $0x0b, AX RORQ $0x0b, AX
SUBQ BP, AX SUBQ SI, AX
ADDQ $0x00000001, R8 ADDQ $0x00000001, R9
MOVQ $0x9c1b8e1e9628323f, SI MOVQ $0x9c1b8e1e9628323f, DI
IMULQ SI, R8 IMULQ DI, R9
XORQ R8, BX XORQ R9, BX
CMPQ DX, $0x00000000 CMPQ DX, $0x00000000
JE longTail0 JE longTail0
CMPQ DX, $0x00000001 CMPQ DX, $0x00000001
@@ -263,22 +263,22 @@ longCore0:
longTail7: longTail7:
MOVBQZX 6(CX), DX MOVBQZX 6(CX), DX
ADDQ DX, BP ADDQ DX, SI
longTail6: longTail6:
MOVWQZX 4(CX), DX MOVWQZX 4(CX), DX
ADDQ DX, DI ADDQ DX, R8
MOVLQZX (CX), DX MOVLQZX (CX), DX
ADDQ DX, AX ADDQ DX, AX
JMP longAfter JMP longAfter
longTail5: longTail5:
MOVBQZX 4(CX), DX MOVBQZX 4(CX), DX
ADDQ DX, BP ADDQ DX, SI
longTail4: longTail4:
MOVLQZX (CX), DX MOVLQZX (CX), DX
ADDQ DX, DI ADDQ DX, R8
JMP longAfter JMP longAfter
longTail3: longTail3:
@@ -287,52 +287,52 @@ longTail3:
longTail2: longTail2:
MOVWQZX (CX), DX MOVWQZX (CX), DX
ADDQ DX, BP ADDQ DX, SI
JMP longAfter JMP longAfter
longTail1: longTail1:
MOVBQZX (CX), DX MOVBQZX (CX), DX
ADDQ DX, DI ADDQ DX, R8
longTail0: longTail0:
ROLQ $0x20, AX ROLQ $0x20, AX
XORQ $0x000000ff, AX XORQ $0x000000ff, AX
longAfter: longAfter:
SUBQ DI, BP SUBQ R8, SI
RORQ $0x13, BX RORQ $0x13, BX
SUBQ BX, BP SUBQ BX, SI
RORQ $0x35, BP RORQ $0x35, SI
XORQ BP, AX XORQ SI, AX
SUBQ AX, BX SUBQ AX, BX
ROLQ $0x2b, AX ROLQ $0x2b, AX
ADDQ AX, BX ADDQ AX, BX
RORQ $0x03, BX RORQ $0x03, BX
SUBQ BX, AX SUBQ BX, AX
RORQ $0x2b, DI RORQ $0x2b, R8
SUBQ AX, DI SUBQ AX, R8
ROLQ $0x37, DI ROLQ $0x37, R8
XORQ BX, DI XORQ BX, R8
SUBQ DI, BP SUBQ R8, SI
RORQ $0x07, AX RORQ $0x07, AX
SUBQ DI, AX SUBQ R8, AX
RORQ $0x1f, DI RORQ $0x1f, R8
ADDQ DI, AX ADDQ R8, AX
SUBQ BP, DI SUBQ SI, R8
RORQ $0x27, AX RORQ $0x27, AX
XORQ AX, DI XORQ AX, R8
RORQ $0x11, AX RORQ $0x11, AX
XORQ DI, AX XORQ R8, AX
ADDQ AX, BP ADDQ AX, SI
RORQ $0x09, BP RORQ $0x09, SI
XORQ BP, DI XORQ SI, R8
ROLQ $0x18, DI ROLQ $0x18, R8
XORQ DI, AX XORQ R8, AX
RORQ $0x3b, AX RORQ $0x3b, AX
RORQ $0x01, BX RORQ $0x01, BX
SUBQ BP, BX SUBQ SI, BX
XORQ BP, BX XORQ SI, BX
XORQ AX, DI XORQ AX, R8
XORQ DI, BX XORQ R8, BX
MOVQ BX, ret+32(FP) MOVQ BX, ret+32(FP)
RET RET

View File

@@ -74,7 +74,7 @@ func Liveness(fn *ir.Function) error {
// AllocateRegisters performs register allocation. // AllocateRegisters performs register allocation.
func AllocateRegisters(fn *ir.Function) error { func AllocateRegisters(fn *ir.Function) error {
// Populate allocators (one per kind). // Initialize one allocator per kind.
as := map[reg.Kind]*Allocator{} as := map[reg.Kind]*Allocator{}
for _, i := range fn.Instructions() { for _, i := range fn.Instructions() {
for _, r := range i.Registers() { for _, r := range i.Registers() {
@@ -86,7 +86,28 @@ func AllocateRegisters(fn *ir.Function) error {
} }
as[k] = a as[k] = a
} }
as[k].Add(r.ID()) }
}
// De-prioritize the base pointer register. This can be used as a general
// purpose register, but it's callee-save so needs to be saved/restored if
// it is clobbered. For this reason we prefer to avoid using it unless
// forced to by register pressure.
for k, a := range as {
f := reg.FamilyOfKind(k)
for _, r := range f.Registers() {
if (r.Info() & reg.BasePointer) != 0 {
// Negative priority penalizes this register relative to all
// others (having default zero priority).
a.SetPriority(r.ID(), -1)
}
}
}
// Populate registers to be allocated.
for _, i := range fn.Instructions() {
for _, r := range i.Registers() {
as[r.Kind()].Add(r.ID())
} }
} }

View File

@@ -106,6 +106,51 @@ func ConstructLiveness(t *testing.T, ctx *build.Context) *ir.Function {
return BuildFunction(t, ctx, pass.LabelTarget, pass.CFG, pass.Liveness) return BuildFunction(t, ctx, pass.LabelTarget, pass.CFG, pass.Liveness)
} }
func TestAllocateRegistersBasePointerDeprioritized(t *testing.T) {
// Construct a function that requires n general-purpose registers all live
// at once. Choose n to be the maximal possible number of registers without
// touching the base pointer.
n := 14
ctx := build.NewContext()
ctx.Function("sum")
ctx.SignatureExpr("func() uint64")
x := make([]reg.GPVirtual, n)
for i := 0; i < n; i++ {
x[i] = ctx.GP64()
ctx.MOVQ(operand.U64(i), x[i])
}
for i := 1; i < n; i++ {
ctx.ADDQ(x[i], x[0])
}
ctx.Store(x[0], ctx.ReturnIndex(0))
ctx.RET()
// Build and compile the function up to register allocation.
fn := BuildFunction(t, ctx, pass.LabelTarget, pass.CFG, pass.Liveness, pass.AllocateRegisters, pass.BindRegisters)
// Verify this function uses n registers, but not the base pointer.
ps := map[reg.Physical]bool{}
for _, i := range fn.Instructions() {
for _, r := range i.OutputRegisters() {
ps[reg.ToPhysical(r)] = true
}
}
if len(ps) != n {
t.Fatalf("expected function to require %d registers", n)
}
for p := range ps {
if (p.Info() & reg.BasePointer) != 0 {
t.Fatal("base pointer used")
}
}
}
func TestEnsureBasePointerCalleeSavedFrameless(t *testing.T) { func TestEnsureBasePointerCalleeSavedFrameless(t *testing.T) {
// Construct a function that writes to the base pointer. // Construct a function that writes to the base pointer.
ctx := build.NewContext() ctx := build.NewContext()

View File

@@ -8,17 +8,17 @@ TEXT ·GP8(SB), NOSPLIT, $8-1
MOVB $0x02, CL MOVB $0x02, CL
MOVB $0x03, DL MOVB $0x03, DL
MOVB $0x04, BL MOVB $0x04, BL
MOVB $0x05, BP MOVB $0x05, SI
MOVB $0x06, SI MOVB $0x06, DI
MOVB $0x07, DI MOVB $0x07, R8
MOVB $0x08, R8 MOVB $0x08, R9
MOVB $0x09, R9 MOVB $0x09, R10
MOVB $0x0a, R10 MOVB $0x0a, R11
MOVB $0x0b, R11 MOVB $0x0b, R12
MOVB $0x0c, R12 MOVB $0x0c, R13
MOVB $0x0d, R13 MOVB $0x0d, R14
MOVB $0x0e, R14 MOVB $0x0e, R15
MOVB $0x0f, R15 MOVB $0x0f, BP
MOVB $0x10, AH MOVB $0x10, AH
MOVB $0x11, CH MOVB $0x11, CH
MOVB $0x12, DH MOVB $0x12, DH
@@ -26,7 +26,6 @@ TEXT ·GP8(SB), NOSPLIT, $8-1
ADDB CL, AL ADDB CL, AL
ADDB DL, AL ADDB DL, AL
ADDB BL, AL ADDB BL, AL
ADDB BP, AL
ADDB SI, AL ADDB SI, AL
ADDB DI, AL ADDB DI, AL
ADDB R8, AL ADDB R8, AL
@@ -37,6 +36,7 @@ TEXT ·GP8(SB), NOSPLIT, $8-1
ADDB R13, AL ADDB R13, AL
ADDB R14, AL ADDB R14, AL
ADDB R15, AL ADDB R15, AL
ADDB BP, AL
ADDB AH, AL ADDB AH, AL
ADDB CH, AL ADDB CH, AL
ADDB DH, AL ADDB DH, AL

View File

@@ -8,36 +8,35 @@ TEXT ·Masks(SB), NOSPLIT, $8-16
MOVQ $0x0002002a, CX MOVQ $0x0002002a, CX
MOVQ $0x0003002a, DX MOVQ $0x0003002a, DX
MOVQ $0x0004002a, BX MOVQ $0x0004002a, BX
MOVQ $0x0005002a, BP MOVQ $0x0005002a, SI
MOVQ $0x0006002a, SI MOVQ $0x0006002a, DI
MOVQ $0x0007002a, DI MOVQ $0x0007002a, R8
MOVQ $0x0008002a, R8 MOVQ $0x0008002a, R9
MOVQ $0x0009002a, R9 MOVQ $0x0009002a, R10
MOVQ $0x000a002a, R10 MOVQ $0x000a002a, R11
MOVQ $0x000b002a, R11 MOVQ $0x000b002a, R12
MOVQ $0x000c002a, R12 MOVQ $0x000c002a, R13
MOVQ $0x000d002a, R13 MOVQ $0x000d002a, R14
MOVQ $0x000e002a, R14 MOVQ $0x000e002a, R15
MOVQ $0x000f002a, R15 MOVQ $0x000f002a, BP
MOVW $0x0001, AX MOVW $0x0001, AX
MOVW $0x0002, CX MOVW $0x0002, CX
MOVW $0x0003, DX MOVW $0x0003, DX
MOVW $0x0004, BX MOVW $0x0004, BX
MOVW $0x0005, BP MOVW $0x0005, SI
MOVW $0x0006, SI MOVW $0x0006, DI
MOVW $0x0007, DI MOVW $0x0007, R8
MOVW $0x0008, R8 MOVW $0x0008, R9
MOVW $0x0009, R9 MOVW $0x0009, R10
MOVW $0x000a, R10 MOVW $0x000a, R11
MOVW $0x000b, R11 MOVW $0x000b, R12
MOVW $0x000c, R12 MOVW $0x000c, R13
MOVW $0x000d, R13 MOVW $0x000d, R14
MOVW $0x000e, R14 MOVW $0x000e, R15
MOVW $0x000f, R15 MOVW $0x000f, BP
ADDW CX, AX ADDW CX, AX
ADDW DX, AX ADDW DX, AX
ADDW BX, AX ADDW BX, AX
ADDW BP, AX
ADDW SI, AX ADDW SI, AX
ADDW DI, AX ADDW DI, AX
ADDW R8, AX ADDW R8, AX
@@ -48,12 +47,12 @@ TEXT ·Masks(SB), NOSPLIT, $8-16
ADDW R13, AX ADDW R13, AX
ADDW R14, AX ADDW R14, AX
ADDW R15, AX ADDW R15, AX
ADDW BP, AX
MOVW AX, ret+0(FP) MOVW AX, ret+0(FP)
MOVW $0x0000, AX MOVW $0x0000, AX
MOVW $0x0000, CX MOVW $0x0000, CX
MOVW $0x0000, DX MOVW $0x0000, DX
MOVW $0x0000, BX MOVW $0x0000, BX
MOVW $0x0000, BP
MOVW $0x0000, SI MOVW $0x0000, SI
MOVW $0x0000, DI MOVW $0x0000, DI
MOVW $0x0000, R8 MOVW $0x0000, R8
@@ -64,10 +63,10 @@ TEXT ·Masks(SB), NOSPLIT, $8-16
MOVW $0x0000, R13 MOVW $0x0000, R13
MOVW $0x0000, R14 MOVW $0x0000, R14
MOVW $0x0000, R15 MOVW $0x0000, R15
MOVW $0x0000, BP
ADDQ CX, AX ADDQ CX, AX
ADDQ DX, AX ADDQ DX, AX
ADDQ BX, AX ADDQ BX, AX
ADDQ BP, AX
ADDQ SI, AX ADDQ SI, AX
ADDQ DI, AX ADDQ DI, AX
ADDQ R8, AX ADDQ R8, AX
@@ -78,6 +77,7 @@ TEXT ·Masks(SB), NOSPLIT, $8-16
ADDQ R13, AX ADDQ R13, AX
ADDQ R14, AX ADDQ R14, AX
ADDQ R15, AX ADDQ R15, AX
ADDQ BP, AX
SHRQ $0x10, AX SHRQ $0x10, AX
MOVQ AX, ret1+8(FP) MOVQ AX, ret1+8(FP)
RET RET

View File

@@ -11,7 +11,6 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
MOVQ $0x9e77d78aacb8cbcc, CX MOVQ $0x9e77d78aacb8cbcc, CX
MOVQ $0x9e77d78aacb8cbcc, DX MOVQ $0x9e77d78aacb8cbcc, DX
MOVQ $0x9e77d78aacb8cbcc, BX MOVQ $0x9e77d78aacb8cbcc, BX
MOVQ $0x9e77d78aacb8cbcc, BP
MOVQ $0x9e77d78aacb8cbcc, SI MOVQ $0x9e77d78aacb8cbcc, SI
MOVQ $0x9e77d78aacb8cbcc, DI MOVQ $0x9e77d78aacb8cbcc, DI
MOVQ $0x9e77d78aacb8cbcc, R8 MOVQ $0x9e77d78aacb8cbcc, R8
@@ -22,10 +21,10 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
MOVQ $0x9e77d78aacb8cbcc, R13 MOVQ $0x9e77d78aacb8cbcc, R13
MOVQ $0x9e77d78aacb8cbcc, R14 MOVQ $0x9e77d78aacb8cbcc, R14
MOVQ $0x9e77d78aacb8cbcc, R15 MOVQ $0x9e77d78aacb8cbcc, R15
MOVQ $0x9e77d78aacb8cbcc, BP
MOVQ $0x9e77d78aacb8cbcc, CX MOVQ $0x9e77d78aacb8cbcc, CX
MOVQ $0x9e77d78aacb8cbcc, DX MOVQ $0x9e77d78aacb8cbcc, DX
MOVQ $0x9e77d78aacb8cbcc, BX MOVQ $0x9e77d78aacb8cbcc, BX
MOVQ $0x9e77d78aacb8cbcc, BP
MOVQ $0x9e77d78aacb8cbcc, SI MOVQ $0x9e77d78aacb8cbcc, SI
MOVQ $0x9e77d78aacb8cbcc, DI MOVQ $0x9e77d78aacb8cbcc, DI
MOVQ $0x9e77d78aacb8cbcc, R8 MOVQ $0x9e77d78aacb8cbcc, R8
@@ -36,10 +35,10 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
MOVQ $0x9e77d78aacb8cbcc, R13 MOVQ $0x9e77d78aacb8cbcc, R13
MOVQ $0x9e77d78aacb8cbcc, R14 MOVQ $0x9e77d78aacb8cbcc, R14
MOVQ $0x9e77d78aacb8cbcc, R15 MOVQ $0x9e77d78aacb8cbcc, R15
MOVQ $0x9e77d78aacb8cbcc, BP
MOVQ $0x9e77d78aacb8cbcc, CX MOVQ $0x9e77d78aacb8cbcc, CX
MOVQ $0x9e77d78aacb8cbcc, DX MOVQ $0x9e77d78aacb8cbcc, DX
MOVQ $0x9e77d78aacb8cbcc, BX MOVQ $0x9e77d78aacb8cbcc, BX
MOVQ $0x9e77d78aacb8cbcc, BP
MOVQ $0x9e77d78aacb8cbcc, SI MOVQ $0x9e77d78aacb8cbcc, SI
MOVQ $0x9e77d78aacb8cbcc, DI MOVQ $0x9e77d78aacb8cbcc, DI
MOVQ $0x9e77d78aacb8cbcc, R8 MOVQ $0x9e77d78aacb8cbcc, R8
@@ -50,26 +49,26 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
MOVQ $0x9e77d78aacb8cbcc, R13 MOVQ $0x9e77d78aacb8cbcc, R13
MOVQ $0x9e77d78aacb8cbcc, R14 MOVQ $0x9e77d78aacb8cbcc, R14
MOVQ $0x9e77d78aacb8cbcc, R15 MOVQ $0x9e77d78aacb8cbcc, R15
MOVQ $0x9e77d78aacb8cbcc, BP
// Iteration 1. // Iteration 1.
MOVL $0x00000001, CX MOVL $0x00000001, CX
MOVL $0x00000002, DX MOVL $0x00000002, DX
MOVL $0x00000003, BX MOVL $0x00000003, BX
MOVL $0x00000004, BP MOVL $0x00000004, SI
MOVL $0x00000005, SI MOVL $0x00000005, DI
MOVL $0x00000006, DI MOVL $0x00000006, R8
MOVL $0x00000007, R8 MOVL $0x00000007, R9
MOVL $0x00000008, R9 MOVL $0x00000008, R10
MOVL $0x00000009, R10 MOVL $0x00000009, R11
MOVL $0x0000000a, R11 MOVL $0x0000000a, R12
MOVL $0x0000000b, R12 MOVL $0x0000000b, R13
MOVL $0x0000000c, R13 MOVL $0x0000000c, R14
MOVL $0x0000000d, R14 MOVL $0x0000000d, R15
MOVL $0x0000000e, R15 MOVL $0x0000000e, BP
ADDQ CX, AX ADDQ CX, AX
ADDQ DX, AX ADDQ DX, AX
ADDQ BX, AX ADDQ BX, AX
ADDQ BP, AX
ADDQ SI, AX ADDQ SI, AX
ADDQ DI, AX ADDQ DI, AX
ADDQ R8, AX ADDQ R8, AX
@@ -80,26 +79,26 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
ADDQ R13, AX ADDQ R13, AX
ADDQ R14, AX ADDQ R14, AX
ADDQ R15, AX ADDQ R15, AX
ADDQ BP, AX
// Iteration 2. // Iteration 2.
MOVL $0x0000000f, CX MOVL $0x0000000f, CX
MOVL $0x00000010, DX MOVL $0x00000010, DX
MOVL $0x00000011, BX MOVL $0x00000011, BX
MOVL $0x00000012, BP MOVL $0x00000012, SI
MOVL $0x00000013, SI MOVL $0x00000013, DI
MOVL $0x00000014, DI MOVL $0x00000014, R8
MOVL $0x00000015, R8 MOVL $0x00000015, R9
MOVL $0x00000016, R9 MOVL $0x00000016, R10
MOVL $0x00000017, R10 MOVL $0x00000017, R11
MOVL $0x00000018, R11 MOVL $0x00000018, R12
MOVL $0x00000019, R12 MOVL $0x00000019, R13
MOVL $0x0000001a, R13 MOVL $0x0000001a, R14
MOVL $0x0000001b, R14 MOVL $0x0000001b, R15
MOVL $0x0000001c, R15 MOVL $0x0000001c, BP
ADDQ CX, AX ADDQ CX, AX
ADDQ DX, AX ADDQ DX, AX
ADDQ BX, AX ADDQ BX, AX
ADDQ BP, AX
ADDQ SI, AX ADDQ SI, AX
ADDQ DI, AX ADDQ DI, AX
ADDQ R8, AX ADDQ R8, AX
@@ -110,26 +109,26 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
ADDQ R13, AX ADDQ R13, AX
ADDQ R14, AX ADDQ R14, AX
ADDQ R15, AX ADDQ R15, AX
ADDQ BP, AX
// Iteration 3. // Iteration 3.
MOVL $0x0000001d, CX MOVL $0x0000001d, CX
MOVL $0x0000001e, DX MOVL $0x0000001e, DX
MOVL $0x0000001f, BX MOVL $0x0000001f, BX
MOVL $0x00000020, BP MOVL $0x00000020, SI
MOVL $0x00000021, SI MOVL $0x00000021, DI
MOVL $0x00000022, DI MOVL $0x00000022, R8
MOVL $0x00000023, R8 MOVL $0x00000023, R9
MOVL $0x00000024, R9 MOVL $0x00000024, R10
MOVL $0x00000025, R10 MOVL $0x00000025, R11
MOVL $0x00000026, R11 MOVL $0x00000026, R12
MOVL $0x00000027, R12 MOVL $0x00000027, R13
MOVL $0x00000028, R13 MOVL $0x00000028, R14
MOVL $0x00000029, R14 MOVL $0x00000029, R15
MOVL $0x0000002a, R15 MOVL $0x0000002a, BP
ADDQ CX, AX ADDQ CX, AX
ADDQ DX, AX ADDQ DX, AX
ADDQ BX, AX ADDQ BX, AX
ADDQ BP, AX
ADDQ SI, AX ADDQ SI, AX
ADDQ DI, AX ADDQ DI, AX
ADDQ R8, AX ADDQ R8, AX
@@ -140,6 +139,7 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8
ADDQ R13, AX ADDQ R13, AX
ADDQ R14, AX ADDQ R14, AX
ADDQ R15, AX ADDQ R15, AX
ADDQ BP, AX
// Store result and return. // Store result and return.
MOVQ AX, ret+0(FP) MOVQ AX, ret+0(FP)

File diff suppressed because it is too large Load Diff