From e089a6c93c127d2251230ea6c4a2a6fdcfb046fc Mon Sep 17 00:00:00 2001 From: Michael McLoughlin Date: Mon, 27 Jan 2020 21:05:33 -0800 Subject: [PATCH] tests/fixedbugs: regression test for issue 100 (#129) Adds a regression test based on klauspost/compress#186. This necessitated some related changes: * Mark "RET" as a terminal instruction * printer refactor to maintain compatibility with asmfmt * Tweaks to other regression tests to ensure they are run correctly in CI Updates #100 #65 #8 --- examples/stadtx/stadtx.s | 92 +- internal/gen/ctors.go | 4 + ir/ir.go | 5 + pass/cfg.go | 2 +- printer/goasm.go | 71 +- printer/goasm_test.go | 24 + tests/fixedbugs/issue100/allocfail/LICENSE | 28 + tests/fixedbugs/issue100/allocfail/README.md | 11 + .../fixedbugs/issue100/allocfail/allocfail.s | 9825 +++++++++++++++++ tests/fixedbugs/issue100/allocfail/asm.go | 1586 +++ tests/fixedbugs/issue100/allocfail/doc.go | 9 + tests/fixedbugs/issue100/allocfail/stubs.go | 85 + tests/fixedbugs/issue100/doc.go | 2 - .../fixedbugs/issue100/{ => minrepro}/asm.go | 0 tests/fixedbugs/issue100/minrepro/doc.go | 2 + .../{issue100.s => minrepro/minrepro.s} | 2 +- .../minrepro_test.go} | 4 +- tests/fixedbugs/issue100/minrepro/stub.go | 5 + tests/fixedbugs/issue100/stub.go | 5 - .../issue65/{castphysical.go => asm.go} | 9 +- tests/fixedbugs/issue65/doc.go | 9 + tests/fixedbugs/issue65/issue65.s | 9 + tests/fixedbugs/issue65/stub.go | 5 + x86/zctors.go | 9 +- 24 files changed, 11711 insertions(+), 92 deletions(-) create mode 100644 tests/fixedbugs/issue100/allocfail/LICENSE create mode 100644 tests/fixedbugs/issue100/allocfail/README.md create mode 100644 tests/fixedbugs/issue100/allocfail/allocfail.s create mode 100644 tests/fixedbugs/issue100/allocfail/asm.go create mode 100644 tests/fixedbugs/issue100/allocfail/doc.go create mode 100644 tests/fixedbugs/issue100/allocfail/stubs.go delete mode 100644 tests/fixedbugs/issue100/doc.go rename tests/fixedbugs/issue100/{ => minrepro}/asm.go (100%) create mode 100644 tests/fixedbugs/issue100/minrepro/doc.go rename tests/fixedbugs/issue100/{issue100.s => minrepro/minrepro.s} (98%) rename tests/fixedbugs/issue100/{issue100_test.go => minrepro/minrepro_test.go} (72%) create mode 100644 tests/fixedbugs/issue100/minrepro/stub.go delete mode 100644 tests/fixedbugs/issue100/stub.go rename tests/fixedbugs/issue65/{castphysical.go => asm.go} (50%) create mode 100644 tests/fixedbugs/issue65/doc.go create mode 100644 tests/fixedbugs/issue65/issue65.s create mode 100644 tests/fixedbugs/issue65/stub.go diff --git a/examples/stadtx/stadtx.s b/examples/stadtx/stadtx.s index e940d19..58fede0 100644 --- a/examples/stadtx/stadtx.s +++ b/examples/stadtx/stadtx.s @@ -33,10 +33,10 @@ TEXT ·Hash(SB), NOSPLIT, $0-40 JE shortCore3 shortCore3: - MOVQ (CX), SI - MOVQ $0x9c1b8e1e9628323f, DI - IMULQ DI, SI - ADDQ SI, BX + MOVQ (CX), AX + MOVQ $0x9c1b8e1e9628323f, SI + IMULQ SI, AX + ADDQ AX, BX RORQ $0x11, BX XORQ BP, BX RORQ $0x35, BP @@ -45,10 +45,10 @@ shortCore3: SUBQ $0x00000008, DX shortCore2: - MOVQ (CX), SI - MOVQ $0x9c1b8e1e9628323f, DI - IMULQ DI, SI - ADDQ SI, BX + MOVQ (CX), AX + MOVQ $0x9c1b8e1e9628323f, SI + IMULQ SI, AX + ADDQ AX, BX RORQ $0x11, BX XORQ BP, BX RORQ $0x35, BP @@ -57,10 +57,10 @@ shortCore2: SUBQ $0x00000008, DX shortCore1: - MOVQ (CX), SI - MOVQ $0x9c1b8e1e9628323f, DI - IMULQ DI, SI - ADDQ SI, BX + MOVQ (CX), AX + MOVQ $0x9c1b8e1e9628323f, SI + IMULQ SI, AX + ADDQ AX, BX RORQ $0x11, BX XORQ BP, BX RORQ $0x35, BP @@ -87,38 +87,38 @@ shortCore0: JE shortTail7 shortTail7: - MOVBQZX 6(CX), SI - SHLQ $0x20, SI - ADDQ SI, BX + MOVBQZX 6(CX), DX + SHLQ $0x20, DX + ADDQ DX, BX shortTail6: - MOVBQZX 5(CX), SI - SHLQ $0x30, SI - ADDQ SI, BP + MOVBQZX 5(CX), DX + SHLQ $0x30, DX + ADDQ DX, BP shortTail5: - MOVBQZX 4(CX), SI - SHLQ $0x10, SI - ADDQ SI, BX + MOVBQZX 4(CX), DX + SHLQ $0x10, DX + ADDQ DX, BX shortTail4: - MOVLQZX (CX), SI - ADDQ SI, BP + MOVLQZX (CX), DX + ADDQ DX, BP JMP shortAfter shortTail3: - MOVBQZX 2(CX), SI - SHLQ $0x30, SI - ADDQ SI, BX + MOVBQZX 2(CX), DX + SHLQ $0x30, DX + ADDQ DX, BX shortTail2: - MOVWQZX (CX), SI - ADDQ SI, BP + MOVWQZX (CX), DX + ADDQ DX, BP JMP shortAfter shortTail1: - MOVBQZX (CX), SI - ADDQ SI, BX + MOVBQZX (CX), DX + ADDQ DX, BX shortTail0: RORQ $0x20, BP @@ -262,37 +262,37 @@ longCore0: JE longTail7 longTail7: - MOVBQZX 6(CX), SI - ADDQ SI, BP + MOVBQZX 6(CX), DX + ADDQ DX, BP longTail6: - MOVWQZX 4(CX), SI - ADDQ SI, DI - MOVLQZX (CX), SI - ADDQ SI, AX + MOVWQZX 4(CX), DX + ADDQ DX, DI + MOVLQZX (CX), DX + ADDQ DX, AX JMP longAfter longTail5: - MOVBQZX 4(CX), SI - ADDQ SI, BP + MOVBQZX 4(CX), DX + ADDQ DX, BP longTail4: - MOVLQZX (CX), SI - ADDQ SI, DI + MOVLQZX (CX), DX + ADDQ DX, DI JMP longAfter longTail3: - MOVBQZX 2(CX), SI - ADDQ SI, AX + MOVBQZX 2(CX), DX + ADDQ DX, AX longTail2: - MOVWQZX (CX), SI - ADDQ SI, BP + MOVWQZX (CX), DX + ADDQ DX, BP JMP longAfter longTail1: - MOVBQZX (CX), SI - ADDQ SI, DI + MOVBQZX (CX), DX + ADDQ DX, DI longTail0: ROLQ $0x20, AX diff --git a/internal/gen/ctors.go b/internal/gen/ctors.go index a9592d0..e48a5dc 100644 --- a/internal/gen/ctors.go +++ b/internal/gen/ctors.go @@ -98,6 +98,10 @@ func construct(i inst.Instruction, f inst.Form, s signature) string { } // Branch variables. + if i.IsTerminal() { + fmt.Fprintf(buf, "\tIsTerminal: true,\n") + } + if i.IsBranch() { fmt.Fprintf(buf, "\tIsBranch: true,\n") fmt.Fprintf(buf, "\tIsConditional: %#v,\n", i.IsConditionalBranch()) diff --git a/ir/ir.go b/ir/ir.go index c5b52a1..6fb9216 100644 --- a/ir/ir.go +++ b/ir/ir.go @@ -61,6 +61,11 @@ type Instruction struct { func (i *Instruction) node() {} +// IsUnconditionalBranch reports whether i is an unconditional branch. +func (i Instruction) IsUnconditionalBranch() bool { + return i.IsBranch && !i.IsConditional +} + // TargetLabel returns the label referenced by this instruction. Returns nil if // no label is referenced. func (i Instruction) TargetLabel() *Label { diff --git a/pass/cfg.go b/pass/cfg.go index f308e2e..d5f6ea4 100644 --- a/pass/cfg.go +++ b/pass/cfg.go @@ -62,7 +62,7 @@ func CFG(fn *ir.Function) error { // Otherwise, could continue to the following instruction. switch { case cur.IsTerminal: - case cur.IsBranch && !cur.IsConditional: + case cur.IsUnconditionalBranch(): default: cur.Succ = append(cur.Succ, nxt) } diff --git a/printer/goasm.go b/printer/goasm.go index 8441916..0d8a12c 100644 --- a/printer/goasm.go +++ b/printer/goasm.go @@ -1,10 +1,8 @@ package printer import ( - "fmt" "strconv" "strings" - "text/tabwriter" "github.com/mmcloughlin/avo/internal/prnt" "github.com/mmcloughlin/avo/ir" @@ -17,6 +15,9 @@ const dot = "\u00b7" type goasm struct { cfg Config prnt.Generator + + instructions []*ir.Instruction + clear bool } // NewGoAsm constructs a printer for writing Go assembly files. @@ -87,31 +88,21 @@ func (p *goasm) function(f *ir.Function) { } p.Printf(", %s\n", textsize(f)) - w := p.tabwriter() - clear := true - flush := func() { - w.Flush() - w = p.tabwriter() - if !clear { - p.NL() - clear = true - } - } + p.clear = true for _, node := range f.Nodes { switch n := node.(type) { case *ir.Instruction: - leader := []byte{tabwriter.Escape, '\t', tabwriter.Escape} - fmt.Fprint(w, string(leader)+n.Opcode) - if len(n.Operands) > 0 { - fmt.Fprintf(w, "\t%s", joinOperands(n.Operands)) + p.instruction(n) + if n.IsTerminal || n.IsUnconditionalBranch() { + p.flush() } - fmt.Fprint(w, "\n") - clear = false case ir.Label: - flush() + p.flush() + p.ensureclear() p.Printf("%s:\n", n) case *ir.Comment: - flush() + p.flush() + p.ensureclear() for _, line := range n.Lines { p.Printf("\t// %s\n", line) } @@ -119,11 +110,45 @@ func (p *goasm) function(f *ir.Function) { panic("unexpected node type") } } - w.Flush() + p.flush() } -func (p *goasm) tabwriter() *tabwriter.Writer { - return tabwriter.NewWriter(p.Raw(), 4, 4, 1, ' ', tabwriter.StripEscape) +func (p *goasm) instruction(i *ir.Instruction) { + p.instructions = append(p.instructions, i) + p.clear = false +} + +func (p *goasm) flush() { + if len(p.instructions) == 0 { + return + } + + // Determine instruction width. Instructions with no operands are not + // considered in this calculation. + width := 0 + for _, i := range p.instructions { + if len(i.Operands) > 0 && len(i.Opcode) > width { + width = len(i.Opcode) + } + } + + // Output instruction block. + for _, i := range p.instructions { + if len(i.Operands) > 0 { + p.Printf("\t%-*s%s\n", width+1, i.Opcode, joinOperands(i.Operands)) + } else { + p.Printf("\t%s\n", i.Opcode) + } + } + + p.instructions = nil +} + +func (p *goasm) ensureclear() { + if !p.clear { + p.NL() + p.clear = true + } } func (p *goasm) global(g *ir.Global) { diff --git a/printer/goasm_test.go b/printer/goasm_test.go index 207a8b2..cfae0f0 100644 --- a/printer/goasm_test.go +++ b/printer/goasm_test.go @@ -81,3 +81,27 @@ func TestConstraints(t *testing.T) { "", }) } + +func TestAlignmentNoOperands(t *testing.T) { + ctx := build.NewContext() + ctx.Function("alignment") + ctx.SignatureExpr("func()") + ctx.ADDQ(reg.RAX, reg.RBX) + ctx.VMOVDQU(reg.Y4, reg.Y11) + ctx.VZEROUPPER() + ctx.ADDQ(reg.R9, reg.R13) + ctx.RET() + + AssertPrintsLines(t, ctx, printer.NewGoAsm, []string{ + "// Code generated by avo. DO NOT EDIT.", + "", + "// func alignment()", + "TEXT ·alignment(SB), $0", + "\tADDQ AX, BX", + "\tVMOVDQU Y4, Y11", + "\tVZEROUPPER", // instruction with no alignment doesn't affect width + "\tADDQ R9, R13", // retains alignment from above + "\tRET", + "", + }) +} diff --git a/tests/fixedbugs/issue100/allocfail/LICENSE b/tests/fixedbugs/issue100/allocfail/LICENSE new file mode 100644 index 0000000..1d2d645 --- /dev/null +++ b/tests/fixedbugs/issue100/allocfail/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. +Copyright (c) 2019 Klaus Post. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/tests/fixedbugs/issue100/allocfail/README.md b/tests/fixedbugs/issue100/allocfail/README.md new file mode 100644 index 0000000..94044d8 --- /dev/null +++ b/tests/fixedbugs/issue100/allocfail/README.md @@ -0,0 +1,11 @@ +Regression test for [issue +#100](https://github.com/mmcloughlin/avo/issues/100) based on the original +reported allocation failure. + +Based on the pull request +[`klauspost/compress#186`](https://github.com/klauspost/compress/pull/186) at +`c1f3cf132cd8e214b38cc16e418bf2e501ccda93` with the lines after `FIXME` +comments re-activated and other minimal edits to make it work in this +environment. + +Original code covered by [BSD 3-Clause License](LICENSE). diff --git a/tests/fixedbugs/issue100/allocfail/allocfail.s b/tests/fixedbugs/issue100/allocfail/allocfail.s new file mode 100644 index 0000000..03fbc78 --- /dev/null +++ b/tests/fixedbugs/issue100/allocfail/allocfail.s @@ -0,0 +1,9825 @@ +// Code generated by command: go run asm.go -out allocfail.s -stubs stubs.go. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc + +#include "textflag.h" + +// func encodeBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm(SB), $65568-56 + MOVQ $0x00000200, AX + LEAQ 32(SP), CX + PXOR X0, X0 + +zero_loop_encodeBlockAsm: + MOVOU X0, (CX) + MOVOU X0, 16(CX) + MOVOU X0, 32(CX) + MOVOU X0, 48(CX) + MOVOU X0, 64(CX) + MOVOU X0, 80(CX) + MOVOU X0, 96(CX) + MOVOU X0, 112(CX) + ADDQ $0x80, CX + DECQ AX + JNZ zero_loop_encodeBlockAsm + MOVL AX, 20(SP) + MOVQ src_len+32(FP), AX + LEAQ -5(AX), CX + LEAQ -8(AX), BX + SHRQ $0x05, AX + SUBL AX, CX + MOVL BX, 16(SP) + MOVQ dst_base+0(FP), AX + MOVQ AX, 8(SP) + LEAQ (AX)(CX*1), CX + MOVQ CX, (SP) + MOVL $0x00000001, AX + MOVL AX, 24(SP) + MOVQ src_base+24(FP), CX + +search_loop_encodeBlockAsm: + MOVQ (CX)(AX*1), BP + MOVL AX, BX + SUBL 20(SP), BX + SHRL $0x06, BX + LEAQ 4(AX)(BX*1), BX + MOVL 16(SP), SI + CMPL BX, SI + JGT emit_remainder_encodeBlockAsm + MOVL BX, 28(SP) + MOVQ $0x0000cf1bbcdcbf9b, BX + MOVQ BP, DI + MOVQ BP, R8 + SHRQ $0x08, R8 + SHLQ $0x10, DI + IMULQ BX, DI + SHRQ $0x30, DI + SHLQ $0x10, R8 + IMULQ BX, R8 + SHRQ $0x30, R8 + MOVL 32(SP)(DI*1), BX + MOVL 32(SP)(R8*1), SI + MOVL AX, 32(SP)(DI*1) + LEAL 1(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, DI + SUBL 24(SP), DI + MOVL 1(CX)(DI*1), R9 + MOVQ BP, R8 + SHLQ $0x08, R8 + CMPL R8, R9 + JNE no_repeat_found_encodeBlockAsm + LEAQ 1(AX), BP + MOVL 20(SP), BX + TESTL DI, DI + JZ repeat_extend_back_end_encodeBlockAsm + +repeat_extend_back_loop_encodeBlockAsm: + CMPL BP, BX + JG repeat_extend_back_end_encodeBlockAsm + MOVB -1(CX)(DI*1), DL + MOVB -1(CX)(BP*1), SI + CMPB DL, SI + JNE repeat_extend_back_end_encodeBlockAsm + LEAQ -1(BP), BP + DECL DI + JZ repeat_extend_back_end_encodeBlockAsm + JMP repeat_extend_back_loop_encodeBlockAsm + +repeat_extend_back_end_encodeBlockAsm: + MOVL 20(SP), BX + CMPL BX, BP + JEQ emit_literal_skip_repeat_emit_encodeBlockAsm + MOVL BP, SI + MOVL BP, 20(SP) + LEAQ (CX)(BX*1), DI + SUBL BX, SI + MOVQ dst_base+0(FP), BX + MOVQ SI, R8 + SUBL $0x01, R8 + JC emit_literal_done_repeat_emit_encodeBlockAsm + CMPL R8, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm + CMPL R8, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm + CMPL R8, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm + CMPL R8, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm + MOVB $0xfc, (BX) + MOVL R8, 1(BX) + ADDQ $0x05, BX + JMP memmove_repeat_emit_encodeBlockAsm + +four_bytes_repeat_emit_encodeBlockAsm: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (BX) + MOVW R8, 1(BX) + MOVB R9, 3(BX) + ADDQ $0x04, BX + JMP memmove_repeat_emit_encodeBlockAsm + +three_bytes_repeat_emit_encodeBlockAsm: + MOVB $0xf4, (BX) + MOVW R8, 1(BX) + ADDQ $0x03, BX + JMP memmove_repeat_emit_encodeBlockAsm + +two_bytes_repeat_emit_encodeBlockAsm: + MOVB $0xf0, (BX) + MOVB R8, 1(BX) + ADDQ $0x02, BX + JMP memmove_repeat_emit_encodeBlockAsm + +one_byte_repeat_emit_encodeBlockAsm: + SHLB $0x02, R8 + MOVB R8, (BX) + ADDQ $0x01, BX + +memmove_repeat_emit_encodeBlockAsm: + LEAQ (BX)(SI*1), R8 + NOP + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_tail: + TESTQ SI, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm + CMPQ SI, $0x02 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2 + CMPQ SI, $0x04 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4 + CMPQ SI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_5through7 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_9through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 + CMPQ SI, $0x40 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 + CMPQ SI, $0x80 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128 + CMPQ SI, $0x00000100 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048 + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2: + MOVB (DI), R8 + MOVB -1(DI)(SI*1), DI + MOVB R8, (BX) + MOVB DI, -1(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4: + MOVL (DI), R8 + MOVL R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3: + MOVW (DI), R8 + MOVB 2(DI), DI + MOVW R8, (BX) + MOVB DI, 2(BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_5through7: + MOVL (DI), R8 + MOVL -4(DI)(SI*1), DI + MOVL R8, (BX) + MOVL DI, -4(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: + MOVQ (DI), R8 + MOVQ R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_9through16: + MOVQ (DI), R8 + MOVQ -8(DI)(SI*1), DI + MOVQ R8, (BX) + MOVQ DI, -8(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(SI*1), X1 + MOVOU X0, (BX) + MOVOU X1, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(SI*1), X2 + MOVOU -16(DI)(SI*1), X3 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, -32(BX)(SI*1) + MOVOU X3, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU -128(DI)(SI*1), X8 + MOVOU -112(DI)(SI*1), X9 + MOVOU -96(DI)(SI*1), X10 + MOVOU -80(DI)(SI*1), X11 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, -128(BX)(SI*1) + MOVOU X9, -112(BX)(SI*1) + MOVOU X10, -96(BX)(SI*1) + MOVOU X11, -80(BX)(SI*1) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm + +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048: + LEAQ -256(SI), SI + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU 128(DI), X8 + MOVOU 144(DI), X9 + MOVOU 160(DI), X10 + MOVOU 176(DI), X11 + MOVOU 192(DI), X12 + MOVOU 208(DI), X13 + MOVOU 224(DI), X14 + MOVOU 240(DI), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, 128(BX) + MOVOU X9, 144(BX) + MOVOU X10, 160(BX) + MOVOU X11, 176(BX) + MOVOU X12, 192(BX) + MOVOU X13, 208(BX) + MOVOU X14, 224(BX) + MOVOU X15, 240(BX) + CMPQ SI, $0x00000100 + LEAQ 256(DI), DI + LEAQ 256(BX), BX + JGE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_tail + MOVQ R8, BX + +emit_literal_done_repeat_emit_encodeBlockAsm: + MOVQ BX, dst_base+0(FP) + +emit_literal_skip_repeat_emit_encodeBlockAsm: + ADDL $0x05, AX + MOVL AX, BX + SUBL 24(SP), BX + MOVL 16(SP), BX + SUBL AX, BX + XORQ DI, DI + CMPQ BX, $0x08 + JL matchlen_single_repeat_extend + +matchlen_loopback_repeat_extend: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_repeat_extend + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP repeat_extend_forward_end_encodeBlockAsm + +matchlen_loop_repeat_extend: + LEAQ -8(BX), BX + LEAQ 8(DI), DI + CMPQ BX, $0x08 + JGE matchlen_loopback_repeat_extend + +matchlen_single_repeat_extend: + TESTQ BX, BX + JZ repeat_extend_forward_end_encodeBlockAsm + +matchlen_single_loopback_repeat_extend: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE repeat_extend_forward_end_encodeBlockAsm + LEAQ 1(DI), DI + DECQ BX + JNZ matchlen_single_loopback_repeat_extend + +repeat_extend_forward_end_encodeBlockAsm: + ADDL DI, AX + MOVL AX, BX + SUBL BP, BX + MOVL 24(SP), BP + MOVQ dst_base+0(FP), SI + MOVL 20(SP), DI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm + +emit_repeat_again_match_repeat_: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_match_repeat_ + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_ + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_repeat_ + +cant_repeat_two_offset_match_repeat_: + CMPL BX, $0x00000104 + JLT repeat_three_match_repeat_ + CMPL BX, $0x00010100 + JLT repeat_four_match_repeat_ + CMPL BX, $0x0100ffff + JLT repeat_five_match_repeat_ + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_repeat_ + +repeat_five_match_repeat_: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_match_repeat_: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_match_repeat_: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_match_repeat_: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_match_repeat_: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_as_copy_encodeBlockAsm: + CMPL BP, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm + CMPL BX, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(BX), BX + ADDQ $0x05, SI + CMPL BX, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm + +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBlockAsm: + TESTL BX, BX + JZ repeat_end_emit_encodeBlockAsm + MOVB $0x03, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm + +two_byte_offset_repeat_as_copy_encodeBlockAsm: + CMPL BX, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(BX), BX + ADDQ $0x03, SI + +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm: + CMPL BX, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm + CMPL BP, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm + MOVB $0x01, DL + LEAQ -16(DX)(BX*4), BX + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm + +emit_copy_three_repeat_as_copy_encodeBlockAsm: + MOVB $0x02, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +repeat_end_emit_encodeBlockAsm: + MOVQ SI, dst_base+0(FP) + MOVL 16(SP), BX + CMPL AX, BX + JGT emit_remainder_encodeBlockAsm + JMP search_loop_encodeBlockAsm + +no_repeat_found_encodeBlockAsm: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ BP, DI + SHRQ $0x10, DI + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x30, DI + CMPL (CX)(BX*1), BP + SHRQ $0x08, BP + JEQ candidate_match_encodeBlockAsm + MOVL 32(SP)(DI*1), BX + CMPL (CX)(SI*1), BP + JEQ candidate2_match_encodeBlockAsm + LEAQ 2(AX), SI + MOVL SI, 32(SP)(DI*1) + SHRQ $0x08, BP + CMPL (CX)(BX*1), BP + JEQ candidate3_match_encodeBlockAsm + MOVL 28(SP), AX + JMP search_loop_encodeBlockAsm + +candidate3_match_encodeBlockAsm: + ADDL $0x02, AX + JMP candidate_match_encodeBlockAsm + +candidate2_match_encodeBlockAsm: + LEAQ -2(AX), BX + MOVL BX, 32(SP)(DI*1) + INCL AX + MOVL SI, BX + +candidate_match_encodeBlockAsm: + MOVL 20(SP), BP + TESTL BX, BX + JZ match_extend_back_end_encodeBlockAsm + +match_extend_back_loop_encodeBlockAsm: + CMPL AX, BP + JG match_extend_back_end_encodeBlockAsm + MOVB -1(CX)(BX*1), DL + MOVB -1(CX)(AX*1), SI + CMPB DL, SI + JNE match_extend_back_end_encodeBlockAsm + LEAL -1(AX), AX + DECL BX + JZ match_extend_back_end_encodeBlockAsm + JMP match_extend_back_loop_encodeBlockAsm + +match_extend_back_end_encodeBlockAsm: + MOVL AX, BP + SUBL 20(SP), BP + LEAQ dst_base+0(FP)(BP*1), BP + CMPQ BP, (SP) + JL match_dst_size_check_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm: + MOVL BX, BP + MOVL 20(SP), SI + CMPL SI, BP + JEQ emit_literal_skip_match_emit_encodeBlockAsm + MOVL BP, DI + MOVL BP, 20(SP) + LEAQ (CX)(SI*1), BP + SUBL SI, DI + MOVQ dst_base+0(FP), SI + MOVQ DI, R8 + SUBL $0x01, R8 + JC emit_literal_done_match_emit_encodeBlockAsm + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm + MOVB $0xfc, (SI) + MOVL R8, 1(SI) + ADDQ $0x05, SI + JMP memmove_match_emit_encodeBlockAsm + +four_bytes_match_emit_encodeBlockAsm: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (SI) + MOVW R8, 1(SI) + MOVB R9, 3(SI) + ADDQ $0x04, SI + JMP memmove_match_emit_encodeBlockAsm + +three_bytes_match_emit_encodeBlockAsm: + MOVB $0xf4, (SI) + MOVW R8, 1(SI) + ADDQ $0x03, SI + JMP memmove_match_emit_encodeBlockAsm + +two_bytes_match_emit_encodeBlockAsm: + MOVB $0xf0, (SI) + MOVB R8, 1(SI) + ADDQ $0x02, SI + JMP memmove_match_emit_encodeBlockAsm + +one_byte_match_emit_encodeBlockAsm: + SHLB $0x02, R8 + MOVB R8, (SI) + ADDQ $0x01, SI + +memmove_match_emit_encodeBlockAsm: + LEAQ (SI)(DI*1), R8 + NOP + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail: + TESTQ DI, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm + CMPQ DI, $0x02 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2 + CMPQ DI, $0x04 + JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_5through7 + JE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_9through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 + CMPQ DI, $0x40 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 + CMPQ DI, $0x80 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_65through128 + CMPQ DI, $0x00000100 + JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_129through256 + JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048 + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2: + MOVB (BP), R8 + MOVB -1(BP)(DI*1), BP + MOVB R8, (SI) + MOVB BP, -1(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4: + MOVL (BP), R8 + MOVL R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3: + MOVW (BP), R8 + MOVB 2(BP), BP + MOVW R8, (SI) + MOVB BP, 2(SI) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_5through7: + MOVL (BP), R8 + MOVL -4(BP)(DI*1), BP + MOVL R8, (SI) + MOVL BP, -4(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: + MOVQ (BP), R8 + MOVQ R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_9through16: + MOVQ (BP), R8 + MOVQ -8(BP)(DI*1), BP + MOVQ R8, (SI) + MOVQ BP, -8(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: + MOVOU (BP), X0 + MOVOU -16(BP)(DI*1), X1 + MOVOU X0, (SI) + MOVOU X1, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU -32(BP)(DI*1), X2 + MOVOU -16(BP)(DI*1), X3 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, -32(SI)(DI*1) + MOVOU X3, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_65through128: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_129through256: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU -128(BP)(DI*1), X8 + MOVOU -112(BP)(DI*1), X9 + MOVOU -96(BP)(DI*1), X10 + MOVOU -80(BP)(DI*1), X11 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, -128(SI)(DI*1) + MOVOU X9, -112(SI)(DI*1) + MOVOU X10, -96(SI)(DI*1) + MOVOU X11, -80(SI)(DI*1) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm + +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048: + LEAQ -256(DI), DI + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU 128(BP), X8 + MOVOU 144(BP), X9 + MOVOU 160(BP), X10 + MOVOU 176(BP), X11 + MOVOU 192(BP), X12 + MOVOU 208(BP), X13 + MOVOU 224(BP), X14 + MOVOU 240(BP), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, 128(SI) + MOVOU X9, 144(SI) + MOVOU X10, 160(SI) + MOVOU X11, 176(SI) + MOVOU X12, 192(SI) + MOVOU X13, 208(SI) + MOVOU X14, 224(SI) + MOVOU X15, 240(SI) + CMPQ DI, $0x00000100 + LEAQ 256(BP), BP + LEAQ 256(SI), SI + JGE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048 + JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail + MOVQ R8, SI + +emit_literal_done_match_emit_encodeBlockAsm: + MOVQ SI, dst_base+0(FP) + +emit_literal_skip_match_emit_encodeBlockAsm: + NOP + +match_nolit_loop_encodeBlockAsm: + MOVL AX, BP + MOVL AX, BP + SUBL BX, BP + MOVL BP, 24(SP) + ADDL $0x04, AX + ADDL $0x04, BX + MOVL 16(SP), BP + SUBL AX, BP + XORQ DI, DI + CMPQ BP, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm + +matchlen_loopback_match_nolit_encodeBlockAsm: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_match_nolit_encodeBlockAsm + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP match_nolit_end_encodeBlockAsm + +matchlen_loop_match_nolit_encodeBlockAsm: + LEAQ -8(BP), BP + LEAQ 8(DI), DI + CMPQ BP, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm + +matchlen_single_match_nolit_encodeBlockAsm: + TESTQ BP, BP + JZ match_nolit_end_encodeBlockAsm + +matchlen_single_loopback_match_nolit_encodeBlockAsm: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE match_nolit_end_encodeBlockAsm + LEAQ 1(DI), DI + DECQ BP + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm + +match_nolit_end_encodeBlockAsm: + MOVL 24(SP), BP + ADDQ $0x04, DI + MOVQ dst_base+0(FP), SI + ADDL DI, AX + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm + CMPL DI, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(DI), DI + ADDQ $0x05, SI + CMPL DI, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm + +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBlockAsm_emit_copy: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +four_bytes_remain_match_nolit_encodeBlockAsm: + TESTL DI, DI + JZ match_nolit_emitcopy_end_encodeBlockAsm + MOVB $0x03, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +two_byte_offset_match_nolit_encodeBlockAsm: + CMPL DI, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(DI), DI + ADDQ $0x03, SI + +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +two_byte_offset_short_match_nolit_encodeBlockAsm: + CMPL DI, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm + MOVB $0x01, DL + LEAQ -16(DX)(DI*4), DI + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm + +emit_copy_three_match_nolit_encodeBlockAsm: + MOVB $0x02, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +match_nolit_emitcopy_end_encodeBlockAsm: + MOVQ SI, dst_base+0(FP) + MOVL AX, 20(SP) + CMPL AX, 16(SP) + JGE emit_remainder_encodeBlockAsm + CMPQ SI, (SP) + JL match_nolit_dst_ok_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm: + MOVQ -2(CX)(AX*1), BP + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ BP, DI + SHRQ $0x10, BP + MOVQ BP, R8 + SHLQ $0x10, DI + IMULQ SI, DI + SHRQ $0x30, DI + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x30, R8 + MOVL 32(SP)(DI*1), SI + MOVL 32(SP)(R8*1), SI + LEAQ -2(AX), SI + MOVL SI, 32(SP)(DI*1) + MOVL AX, 32(SP)(R8*1) + CMPL (CX)(R8*1), BP + JEQ match_nolit_loop_encodeBlockAsm + INCL AX + JMP search_loop_encodeBlockAsm + +emit_remainder_encodeBlockAsm: + MOVQ src_len+32(FP), AX + SUBL 20(SP), AX + MOVQ dst_base+0(FP), DX + LEAQ (DX)(AX*1), DX + CMPQ DX, (SP) + JL emit_remainder_ok_encodeBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm: + MOVQ src_len+32(FP), AX + MOVL 20(SP), DX + CMPL DX, AX + JEQ emit_literal_skip_emit_remainder_encodeBlockAsm + MOVL AX, BX + MOVL AX, 20(SP) + LEAQ (CX)(DX*1), AX + SUBL DX, BX + MOVQ dst_base+0(FP), CX + MOVQ BX, DX + SUBL $0x01, DX + JC emit_literal_done_emit_remainder_encodeBlockAsm + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm + MOVB $0xfc, (CX) + MOVL DX, 1(CX) + ADDQ $0x05, CX + JMP memmove_emit_remainder_encodeBlockAsm + +four_bytes_emit_remainder_encodeBlockAsm: + MOVQ DX, BP + SHRL $0x10, BP + MOVB $0xf8, (CX) + MOVW DX, 1(CX) + MOVB BP, 3(CX) + ADDQ $0x04, CX + JMP memmove_emit_remainder_encodeBlockAsm + +three_bytes_emit_remainder_encodeBlockAsm: + MOVB $0xf4, (CX) + MOVW DX, 1(CX) + ADDQ $0x03, CX + JMP memmove_emit_remainder_encodeBlockAsm + +two_bytes_emit_remainder_encodeBlockAsm: + MOVB $0xf0, (CX) + MOVB DL, 1(CX) + ADDQ $0x02, CX + JMP memmove_emit_remainder_encodeBlockAsm + +one_byte_emit_remainder_encodeBlockAsm: + SHLB $0x02, DL + MOVB DL, (CX) + ADDQ $0x01, CX + +memmove_emit_remainder_encodeBlockAsm: + LEAQ (CX)(BX*1), DX + NOP + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_tail: + TESTQ BX, BX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm + CMPQ BX, $0x02 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 + CMPQ BX, $0x04 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_5through7 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_9through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 + CMPQ BX, $0x40 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 + CMPQ BX, $0x80 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_65through128 + CMPQ BX, $0x00000100 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_129through256 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_256through2048 + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: + MOVB (AX), DL + MOVB -1(AX)(BX*1), AL + MOVB DL, (CX) + MOVB AL, -1(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4: + MOVL (AX), DX + MOVL DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: + MOVW (AX), DX + MOVB 2(AX), AL + MOVW DX, (CX) + MOVB AL, 2(CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_5through7: + MOVL (AX), DX + MOVL -4(AX)(BX*1), AX + MOVL DX, (CX) + MOVL AX, -4(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: + MOVQ (AX), DX + MOVQ DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_9through16: + MOVQ (AX), DX + MOVQ -8(AX)(BX*1), AX + MOVQ DX, (CX) + MOVQ AX, -8(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: + MOVOU (AX), X0 + MOVOU -16(AX)(BX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(BX*1) + MOVOU X3, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_65through128: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_129through256: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU -128(AX)(BX*1), X8 + MOVOU -112(AX)(BX*1), X9 + MOVOU -96(AX)(BX*1), X10 + MOVOU -80(AX)(BX*1), X11 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, -128(CX)(BX*1) + MOVOU X9, -112(CX)(BX*1) + MOVOU X10, -96(CX)(BX*1) + MOVOU X11, -80(CX)(BX*1) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_256through2048: + LEAQ -256(BX), BX + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 128(AX), X8 + MOVOU 144(AX), X9 + MOVOU 160(AX), X10 + MOVOU 176(AX), X11 + MOVOU 192(AX), X12 + MOVOU 208(AX), X13 + MOVOU 224(AX), X14 + MOVOU 240(AX), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, 128(CX) + MOVOU X9, 144(CX) + MOVOU X10, 160(CX) + MOVOU X11, 176(CX) + MOVOU X12, 192(CX) + MOVOU X13, 208(CX) + MOVOU X14, 224(CX) + MOVOU X15, 240(CX) + CMPQ BX, $0x00000100 + LEAQ 256(AX), AX + LEAQ 256(CX), CX + JGE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_256through2048 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_tail + MOVQ DX, CX + +emit_literal_done_emit_remainder_encodeBlockAsm: + MOVQ CX, dst_base+0(FP) + +emit_literal_skip_emit_remainder_encodeBlockAsm: + MOVQ 8(SP), AX + SUBQ dst_base+0(FP), AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm14B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm14B(SB), $16416-56 + MOVQ $0x00000080, AX + LEAQ 32(SP), CX + PXOR X0, X0 + +zero_loop_encodeBlockAsm14B: + MOVOU X0, (CX) + MOVOU X0, 16(CX) + MOVOU X0, 32(CX) + MOVOU X0, 48(CX) + MOVOU X0, 64(CX) + MOVOU X0, 80(CX) + MOVOU X0, 96(CX) + MOVOU X0, 112(CX) + ADDQ $0x80, CX + DECQ AX + JNZ zero_loop_encodeBlockAsm14B + MOVL AX, 20(SP) + MOVQ src_len+32(FP), AX + LEAQ -5(AX), CX + LEAQ -8(AX), BX + SHRQ $0x05, AX + SUBL AX, CX + MOVL BX, 16(SP) + MOVQ dst_base+0(FP), AX + MOVQ AX, 8(SP) + LEAQ (AX)(CX*1), CX + MOVQ CX, (SP) + MOVL $0x00000001, AX + MOVL AX, 24(SP) + MOVQ src_base+24(FP), CX + +search_loop_encodeBlockAsm14B: + MOVQ (CX)(AX*1), BP + MOVL AX, BX + SUBL 20(SP), BX + SHRL $0x05, BX + LEAQ 4(AX)(BX*1), BX + MOVL 16(SP), SI + CMPL BX, SI + JGT emit_remainder_encodeBlockAsm14B + MOVL BX, 28(SP) + MOVQ $0x0000cf1bbcdcbf9b, BX + MOVQ BP, DI + MOVQ BP, R8 + SHRQ $0x08, R8 + SHLQ $0x10, DI + IMULQ BX, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ BX, R8 + SHRQ $0x32, R8 + MOVL 32(SP)(DI*1), BX + MOVL 32(SP)(R8*1), SI + MOVL AX, 32(SP)(DI*1) + LEAL 1(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, DI + SUBL 24(SP), DI + MOVL 1(CX)(DI*1), R9 + MOVQ BP, R8 + SHLQ $0x08, R8 + CMPL R8, R9 + JNE no_repeat_found_encodeBlockAsm14B + LEAQ 1(AX), BP + MOVL 20(SP), BX + TESTL DI, DI + JZ repeat_extend_back_end_encodeBlockAsm14B + +repeat_extend_back_loop_encodeBlockAsm14B: + CMPL BP, BX + JG repeat_extend_back_end_encodeBlockAsm14B + MOVB -1(CX)(DI*1), DL + MOVB -1(CX)(BP*1), SI + CMPB DL, SI + JNE repeat_extend_back_end_encodeBlockAsm14B + LEAQ -1(BP), BP + DECL DI + JZ repeat_extend_back_end_encodeBlockAsm14B + JMP repeat_extend_back_loop_encodeBlockAsm14B + +repeat_extend_back_end_encodeBlockAsm14B: + MOVL 20(SP), BX + CMPL BX, BP + JEQ emit_literal_skip_repeat_emit_encodeBlockAsm14B + MOVL BP, SI + MOVL BP, 20(SP) + LEAQ (CX)(BX*1), DI + SUBL BX, SI + MOVQ dst_base+0(FP), BX + MOVQ SI, R8 + SUBL $0x01, R8 + JC emit_literal_done_repeat_emit_encodeBlockAsm14B + CMPL R8, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm14B + CMPL R8, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm14B + CMPL R8, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm14B + CMPL R8, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm14B + MOVB $0xfc, (BX) + MOVL R8, 1(BX) + ADDQ $0x05, BX + JMP memmove_repeat_emit_encodeBlockAsm14B + +four_bytes_repeat_emit_encodeBlockAsm14B: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (BX) + MOVW R8, 1(BX) + MOVB R9, 3(BX) + ADDQ $0x04, BX + JMP memmove_repeat_emit_encodeBlockAsm14B + +three_bytes_repeat_emit_encodeBlockAsm14B: + MOVB $0xf4, (BX) + MOVW R8, 1(BX) + ADDQ $0x03, BX + JMP memmove_repeat_emit_encodeBlockAsm14B + +two_bytes_repeat_emit_encodeBlockAsm14B: + MOVB $0xf0, (BX) + MOVB R8, 1(BX) + ADDQ $0x02, BX + JMP memmove_repeat_emit_encodeBlockAsm14B + +one_byte_repeat_emit_encodeBlockAsm14B: + SHLB $0x02, R8 + MOVB R8, (BX) + ADDQ $0x01, BX + +memmove_repeat_emit_encodeBlockAsm14B: + LEAQ (BX)(SI*1), R8 + NOP + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_tail: + TESTQ SI, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm14B + CMPQ SI, $0x02 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_1or2 + CMPQ SI, $0x04 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_3 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_4 + CMPQ SI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_5through7 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_8 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_9through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_17through32 + CMPQ SI, $0x40 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_33through64 + CMPQ SI, $0x80 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_65through128 + CMPQ SI, $0x00000100 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_129through256 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048 + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_1or2: + MOVB (DI), R8 + MOVB -1(DI)(SI*1), DI + MOVB R8, (BX) + MOVB DI, -1(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_4: + MOVL (DI), R8 + MOVL R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_3: + MOVW (DI), R8 + MOVB 2(DI), DI + MOVW R8, (BX) + MOVB DI, 2(BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_5through7: + MOVL (DI), R8 + MOVL -4(DI)(SI*1), DI + MOVL R8, (BX) + MOVL DI, -4(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_8: + MOVQ (DI), R8 + MOVQ R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_9through16: + MOVQ (DI), R8 + MOVQ -8(DI)(SI*1), DI + MOVQ R8, (BX) + MOVQ DI, -8(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(SI*1), X1 + MOVOU X0, (BX) + MOVOU X1, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(SI*1), X2 + MOVOU -16(DI)(SI*1), X3 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, -32(BX)(SI*1) + MOVOU X3, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_65through128: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_129through256: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU -128(DI)(SI*1), X8 + MOVOU -112(DI)(SI*1), X9 + MOVOU -96(DI)(SI*1), X10 + MOVOU -80(DI)(SI*1), X11 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, -128(BX)(SI*1) + MOVOU X9, -112(BX)(SI*1) + MOVOU X10, -96(BX)(SI*1) + MOVOU X11, -80(BX)(SI*1) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14B + +emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048: + LEAQ -256(SI), SI + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU 128(DI), X8 + MOVOU 144(DI), X9 + MOVOU 160(DI), X10 + MOVOU 176(DI), X11 + MOVOU 192(DI), X12 + MOVOU 208(DI), X13 + MOVOU 224(DI), X14 + MOVOU 240(DI), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, 128(BX) + MOVOU X9, 144(BX) + MOVOU X10, 160(BX) + MOVOU X11, 176(BX) + MOVOU X12, 192(BX) + MOVOU X13, 208(BX) + MOVOU X14, 224(BX) + MOVOU X15, 240(BX) + CMPQ SI, $0x00000100 + LEAQ 256(DI), DI + LEAQ 256(BX), BX + JGE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_tail + MOVQ R8, BX + +emit_literal_done_repeat_emit_encodeBlockAsm14B: + MOVQ BX, dst_base+0(FP) + +emit_literal_skip_repeat_emit_encodeBlockAsm14B: + ADDL $0x05, AX + MOVL AX, BX + SUBL 24(SP), BX + MOVL 16(SP), BX + SUBL AX, BX + XORQ DI, DI + CMPQ BX, $0x08 + JL matchlen_single_repeat_extend + +matchlen_loopback_repeat_extend: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_repeat_extend + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP repeat_extend_forward_end_encodeBlockAsm14B + +matchlen_loop_repeat_extend: + LEAQ -8(BX), BX + LEAQ 8(DI), DI + CMPQ BX, $0x08 + JGE matchlen_loopback_repeat_extend + +matchlen_single_repeat_extend: + TESTQ BX, BX + JZ repeat_extend_forward_end_encodeBlockAsm14B + +matchlen_single_loopback_repeat_extend: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE repeat_extend_forward_end_encodeBlockAsm14B + LEAQ 1(DI), DI + DECQ BX + JNZ matchlen_single_loopback_repeat_extend + +repeat_extend_forward_end_encodeBlockAsm14B: + ADDL DI, AX + MOVL AX, BX + SUBL BP, BX + MOVL 24(SP), BP + MOVQ dst_base+0(FP), SI + MOVL 20(SP), DI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm14B + +emit_repeat_again_match_repeat_: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_match_repeat_ + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_ + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_repeat_ + +cant_repeat_two_offset_match_repeat_: + CMPL BX, $0x00000104 + JLT repeat_three_match_repeat_ + CMPL BX, $0x00010100 + JLT repeat_four_match_repeat_ + CMPL BX, $0x0100ffff + JLT repeat_five_match_repeat_ + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_repeat_ + +repeat_five_match_repeat_: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_four_match_repeat_: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_three_match_repeat_: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_two_match_repeat_: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_two_offset_match_repeat_: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_as_copy_encodeBlockAsm14B: + CMPL BP, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm14B + CMPL BX, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm14B + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(BX), BX + ADDQ $0x05, SI + CMPL BX, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm14B + +emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm14B_emit_copy + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm14B_emit_copy + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_four_repeat_as_copy_encodeBlockAsm14B_emit_copy: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_three_repeat_as_copy_encodeBlockAsm14B_emit_copy: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +four_bytes_remain_repeat_as_copy_encodeBlockAsm14B: + TESTL BX, BX + JZ repeat_end_emit_encodeBlockAsm14B + MOVB $0x03, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14B + +two_byte_offset_repeat_as_copy_encodeBlockAsm14B: + CMPL BX, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm14B + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(BX), BX + ADDQ $0x03, SI + +emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_four_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_three_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm14B: + CMPL BX, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14B + CMPL BP, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14B + MOVB $0x01, DL + LEAQ -16(DX)(BX*4), BX + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14B + +emit_copy_three_repeat_as_copy_encodeBlockAsm14B: + MOVB $0x02, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +repeat_end_emit_encodeBlockAsm14B: + MOVQ SI, dst_base+0(FP) + MOVL 16(SP), BX + CMPL AX, BX + JGT emit_remainder_encodeBlockAsm14B + JMP search_loop_encodeBlockAsm14B + +no_repeat_found_encodeBlockAsm14B: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ BP, DI + SHRQ $0x10, DI + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x32, DI + CMPL (CX)(BX*1), BP + SHRQ $0x08, BP + JEQ candidate_match_encodeBlockAsm14B + MOVL 32(SP)(DI*1), BX + CMPL (CX)(SI*1), BP + JEQ candidate2_match_encodeBlockAsm14B + LEAQ 2(AX), SI + MOVL SI, 32(SP)(DI*1) + SHRQ $0x08, BP + CMPL (CX)(BX*1), BP + JEQ candidate3_match_encodeBlockAsm14B + MOVL 28(SP), AX + JMP search_loop_encodeBlockAsm14B + +candidate3_match_encodeBlockAsm14B: + ADDL $0x02, AX + JMP candidate_match_encodeBlockAsm14B + +candidate2_match_encodeBlockAsm14B: + LEAQ -2(AX), BX + MOVL BX, 32(SP)(DI*1) + INCL AX + MOVL SI, BX + +candidate_match_encodeBlockAsm14B: + MOVL 20(SP), BP + TESTL BX, BX + JZ match_extend_back_end_encodeBlockAsm14B + +match_extend_back_loop_encodeBlockAsm14B: + CMPL AX, BP + JG match_extend_back_end_encodeBlockAsm14B + MOVB -1(CX)(BX*1), DL + MOVB -1(CX)(AX*1), SI + CMPB DL, SI + JNE match_extend_back_end_encodeBlockAsm14B + LEAL -1(AX), AX + DECL BX + JZ match_extend_back_end_encodeBlockAsm14B + JMP match_extend_back_loop_encodeBlockAsm14B + +match_extend_back_end_encodeBlockAsm14B: + MOVL AX, BP + SUBL 20(SP), BP + LEAQ dst_base+0(FP)(BP*1), BP + CMPQ BP, (SP) + JL match_dst_size_check_encodeBlockAsm14B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm14B: + MOVL BX, BP + MOVL 20(SP), SI + CMPL SI, BP + JEQ emit_literal_skip_match_emit_encodeBlockAsm14B + MOVL BP, DI + MOVL BP, 20(SP) + LEAQ (CX)(SI*1), BP + SUBL SI, DI + MOVQ dst_base+0(FP), SI + MOVQ DI, R8 + SUBL $0x01, R8 + JC emit_literal_done_match_emit_encodeBlockAsm14B + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm14B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm14B + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm14B + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm14B + MOVB $0xfc, (SI) + MOVL R8, 1(SI) + ADDQ $0x05, SI + JMP memmove_match_emit_encodeBlockAsm14B + +four_bytes_match_emit_encodeBlockAsm14B: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (SI) + MOVW R8, 1(SI) + MOVB R9, 3(SI) + ADDQ $0x04, SI + JMP memmove_match_emit_encodeBlockAsm14B + +three_bytes_match_emit_encodeBlockAsm14B: + MOVB $0xf4, (SI) + MOVW R8, 1(SI) + ADDQ $0x03, SI + JMP memmove_match_emit_encodeBlockAsm14B + +two_bytes_match_emit_encodeBlockAsm14B: + MOVB $0xf0, (SI) + MOVB R8, 1(SI) + ADDQ $0x02, SI + JMP memmove_match_emit_encodeBlockAsm14B + +one_byte_match_emit_encodeBlockAsm14B: + SHLB $0x02, R8 + MOVB R8, (SI) + ADDQ $0x01, SI + +memmove_match_emit_encodeBlockAsm14B: + LEAQ (SI)(DI*1), R8 + NOP + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_tail: + TESTQ DI, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm14B + CMPQ DI, $0x02 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_1or2 + CMPQ DI, $0x04 + JB emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_3 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_4 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_5through7 + JE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_8 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_9through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_17through32 + CMPQ DI, $0x40 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_33through64 + CMPQ DI, $0x80 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_65through128 + CMPQ DI, $0x00000100 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_129through256 + JMP emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_256through2048 + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_1or2: + MOVB (BP), R8 + MOVB -1(BP)(DI*1), BP + MOVB R8, (SI) + MOVB BP, -1(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_4: + MOVL (BP), R8 + MOVL R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_3: + MOVW (BP), R8 + MOVB 2(BP), BP + MOVW R8, (SI) + MOVB BP, 2(SI) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_5through7: + MOVL (BP), R8 + MOVL -4(BP)(DI*1), BP + MOVL R8, (SI) + MOVL BP, -4(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_8: + MOVQ (BP), R8 + MOVQ R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_9through16: + MOVQ (BP), R8 + MOVQ -8(BP)(DI*1), BP + MOVQ R8, (SI) + MOVQ BP, -8(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_17through32: + MOVOU (BP), X0 + MOVOU -16(BP)(DI*1), X1 + MOVOU X0, (SI) + MOVOU X1, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_33through64: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU -32(BP)(DI*1), X2 + MOVOU -16(BP)(DI*1), X3 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, -32(SI)(DI*1) + MOVOU X3, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_65through128: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_129through256: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU -128(BP)(DI*1), X8 + MOVOU -112(BP)(DI*1), X9 + MOVOU -96(BP)(DI*1), X10 + MOVOU -80(BP)(DI*1), X11 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, -128(SI)(DI*1) + MOVOU X9, -112(SI)(DI*1) + MOVOU X10, -96(SI)(DI*1) + MOVOU X11, -80(SI)(DI*1) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14B + +emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_256through2048: + LEAQ -256(DI), DI + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU 128(BP), X8 + MOVOU 144(BP), X9 + MOVOU 160(BP), X10 + MOVOU 176(BP), X11 + MOVOU 192(BP), X12 + MOVOU 208(BP), X13 + MOVOU 224(BP), X14 + MOVOU 240(BP), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, 128(SI) + MOVOU X9, 144(SI) + MOVOU X10, 160(SI) + MOVOU X11, 176(SI) + MOVOU X12, 192(SI) + MOVOU X13, 208(SI) + MOVOU X14, 224(SI) + MOVOU X15, 240(SI) + CMPQ DI, $0x00000100 + LEAQ 256(BP), BP + LEAQ 256(SI), SI + JGE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_256through2048 + JMP emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_tail + MOVQ R8, SI + +emit_literal_done_match_emit_encodeBlockAsm14B: + MOVQ SI, dst_base+0(FP) + +emit_literal_skip_match_emit_encodeBlockAsm14B: + NOP + +match_nolit_loop_encodeBlockAsm14B: + MOVL AX, BP + MOVL AX, BP + SUBL BX, BP + MOVL BP, 24(SP) + ADDL $0x04, AX + ADDL $0x04, BX + MOVL 16(SP), BP + SUBL AX, BP + XORQ DI, DI + CMPQ BP, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm14B + +matchlen_loopback_match_nolit_encodeBlockAsm14B: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_match_nolit_encodeBlockAsm14B + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP match_nolit_end_encodeBlockAsm14B + +matchlen_loop_match_nolit_encodeBlockAsm14B: + LEAQ -8(BP), BP + LEAQ 8(DI), DI + CMPQ BP, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm14B + +matchlen_single_match_nolit_encodeBlockAsm14B: + TESTQ BP, BP + JZ match_nolit_end_encodeBlockAsm14B + +matchlen_single_loopback_match_nolit_encodeBlockAsm14B: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE match_nolit_end_encodeBlockAsm14B + LEAQ 1(DI), DI + DECQ BP + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm14B + +match_nolit_end_encodeBlockAsm14B: + MOVL 24(SP), BP + ADDQ $0x04, DI + MOVQ dst_base+0(FP), SI + ADDL DI, AX + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm14B + CMPL DI, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm14B + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(DI), DI + ADDQ $0x05, SI + CMPL DI, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm14B + +emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm14B_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm14B_emit_copy + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm14B_emit_copy + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm14B_emit_copy + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy + +repeat_five_match_nolit_encodeBlockAsm14B_emit_copy: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_four_match_nolit_encodeBlockAsm14B_emit_copy: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_three_match_nolit_encodeBlockAsm14B_emit_copy: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_two_match_nolit_encodeBlockAsm14B_emit_copy: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +four_bytes_remain_match_nolit_encodeBlockAsm14B: + TESTL DI, DI + JZ match_nolit_emitcopy_end_encodeBlockAsm14B + MOVB $0x03, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +two_byte_offset_match_nolit_encodeBlockAsm14B: + CMPL DI, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm14B + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(DI), DI + ADDQ $0x03, SI + +emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy_short: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm14B_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm14B_emit_copy_short + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm14B_emit_copy_short + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm14B_emit_copy_short + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm14B_emit_copy_short: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_four_match_nolit_encodeBlockAsm14B_emit_copy_short: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_three_match_nolit_encodeBlockAsm14B_emit_copy_short: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_two_match_nolit_encodeBlockAsm14B_emit_copy_short: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +two_byte_offset_short_match_nolit_encodeBlockAsm14B: + CMPL DI, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm14B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm14B + MOVB $0x01, DL + LEAQ -16(DX)(DI*4), DI + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14B + +emit_copy_three_match_nolit_encodeBlockAsm14B: + MOVB $0x02, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +match_nolit_emitcopy_end_encodeBlockAsm14B: + MOVQ SI, dst_base+0(FP) + MOVL AX, 20(SP) + CMPL AX, 16(SP) + JGE emit_remainder_encodeBlockAsm14B + CMPQ SI, (SP) + JL match_nolit_dst_ok_encodeBlockAsm14B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm14B: + MOVQ -2(CX)(AX*1), BP + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ BP, DI + SHRQ $0x10, BP + MOVQ BP, R8 + SHLQ $0x10, DI + IMULQ SI, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 + MOVL 32(SP)(DI*1), SI + MOVL 32(SP)(R8*1), SI + LEAQ -2(AX), SI + MOVL SI, 32(SP)(DI*1) + MOVL AX, 32(SP)(R8*1) + CMPL (CX)(R8*1), BP + JEQ match_nolit_loop_encodeBlockAsm14B + INCL AX + JMP search_loop_encodeBlockAsm14B + +emit_remainder_encodeBlockAsm14B: + MOVQ src_len+32(FP), AX + SUBL 20(SP), AX + MOVQ dst_base+0(FP), DX + LEAQ (DX)(AX*1), DX + CMPQ DX, (SP) + JL emit_remainder_ok_encodeBlockAsm14B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm14B: + MOVQ src_len+32(FP), AX + MOVL 20(SP), DX + CMPL DX, AX + JEQ emit_literal_skip_emit_remainder_encodeBlockAsm14B + MOVL AX, BX + MOVL AX, 20(SP) + LEAQ (CX)(DX*1), AX + SUBL DX, BX + MOVQ dst_base+0(FP), CX + MOVQ BX, DX + SUBL $0x01, DX + JC emit_literal_done_emit_remainder_encodeBlockAsm14B + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm14B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm14B + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm14B + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm14B + MOVB $0xfc, (CX) + MOVL DX, 1(CX) + ADDQ $0x05, CX + JMP memmove_emit_remainder_encodeBlockAsm14B + +four_bytes_emit_remainder_encodeBlockAsm14B: + MOVQ DX, BP + SHRL $0x10, BP + MOVB $0xf8, (CX) + MOVW DX, 1(CX) + MOVB BP, 3(CX) + ADDQ $0x04, CX + JMP memmove_emit_remainder_encodeBlockAsm14B + +three_bytes_emit_remainder_encodeBlockAsm14B: + MOVB $0xf4, (CX) + MOVW DX, 1(CX) + ADDQ $0x03, CX + JMP memmove_emit_remainder_encodeBlockAsm14B + +two_bytes_emit_remainder_encodeBlockAsm14B: + MOVB $0xf0, (CX) + MOVB DL, 1(CX) + ADDQ $0x02, CX + JMP memmove_emit_remainder_encodeBlockAsm14B + +one_byte_emit_remainder_encodeBlockAsm14B: + SHLB $0x02, DL + MOVB DL, (CX) + ADDQ $0x01, CX + +memmove_emit_remainder_encodeBlockAsm14B: + LEAQ (CX)(BX*1), DX + NOP + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_tail: + TESTQ BX, BX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm14B + CMPQ BX, $0x02 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_1or2 + CMPQ BX, $0x04 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_3 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_5through7 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_9through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_17through32 + CMPQ BX, $0x40 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_33through64 + CMPQ BX, $0x80 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_65through128 + CMPQ BX, $0x00000100 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_129through256 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_256through2048 + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_1or2: + MOVB (AX), DL + MOVB -1(AX)(BX*1), AL + MOVB DL, (CX) + MOVB AL, -1(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_4: + MOVL (AX), DX + MOVL DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_3: + MOVW (AX), DX + MOVB 2(AX), AL + MOVW DX, (CX) + MOVB AL, 2(CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_5through7: + MOVL (AX), DX + MOVL -4(AX)(BX*1), AX + MOVL DX, (CX) + MOVL AX, -4(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_8: + MOVQ (AX), DX + MOVQ DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_9through16: + MOVQ (AX), DX + MOVQ -8(AX)(BX*1), AX + MOVQ DX, (CX) + MOVQ AX, -8(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_17through32: + MOVOU (AX), X0 + MOVOU -16(AX)(BX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_33through64: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(BX*1) + MOVOU X3, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_65through128: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_129through256: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU -128(AX)(BX*1), X8 + MOVOU -112(AX)(BX*1), X9 + MOVOU -96(AX)(BX*1), X10 + MOVOU -80(AX)(BX*1), X11 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, -128(CX)(BX*1) + MOVOU X9, -112(CX)(BX*1) + MOVOU X10, -96(CX)(BX*1) + MOVOU X11, -80(CX)(BX*1) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14B + +emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_256through2048: + LEAQ -256(BX), BX + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 128(AX), X8 + MOVOU 144(AX), X9 + MOVOU 160(AX), X10 + MOVOU 176(AX), X11 + MOVOU 192(AX), X12 + MOVOU 208(AX), X13 + MOVOU 224(AX), X14 + MOVOU 240(AX), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, 128(CX) + MOVOU X9, 144(CX) + MOVOU X10, 160(CX) + MOVOU X11, 176(CX) + MOVOU X12, 192(CX) + MOVOU X13, 208(CX) + MOVOU X14, 224(CX) + MOVOU X15, 240(CX) + CMPQ BX, $0x00000100 + LEAQ 256(AX), AX + LEAQ 256(CX), CX + JGE emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_move_256through2048 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm14B_memmove_tail + MOVQ DX, CX + +emit_literal_done_emit_remainder_encodeBlockAsm14B: + MOVQ CX, dst_base+0(FP) + +emit_literal_skip_emit_remainder_encodeBlockAsm14B: + MOVQ 8(SP), AX + SUBQ dst_base+0(FP), AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm12B(SB), $4128-56 + MOVQ $0x00000020, AX + LEAQ 32(SP), CX + PXOR X0, X0 + +zero_loop_encodeBlockAsm12B: + MOVOU X0, (CX) + MOVOU X0, 16(CX) + MOVOU X0, 32(CX) + MOVOU X0, 48(CX) + MOVOU X0, 64(CX) + MOVOU X0, 80(CX) + MOVOU X0, 96(CX) + MOVOU X0, 112(CX) + ADDQ $0x80, CX + DECQ AX + JNZ zero_loop_encodeBlockAsm12B + MOVL AX, 20(SP) + MOVQ src_len+32(FP), AX + LEAQ -5(AX), CX + LEAQ -8(AX), BX + SHRQ $0x05, AX + SUBL AX, CX + MOVL BX, 16(SP) + MOVQ dst_base+0(FP), AX + MOVQ AX, 8(SP) + LEAQ (AX)(CX*1), CX + MOVQ CX, (SP) + MOVL $0x00000001, AX + MOVL AX, 24(SP) + MOVQ src_base+24(FP), CX + +search_loop_encodeBlockAsm12B: + MOVQ (CX)(AX*1), BP + MOVL AX, BX + SUBL 20(SP), BX + SHRL $0x04, BX + LEAQ 4(AX)(BX*1), BX + MOVL 16(SP), SI + CMPL BX, SI + JGT emit_remainder_encodeBlockAsm12B + MOVL BX, 28(SP) + MOVQ $0x0000cf1bbcdcbf9b, BX + MOVQ BP, DI + MOVQ BP, R8 + SHRQ $0x08, R8 + SHLQ $0x10, DI + IMULQ BX, DI + SHRQ $0x34, DI + SHLQ $0x10, R8 + IMULQ BX, R8 + SHRQ $0x34, R8 + MOVL 32(SP)(DI*1), BX + MOVL 32(SP)(R8*1), SI + MOVL AX, 32(SP)(DI*1) + LEAL 1(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, DI + SUBL 24(SP), DI + MOVL 1(CX)(DI*1), R9 + MOVQ BP, R8 + SHLQ $0x08, R8 + CMPL R8, R9 + JNE no_repeat_found_encodeBlockAsm12B + LEAQ 1(AX), BP + MOVL 20(SP), BX + TESTL DI, DI + JZ repeat_extend_back_end_encodeBlockAsm12B + +repeat_extend_back_loop_encodeBlockAsm12B: + CMPL BP, BX + JG repeat_extend_back_end_encodeBlockAsm12B + MOVB -1(CX)(DI*1), DL + MOVB -1(CX)(BP*1), SI + CMPB DL, SI + JNE repeat_extend_back_end_encodeBlockAsm12B + LEAQ -1(BP), BP + DECL DI + JZ repeat_extend_back_end_encodeBlockAsm12B + JMP repeat_extend_back_loop_encodeBlockAsm12B + +repeat_extend_back_end_encodeBlockAsm12B: + MOVL 20(SP), BX + CMPL BX, BP + JEQ emit_literal_skip_repeat_emit_encodeBlockAsm12B + MOVL BP, SI + MOVL BP, 20(SP) + LEAQ (CX)(BX*1), DI + SUBL BX, SI + MOVQ dst_base+0(FP), BX + MOVQ SI, R8 + SUBL $0x01, R8 + JC emit_literal_done_repeat_emit_encodeBlockAsm12B + CMPL R8, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm12B + CMPL R8, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm12B + CMPL R8, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm12B + CMPL R8, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm12B + MOVB $0xfc, (BX) + MOVL R8, 1(BX) + ADDQ $0x05, BX + JMP memmove_repeat_emit_encodeBlockAsm12B + +four_bytes_repeat_emit_encodeBlockAsm12B: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (BX) + MOVW R8, 1(BX) + MOVB R9, 3(BX) + ADDQ $0x04, BX + JMP memmove_repeat_emit_encodeBlockAsm12B + +three_bytes_repeat_emit_encodeBlockAsm12B: + MOVB $0xf4, (BX) + MOVW R8, 1(BX) + ADDQ $0x03, BX + JMP memmove_repeat_emit_encodeBlockAsm12B + +two_bytes_repeat_emit_encodeBlockAsm12B: + MOVB $0xf0, (BX) + MOVB R8, 1(BX) + ADDQ $0x02, BX + JMP memmove_repeat_emit_encodeBlockAsm12B + +one_byte_repeat_emit_encodeBlockAsm12B: + SHLB $0x02, R8 + MOVB R8, (BX) + ADDQ $0x01, BX + +memmove_repeat_emit_encodeBlockAsm12B: + LEAQ (BX)(SI*1), R8 + NOP + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_tail: + TESTQ SI, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B + CMPQ SI, $0x02 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 + CMPQ SI, $0x04 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4 + CMPQ SI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_5through7 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_9through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 + CMPQ SI, $0x40 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 + CMPQ SI, $0x80 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128 + CMPQ SI, $0x00000100 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048 + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: + MOVB (DI), R8 + MOVB -1(DI)(SI*1), DI + MOVB R8, (BX) + MOVB DI, -1(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4: + MOVL (DI), R8 + MOVL R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: + MOVW (DI), R8 + MOVB 2(DI), DI + MOVW R8, (BX) + MOVB DI, 2(BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_5through7: + MOVL (DI), R8 + MOVL -4(DI)(SI*1), DI + MOVL R8, (BX) + MOVL DI, -4(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (DI), R8 + MOVQ R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_9through16: + MOVQ (DI), R8 + MOVQ -8(DI)(SI*1), DI + MOVQ R8, (BX) + MOVQ DI, -8(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(SI*1), X1 + MOVOU X0, (BX) + MOVOU X1, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(SI*1), X2 + MOVOU -16(DI)(SI*1), X3 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, -32(BX)(SI*1) + MOVOU X3, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU -128(DI)(SI*1), X8 + MOVOU -112(DI)(SI*1), X9 + MOVOU -96(DI)(SI*1), X10 + MOVOU -80(DI)(SI*1), X11 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, -128(BX)(SI*1) + MOVOU X9, -112(BX)(SI*1) + MOVOU X10, -96(BX)(SI*1) + MOVOU X11, -80(BX)(SI*1) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048: + LEAQ -256(SI), SI + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU 128(DI), X8 + MOVOU 144(DI), X9 + MOVOU 160(DI), X10 + MOVOU 176(DI), X11 + MOVOU 192(DI), X12 + MOVOU 208(DI), X13 + MOVOU 224(DI), X14 + MOVOU 240(DI), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, 128(BX) + MOVOU X9, 144(BX) + MOVOU X10, 160(BX) + MOVOU X11, 176(BX) + MOVOU X12, 192(BX) + MOVOU X13, 208(BX) + MOVOU X14, 224(BX) + MOVOU X15, 240(BX) + CMPQ SI, $0x00000100 + LEAQ 256(DI), DI + LEAQ 256(BX), BX + JGE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_tail + MOVQ R8, BX + +emit_literal_done_repeat_emit_encodeBlockAsm12B: + MOVQ BX, dst_base+0(FP) + +emit_literal_skip_repeat_emit_encodeBlockAsm12B: + ADDL $0x05, AX + MOVL AX, BX + SUBL 24(SP), BX + MOVL 16(SP), BX + SUBL AX, BX + XORQ DI, DI + CMPQ BX, $0x08 + JL matchlen_single_repeat_extend + +matchlen_loopback_repeat_extend: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_repeat_extend + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_loop_repeat_extend: + LEAQ -8(BX), BX + LEAQ 8(DI), DI + CMPQ BX, $0x08 + JGE matchlen_loopback_repeat_extend + +matchlen_single_repeat_extend: + TESTQ BX, BX + JZ repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_single_loopback_repeat_extend: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE repeat_extend_forward_end_encodeBlockAsm12B + LEAQ 1(DI), DI + DECQ BX + JNZ matchlen_single_loopback_repeat_extend + +repeat_extend_forward_end_encodeBlockAsm12B: + ADDL DI, AX + MOVL AX, BX + SUBL BP, BX + MOVL 24(SP), BP + MOVQ dst_base+0(FP), SI + MOVL 20(SP), DI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm12B + +emit_repeat_again_match_repeat_: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_match_repeat_ + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_ + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_repeat_ + +cant_repeat_two_offset_match_repeat_: + CMPL BX, $0x00000104 + JLT repeat_three_match_repeat_ + CMPL BX, $0x00010100 + JLT repeat_four_match_repeat_ + CMPL BX, $0x0100ffff + JLT repeat_five_match_repeat_ + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_repeat_ + +repeat_five_match_repeat_: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_four_match_repeat_: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_match_repeat_: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_match_repeat_: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_match_repeat_: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_as_copy_encodeBlockAsm12B: + CMPL BP, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm12B + CMPL BX, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm12B + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(BX), BX + ADDQ $0x05, SI + CMPL BX, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm12B + +emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +four_bytes_remain_repeat_as_copy_encodeBlockAsm12B: + TESTL BX, BX + JZ repeat_end_emit_encodeBlockAsm12B + MOVB $0x03, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12B + +two_byte_offset_repeat_as_copy_encodeBlockAsm12B: + CMPL BX, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(BX), BX + ADDQ $0x03, SI + +emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: + CMPL BX, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + CMPL BP, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + MOVB $0x01, DL + LEAQ -16(DX)(BX*4), BX + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeBlockAsm12B: + MOVB $0x02, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +repeat_end_emit_encodeBlockAsm12B: + MOVQ SI, dst_base+0(FP) + MOVL 16(SP), BX + CMPL AX, BX + JGT emit_remainder_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm12B + +no_repeat_found_encodeBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ BP, DI + SHRQ $0x10, DI + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x34, DI + CMPL (CX)(BX*1), BP + SHRQ $0x08, BP + JEQ candidate_match_encodeBlockAsm12B + MOVL 32(SP)(DI*1), BX + CMPL (CX)(SI*1), BP + JEQ candidate2_match_encodeBlockAsm12B + LEAQ 2(AX), SI + MOVL SI, 32(SP)(DI*1) + SHRQ $0x08, BP + CMPL (CX)(BX*1), BP + JEQ candidate3_match_encodeBlockAsm12B + MOVL 28(SP), AX + JMP search_loop_encodeBlockAsm12B + +candidate3_match_encodeBlockAsm12B: + ADDL $0x02, AX + JMP candidate_match_encodeBlockAsm12B + +candidate2_match_encodeBlockAsm12B: + LEAQ -2(AX), BX + MOVL BX, 32(SP)(DI*1) + INCL AX + MOVL SI, BX + +candidate_match_encodeBlockAsm12B: + MOVL 20(SP), BP + TESTL BX, BX + JZ match_extend_back_end_encodeBlockAsm12B + +match_extend_back_loop_encodeBlockAsm12B: + CMPL AX, BP + JG match_extend_back_end_encodeBlockAsm12B + MOVB -1(CX)(BX*1), DL + MOVB -1(CX)(AX*1), SI + CMPB DL, SI + JNE match_extend_back_end_encodeBlockAsm12B + LEAL -1(AX), AX + DECL BX + JZ match_extend_back_end_encodeBlockAsm12B + JMP match_extend_back_loop_encodeBlockAsm12B + +match_extend_back_end_encodeBlockAsm12B: + MOVL AX, BP + SUBL 20(SP), BP + LEAQ dst_base+0(FP)(BP*1), BP + CMPQ BP, (SP) + JL match_dst_size_check_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm12B: + MOVL BX, BP + MOVL 20(SP), SI + CMPL SI, BP + JEQ emit_literal_skip_match_emit_encodeBlockAsm12B + MOVL BP, DI + MOVL BP, 20(SP) + LEAQ (CX)(SI*1), BP + SUBL SI, DI + MOVQ dst_base+0(FP), SI + MOVQ DI, R8 + SUBL $0x01, R8 + JC emit_literal_done_match_emit_encodeBlockAsm12B + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm12B + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm12B + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm12B + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm12B + MOVB $0xfc, (SI) + MOVL R8, 1(SI) + ADDQ $0x05, SI + JMP memmove_match_emit_encodeBlockAsm12B + +four_bytes_match_emit_encodeBlockAsm12B: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (SI) + MOVW R8, 1(SI) + MOVB R9, 3(SI) + ADDQ $0x04, SI + JMP memmove_match_emit_encodeBlockAsm12B + +three_bytes_match_emit_encodeBlockAsm12B: + MOVB $0xf4, (SI) + MOVW R8, 1(SI) + ADDQ $0x03, SI + JMP memmove_match_emit_encodeBlockAsm12B + +two_bytes_match_emit_encodeBlockAsm12B: + MOVB $0xf0, (SI) + MOVB R8, 1(SI) + ADDQ $0x02, SI + JMP memmove_match_emit_encodeBlockAsm12B + +one_byte_match_emit_encodeBlockAsm12B: + SHLB $0x02, R8 + MOVB R8, (SI) + ADDQ $0x01, SI + +memmove_match_emit_encodeBlockAsm12B: + LEAQ (SI)(DI*1), R8 + NOP + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail: + TESTQ DI, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm12B + CMPQ DI, $0x02 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 + CMPQ DI, $0x04 + JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_5through7 + JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_9through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 + CMPQ DI, $0x40 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 + CMPQ DI, $0x80 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_65through128 + CMPQ DI, $0x00000100 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_129through256 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048 + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: + MOVB (BP), R8 + MOVB -1(BP)(DI*1), BP + MOVB R8, (SI) + MOVB BP, -1(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4: + MOVL (BP), R8 + MOVL R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: + MOVW (BP), R8 + MOVB 2(BP), BP + MOVW R8, (SI) + MOVB BP, 2(SI) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_5through7: + MOVL (BP), R8 + MOVL -4(BP)(DI*1), BP + MOVL R8, (SI) + MOVL BP, -4(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (BP), R8 + MOVQ R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_9through16: + MOVQ (BP), R8 + MOVQ -8(BP)(DI*1), BP + MOVQ R8, (SI) + MOVQ BP, -8(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (BP), X0 + MOVOU -16(BP)(DI*1), X1 + MOVOU X0, (SI) + MOVOU X1, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU -32(BP)(DI*1), X2 + MOVOU -16(BP)(DI*1), X3 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, -32(SI)(DI*1) + MOVOU X3, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_65through128: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_129through256: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU -128(BP)(DI*1), X8 + MOVOU -112(BP)(DI*1), X9 + MOVOU -96(BP)(DI*1), X10 + MOVOU -80(BP)(DI*1), X11 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, -128(SI)(DI*1) + MOVOU X9, -112(SI)(DI*1) + MOVOU X10, -96(SI)(DI*1) + MOVOU X11, -80(SI)(DI*1) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12B + +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048: + LEAQ -256(DI), DI + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU 128(BP), X8 + MOVOU 144(BP), X9 + MOVOU 160(BP), X10 + MOVOU 176(BP), X11 + MOVOU 192(BP), X12 + MOVOU 208(BP), X13 + MOVOU 224(BP), X14 + MOVOU 240(BP), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, 128(SI) + MOVOU X9, 144(SI) + MOVOU X10, 160(SI) + MOVOU X11, 176(SI) + MOVOU X12, 192(SI) + MOVOU X13, 208(SI) + MOVOU X14, 224(SI) + MOVOU X15, 240(SI) + CMPQ DI, $0x00000100 + LEAQ 256(BP), BP + LEAQ 256(SI), SI + JGE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail + MOVQ R8, SI + +emit_literal_done_match_emit_encodeBlockAsm12B: + MOVQ SI, dst_base+0(FP) + +emit_literal_skip_match_emit_encodeBlockAsm12B: + NOP + +match_nolit_loop_encodeBlockAsm12B: + MOVL AX, BP + MOVL AX, BP + SUBL BX, BP + MOVL BP, 24(SP) + ADDL $0x04, AX + ADDL $0x04, BX + MOVL 16(SP), BP + SUBL AX, BP + XORQ DI, DI + CMPQ BP, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm12B + +matchlen_loopback_match_nolit_encodeBlockAsm12B: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_match_nolit_encodeBlockAsm12B + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP match_nolit_end_encodeBlockAsm12B + +matchlen_loop_match_nolit_encodeBlockAsm12B: + LEAQ -8(BP), BP + LEAQ 8(DI), DI + CMPQ BP, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm12B + +matchlen_single_match_nolit_encodeBlockAsm12B: + TESTQ BP, BP + JZ match_nolit_end_encodeBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBlockAsm12B: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE match_nolit_end_encodeBlockAsm12B + LEAQ 1(DI), DI + DECQ BP + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B + +match_nolit_end_encodeBlockAsm12B: + MOVL 24(SP), BP + ADDQ $0x04, DI + MOVQ dst_base+0(FP), SI + ADDL DI, AX + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm12B + CMPL DI, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm12B + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(DI), DI + ADDQ $0x05, SI + CMPL DI, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm12B + +emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm12B_emit_copy + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm12B_emit_copy + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy + +repeat_five_match_nolit_encodeBlockAsm12B_emit_copy: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_four_match_nolit_encodeBlockAsm12B_emit_copy: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +four_bytes_remain_match_nolit_encodeBlockAsm12B: + TESTL DI, DI + JZ match_nolit_emitcopy_end_encodeBlockAsm12B + MOVB $0x03, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +two_byte_offset_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(DI), DI + ADDQ $0x03, SI + +emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy_short: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm12B_emit_copy_short + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm12B_emit_copy_short + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm12B_emit_copy_short: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_four_match_nolit_encodeBlockAsm12B_emit_copy_short: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm12B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm12B + MOVB $0x01, DL + LEAQ -16(DX)(DI*4), DI + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +emit_copy_three_match_nolit_encodeBlockAsm12B: + MOVB $0x02, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +match_nolit_emitcopy_end_encodeBlockAsm12B: + MOVQ SI, dst_base+0(FP) + MOVL AX, 20(SP) + CMPL AX, 16(SP) + JGE emit_remainder_encodeBlockAsm12B + CMPQ SI, (SP) + JL match_nolit_dst_ok_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm12B: + MOVQ -2(CX)(AX*1), BP + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ BP, DI + SHRQ $0x10, BP + MOVQ BP, R8 + SHLQ $0x10, DI + IMULQ SI, DI + SHRQ $0x34, DI + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 + MOVL 32(SP)(DI*1), SI + MOVL 32(SP)(R8*1), SI + LEAQ -2(AX), SI + MOVL SI, 32(SP)(DI*1) + MOVL AX, 32(SP)(R8*1) + CMPL (CX)(R8*1), BP + JEQ match_nolit_loop_encodeBlockAsm12B + INCL AX + JMP search_loop_encodeBlockAsm12B + +emit_remainder_encodeBlockAsm12B: + MOVQ src_len+32(FP), AX + SUBL 20(SP), AX + MOVQ dst_base+0(FP), DX + LEAQ (DX)(AX*1), DX + CMPQ DX, (SP) + JL emit_remainder_ok_encodeBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm12B: + MOVQ src_len+32(FP), AX + MOVL 20(SP), DX + CMPL DX, AX + JEQ emit_literal_skip_emit_remainder_encodeBlockAsm12B + MOVL AX, BX + MOVL AX, 20(SP) + LEAQ (CX)(DX*1), AX + SUBL DX, BX + MOVQ dst_base+0(FP), CX + MOVQ BX, DX + SUBL $0x01, DX + JC emit_literal_done_emit_remainder_encodeBlockAsm12B + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm12B + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm12B + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm12B + MOVB $0xfc, (CX) + MOVL DX, 1(CX) + ADDQ $0x05, CX + JMP memmove_emit_remainder_encodeBlockAsm12B + +four_bytes_emit_remainder_encodeBlockAsm12B: + MOVQ DX, BP + SHRL $0x10, BP + MOVB $0xf8, (CX) + MOVW DX, 1(CX) + MOVB BP, 3(CX) + ADDQ $0x04, CX + JMP memmove_emit_remainder_encodeBlockAsm12B + +three_bytes_emit_remainder_encodeBlockAsm12B: + MOVB $0xf4, (CX) + MOVW DX, 1(CX) + ADDQ $0x03, CX + JMP memmove_emit_remainder_encodeBlockAsm12B + +two_bytes_emit_remainder_encodeBlockAsm12B: + MOVB $0xf0, (CX) + MOVB DL, 1(CX) + ADDQ $0x02, CX + JMP memmove_emit_remainder_encodeBlockAsm12B + +one_byte_emit_remainder_encodeBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (CX) + ADDQ $0x01, CX + +memmove_emit_remainder_encodeBlockAsm12B: + LEAQ (CX)(BX*1), DX + NOP + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_tail: + TESTQ BX, BX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B + CMPQ BX, $0x02 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 + CMPQ BX, $0x04 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_5through7 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_9through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 + CMPQ BX, $0x40 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 + CMPQ BX, $0x80 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_65through128 + CMPQ BX, $0x00000100 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_129through256 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_256through2048 + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: + MOVB (AX), DL + MOVB -1(AX)(BX*1), AL + MOVB DL, (CX) + MOVB AL, -1(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4: + MOVL (AX), DX + MOVL DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: + MOVW (AX), DX + MOVB 2(AX), AL + MOVW DX, (CX) + MOVB AL, 2(CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_5through7: + MOVL (AX), DX + MOVL -4(AX)(BX*1), AX + MOVL DX, (CX) + MOVL AX, -4(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: + MOVQ (AX), DX + MOVQ DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_9through16: + MOVQ (AX), DX + MOVQ -8(AX)(BX*1), AX + MOVQ DX, (CX) + MOVQ AX, -8(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: + MOVOU (AX), X0 + MOVOU -16(AX)(BX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(BX*1) + MOVOU X3, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_65through128: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_129through256: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU -128(AX)(BX*1), X8 + MOVOU -112(AX)(BX*1), X9 + MOVOU -96(AX)(BX*1), X10 + MOVOU -80(AX)(BX*1), X11 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, -128(CX)(BX*1) + MOVOU X9, -112(CX)(BX*1) + MOVOU X10, -96(CX)(BX*1) + MOVOU X11, -80(CX)(BX*1) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_256through2048: + LEAQ -256(BX), BX + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 128(AX), X8 + MOVOU 144(AX), X9 + MOVOU 160(AX), X10 + MOVOU 176(AX), X11 + MOVOU 192(AX), X12 + MOVOU 208(AX), X13 + MOVOU 224(AX), X14 + MOVOU 240(AX), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, 128(CX) + MOVOU X9, 144(CX) + MOVOU X10, 160(CX) + MOVOU X11, 176(CX) + MOVOU X12, 192(CX) + MOVOU X13, 208(CX) + MOVOU X14, 224(CX) + MOVOU X15, 240(CX) + CMPQ BX, $0x00000100 + LEAQ 256(AX), AX + LEAQ 256(CX), CX + JGE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_256through2048 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_tail + MOVQ DX, CX + +emit_literal_done_emit_remainder_encodeBlockAsm12B: + MOVQ CX, dst_base+0(FP) + +emit_literal_skip_emit_remainder_encodeBlockAsm12B: + MOVQ 8(SP), AX + SUBQ dst_base+0(FP), AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsmAvx(dst []byte, src []byte) int +// Requires: AVX, SSE2 +TEXT ·encodeBlockAsmAvx(SB), $65568-56 + MOVQ $0x00000200, AX + LEAQ 32(SP), CX + PXOR X0, X0 + +zero_loop_encodeBlockAsmAvx: + MOVOU X0, (CX) + MOVOU X0, 16(CX) + MOVOU X0, 32(CX) + MOVOU X0, 48(CX) + MOVOU X0, 64(CX) + MOVOU X0, 80(CX) + MOVOU X0, 96(CX) + MOVOU X0, 112(CX) + ADDQ $0x80, CX + DECQ AX + JNZ zero_loop_encodeBlockAsmAvx + MOVL AX, 20(SP) + MOVQ src_len+32(FP), AX + LEAQ -5(AX), CX + LEAQ -8(AX), BX + SHRQ $0x05, AX + SUBL AX, CX + MOVL BX, 16(SP) + MOVQ dst_base+0(FP), AX + MOVQ AX, 8(SP) + LEAQ (AX)(CX*1), CX + MOVQ CX, (SP) + MOVL $0x00000001, AX + MOVL AX, 24(SP) + MOVQ src_base+24(FP), CX + +search_loop_encodeBlockAsmAvx: + MOVQ (CX)(AX*1), BP + MOVL AX, BX + SUBL 20(SP), BX + SHRL $0x06, BX + LEAQ 4(AX)(BX*1), BX + MOVL 16(SP), SI + CMPL BX, SI + JGT emit_remainder_encodeBlockAsmAvx + MOVL BX, 28(SP) + MOVQ $0x0000cf1bbcdcbf9b, BX + MOVQ BP, DI + MOVQ BP, R8 + SHRQ $0x08, R8 + SHLQ $0x10, DI + IMULQ BX, DI + SHRQ $0x30, DI + SHLQ $0x10, R8 + IMULQ BX, R8 + SHRQ $0x30, R8 + MOVL 32(SP)(DI*1), BX + MOVL 32(SP)(R8*1), SI + MOVL AX, 32(SP)(DI*1) + LEAL 1(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, DI + SUBL 24(SP), DI + MOVL 1(CX)(DI*1), R9 + MOVQ BP, R8 + SHLQ $0x08, R8 + CMPL R8, R9 + JNE no_repeat_found_encodeBlockAsmAvx + LEAQ 1(AX), BP + MOVL 20(SP), BX + TESTL DI, DI + JZ repeat_extend_back_end_encodeBlockAsmAvx + +repeat_extend_back_loop_encodeBlockAsmAvx: + CMPL BP, BX + JG repeat_extend_back_end_encodeBlockAsmAvx + MOVB -1(CX)(DI*1), DL + MOVB -1(CX)(BP*1), SI + CMPB DL, SI + JNE repeat_extend_back_end_encodeBlockAsmAvx + LEAQ -1(BP), BP + DECL DI + JZ repeat_extend_back_end_encodeBlockAsmAvx + JMP repeat_extend_back_loop_encodeBlockAsmAvx + +repeat_extend_back_end_encodeBlockAsmAvx: + MOVL 20(SP), BX + CMPL BX, BP + JEQ emit_literal_skip_repeat_emit_encodeBlockAsmAvx + MOVL BP, SI + MOVL BP, 20(SP) + LEAQ (CX)(BX*1), DI + SUBL BX, SI + MOVQ dst_base+0(FP), BX + MOVQ SI, R8 + SUBL $0x01, R8 + JC emit_literal_done_repeat_emit_encodeBlockAsmAvx + CMPL R8, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsmAvx + CMPL R8, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsmAvx + CMPL R8, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsmAvx + CMPL R8, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsmAvx + MOVB $0xfc, (BX) + MOVL R8, 1(BX) + ADDQ $0x05, BX + JMP memmove_repeat_emit_encodeBlockAsmAvx + +four_bytes_repeat_emit_encodeBlockAsmAvx: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (BX) + MOVW R8, 1(BX) + MOVB R9, 3(BX) + ADDQ $0x04, BX + JMP memmove_repeat_emit_encodeBlockAsmAvx + +three_bytes_repeat_emit_encodeBlockAsmAvx: + MOVB $0xf4, (BX) + MOVW R8, 1(BX) + ADDQ $0x03, BX + JMP memmove_repeat_emit_encodeBlockAsmAvx + +two_bytes_repeat_emit_encodeBlockAsmAvx: + MOVB $0xf0, (BX) + MOVB R8, 1(BX) + ADDQ $0x02, BX + JMP memmove_repeat_emit_encodeBlockAsmAvx + +one_byte_repeat_emit_encodeBlockAsmAvx: + SHLB $0x02, R8 + MOVB R8, (BX) + ADDQ $0x01, BX + +memmove_repeat_emit_encodeBlockAsmAvx: + LEAQ (BX)(SI*1), R8 + NOP + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_tail: + TESTQ SI, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsmAvx + CMPQ SI, $0x02 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_1or2 + CMPQ SI, $0x04 + JB emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_3 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_4 + CMPQ SI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_5through7 + JE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_8 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_9through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_17through32 + CMPQ SI, $0x40 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_33through64 + CMPQ SI, $0x80 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128 + CMPQ SI, $0x00000100 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_avxUnaligned + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_1or2: + MOVB (DI), R8 + MOVB -1(DI)(SI*1), R9 + MOVB R8, (BX) + MOVB R9, -1(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_4: + MOVL (DI), R8 + MOVL R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_3: + MOVW (DI), R8 + MOVB 2(DI), R9 + MOVW R8, (BX) + MOVB R9, 2(BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_5through7: + MOVL (DI), R8 + MOVL -4(DI)(SI*1), R9 + MOVL R8, (BX) + MOVL R9, -4(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_8: + MOVQ (DI), R8 + MOVQ R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_9through16: + MOVQ (DI), R8 + MOVQ -8(DI)(SI*1), R9 + MOVQ R8, (BX) + MOVQ R9, -8(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(SI*1), X1 + MOVOU X0, (BX) + MOVOU X1, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(SI*1), X2 + MOVOU -16(DI)(SI*1), X3 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, -32(BX)(SI*1) + MOVOU X3, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU -128(DI)(SI*1), X8 + MOVOU -112(DI)(SI*1), X9 + MOVOU -96(DI)(SI*1), X10 + MOVOU -80(DI)(SI*1), X11 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, -128(BX)(SI*1) + MOVOU X9, -112(BX)(SI*1) + MOVOU X10, -96(BX)(SI*1) + MOVOU X11, -80(BX)(SI*1) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048: + LEAQ -256(SI), SI + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU 128(DI), X8 + MOVOU 144(DI), X9 + MOVOU 160(DI), X10 + MOVOU 176(DI), X11 + MOVOU 192(DI), X12 + MOVOU 208(DI), X13 + MOVOU 224(DI), X14 + MOVOU 240(DI), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, 128(BX) + MOVOU X9, 144(BX) + MOVOU X10, 160(BX) + MOVOU X11, 176(BX) + MOVOU X12, 192(BX) + MOVOU X13, 208(BX) + MOVOU X14, 224(BX) + MOVOU X15, 240(BX) + CMPQ SI, $0x00000100 + LEAQ 256(DI), DI + LEAQ 256(BX), BX + JGE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_tail + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_avxUnaligned: + LEAQ (DI)(SI*1), R9 + MOVQ BX, R11 + MOVOU -128(R9), X5 + MOVOU -112(R9), X6 + MOVQ $0x00000080, R8 + ANDQ $0xffffffe0, BX + ADDQ $0x20, BX + MOVOU -96(R9), X7 + MOVOU -80(R9), X8 + MOVQ BX, R10 + SUBQ R11, R10 + MOVOU -64(R9), X9 + MOVOU -48(R9), X10 + SUBQ R10, SI + MOVOU -32(R9), X11 + MOVOU -16(R9), X12 + VMOVDQU (DI), Y4 + ADDQ R10, DI + SUBQ R8, SI + +emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_gobble_128_loop: + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU 64(DI), Y2 + VMOVDQU 96(DI), Y3 + ADDQ R8, DI + VMOVDQA Y0, (BX) + VMOVDQA Y1, 32(BX) + VMOVDQA Y2, 64(BX) + VMOVDQA Y3, 96(BX) + ADDQ R8, BX + SUBQ R8, SI + JA emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_gobble_128_loop + ADDQ R8, SI + ADDQ BX, SI + VMOVDQU Y4, (R11) + VZEROUPPER + MOVOU X5, -128(SI) + MOVOU X6, -112(SI) + MOVOU X7, -96(SI) + MOVOU X8, -80(SI) + MOVOU X9, -64(SI) + MOVOU X10, -48(SI) + MOVOU X11, -32(SI) + MOVOU X12, -16(SI) + JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx + MOVQ R8, BX + +emit_literal_done_repeat_emit_encodeBlockAsmAvx: + MOVQ BX, dst_base+0(FP) + +emit_literal_skip_repeat_emit_encodeBlockAsmAvx: + ADDL $0x05, AX + MOVL AX, BX + SUBL 24(SP), BX + MOVL 16(SP), BX + SUBL AX, BX + XORQ DI, DI + CMPQ BX, $0x08 + JL matchlen_single_repeat_extend + +matchlen_loopback_repeat_extend: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_repeat_extend + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP repeat_extend_forward_end_encodeBlockAsmAvx + +matchlen_loop_repeat_extend: + LEAQ -8(BX), BX + LEAQ 8(DI), DI + CMPQ BX, $0x08 + JGE matchlen_loopback_repeat_extend + +matchlen_single_repeat_extend: + TESTQ BX, BX + JZ repeat_extend_forward_end_encodeBlockAsmAvx + +matchlen_single_loopback_repeat_extend: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE repeat_extend_forward_end_encodeBlockAsmAvx + LEAQ 1(DI), DI + DECQ BX + JNZ matchlen_single_loopback_repeat_extend + +repeat_extend_forward_end_encodeBlockAsmAvx: + ADDL DI, AX + MOVL AX, BX + SUBL BP, BX + MOVL 24(SP), BP + MOVQ dst_base+0(FP), SI + MOVL 20(SP), DI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsmAvx + +emit_repeat_again_match_repeat_: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_match_repeat_ + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_ + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_repeat_ + +cant_repeat_two_offset_match_repeat_: + CMPL BX, $0x00000104 + JLT repeat_three_match_repeat_ + CMPL BX, $0x00010100 + JLT repeat_four_match_repeat_ + CMPL BX, $0x0100ffff + JLT repeat_five_match_repeat_ + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_repeat_ + +repeat_five_match_repeat_: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_four_match_repeat_: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_three_match_repeat_: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_two_match_repeat_: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_two_offset_match_repeat_: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_as_copy_encodeBlockAsmAvx: + CMPL BP, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsmAvx + CMPL BX, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(BX), BX + ADDQ $0x05, SI + CMPL BX, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx + +emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx: + TESTL BX, BX + JZ repeat_end_emit_encodeBlockAsmAvx + MOVB $0x03, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +two_byte_offset_repeat_as_copy_encodeBlockAsmAvx: + CMPL BX, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsmAvx + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(BX), BX + ADDQ $0x03, SI + +emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +two_byte_offset_short_repeat_as_copy_encodeBlockAsmAvx: + CMPL BX, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsmAvx + CMPL BP, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsmAvx + MOVB $0x01, DL + LEAQ -16(DX)(BX*4), BX + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsmAvx + +emit_copy_three_repeat_as_copy_encodeBlockAsmAvx: + MOVB $0x02, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +repeat_end_emit_encodeBlockAsmAvx: + MOVQ SI, dst_base+0(FP) + MOVL 16(SP), BX + CMPL AX, BX + JGT emit_remainder_encodeBlockAsmAvx + JMP search_loop_encodeBlockAsmAvx + +no_repeat_found_encodeBlockAsmAvx: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ BP, DI + SHRQ $0x10, DI + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x30, DI + CMPL (CX)(BX*1), BP + SHRQ $0x08, BP + JEQ candidate_match_encodeBlockAsmAvx + MOVL 32(SP)(DI*1), BX + CMPL (CX)(SI*1), BP + JEQ candidate2_match_encodeBlockAsmAvx + LEAQ 2(AX), SI + MOVL SI, 32(SP)(DI*1) + SHRQ $0x08, BP + CMPL (CX)(BX*1), BP + JEQ candidate3_match_encodeBlockAsmAvx + MOVL 28(SP), AX + JMP search_loop_encodeBlockAsmAvx + +candidate3_match_encodeBlockAsmAvx: + ADDL $0x02, AX + JMP candidate_match_encodeBlockAsmAvx + +candidate2_match_encodeBlockAsmAvx: + LEAQ -2(AX), BX + MOVL BX, 32(SP)(DI*1) + INCL AX + MOVL SI, BX + +candidate_match_encodeBlockAsmAvx: + MOVL 20(SP), BP + TESTL BX, BX + JZ match_extend_back_end_encodeBlockAsmAvx + +match_extend_back_loop_encodeBlockAsmAvx: + CMPL AX, BP + JG match_extend_back_end_encodeBlockAsmAvx + MOVB -1(CX)(BX*1), DL + MOVB -1(CX)(AX*1), SI + CMPB DL, SI + JNE match_extend_back_end_encodeBlockAsmAvx + LEAL -1(AX), AX + DECL BX + JZ match_extend_back_end_encodeBlockAsmAvx + JMP match_extend_back_loop_encodeBlockAsmAvx + +match_extend_back_end_encodeBlockAsmAvx: + MOVL AX, BP + SUBL 20(SP), BP + LEAQ dst_base+0(FP)(BP*1), BP + CMPQ BP, (SP) + JL match_dst_size_check_encodeBlockAsmAvx + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsmAvx: + MOVL BX, BP + MOVL 20(SP), SI + CMPL SI, BP + JEQ emit_literal_skip_match_emit_encodeBlockAsmAvx + MOVL BP, DI + MOVL BP, 20(SP) + LEAQ (CX)(SI*1), BP + SUBL SI, DI + MOVQ dst_base+0(FP), SI + MOVQ DI, R8 + SUBL $0x01, R8 + JC emit_literal_done_match_emit_encodeBlockAsmAvx + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsmAvx + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsmAvx + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsmAvx + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsmAvx + MOVB $0xfc, (SI) + MOVL R8, 1(SI) + ADDQ $0x05, SI + JMP memmove_match_emit_encodeBlockAsmAvx + +four_bytes_match_emit_encodeBlockAsmAvx: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (SI) + MOVW R8, 1(SI) + MOVB R9, 3(SI) + ADDQ $0x04, SI + JMP memmove_match_emit_encodeBlockAsmAvx + +three_bytes_match_emit_encodeBlockAsmAvx: + MOVB $0xf4, (SI) + MOVW R8, 1(SI) + ADDQ $0x03, SI + JMP memmove_match_emit_encodeBlockAsmAvx + +two_bytes_match_emit_encodeBlockAsmAvx: + MOVB $0xf0, (SI) + MOVB R8, 1(SI) + ADDQ $0x02, SI + JMP memmove_match_emit_encodeBlockAsmAvx + +one_byte_match_emit_encodeBlockAsmAvx: + SHLB $0x02, R8 + MOVB R8, (SI) + ADDQ $0x01, SI + +memmove_match_emit_encodeBlockAsmAvx: + LEAQ (SI)(DI*1), R8 + NOP + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail: + TESTQ DI, DI + JEQ emit_literal_done_match_emit_encodeBlockAsmAvx + CMPQ DI, $0x02 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_1or2 + CMPQ DI, $0x04 + JB emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_3 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_4 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_5through7 + JE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_8 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_9through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_17through32 + CMPQ DI, $0x40 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_33through64 + CMPQ DI, $0x80 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_65through128 + CMPQ DI, $0x00000100 + JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_129through256 + JMP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_avxUnaligned + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_1or2: + MOVB (BP), R8 + MOVB -1(BP)(DI*1), R9 + MOVB R8, (SI) + MOVB R9, -1(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_4: + MOVL (BP), R8 + MOVL R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_3: + MOVW (BP), R8 + MOVB 2(BP), R9 + MOVW R8, (SI) + MOVB R9, 2(SI) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_5through7: + MOVL (BP), R8 + MOVL -4(BP)(DI*1), R9 + MOVL R8, (SI) + MOVL R9, -4(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_8: + MOVQ (BP), R8 + MOVQ R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_9through16: + MOVQ (BP), R8 + MOVQ -8(BP)(DI*1), R9 + MOVQ R8, (SI) + MOVQ R9, -8(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_17through32: + MOVOU (BP), X0 + MOVOU -16(BP)(DI*1), X1 + MOVOU X0, (SI) + MOVOU X1, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_33through64: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU -32(BP)(DI*1), X2 + MOVOU -16(BP)(DI*1), X3 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, -32(SI)(DI*1) + MOVOU X3, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_65through128: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_129through256: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU -128(BP)(DI*1), X8 + MOVOU -112(BP)(DI*1), X9 + MOVOU -96(BP)(DI*1), X10 + MOVOU -80(BP)(DI*1), X11 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, -128(SI)(DI*1) + MOVOU X9, -112(SI)(DI*1) + MOVOU X10, -96(SI)(DI*1) + MOVOU X11, -80(SI)(DI*1) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048: + LEAQ -256(DI), DI + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU 128(BP), X8 + MOVOU 144(BP), X9 + MOVOU 160(BP), X10 + MOVOU 176(BP), X11 + MOVOU 192(BP), X12 + MOVOU 208(BP), X13 + MOVOU 224(BP), X14 + MOVOU 240(BP), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, 128(SI) + MOVOU X9, 144(SI) + MOVOU X10, 160(SI) + MOVOU X11, 176(SI) + MOVOU X12, 192(SI) + MOVOU X13, 208(SI) + MOVOU X14, 224(SI) + MOVOU X15, 240(SI) + CMPQ DI, $0x00000100 + LEAQ 256(BP), BP + LEAQ 256(SI), SI + JGE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048 + JMP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_avxUnaligned: + LEAQ (BP)(DI*1), R9 + MOVQ SI, R11 + MOVOU -128(R9), X5 + MOVOU -112(R9), X6 + MOVQ $0x00000080, R8 + ANDQ $0xffffffe0, SI + ADDQ $0x20, SI + MOVOU -96(R9), X7 + MOVOU -80(R9), X8 + MOVQ SI, R10 + SUBQ R11, R10 + MOVOU -64(R9), X9 + MOVOU -48(R9), X10 + SUBQ R10, DI + MOVOU -32(R9), X11 + MOVOU -16(R9), X12 + VMOVDQU (BP), Y4 + ADDQ R10, BP + SUBQ R8, DI + +emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_gobble_128_loop: + VMOVDQU (BP), Y0 + VMOVDQU 32(BP), Y1 + VMOVDQU 64(BP), Y2 + VMOVDQU 96(BP), Y3 + ADDQ R8, BP + VMOVDQA Y0, (SI) + VMOVDQA Y1, 32(SI) + VMOVDQA Y2, 64(SI) + VMOVDQA Y3, 96(SI) + ADDQ R8, SI + SUBQ R8, DI + JA emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_gobble_128_loop + ADDQ R8, DI + ADDQ SI, DI + VMOVDQU Y4, (R11) + VZEROUPPER + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) + JMP emit_literal_done_match_emit_encodeBlockAsmAvx + MOVQ R8, SI + +emit_literal_done_match_emit_encodeBlockAsmAvx: + MOVQ SI, dst_base+0(FP) + +emit_literal_skip_match_emit_encodeBlockAsmAvx: + NOP + +match_nolit_loop_encodeBlockAsmAvx: + MOVL AX, BP + MOVL AX, BP + SUBL BX, BP + MOVL BP, 24(SP) + ADDL $0x04, AX + ADDL $0x04, BX + MOVL 16(SP), BP + SUBL AX, BP + XORQ DI, DI + CMPQ BP, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsmAvx + +matchlen_loopback_match_nolit_encodeBlockAsmAvx: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_match_nolit_encodeBlockAsmAvx + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP match_nolit_end_encodeBlockAsmAvx + +matchlen_loop_match_nolit_encodeBlockAsmAvx: + LEAQ -8(BP), BP + LEAQ 8(DI), DI + CMPQ BP, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsmAvx + +matchlen_single_match_nolit_encodeBlockAsmAvx: + TESTQ BP, BP + JZ match_nolit_end_encodeBlockAsmAvx + +matchlen_single_loopback_match_nolit_encodeBlockAsmAvx: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE match_nolit_end_encodeBlockAsmAvx + LEAQ 1(DI), DI + DECQ BP + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsmAvx + +match_nolit_end_encodeBlockAsmAvx: + MOVL 24(SP), BP + ADDQ $0x04, DI + MOVQ dst_base+0(FP), SI + ADDL DI, AX + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsmAvx + CMPL DI, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsmAvx + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(DI), DI + ADDQ $0x05, SI + CMPL DI, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsmAvx + +emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy + +repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +four_bytes_remain_match_nolit_encodeBlockAsmAvx: + TESTL DI, DI + JZ match_nolit_emitcopy_end_encodeBlockAsmAvx + MOVB $0x03, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +two_byte_offset_match_nolit_encodeBlockAsmAvx: + CMPL DI, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsmAvx + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(DI), DI + ADDQ $0x03, SI + +emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy_short: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy_short + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy_short + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy_short + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy_short: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy_short: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy_short: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy_short: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +two_byte_offset_short_match_nolit_encodeBlockAsmAvx: + CMPL DI, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsmAvx + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsmAvx + MOVB $0x01, DL + LEAQ -16(DX)(DI*4), DI + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsmAvx + +emit_copy_three_match_nolit_encodeBlockAsmAvx: + MOVB $0x02, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +match_nolit_emitcopy_end_encodeBlockAsmAvx: + MOVQ SI, dst_base+0(FP) + MOVL AX, 20(SP) + CMPL AX, 16(SP) + JGE emit_remainder_encodeBlockAsmAvx + CMPQ SI, (SP) + JL match_nolit_dst_ok_encodeBlockAsmAvx + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsmAvx: + MOVQ -2(CX)(AX*1), BP + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ BP, DI + SHRQ $0x10, BP + MOVQ BP, R8 + SHLQ $0x10, DI + IMULQ SI, DI + SHRQ $0x30, DI + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x30, R8 + MOVL 32(SP)(DI*1), SI + MOVL 32(SP)(R8*1), SI + LEAQ -2(AX), SI + MOVL SI, 32(SP)(DI*1) + MOVL AX, 32(SP)(R8*1) + CMPL (CX)(R8*1), BP + JEQ match_nolit_loop_encodeBlockAsmAvx + INCL AX + JMP search_loop_encodeBlockAsmAvx + +emit_remainder_encodeBlockAsmAvx: + MOVQ src_len+32(FP), AX + SUBL 20(SP), AX + MOVQ dst_base+0(FP), DX + LEAQ (DX)(AX*1), DX + CMPQ DX, (SP) + JL emit_remainder_ok_encodeBlockAsmAvx + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsmAvx: + MOVQ src_len+32(FP), AX + MOVL 20(SP), DX + CMPL DX, AX + JEQ emit_literal_skip_emit_remainder_encodeBlockAsmAvx + MOVL AX, BX + MOVL AX, 20(SP) + LEAQ (CX)(DX*1), AX + SUBL DX, BX + MOVQ dst_base+0(FP), CX + MOVQ BX, DX + SUBL $0x01, DX + JC emit_literal_done_emit_remainder_encodeBlockAsmAvx + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsmAvx + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsmAvx + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsmAvx + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsmAvx + MOVB $0xfc, (CX) + MOVL DX, 1(CX) + ADDQ $0x05, CX + JMP memmove_emit_remainder_encodeBlockAsmAvx + +four_bytes_emit_remainder_encodeBlockAsmAvx: + MOVQ DX, BP + SHRL $0x10, BP + MOVB $0xf8, (CX) + MOVW DX, 1(CX) + MOVB BP, 3(CX) + ADDQ $0x04, CX + JMP memmove_emit_remainder_encodeBlockAsmAvx + +three_bytes_emit_remainder_encodeBlockAsmAvx: + MOVB $0xf4, (CX) + MOVW DX, 1(CX) + ADDQ $0x03, CX + JMP memmove_emit_remainder_encodeBlockAsmAvx + +two_bytes_emit_remainder_encodeBlockAsmAvx: + MOVB $0xf0, (CX) + MOVB DL, 1(CX) + ADDQ $0x02, CX + JMP memmove_emit_remainder_encodeBlockAsmAvx + +one_byte_emit_remainder_encodeBlockAsmAvx: + SHLB $0x02, DL + MOVB DL, (CX) + ADDQ $0x01, CX + +memmove_emit_remainder_encodeBlockAsmAvx: + LEAQ (CX)(BX*1), DX + NOP + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_tail: + TESTQ BX, BX + JEQ emit_literal_done_emit_remainder_encodeBlockAsmAvx + CMPQ BX, $0x02 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_1or2 + CMPQ BX, $0x04 + JB emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_3 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_5through7 + JE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_9through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_17through32 + CMPQ BX, $0x40 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_33through64 + CMPQ BX, $0x80 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_65through128 + CMPQ BX, $0x00000100 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_129through256 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_avxUnaligned + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_1or2: + MOVB (AX), DL + MOVB -1(AX)(BX*1), BP + MOVB DL, (CX) + MOVB BP, -1(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_4: + MOVL (AX), DX + MOVL DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_3: + MOVW (AX), DX + MOVB 2(AX), BP + MOVW DX, (CX) + MOVB BP, 2(CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_5through7: + MOVL (AX), DX + MOVL -4(AX)(BX*1), BP + MOVL DX, (CX) + MOVL BP, -4(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_8: + MOVQ (AX), DX + MOVQ DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_9through16: + MOVQ (AX), DX + MOVQ -8(AX)(BX*1), BP + MOVQ DX, (CX) + MOVQ BP, -8(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_17through32: + MOVOU (AX), X0 + MOVOU -16(AX)(BX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_33through64: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(BX*1) + MOVOU X3, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_65through128: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_129through256: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU -128(AX)(BX*1), X8 + MOVOU -112(AX)(BX*1), X9 + MOVOU -96(AX)(BX*1), X10 + MOVOU -80(AX)(BX*1), X11 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, -128(CX)(BX*1) + MOVOU X9, -112(CX)(BX*1) + MOVOU X10, -96(CX)(BX*1) + MOVOU X11, -80(CX)(BX*1) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_256through2048: + LEAQ -256(BX), BX + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 128(AX), X8 + MOVOU 144(AX), X9 + MOVOU 160(AX), X10 + MOVOU 176(AX), X11 + MOVOU 192(AX), X12 + MOVOU 208(AX), X13 + MOVOU 224(AX), X14 + MOVOU 240(AX), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, 128(CX) + MOVOU X9, 144(CX) + MOVOU X10, 160(CX) + MOVOU X11, 176(CX) + MOVOU X12, 192(CX) + MOVOU X13, 208(CX) + MOVOU X14, 224(CX) + MOVOU X15, 240(CX) + CMPQ BX, $0x00000100 + LEAQ 256(AX), AX + LEAQ 256(CX), CX + JGE emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_256through2048 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_tail + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_avxUnaligned: + LEAQ (AX)(BX*1), BP + MOVQ CX, DI + MOVOU -128(BP), X5 + MOVOU -112(BP), X6 + MOVQ $0x00000080, DX + ANDQ $0xffffffe0, CX + ADDQ $0x20, CX + MOVOU -96(BP), X7 + MOVOU -80(BP), X8 + MOVQ CX, SI + SUBQ DI, SI + MOVOU -64(BP), X9 + MOVOU -48(BP), X10 + SUBQ SI, BX + MOVOU -32(BP), X11 + MOVOU -16(BP), X12 + VMOVDQU (AX), Y4 + ADDQ SI, AX + SUBQ DX, BX + +emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_gobble_128_loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + ADDQ DX, AX + VMOVDQA Y0, (CX) + VMOVDQA Y1, 32(CX) + VMOVDQA Y2, 64(CX) + VMOVDQA Y3, 96(CX) + ADDQ DX, CX + SUBQ DX, BX + JA emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_gobble_128_loop + ADDQ DX, BX + ADDQ CX, BX + VMOVDQU Y4, (DI) + VZEROUPPER + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) + JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx + MOVQ DX, CX + +emit_literal_done_emit_remainder_encodeBlockAsmAvx: + MOVQ CX, dst_base+0(FP) + +emit_literal_skip_emit_remainder_encodeBlockAsmAvx: + MOVQ 8(SP), AX + SUBQ dst_base+0(FP), AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm14BAvx(dst []byte, src []byte) int +// Requires: AVX, SSE2 +TEXT ·encodeBlockAsm14BAvx(SB), $16416-56 + MOVQ $0x00000080, AX + LEAQ 32(SP), CX + PXOR X0, X0 + +zero_loop_encodeBlockAsm14BAvx: + MOVOU X0, (CX) + MOVOU X0, 16(CX) + MOVOU X0, 32(CX) + MOVOU X0, 48(CX) + MOVOU X0, 64(CX) + MOVOU X0, 80(CX) + MOVOU X0, 96(CX) + MOVOU X0, 112(CX) + ADDQ $0x80, CX + DECQ AX + JNZ zero_loop_encodeBlockAsm14BAvx + MOVL AX, 20(SP) + MOVQ src_len+32(FP), AX + LEAQ -5(AX), CX + LEAQ -8(AX), BX + SHRQ $0x05, AX + SUBL AX, CX + MOVL BX, 16(SP) + MOVQ dst_base+0(FP), AX + MOVQ AX, 8(SP) + LEAQ (AX)(CX*1), CX + MOVQ CX, (SP) + MOVL $0x00000001, AX + MOVL AX, 24(SP) + MOVQ src_base+24(FP), CX + +search_loop_encodeBlockAsm14BAvx: + MOVQ (CX)(AX*1), BP + MOVL AX, BX + SUBL 20(SP), BX + SHRL $0x05, BX + LEAQ 4(AX)(BX*1), BX + MOVL 16(SP), SI + CMPL BX, SI + JGT emit_remainder_encodeBlockAsm14BAvx + MOVL BX, 28(SP) + MOVQ $0x0000cf1bbcdcbf9b, BX + MOVQ BP, DI + MOVQ BP, R8 + SHRQ $0x08, R8 + SHLQ $0x10, DI + IMULQ BX, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ BX, R8 + SHRQ $0x32, R8 + MOVL 32(SP)(DI*1), BX + MOVL 32(SP)(R8*1), SI + MOVL AX, 32(SP)(DI*1) + LEAL 1(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, DI + SUBL 24(SP), DI + MOVL 1(CX)(DI*1), R9 + MOVQ BP, R8 + SHLQ $0x08, R8 + CMPL R8, R9 + JNE no_repeat_found_encodeBlockAsm14BAvx + LEAQ 1(AX), BP + MOVL 20(SP), BX + TESTL DI, DI + JZ repeat_extend_back_end_encodeBlockAsm14BAvx + +repeat_extend_back_loop_encodeBlockAsm14BAvx: + CMPL BP, BX + JG repeat_extend_back_end_encodeBlockAsm14BAvx + MOVB -1(CX)(DI*1), DL + MOVB -1(CX)(BP*1), SI + CMPB DL, SI + JNE repeat_extend_back_end_encodeBlockAsm14BAvx + LEAQ -1(BP), BP + DECL DI + JZ repeat_extend_back_end_encodeBlockAsm14BAvx + JMP repeat_extend_back_loop_encodeBlockAsm14BAvx + +repeat_extend_back_end_encodeBlockAsm14BAvx: + MOVL 20(SP), BX + CMPL BX, BP + JEQ emit_literal_skip_repeat_emit_encodeBlockAsm14BAvx + MOVL BP, SI + MOVL BP, 20(SP) + LEAQ (CX)(BX*1), DI + SUBL BX, SI + MOVQ dst_base+0(FP), BX + MOVQ SI, R8 + SUBL $0x01, R8 + JC emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + CMPL R8, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm14BAvx + CMPL R8, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm14BAvx + CMPL R8, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm14BAvx + CMPL R8, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm14BAvx + MOVB $0xfc, (BX) + MOVL R8, 1(BX) + ADDQ $0x05, BX + JMP memmove_repeat_emit_encodeBlockAsm14BAvx + +four_bytes_repeat_emit_encodeBlockAsm14BAvx: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (BX) + MOVW R8, 1(BX) + MOVB R9, 3(BX) + ADDQ $0x04, BX + JMP memmove_repeat_emit_encodeBlockAsm14BAvx + +three_bytes_repeat_emit_encodeBlockAsm14BAvx: + MOVB $0xf4, (BX) + MOVW R8, 1(BX) + ADDQ $0x03, BX + JMP memmove_repeat_emit_encodeBlockAsm14BAvx + +two_bytes_repeat_emit_encodeBlockAsm14BAvx: + MOVB $0xf0, (BX) + MOVB R8, 1(BX) + ADDQ $0x02, BX + JMP memmove_repeat_emit_encodeBlockAsm14BAvx + +one_byte_repeat_emit_encodeBlockAsm14BAvx: + SHLB $0x02, R8 + MOVB R8, (BX) + ADDQ $0x01, BX + +memmove_repeat_emit_encodeBlockAsm14BAvx: + LEAQ (BX)(SI*1), R8 + NOP + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_tail: + TESTQ SI, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + CMPQ SI, $0x02 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_1or2 + CMPQ SI, $0x04 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_3 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_4 + CMPQ SI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_5through7 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_8 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_9through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_17through32 + CMPQ SI, $0x40 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_33through64 + CMPQ SI, $0x80 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_65through128 + CMPQ SI, $0x00000100 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_129through256 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_1or2: + MOVB (DI), R8 + MOVB -1(DI)(SI*1), R9 + MOVB R8, (BX) + MOVB R9, -1(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_4: + MOVL (DI), R8 + MOVL R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_3: + MOVW (DI), R8 + MOVB 2(DI), R9 + MOVW R8, (BX) + MOVB R9, 2(BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_5through7: + MOVL (DI), R8 + MOVL -4(DI)(SI*1), R9 + MOVL R8, (BX) + MOVL R9, -4(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_8: + MOVQ (DI), R8 + MOVQ R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_9through16: + MOVQ (DI), R8 + MOVQ -8(DI)(SI*1), R9 + MOVQ R8, (BX) + MOVQ R9, -8(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(SI*1), X1 + MOVOU X0, (BX) + MOVOU X1, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(SI*1), X2 + MOVOU -16(DI)(SI*1), X3 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, -32(BX)(SI*1) + MOVOU X3, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_65through128: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_129through256: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU -128(DI)(SI*1), X8 + MOVOU -112(DI)(SI*1), X9 + MOVOU -96(DI)(SI*1), X10 + MOVOU -80(DI)(SI*1), X11 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, -128(BX)(SI*1) + MOVOU X9, -112(BX)(SI*1) + MOVOU X10, -96(BX)(SI*1) + MOVOU X11, -80(BX)(SI*1) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_256through2048: + LEAQ -256(SI), SI + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU 128(DI), X8 + MOVOU 144(DI), X9 + MOVOU 160(DI), X10 + MOVOU 176(DI), X11 + MOVOU 192(DI), X12 + MOVOU 208(DI), X13 + MOVOU 224(DI), X14 + MOVOU 240(DI), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, 128(BX) + MOVOU X9, 144(BX) + MOVOU X10, 160(BX) + MOVOU X11, 176(BX) + MOVOU X12, 192(BX) + MOVOU X13, 208(BX) + MOVOU X14, 224(BX) + MOVOU X15, 240(BX) + CMPQ SI, $0x00000100 + LEAQ 256(DI), DI + LEAQ 256(BX), BX + JGE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_256through2048 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_tail + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned: + LEAQ (DI)(SI*1), R9 + MOVQ BX, R11 + MOVOU -128(R9), X5 + MOVOU -112(R9), X6 + MOVQ $0x00000080, R8 + ANDQ $0xffffffe0, BX + ADDQ $0x20, BX + MOVOU -96(R9), X7 + MOVOU -80(R9), X8 + MOVQ BX, R10 + SUBQ R11, R10 + MOVOU -64(R9), X9 + MOVOU -48(R9), X10 + SUBQ R10, SI + MOVOU -32(R9), X11 + MOVOU -16(R9), X12 + VMOVDQU (DI), Y4 + ADDQ R10, DI + SUBQ R8, SI + +emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop: + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU 64(DI), Y2 + VMOVDQU 96(DI), Y3 + ADDQ R8, DI + VMOVDQA Y0, (BX) + VMOVDQA Y1, 32(BX) + VMOVDQA Y2, 64(BX) + VMOVDQA Y3, 96(BX) + ADDQ R8, BX + SUBQ R8, SI + JA emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop + ADDQ R8, SI + ADDQ BX, SI + VMOVDQU Y4, (R11) + VZEROUPPER + MOVOU X5, -128(SI) + MOVOU X6, -112(SI) + MOVOU X7, -96(SI) + MOVOU X8, -80(SI) + MOVOU X9, -64(SI) + MOVOU X10, -48(SI) + MOVOU X11, -32(SI) + MOVOU X12, -16(SI) + JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx + MOVQ R8, BX + +emit_literal_done_repeat_emit_encodeBlockAsm14BAvx: + MOVQ BX, dst_base+0(FP) + +emit_literal_skip_repeat_emit_encodeBlockAsm14BAvx: + ADDL $0x05, AX + MOVL AX, BX + SUBL 24(SP), BX + MOVL 16(SP), BX + SUBL AX, BX + XORQ DI, DI + CMPQ BX, $0x08 + JL matchlen_single_repeat_extend + +matchlen_loopback_repeat_extend: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_repeat_extend + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP repeat_extend_forward_end_encodeBlockAsm14BAvx + +matchlen_loop_repeat_extend: + LEAQ -8(BX), BX + LEAQ 8(DI), DI + CMPQ BX, $0x08 + JGE matchlen_loopback_repeat_extend + +matchlen_single_repeat_extend: + TESTQ BX, BX + JZ repeat_extend_forward_end_encodeBlockAsm14BAvx + +matchlen_single_loopback_repeat_extend: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE repeat_extend_forward_end_encodeBlockAsm14BAvx + LEAQ 1(DI), DI + DECQ BX + JNZ matchlen_single_loopback_repeat_extend + +repeat_extend_forward_end_encodeBlockAsm14BAvx: + ADDL DI, AX + MOVL AX, BX + SUBL BP, BX + MOVL 24(SP), BP + MOVQ dst_base+0(FP), SI + MOVL 20(SP), DI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm14BAvx + +emit_repeat_again_match_repeat_: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_match_repeat_ + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_ + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_repeat_ + +cant_repeat_two_offset_match_repeat_: + CMPL BX, $0x00000104 + JLT repeat_three_match_repeat_ + CMPL BX, $0x00010100 + JLT repeat_four_match_repeat_ + CMPL BX, $0x0100ffff + JLT repeat_five_match_repeat_ + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_repeat_ + +repeat_five_match_repeat_: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_four_match_repeat_: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_three_match_repeat_: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_two_match_repeat_: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_two_offset_match_repeat_: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_as_copy_encodeBlockAsm14BAvx: + CMPL BP, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm14BAvx + CMPL BX, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(BX), BX + ADDQ $0x05, SI + CMPL BX, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx + +emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_four_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_three_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx: + TESTL BX, BX + JZ repeat_end_emit_encodeBlockAsm14BAvx + MOVB $0x03, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +two_byte_offset_repeat_as_copy_encodeBlockAsm14BAvx: + CMPL BX, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm14BAvx + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(BX), BX + ADDQ $0x03, SI + +emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_four_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_three_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm14BAvx: + CMPL BX, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14BAvx + CMPL BP, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14BAvx + MOVB $0x01, DL + LEAQ -16(DX)(BX*4), BX + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm14BAvx + +emit_copy_three_repeat_as_copy_encodeBlockAsm14BAvx: + MOVB $0x02, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +repeat_end_emit_encodeBlockAsm14BAvx: + MOVQ SI, dst_base+0(FP) + MOVL 16(SP), BX + CMPL AX, BX + JGT emit_remainder_encodeBlockAsm14BAvx + JMP search_loop_encodeBlockAsm14BAvx + +no_repeat_found_encodeBlockAsm14BAvx: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ BP, DI + SHRQ $0x10, DI + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x32, DI + CMPL (CX)(BX*1), BP + SHRQ $0x08, BP + JEQ candidate_match_encodeBlockAsm14BAvx + MOVL 32(SP)(DI*1), BX + CMPL (CX)(SI*1), BP + JEQ candidate2_match_encodeBlockAsm14BAvx + LEAQ 2(AX), SI + MOVL SI, 32(SP)(DI*1) + SHRQ $0x08, BP + CMPL (CX)(BX*1), BP + JEQ candidate3_match_encodeBlockAsm14BAvx + MOVL 28(SP), AX + JMP search_loop_encodeBlockAsm14BAvx + +candidate3_match_encodeBlockAsm14BAvx: + ADDL $0x02, AX + JMP candidate_match_encodeBlockAsm14BAvx + +candidate2_match_encodeBlockAsm14BAvx: + LEAQ -2(AX), BX + MOVL BX, 32(SP)(DI*1) + INCL AX + MOVL SI, BX + +candidate_match_encodeBlockAsm14BAvx: + MOVL 20(SP), BP + TESTL BX, BX + JZ match_extend_back_end_encodeBlockAsm14BAvx + +match_extend_back_loop_encodeBlockAsm14BAvx: + CMPL AX, BP + JG match_extend_back_end_encodeBlockAsm14BAvx + MOVB -1(CX)(BX*1), DL + MOVB -1(CX)(AX*1), SI + CMPB DL, SI + JNE match_extend_back_end_encodeBlockAsm14BAvx + LEAL -1(AX), AX + DECL BX + JZ match_extend_back_end_encodeBlockAsm14BAvx + JMP match_extend_back_loop_encodeBlockAsm14BAvx + +match_extend_back_end_encodeBlockAsm14BAvx: + MOVL AX, BP + SUBL 20(SP), BP + LEAQ dst_base+0(FP)(BP*1), BP + CMPQ BP, (SP) + JL match_dst_size_check_encodeBlockAsm14BAvx + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm14BAvx: + MOVL BX, BP + MOVL 20(SP), SI + CMPL SI, BP + JEQ emit_literal_skip_match_emit_encodeBlockAsm14BAvx + MOVL BP, DI + MOVL BP, 20(SP) + LEAQ (CX)(SI*1), BP + SUBL SI, DI + MOVQ dst_base+0(FP), SI + MOVQ DI, R8 + SUBL $0x01, R8 + JC emit_literal_done_match_emit_encodeBlockAsm14BAvx + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm14BAvx + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm14BAvx + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm14BAvx + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm14BAvx + MOVB $0xfc, (SI) + MOVL R8, 1(SI) + ADDQ $0x05, SI + JMP memmove_match_emit_encodeBlockAsm14BAvx + +four_bytes_match_emit_encodeBlockAsm14BAvx: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (SI) + MOVW R8, 1(SI) + MOVB R9, 3(SI) + ADDQ $0x04, SI + JMP memmove_match_emit_encodeBlockAsm14BAvx + +three_bytes_match_emit_encodeBlockAsm14BAvx: + MOVB $0xf4, (SI) + MOVW R8, 1(SI) + ADDQ $0x03, SI + JMP memmove_match_emit_encodeBlockAsm14BAvx + +two_bytes_match_emit_encodeBlockAsm14BAvx: + MOVB $0xf0, (SI) + MOVB R8, 1(SI) + ADDQ $0x02, SI + JMP memmove_match_emit_encodeBlockAsm14BAvx + +one_byte_match_emit_encodeBlockAsm14BAvx: + SHLB $0x02, R8 + MOVB R8, (SI) + ADDQ $0x01, SI + +memmove_match_emit_encodeBlockAsm14BAvx: + LEAQ (SI)(DI*1), R8 + NOP + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_tail: + TESTQ DI, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm14BAvx + CMPQ DI, $0x02 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_1or2 + CMPQ DI, $0x04 + JB emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_3 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_4 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_5through7 + JE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_8 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_9through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_17through32 + CMPQ DI, $0x40 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_33through64 + CMPQ DI, $0x80 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_65through128 + CMPQ DI, $0x00000100 + JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_129through256 + JMP emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_1or2: + MOVB (BP), R8 + MOVB -1(BP)(DI*1), R9 + MOVB R8, (SI) + MOVB R9, -1(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_4: + MOVL (BP), R8 + MOVL R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_3: + MOVW (BP), R8 + MOVB 2(BP), R9 + MOVW R8, (SI) + MOVB R9, 2(SI) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_5through7: + MOVL (BP), R8 + MOVL -4(BP)(DI*1), R9 + MOVL R8, (SI) + MOVL R9, -4(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_8: + MOVQ (BP), R8 + MOVQ R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_9through16: + MOVQ (BP), R8 + MOVQ -8(BP)(DI*1), R9 + MOVQ R8, (SI) + MOVQ R9, -8(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_17through32: + MOVOU (BP), X0 + MOVOU -16(BP)(DI*1), X1 + MOVOU X0, (SI) + MOVOU X1, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_33through64: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU -32(BP)(DI*1), X2 + MOVOU -16(BP)(DI*1), X3 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, -32(SI)(DI*1) + MOVOU X3, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_65through128: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_129through256: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU -128(BP)(DI*1), X8 + MOVOU -112(BP)(DI*1), X9 + MOVOU -96(BP)(DI*1), X10 + MOVOU -80(BP)(DI*1), X11 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, -128(SI)(DI*1) + MOVOU X9, -112(SI)(DI*1) + MOVOU X10, -96(SI)(DI*1) + MOVOU X11, -80(SI)(DI*1) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_256through2048: + LEAQ -256(DI), DI + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU 128(BP), X8 + MOVOU 144(BP), X9 + MOVOU 160(BP), X10 + MOVOU 176(BP), X11 + MOVOU 192(BP), X12 + MOVOU 208(BP), X13 + MOVOU 224(BP), X14 + MOVOU 240(BP), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, 128(SI) + MOVOU X9, 144(SI) + MOVOU X10, 160(SI) + MOVOU X11, 176(SI) + MOVOU X12, 192(SI) + MOVOU X13, 208(SI) + MOVOU X14, 224(SI) + MOVOU X15, 240(SI) + CMPQ DI, $0x00000100 + LEAQ 256(BP), BP + LEAQ 256(SI), SI + JGE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_256through2048 + JMP emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_tail + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned: + LEAQ (BP)(DI*1), R9 + MOVQ SI, R11 + MOVOU -128(R9), X5 + MOVOU -112(R9), X6 + MOVQ $0x00000080, R8 + ANDQ $0xffffffe0, SI + ADDQ $0x20, SI + MOVOU -96(R9), X7 + MOVOU -80(R9), X8 + MOVQ SI, R10 + SUBQ R11, R10 + MOVOU -64(R9), X9 + MOVOU -48(R9), X10 + SUBQ R10, DI + MOVOU -32(R9), X11 + MOVOU -16(R9), X12 + VMOVDQU (BP), Y4 + ADDQ R10, BP + SUBQ R8, DI + +emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop: + VMOVDQU (BP), Y0 + VMOVDQU 32(BP), Y1 + VMOVDQU 64(BP), Y2 + VMOVDQU 96(BP), Y3 + ADDQ R8, BP + VMOVDQA Y0, (SI) + VMOVDQA Y1, 32(SI) + VMOVDQA Y2, 64(SI) + VMOVDQA Y3, 96(SI) + ADDQ R8, SI + SUBQ R8, DI + JA emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop + ADDQ R8, DI + ADDQ SI, DI + VMOVDQU Y4, (R11) + VZEROUPPER + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) + JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx + MOVQ R8, SI + +emit_literal_done_match_emit_encodeBlockAsm14BAvx: + MOVQ SI, dst_base+0(FP) + +emit_literal_skip_match_emit_encodeBlockAsm14BAvx: + NOP + +match_nolit_loop_encodeBlockAsm14BAvx: + MOVL AX, BP + MOVL AX, BP + SUBL BX, BP + MOVL BP, 24(SP) + ADDL $0x04, AX + ADDL $0x04, BX + MOVL 16(SP), BP + SUBL AX, BP + XORQ DI, DI + CMPQ BP, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm14BAvx + +matchlen_loopback_match_nolit_encodeBlockAsm14BAvx: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_match_nolit_encodeBlockAsm14BAvx + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP match_nolit_end_encodeBlockAsm14BAvx + +matchlen_loop_match_nolit_encodeBlockAsm14BAvx: + LEAQ -8(BP), BP + LEAQ 8(DI), DI + CMPQ BP, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm14BAvx + +matchlen_single_match_nolit_encodeBlockAsm14BAvx: + TESTQ BP, BP + JZ match_nolit_end_encodeBlockAsm14BAvx + +matchlen_single_loopback_match_nolit_encodeBlockAsm14BAvx: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE match_nolit_end_encodeBlockAsm14BAvx + LEAQ 1(DI), DI + DECQ BP + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm14BAvx + +match_nolit_end_encodeBlockAsm14BAvx: + MOVL 24(SP), BP + ADDQ $0x04, DI + MOVQ dst_base+0(FP), SI + ADDL DI, AX + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm14BAvx + CMPL DI, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm14BAvx + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(DI), DI + ADDQ $0x05, SI + CMPL DI, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm14BAvx + +emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy + +repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +four_bytes_remain_match_nolit_encodeBlockAsm14BAvx: + TESTL DI, DI + JZ match_nolit_emitcopy_end_encodeBlockAsm14BAvx + MOVB $0x03, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +two_byte_offset_match_nolit_encodeBlockAsm14BAvx: + CMPL DI, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm14BAvx + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(DI), DI + ADDQ $0x03, SI + +emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +two_byte_offset_short_match_nolit_encodeBlockAsm14BAvx: + CMPL DI, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm14BAvx + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm14BAvx + MOVB $0x01, DL + LEAQ -16(DX)(DI*4), DI + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx + +emit_copy_three_match_nolit_encodeBlockAsm14BAvx: + MOVB $0x02, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +match_nolit_emitcopy_end_encodeBlockAsm14BAvx: + MOVQ SI, dst_base+0(FP) + MOVL AX, 20(SP) + CMPL AX, 16(SP) + JGE emit_remainder_encodeBlockAsm14BAvx + CMPQ SI, (SP) + JL match_nolit_dst_ok_encodeBlockAsm14BAvx + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm14BAvx: + MOVQ -2(CX)(AX*1), BP + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ BP, DI + SHRQ $0x10, BP + MOVQ BP, R8 + SHLQ $0x10, DI + IMULQ SI, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 + MOVL 32(SP)(DI*1), SI + MOVL 32(SP)(R8*1), SI + LEAQ -2(AX), SI + MOVL SI, 32(SP)(DI*1) + MOVL AX, 32(SP)(R8*1) + CMPL (CX)(R8*1), BP + JEQ match_nolit_loop_encodeBlockAsm14BAvx + INCL AX + JMP search_loop_encodeBlockAsm14BAvx + +emit_remainder_encodeBlockAsm14BAvx: + MOVQ src_len+32(FP), AX + SUBL 20(SP), AX + MOVQ dst_base+0(FP), DX + LEAQ (DX)(AX*1), DX + CMPQ DX, (SP) + JL emit_remainder_ok_encodeBlockAsm14BAvx + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm14BAvx: + MOVQ src_len+32(FP), AX + MOVL 20(SP), DX + CMPL DX, AX + JEQ emit_literal_skip_emit_remainder_encodeBlockAsm14BAvx + MOVL AX, BX + MOVL AX, 20(SP) + LEAQ (CX)(DX*1), AX + SUBL DX, BX + MOVQ dst_base+0(FP), CX + MOVQ BX, DX + SUBL $0x01, DX + JC emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm14BAvx + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm14BAvx + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm14BAvx + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm14BAvx + MOVB $0xfc, (CX) + MOVL DX, 1(CX) + ADDQ $0x05, CX + JMP memmove_emit_remainder_encodeBlockAsm14BAvx + +four_bytes_emit_remainder_encodeBlockAsm14BAvx: + MOVQ DX, BP + SHRL $0x10, BP + MOVB $0xf8, (CX) + MOVW DX, 1(CX) + MOVB BP, 3(CX) + ADDQ $0x04, CX + JMP memmove_emit_remainder_encodeBlockAsm14BAvx + +three_bytes_emit_remainder_encodeBlockAsm14BAvx: + MOVB $0xf4, (CX) + MOVW DX, 1(CX) + ADDQ $0x03, CX + JMP memmove_emit_remainder_encodeBlockAsm14BAvx + +two_bytes_emit_remainder_encodeBlockAsm14BAvx: + MOVB $0xf0, (CX) + MOVB DL, 1(CX) + ADDQ $0x02, CX + JMP memmove_emit_remainder_encodeBlockAsm14BAvx + +one_byte_emit_remainder_encodeBlockAsm14BAvx: + SHLB $0x02, DL + MOVB DL, (CX) + ADDQ $0x01, CX + +memmove_emit_remainder_encodeBlockAsm14BAvx: + LEAQ (CX)(BX*1), DX + NOP + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_tail: + TESTQ BX, BX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + CMPQ BX, $0x02 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_1or2 + CMPQ BX, $0x04 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_3 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_5through7 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_9through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_17through32 + CMPQ BX, $0x40 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_33through64 + CMPQ BX, $0x80 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_65through128 + CMPQ BX, $0x00000100 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_129through256 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_avxUnaligned + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_1or2: + MOVB (AX), DL + MOVB -1(AX)(BX*1), BP + MOVB DL, (CX) + MOVB BP, -1(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_4: + MOVL (AX), DX + MOVL DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_3: + MOVW (AX), DX + MOVB 2(AX), BP + MOVW DX, (CX) + MOVB BP, 2(CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_5through7: + MOVL (AX), DX + MOVL -4(AX)(BX*1), BP + MOVL DX, (CX) + MOVL BP, -4(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_8: + MOVQ (AX), DX + MOVQ DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_9through16: + MOVQ (AX), DX + MOVQ -8(AX)(BX*1), BP + MOVQ DX, (CX) + MOVQ BP, -8(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_17through32: + MOVOU (AX), X0 + MOVOU -16(AX)(BX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_33through64: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(BX*1) + MOVOU X3, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_65through128: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_129through256: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU -128(AX)(BX*1), X8 + MOVOU -112(AX)(BX*1), X9 + MOVOU -96(AX)(BX*1), X10 + MOVOU -80(AX)(BX*1), X11 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, -128(CX)(BX*1) + MOVOU X9, -112(CX)(BX*1) + MOVOU X10, -96(CX)(BX*1) + MOVOU X11, -80(CX)(BX*1) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_256through2048: + LEAQ -256(BX), BX + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 128(AX), X8 + MOVOU 144(AX), X9 + MOVOU 160(AX), X10 + MOVOU 176(AX), X11 + MOVOU 192(AX), X12 + MOVOU 208(AX), X13 + MOVOU 224(AX), X14 + MOVOU 240(AX), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, 128(CX) + MOVOU X9, 144(CX) + MOVOU X10, 160(CX) + MOVOU X11, 176(CX) + MOVOU X12, 192(CX) + MOVOU X13, 208(CX) + MOVOU X14, 224(CX) + MOVOU X15, 240(CX) + CMPQ BX, $0x00000100 + LEAQ 256(AX), AX + LEAQ 256(CX), CX + JGE emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_256through2048 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_tail + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_avxUnaligned: + LEAQ (AX)(BX*1), BP + MOVQ CX, DI + MOVOU -128(BP), X5 + MOVOU -112(BP), X6 + MOVQ $0x00000080, DX + ANDQ $0xffffffe0, CX + ADDQ $0x20, CX + MOVOU -96(BP), X7 + MOVOU -80(BP), X8 + MOVQ CX, SI + SUBQ DI, SI + MOVOU -64(BP), X9 + MOVOU -48(BP), X10 + SUBQ SI, BX + MOVOU -32(BP), X11 + MOVOU -16(BP), X12 + VMOVDQU (AX), Y4 + ADDQ SI, AX + SUBQ DX, BX + +emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_gobble_128_loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + ADDQ DX, AX + VMOVDQA Y0, (CX) + VMOVDQA Y1, 32(CX) + VMOVDQA Y2, 64(CX) + VMOVDQA Y3, 96(CX) + ADDQ DX, CX + SUBQ DX, BX + JA emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_gobble_128_loop + ADDQ DX, BX + ADDQ CX, BX + VMOVDQU Y4, (DI) + VZEROUPPER + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx + MOVQ DX, CX + +emit_literal_done_emit_remainder_encodeBlockAsm14BAvx: + MOVQ CX, dst_base+0(FP) + +emit_literal_skip_emit_remainder_encodeBlockAsm14BAvx: + MOVQ 8(SP), AX + SUBQ dst_base+0(FP), AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm12BAvx(dst []byte, src []byte) int +// Requires: AVX, SSE2 +TEXT ·encodeBlockAsm12BAvx(SB), $4128-56 + MOVQ $0x00000020, AX + LEAQ 32(SP), CX + PXOR X0, X0 + +zero_loop_encodeBlockAsm12BAvx: + MOVOU X0, (CX) + MOVOU X0, 16(CX) + MOVOU X0, 32(CX) + MOVOU X0, 48(CX) + MOVOU X0, 64(CX) + MOVOU X0, 80(CX) + MOVOU X0, 96(CX) + MOVOU X0, 112(CX) + ADDQ $0x80, CX + DECQ AX + JNZ zero_loop_encodeBlockAsm12BAvx + MOVL AX, 20(SP) + MOVQ src_len+32(FP), AX + LEAQ -5(AX), CX + LEAQ -8(AX), BX + SHRQ $0x05, AX + SUBL AX, CX + MOVL BX, 16(SP) + MOVQ dst_base+0(FP), AX + MOVQ AX, 8(SP) + LEAQ (AX)(CX*1), CX + MOVQ CX, (SP) + MOVL $0x00000001, AX + MOVL AX, 24(SP) + MOVQ src_base+24(FP), CX + +search_loop_encodeBlockAsm12BAvx: + MOVQ (CX)(AX*1), BP + MOVL AX, BX + SUBL 20(SP), BX + SHRL $0x04, BX + LEAQ 4(AX)(BX*1), BX + MOVL 16(SP), SI + CMPL BX, SI + JGT emit_remainder_encodeBlockAsm12BAvx + MOVL BX, 28(SP) + MOVQ $0x0000cf1bbcdcbf9b, BX + MOVQ BP, DI + MOVQ BP, R8 + SHRQ $0x08, R8 + SHLQ $0x10, DI + IMULQ BX, DI + SHRQ $0x34, DI + SHLQ $0x10, R8 + IMULQ BX, R8 + SHRQ $0x34, R8 + MOVL 32(SP)(DI*1), BX + MOVL 32(SP)(R8*1), SI + MOVL AX, 32(SP)(DI*1) + LEAL 1(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, DI + SUBL 24(SP), DI + MOVL 1(CX)(DI*1), R9 + MOVQ BP, R8 + SHLQ $0x08, R8 + CMPL R8, R9 + JNE no_repeat_found_encodeBlockAsm12BAvx + LEAQ 1(AX), BP + MOVL 20(SP), BX + TESTL DI, DI + JZ repeat_extend_back_end_encodeBlockAsm12BAvx + +repeat_extend_back_loop_encodeBlockAsm12BAvx: + CMPL BP, BX + JG repeat_extend_back_end_encodeBlockAsm12BAvx + MOVB -1(CX)(DI*1), DL + MOVB -1(CX)(BP*1), SI + CMPB DL, SI + JNE repeat_extend_back_end_encodeBlockAsm12BAvx + LEAQ -1(BP), BP + DECL DI + JZ repeat_extend_back_end_encodeBlockAsm12BAvx + JMP repeat_extend_back_loop_encodeBlockAsm12BAvx + +repeat_extend_back_end_encodeBlockAsm12BAvx: + MOVL 20(SP), BX + CMPL BX, BP + JEQ emit_literal_skip_repeat_emit_encodeBlockAsm12BAvx + MOVL BP, SI + MOVL BP, 20(SP) + LEAQ (CX)(BX*1), DI + SUBL BX, SI + MOVQ dst_base+0(FP), BX + MOVQ SI, R8 + SUBL $0x01, R8 + JC emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + CMPL R8, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm12BAvx + CMPL R8, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm12BAvx + CMPL R8, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm12BAvx + CMPL R8, $0x01000000 + JLT four_bytes_repeat_emit_encodeBlockAsm12BAvx + MOVB $0xfc, (BX) + MOVL R8, 1(BX) + ADDQ $0x05, BX + JMP memmove_repeat_emit_encodeBlockAsm12BAvx + +four_bytes_repeat_emit_encodeBlockAsm12BAvx: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (BX) + MOVW R8, 1(BX) + MOVB R9, 3(BX) + ADDQ $0x04, BX + JMP memmove_repeat_emit_encodeBlockAsm12BAvx + +three_bytes_repeat_emit_encodeBlockAsm12BAvx: + MOVB $0xf4, (BX) + MOVW R8, 1(BX) + ADDQ $0x03, BX + JMP memmove_repeat_emit_encodeBlockAsm12BAvx + +two_bytes_repeat_emit_encodeBlockAsm12BAvx: + MOVB $0xf0, (BX) + MOVB R8, 1(BX) + ADDQ $0x02, BX + JMP memmove_repeat_emit_encodeBlockAsm12BAvx + +one_byte_repeat_emit_encodeBlockAsm12BAvx: + SHLB $0x02, R8 + MOVB R8, (BX) + ADDQ $0x01, BX + +memmove_repeat_emit_encodeBlockAsm12BAvx: + LEAQ (BX)(SI*1), R8 + NOP + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_tail: + TESTQ SI, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + CMPQ SI, $0x02 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_1or2 + CMPQ SI, $0x04 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_3 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_4 + CMPQ SI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_5through7 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_8 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_9through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_17through32 + CMPQ SI, $0x40 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_33through64 + CMPQ SI, $0x80 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128 + CMPQ SI, $0x00000100 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_1or2: + MOVB (DI), R8 + MOVB -1(DI)(SI*1), R9 + MOVB R8, (BX) + MOVB R9, -1(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_4: + MOVL (DI), R8 + MOVL R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_3: + MOVW (DI), R8 + MOVB 2(DI), R9 + MOVW R8, (BX) + MOVB R9, 2(BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_5through7: + MOVL (DI), R8 + MOVL -4(DI)(SI*1), R9 + MOVL R8, (BX) + MOVL R9, -4(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_8: + MOVQ (DI), R8 + MOVQ R8, (BX) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_9through16: + MOVQ (DI), R8 + MOVQ -8(DI)(SI*1), R9 + MOVQ R8, (BX) + MOVQ R9, -8(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(SI*1), X1 + MOVOU X0, (BX) + MOVOU X1, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(SI*1), X2 + MOVOU -16(DI)(SI*1), X3 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, -32(BX)(SI*1) + MOVOU X3, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU -128(DI)(SI*1), X8 + MOVOU -112(DI)(SI*1), X9 + MOVOU -96(DI)(SI*1), X10 + MOVOU -80(DI)(SI*1), X11 + MOVOU -64(DI)(SI*1), X12 + MOVOU -48(DI)(SI*1), X13 + MOVOU -32(DI)(SI*1), X14 + MOVOU -16(DI)(SI*1), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, -128(BX)(SI*1) + MOVOU X9, -112(BX)(SI*1) + MOVOU X10, -96(BX)(SI*1) + MOVOU X11, -80(BX)(SI*1) + MOVOU X12, -64(BX)(SI*1) + MOVOU X13, -48(BX)(SI*1) + MOVOU X14, -32(BX)(SI*1) + MOVOU X15, -16(BX)(SI*1) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: + LEAQ -256(SI), SI + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU 32(DI), X2 + MOVOU 48(DI), X3 + MOVOU 64(DI), X4 + MOVOU 80(DI), X5 + MOVOU 96(DI), X6 + MOVOU 112(DI), X7 + MOVOU 128(DI), X8 + MOVOU 144(DI), X9 + MOVOU 160(DI), X10 + MOVOU 176(DI), X11 + MOVOU 192(DI), X12 + MOVOU 208(DI), X13 + MOVOU 224(DI), X14 + MOVOU 240(DI), X15 + MOVOU X0, (BX) + MOVOU X1, 16(BX) + MOVOU X2, 32(BX) + MOVOU X3, 48(BX) + MOVOU X4, 64(BX) + MOVOU X5, 80(BX) + MOVOU X6, 96(BX) + MOVOU X7, 112(BX) + MOVOU X8, 128(BX) + MOVOU X9, 144(BX) + MOVOU X10, 160(BX) + MOVOU X11, 176(BX) + MOVOU X12, 192(BX) + MOVOU X13, 208(BX) + MOVOU X14, 224(BX) + MOVOU X15, 240(BX) + CMPQ SI, $0x00000100 + LEAQ 256(DI), DI + LEAQ 256(BX), BX + JGE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_tail + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned: + LEAQ (DI)(SI*1), R9 + MOVQ BX, R11 + MOVOU -128(R9), X5 + MOVOU -112(R9), X6 + MOVQ $0x00000080, R8 + ANDQ $0xffffffe0, BX + ADDQ $0x20, BX + MOVOU -96(R9), X7 + MOVOU -80(R9), X8 + MOVQ BX, R10 + SUBQ R11, R10 + MOVOU -64(R9), X9 + MOVOU -48(R9), X10 + SUBQ R10, SI + MOVOU -32(R9), X11 + MOVOU -16(R9), X12 + VMOVDQU (DI), Y4 + ADDQ R10, DI + SUBQ R8, SI + +emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop: + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU 64(DI), Y2 + VMOVDQU 96(DI), Y3 + ADDQ R8, DI + VMOVDQA Y0, (BX) + VMOVDQA Y1, 32(BX) + VMOVDQA Y2, 64(BX) + VMOVDQA Y3, 96(BX) + ADDQ R8, BX + SUBQ R8, SI + JA emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop + ADDQ R8, SI + ADDQ BX, SI + VMOVDQU Y4, (R11) + VZEROUPPER + MOVOU X5, -128(SI) + MOVOU X6, -112(SI) + MOVOU X7, -96(SI) + MOVOU X8, -80(SI) + MOVOU X9, -64(SI) + MOVOU X10, -48(SI) + MOVOU X11, -32(SI) + MOVOU X12, -16(SI) + JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx + MOVQ R8, BX + +emit_literal_done_repeat_emit_encodeBlockAsm12BAvx: + MOVQ BX, dst_base+0(FP) + +emit_literal_skip_repeat_emit_encodeBlockAsm12BAvx: + ADDL $0x05, AX + MOVL AX, BX + SUBL 24(SP), BX + MOVL 16(SP), BX + SUBL AX, BX + XORQ DI, DI + CMPQ BX, $0x08 + JL matchlen_single_repeat_extend + +matchlen_loopback_repeat_extend: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_repeat_extend + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP repeat_extend_forward_end_encodeBlockAsm12BAvx + +matchlen_loop_repeat_extend: + LEAQ -8(BX), BX + LEAQ 8(DI), DI + CMPQ BX, $0x08 + JGE matchlen_loopback_repeat_extend + +matchlen_single_repeat_extend: + TESTQ BX, BX + JZ repeat_extend_forward_end_encodeBlockAsm12BAvx + +matchlen_single_loopback_repeat_extend: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE repeat_extend_forward_end_encodeBlockAsm12BAvx + LEAQ 1(DI), DI + DECQ BX + JNZ matchlen_single_loopback_repeat_extend + +repeat_extend_forward_end_encodeBlockAsm12BAvx: + ADDL DI, AX + MOVL AX, BX + SUBL BP, BX + MOVL 24(SP), BP + MOVQ dst_base+0(FP), SI + MOVL 20(SP), DI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm12BAvx + +emit_repeat_again_match_repeat_: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_match_repeat_ + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_ + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_repeat_ + +cant_repeat_two_offset_match_repeat_: + CMPL BX, $0x00000104 + JLT repeat_three_match_repeat_ + CMPL BX, $0x00010100 + JLT repeat_four_match_repeat_ + CMPL BX, $0x0100ffff + JLT repeat_five_match_repeat_ + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_repeat_ + +repeat_five_match_repeat_: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_four_match_repeat_: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_three_match_repeat_: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_two_match_repeat_: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_two_offset_match_repeat_: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_as_copy_encodeBlockAsm12BAvx: + CMPL BP, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx + CMPL BX, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(BX), BX + ADDQ $0x05, SI + CMPL BX, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx + +emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy + +repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx: + TESTL BX, BX + JZ repeat_end_emit_encodeBlockAsm12BAvx + MOVB $0x03, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx: + CMPL BX, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12BAvx + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(BX), BX + ADDQ $0x03, SI + +emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + MOVQ BX, DI + LEAQ -4(BX), BX + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + CMPL BX, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + CMPL BX, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + CMPL BX, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + LEAQ -16842747(BX), BX + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short + +repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + LEAQ -65536(BX), BX + MOVQ BX, BP + MOVW $0x001d, (SI) + MOVW BX, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + LEAQ -256(BX), BX + MOVW $0x0019, (SI) + MOVW BX, 2(SI) + ADDQ $0x04, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + LEAQ -4(BX), BX + MOVW $0x0015, (SI) + MOVB BL, 2(SI) + ADDQ $0x03, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: + XORQ DI, DI + LEAQ 1(DI)(BX*4), BX + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm12BAvx: + CMPL BX, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx + CMPL BP, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx + MOVB $0x01, DL + LEAQ -16(DX)(BX*4), BX + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, BX + MOVB BL, (SI) + ADDQ $0x02, SI + JMP repeat_end_emit_encodeBlockAsm12BAvx + +emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx: + MOVB $0x02, DL + LEAQ -4(DX)(BX*4), BX + MOVB BL, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +repeat_end_emit_encodeBlockAsm12BAvx: + MOVQ SI, dst_base+0(FP) + MOVL 16(SP), BX + CMPL AX, BX + JGT emit_remainder_encodeBlockAsm12BAvx + JMP search_loop_encodeBlockAsm12BAvx + +no_repeat_found_encodeBlockAsm12BAvx: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ BP, DI + SHRQ $0x10, DI + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x34, DI + CMPL (CX)(BX*1), BP + SHRQ $0x08, BP + JEQ candidate_match_encodeBlockAsm12BAvx + MOVL 32(SP)(DI*1), BX + CMPL (CX)(SI*1), BP + JEQ candidate2_match_encodeBlockAsm12BAvx + LEAQ 2(AX), SI + MOVL SI, 32(SP)(DI*1) + SHRQ $0x08, BP + CMPL (CX)(BX*1), BP + JEQ candidate3_match_encodeBlockAsm12BAvx + MOVL 28(SP), AX + JMP search_loop_encodeBlockAsm12BAvx + +candidate3_match_encodeBlockAsm12BAvx: + ADDL $0x02, AX + JMP candidate_match_encodeBlockAsm12BAvx + +candidate2_match_encodeBlockAsm12BAvx: + LEAQ -2(AX), BX + MOVL BX, 32(SP)(DI*1) + INCL AX + MOVL SI, BX + +candidate_match_encodeBlockAsm12BAvx: + MOVL 20(SP), BP + TESTL BX, BX + JZ match_extend_back_end_encodeBlockAsm12BAvx + +match_extend_back_loop_encodeBlockAsm12BAvx: + CMPL AX, BP + JG match_extend_back_end_encodeBlockAsm12BAvx + MOVB -1(CX)(BX*1), DL + MOVB -1(CX)(AX*1), SI + CMPB DL, SI + JNE match_extend_back_end_encodeBlockAsm12BAvx + LEAL -1(AX), AX + DECL BX + JZ match_extend_back_end_encodeBlockAsm12BAvx + JMP match_extend_back_loop_encodeBlockAsm12BAvx + +match_extend_back_end_encodeBlockAsm12BAvx: + MOVL AX, BP + SUBL 20(SP), BP + LEAQ dst_base+0(FP)(BP*1), BP + CMPQ BP, (SP) + JL match_dst_size_check_encodeBlockAsm12BAvx + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm12BAvx: + MOVL BX, BP + MOVL 20(SP), SI + CMPL SI, BP + JEQ emit_literal_skip_match_emit_encodeBlockAsm12BAvx + MOVL BP, DI + MOVL BP, 20(SP) + LEAQ (CX)(SI*1), BP + SUBL SI, DI + MOVQ dst_base+0(FP), SI + MOVQ DI, R8 + SUBL $0x01, R8 + JC emit_literal_done_match_emit_encodeBlockAsm12BAvx + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm12BAvx + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm12BAvx + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm12BAvx + CMPL R8, $0x01000000 + JLT four_bytes_match_emit_encodeBlockAsm12BAvx + MOVB $0xfc, (SI) + MOVL R8, 1(SI) + ADDQ $0x05, SI + JMP memmove_match_emit_encodeBlockAsm12BAvx + +four_bytes_match_emit_encodeBlockAsm12BAvx: + MOVQ R8, R9 + SHRL $0x10, R9 + MOVB $0xf8, (SI) + MOVW R8, 1(SI) + MOVB R9, 3(SI) + ADDQ $0x04, SI + JMP memmove_match_emit_encodeBlockAsm12BAvx + +three_bytes_match_emit_encodeBlockAsm12BAvx: + MOVB $0xf4, (SI) + MOVW R8, 1(SI) + ADDQ $0x03, SI + JMP memmove_match_emit_encodeBlockAsm12BAvx + +two_bytes_match_emit_encodeBlockAsm12BAvx: + MOVB $0xf0, (SI) + MOVB R8, 1(SI) + ADDQ $0x02, SI + JMP memmove_match_emit_encodeBlockAsm12BAvx + +one_byte_match_emit_encodeBlockAsm12BAvx: + SHLB $0x02, R8 + MOVB R8, (SI) + ADDQ $0x01, SI + +memmove_match_emit_encodeBlockAsm12BAvx: + LEAQ (SI)(DI*1), R8 + NOP + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail: + TESTQ DI, DI + JEQ emit_literal_done_match_emit_encodeBlockAsm12BAvx + CMPQ DI, $0x02 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_1or2 + CMPQ DI, $0x04 + JB emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_3 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_4 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_5through7 + JE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_8 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_9through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_17through32 + CMPQ DI, $0x40 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_33through64 + CMPQ DI, $0x80 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_65through128 + CMPQ DI, $0x00000100 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_129through256 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_1or2: + MOVB (BP), R8 + MOVB -1(BP)(DI*1), R9 + MOVB R8, (SI) + MOVB R9, -1(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_4: + MOVL (BP), R8 + MOVL R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_3: + MOVW (BP), R8 + MOVB 2(BP), R9 + MOVW R8, (SI) + MOVB R9, 2(SI) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_5through7: + MOVL (BP), R8 + MOVL -4(BP)(DI*1), R9 + MOVL R8, (SI) + MOVL R9, -4(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_8: + MOVQ (BP), R8 + MOVQ R8, (SI) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_9through16: + MOVQ (BP), R8 + MOVQ -8(BP)(DI*1), R9 + MOVQ R8, (SI) + MOVQ R9, -8(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_17through32: + MOVOU (BP), X0 + MOVOU -16(BP)(DI*1), X1 + MOVOU X0, (SI) + MOVOU X1, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_33through64: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU -32(BP)(DI*1), X2 + MOVOU -16(BP)(DI*1), X3 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, -32(SI)(DI*1) + MOVOU X3, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_65through128: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_129through256: + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU -128(BP)(DI*1), X8 + MOVOU -112(BP)(DI*1), X9 + MOVOU -96(BP)(DI*1), X10 + MOVOU -80(BP)(DI*1), X11 + MOVOU -64(BP)(DI*1), X12 + MOVOU -48(BP)(DI*1), X13 + MOVOU -32(BP)(DI*1), X14 + MOVOU -16(BP)(DI*1), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, -128(SI)(DI*1) + MOVOU X9, -112(SI)(DI*1) + MOVOU X10, -96(SI)(DI*1) + MOVOU X11, -80(SI)(DI*1) + MOVOU X12, -64(SI)(DI*1) + MOVOU X13, -48(SI)(DI*1) + MOVOU X14, -32(SI)(DI*1) + MOVOU X15, -16(SI)(DI*1) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: + LEAQ -256(DI), DI + MOVOU (BP), X0 + MOVOU 16(BP), X1 + MOVOU 32(BP), X2 + MOVOU 48(BP), X3 + MOVOU 64(BP), X4 + MOVOU 80(BP), X5 + MOVOU 96(BP), X6 + MOVOU 112(BP), X7 + MOVOU 128(BP), X8 + MOVOU 144(BP), X9 + MOVOU 160(BP), X10 + MOVOU 176(BP), X11 + MOVOU 192(BP), X12 + MOVOU 208(BP), X13 + MOVOU 224(BP), X14 + MOVOU 240(BP), X15 + MOVOU X0, (SI) + MOVOU X1, 16(SI) + MOVOU X2, 32(SI) + MOVOU X3, 48(SI) + MOVOU X4, 64(SI) + MOVOU X5, 80(SI) + MOVOU X6, 96(SI) + MOVOU X7, 112(SI) + MOVOU X8, 128(SI) + MOVOU X9, 144(SI) + MOVOU X10, 160(SI) + MOVOU X11, 176(SI) + MOVOU X12, 192(SI) + MOVOU X13, 208(SI) + MOVOU X14, 224(SI) + MOVOU X15, 240(SI) + CMPQ DI, $0x00000100 + LEAQ 256(BP), BP + LEAQ 256(SI), SI + JGE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned: + LEAQ (BP)(DI*1), R9 + MOVQ SI, R11 + MOVOU -128(R9), X5 + MOVOU -112(R9), X6 + MOVQ $0x00000080, R8 + ANDQ $0xffffffe0, SI + ADDQ $0x20, SI + MOVOU -96(R9), X7 + MOVOU -80(R9), X8 + MOVQ SI, R10 + SUBQ R11, R10 + MOVOU -64(R9), X9 + MOVOU -48(R9), X10 + SUBQ R10, DI + MOVOU -32(R9), X11 + MOVOU -16(R9), X12 + VMOVDQU (BP), Y4 + ADDQ R10, BP + SUBQ R8, DI + +emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop: + VMOVDQU (BP), Y0 + VMOVDQU 32(BP), Y1 + VMOVDQU 64(BP), Y2 + VMOVDQU 96(BP), Y3 + ADDQ R8, BP + VMOVDQA Y0, (SI) + VMOVDQA Y1, 32(SI) + VMOVDQA Y2, 64(SI) + VMOVDQA Y3, 96(SI) + ADDQ R8, SI + SUBQ R8, DI + JA emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop + ADDQ R8, DI + ADDQ SI, DI + VMOVDQU Y4, (R11) + VZEROUPPER + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) + JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx + MOVQ R8, SI + +emit_literal_done_match_emit_encodeBlockAsm12BAvx: + MOVQ SI, dst_base+0(FP) + +emit_literal_skip_match_emit_encodeBlockAsm12BAvx: + NOP + +match_nolit_loop_encodeBlockAsm12BAvx: + MOVL AX, BP + MOVL AX, BP + SUBL BX, BP + MOVL BP, 24(SP) + ADDL $0x04, AX + ADDL $0x04, BX + MOVL 16(SP), BP + SUBL AX, BP + XORQ DI, DI + CMPQ BP, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm12BAvx + +matchlen_loopback_match_nolit_encodeBlockAsm12BAvx: + MOVQ (CX)(DI*1), SI + XORQ (CX)(DI*1), SI + TESTQ SI, SI + JZ matchlen_loop_match_nolit_encodeBlockAsm12BAvx + BSFQ SI, SI + SARQ $0x03, SI + LEAQ (DI)(SI*1), DI + JMP match_nolit_end_encodeBlockAsm12BAvx + +matchlen_loop_match_nolit_encodeBlockAsm12BAvx: + LEAQ -8(BP), BP + LEAQ 8(DI), DI + CMPQ BP, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm12BAvx + +matchlen_single_match_nolit_encodeBlockAsm12BAvx: + TESTQ BP, BP + JZ match_nolit_end_encodeBlockAsm12BAvx + +matchlen_single_loopback_match_nolit_encodeBlockAsm12BAvx: + MOVB (CX)(DI*1), SI + CMPB (CX)(DI*1), SI + JNE match_nolit_end_encodeBlockAsm12BAvx + LEAQ 1(DI), DI + DECQ BP + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12BAvx + +match_nolit_end_encodeBlockAsm12BAvx: + MOVL 24(SP), BP + ADDQ $0x04, DI + MOVQ dst_base+0(FP), SI + ADDL DI, AX + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm12BAvx + CMPL DI, $0x40 + JLE four_bytes_remain_match_nolit_encodeBlockAsm12BAvx + MOVB $0xff, (SI) + MOVD BP, 1(SI) + LEAQ -64(DI), DI + ADDQ $0x05, SI + CMPL DI, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm12BAvx + +emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy + +repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +four_bytes_remain_match_nolit_encodeBlockAsm12BAvx: + TESTL DI, DI + JZ match_nolit_emitcopy_end_encodeBlockAsm12BAvx + MOVB $0x03, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVD BP, 1(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +two_byte_offset_match_nolit_encodeBlockAsm12BAvx: + CMPL DI, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm12BAvx + MOVB $0xee, (SI) + MOVW BP, 1(SI) + LEAQ -60(DI), DI + ADDQ $0x03, SI + +emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + MOVQ DI, R8 + LEAQ -4(DI), DI + CMPL R8, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + CMPL DI, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + CMPL DI, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + CMPL DI, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + LEAQ -16842747(DI), DI + MOVW $0x001d, (SI) + MOVW $0xfffb, 2(SI) + MOVB $0xff, 4(SI) + ADDQ $0x05, SI + JMP emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy_short + +repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + LEAQ -65536(DI), DI + MOVQ DI, BP + MOVW $0x001d, (SI) + MOVW DI, 2(SI) + SARQ $0x10, BP + MOVB BP, 4(SI) + ADDQ $0x05, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + LEAQ -256(DI), DI + MOVW $0x0019, (SI) + MOVW DI, 2(SI) + ADDQ $0x04, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + LEAQ -4(DI), DI + MOVW $0x0015, (SI) + MOVB DI, 2(SI) + ADDQ $0x03, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + SHLL $0x02, DI + ORL $0x01, DI + MOVW DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: + XORQ R8, R8 + LEAQ 1(R8)(DI*4), DI + MOVB BP, 1(SI) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +two_byte_offset_short_match_nolit_encodeBlockAsm12BAvx: + CMPL DI, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm12BAvx + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm12BAvx + MOVB $0x01, DL + LEAQ -16(DX)(DI*4), DI + MOVB BP, 1(SI) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, DI + MOVB DI, (SI) + ADDQ $0x02, SI + JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx + +emit_copy_three_match_nolit_encodeBlockAsm12BAvx: + MOVB $0x02, DL + LEAQ -4(DX)(DI*4), DI + MOVB DI, (SI) + MOVW BP, 1(SI) + ADDQ $0x03, SI + +match_nolit_emitcopy_end_encodeBlockAsm12BAvx: + MOVQ SI, dst_base+0(FP) + MOVL AX, 20(SP) + CMPL AX, 16(SP) + JGE emit_remainder_encodeBlockAsm12BAvx + CMPQ SI, (SP) + JL match_nolit_dst_ok_encodeBlockAsm12BAvx + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm12BAvx: + MOVQ -2(CX)(AX*1), BP + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ BP, DI + SHRQ $0x10, BP + MOVQ BP, R8 + SHLQ $0x10, DI + IMULQ SI, DI + SHRQ $0x34, DI + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 + MOVL 32(SP)(DI*1), SI + MOVL 32(SP)(R8*1), SI + LEAQ -2(AX), SI + MOVL SI, 32(SP)(DI*1) + MOVL AX, 32(SP)(R8*1) + CMPL (CX)(R8*1), BP + JEQ match_nolit_loop_encodeBlockAsm12BAvx + INCL AX + JMP search_loop_encodeBlockAsm12BAvx + +emit_remainder_encodeBlockAsm12BAvx: + MOVQ src_len+32(FP), AX + SUBL 20(SP), AX + MOVQ dst_base+0(FP), DX + LEAQ (DX)(AX*1), DX + CMPQ DX, (SP) + JL emit_remainder_ok_encodeBlockAsm12BAvx + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm12BAvx: + MOVQ src_len+32(FP), AX + MOVL 20(SP), DX + CMPL DX, AX + JEQ emit_literal_skip_emit_remainder_encodeBlockAsm12BAvx + MOVL AX, BX + MOVL AX, 20(SP) + LEAQ (CX)(DX*1), AX + SUBL DX, BX + MOVQ dst_base+0(FP), CX + MOVQ BX, DX + SUBL $0x01, DX + JC emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm12BAvx + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm12BAvx + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm12BAvx + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBlockAsm12BAvx + MOVB $0xfc, (CX) + MOVL DX, 1(CX) + ADDQ $0x05, CX + JMP memmove_emit_remainder_encodeBlockAsm12BAvx + +four_bytes_emit_remainder_encodeBlockAsm12BAvx: + MOVQ DX, BP + SHRL $0x10, BP + MOVB $0xf8, (CX) + MOVW DX, 1(CX) + MOVB BP, 3(CX) + ADDQ $0x04, CX + JMP memmove_emit_remainder_encodeBlockAsm12BAvx + +three_bytes_emit_remainder_encodeBlockAsm12BAvx: + MOVB $0xf4, (CX) + MOVW DX, 1(CX) + ADDQ $0x03, CX + JMP memmove_emit_remainder_encodeBlockAsm12BAvx + +two_bytes_emit_remainder_encodeBlockAsm12BAvx: + MOVB $0xf0, (CX) + MOVB DL, 1(CX) + ADDQ $0x02, CX + JMP memmove_emit_remainder_encodeBlockAsm12BAvx + +one_byte_emit_remainder_encodeBlockAsm12BAvx: + SHLB $0x02, DL + MOVB DL, (CX) + ADDQ $0x01, CX + +memmove_emit_remainder_encodeBlockAsm12BAvx: + LEAQ (CX)(BX*1), DX + NOP + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_tail: + TESTQ BX, BX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + CMPQ BX, $0x02 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_1or2 + CMPQ BX, $0x04 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_3 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_4 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_5through7 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_9through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_17through32 + CMPQ BX, $0x40 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_33through64 + CMPQ BX, $0x80 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_65through128 + CMPQ BX, $0x00000100 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_129through256 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_avxUnaligned + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_1or2: + MOVB (AX), DL + MOVB -1(AX)(BX*1), BP + MOVB DL, (CX) + MOVB BP, -1(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_4: + MOVL (AX), DX + MOVL DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_3: + MOVW (AX), DX + MOVB 2(AX), BP + MOVW DX, (CX) + MOVB BP, 2(CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_5through7: + MOVL (AX), DX + MOVL -4(AX)(BX*1), BP + MOVL DX, (CX) + MOVL BP, -4(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_8: + MOVQ (AX), DX + MOVQ DX, (CX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_9through16: + MOVQ (AX), DX + MOVQ -8(AX)(BX*1), BP + MOVQ DX, (CX) + MOVQ BP, -8(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_17through32: + MOVOU (AX), X0 + MOVOU -16(AX)(BX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_33through64: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(BX*1) + MOVOU X3, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_65through128: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_129through256: + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU -128(AX)(BX*1), X8 + MOVOU -112(AX)(BX*1), X9 + MOVOU -96(AX)(BX*1), X10 + MOVOU -80(AX)(BX*1), X11 + MOVOU -64(AX)(BX*1), X12 + MOVOU -48(AX)(BX*1), X13 + MOVOU -32(AX)(BX*1), X14 + MOVOU -16(AX)(BX*1), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, -128(CX)(BX*1) + MOVOU X9, -112(CX)(BX*1) + MOVOU X10, -96(CX)(BX*1) + MOVOU X11, -80(CX)(BX*1) + MOVOU X12, -64(CX)(BX*1) + MOVOU X13, -48(CX)(BX*1) + MOVOU X14, -32(CX)(BX*1) + MOVOU X15, -16(CX)(BX*1) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_256through2048: + LEAQ -256(BX), BX + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVOU 128(AX), X8 + MOVOU 144(AX), X9 + MOVOU 160(AX), X10 + MOVOU 176(AX), X11 + MOVOU 192(AX), X12 + MOVOU 208(AX), X13 + MOVOU 224(AX), X14 + MOVOU 240(AX), X15 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, 32(CX) + MOVOU X3, 48(CX) + MOVOU X4, 64(CX) + MOVOU X5, 80(CX) + MOVOU X6, 96(CX) + MOVOU X7, 112(CX) + MOVOU X8, 128(CX) + MOVOU X9, 144(CX) + MOVOU X10, 160(CX) + MOVOU X11, 176(CX) + MOVOU X12, 192(CX) + MOVOU X13, 208(CX) + MOVOU X14, 224(CX) + MOVOU X15, 240(CX) + CMPQ BX, $0x00000100 + LEAQ 256(AX), AX + LEAQ 256(CX), CX + JGE emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_256through2048 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_tail + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_avxUnaligned: + LEAQ (AX)(BX*1), BP + MOVQ CX, DI + MOVOU -128(BP), X5 + MOVOU -112(BP), X6 + MOVQ $0x00000080, DX + ANDQ $0xffffffe0, CX + ADDQ $0x20, CX + MOVOU -96(BP), X7 + MOVOU -80(BP), X8 + MOVQ CX, SI + SUBQ DI, SI + MOVOU -64(BP), X9 + MOVOU -48(BP), X10 + SUBQ SI, BX + MOVOU -32(BP), X11 + MOVOU -16(BP), X12 + VMOVDQU (AX), Y4 + ADDQ SI, AX + SUBQ DX, BX + +emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_gobble_128_loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + ADDQ DX, AX + VMOVDQA Y0, (CX) + VMOVDQA Y1, 32(CX) + VMOVDQA Y2, 64(CX) + VMOVDQA Y3, 96(CX) + ADDQ DX, CX + SUBQ DX, BX + JA emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_gobble_128_loop + ADDQ DX, BX + ADDQ CX, BX + VMOVDQU Y4, (DI) + VZEROUPPER + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) + JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx + MOVQ DX, CX + +emit_literal_done_emit_remainder_encodeBlockAsm12BAvx: + MOVQ CX, dst_base+0(FP) + +emit_literal_skip_emit_remainder_encodeBlockAsm12BAvx: + MOVQ 8(SP), AX + SUBQ dst_base+0(FP), AX + MOVQ AX, ret+48(FP) + RET + +// func emitLiteral(dst []byte, lit []byte) int +// Requires: SSE2 +TEXT ·emitLiteral(SB), NOSPLIT, $0-56 + MOVQ dst_base+0(FP), AX + MOVQ lit_base+24(FP), CX + MOVQ lit_len+32(FP), DX + MOVQ DX, BX + MOVQ DX, BP + SUBL $0x01, BP + JC emit_literal_end_standalone + CMPL BP, $0x3c + JLT one_byte_standalone + CMPL BP, $0x00000100 + JLT two_bytes_standalone + CMPL BP, $0x00010000 + JLT three_bytes_standalone + CMPL BP, $0x01000000 + JLT four_bytes_standalone + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP memmove_standalone + +four_bytes_standalone: + MOVQ BP, SI + SHRL $0x10, SI + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB SI, 3(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP memmove_standalone + +three_bytes_standalone: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP memmove_standalone + +two_bytes_standalone: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP memmove_standalone + +one_byte_standalone: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, BX + ADDQ $0x01, AX + +memmove_standalone: + NOP + +emit_lit_memmove_standalone_memmove_tail: + TESTQ DX, DX + JEQ emit_literal_end_standalone + CMPQ DX, $0x02 + JBE emit_lit_memmove_standalone_memmove_move_1or2 + CMPQ DX, $0x04 + JB emit_lit_memmove_standalone_memmove_move_3 + JBE emit_lit_memmove_standalone_memmove_move_4 + CMPQ DX, $0x08 + JB emit_lit_memmove_standalone_memmove_move_5through7 + JE emit_lit_memmove_standalone_memmove_move_8 + CMPQ DX, $0x10 + JBE emit_lit_memmove_standalone_memmove_move_9through16 + CMPQ DX, $0x20 + JBE emit_lit_memmove_standalone_memmove_move_17through32 + CMPQ DX, $0x40 + JBE emit_lit_memmove_standalone_memmove_move_33through64 + CMPQ DX, $0x80 + JBE emit_lit_memmove_standalone_memmove_move_65through128 + CMPQ DX, $0x00000100 + JBE emit_lit_memmove_standalone_memmove_move_129through256 + JMP emit_lit_memmove_standalone_memmove_move_256through2048 + +emit_lit_memmove_standalone_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(DX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_4: + MOVL (CX), BP + MOVL BP, (AX) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_5through7: + MOVL (CX), BP + MOVL -4(CX)(DX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_8: + MOVQ (CX), BP + MOVQ BP, (AX) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_9through16: + MOVQ (CX), BP + MOVQ -8(CX)(DX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(DX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_65through128: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_129through256: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU 64(CX), X4 + MOVOU 80(CX), X5 + MOVOU 96(CX), X6 + MOVOU 112(CX), X7 + MOVOU -128(CX)(DX*1), X8 + MOVOU -112(CX)(DX*1), X9 + MOVOU -96(CX)(DX*1), X10 + MOVOU -80(CX)(DX*1), X11 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU X8, -128(AX)(DX*1) + MOVOU X9, -112(AX)(DX*1) + MOVOU X10, -96(AX)(DX*1) + MOVOU X11, -80(AX)(DX*1) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) + JMP emit_literal_end_standalone + +emit_lit_memmove_standalone_memmove_move_256through2048: + LEAQ -256(DX), DX + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU 64(CX), X4 + MOVOU 80(CX), X5 + MOVOU 96(CX), X6 + MOVOU 112(CX), X7 + MOVOU 128(CX), X8 + MOVOU 144(CX), X9 + MOVOU 160(CX), X10 + MOVOU 176(CX), X11 + MOVOU 192(CX), X12 + MOVOU 208(CX), X13 + MOVOU 224(CX), X14 + MOVOU 240(CX), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU X8, 128(AX) + MOVOU X9, 144(AX) + MOVOU X10, 160(AX) + MOVOU X11, 176(AX) + MOVOU X12, 192(AX) + MOVOU X13, 208(AX) + MOVOU X14, 224(AX) + MOVOU X15, 240(AX) + CMPQ DX, $0x00000100 + LEAQ 256(CX), CX + LEAQ 256(AX), AX + JGE emit_lit_memmove_standalone_memmove_move_256through2048 + JMP emit_lit_memmove_standalone_memmove_tail + +emit_literal_end_standalone: + MOVQ BX, ret+48(FP) + RET + +// func emitLiteralAvx(dst []byte, lit []byte) int +// Requires: AVX, SSE2 +TEXT ·emitLiteralAvx(SB), NOSPLIT, $0-56 + MOVQ dst_base+0(FP), AX + MOVQ lit_base+24(FP), CX + MOVQ lit_len+32(FP), DX + MOVQ DX, BX + MOVQ DX, BP + SUBL $0x01, BP + JC emit_literal_end_avx_standalone + CMPL BP, $0x3c + JLT one_byte_standalone + CMPL BP, $0x00000100 + JLT two_bytes_standalone + CMPL BP, $0x00010000 + JLT three_bytes_standalone + CMPL BP, $0x01000000 + JLT four_bytes_standalone + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP memmove_standalone + +four_bytes_standalone: + MOVQ BP, SI + SHRL $0x10, SI + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB SI, 3(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP memmove_standalone + +three_bytes_standalone: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP memmove_standalone + +two_bytes_standalone: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP memmove_standalone + +one_byte_standalone: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, BX + ADDQ $0x01, AX + +memmove_standalone: + NOP + +emit_lit_memmove_standalone_memmove_tail: + TESTQ DX, DX + JEQ emit_literal_end_avx_standalone + CMPQ DX, $0x02 + JBE emit_lit_memmove_standalone_memmove_move_1or2 + CMPQ DX, $0x04 + JB emit_lit_memmove_standalone_memmove_move_3 + JBE emit_lit_memmove_standalone_memmove_move_4 + CMPQ DX, $0x08 + JB emit_lit_memmove_standalone_memmove_move_5through7 + JE emit_lit_memmove_standalone_memmove_move_8 + CMPQ DX, $0x10 + JBE emit_lit_memmove_standalone_memmove_move_9through16 + CMPQ DX, $0x20 + JBE emit_lit_memmove_standalone_memmove_move_17through32 + CMPQ DX, $0x40 + JBE emit_lit_memmove_standalone_memmove_move_33through64 + CMPQ DX, $0x80 + JBE emit_lit_memmove_standalone_memmove_move_65through128 + CMPQ DX, $0x00000100 + JBE emit_lit_memmove_standalone_memmove_move_129through256 + JMP emit_lit_memmove_standalone_memmove_avxUnaligned + +emit_lit_memmove_standalone_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(DX*1), SI + MOVB BP, (AX) + MOVB SI, -1(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_4: + MOVL (CX), BP + MOVL BP, (AX) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), SI + MOVW BP, (AX) + MOVB SI, 2(AX) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_5through7: + MOVL (CX), BP + MOVL -4(CX)(DX*1), SI + MOVL BP, (AX) + MOVL SI, -4(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_8: + MOVQ (CX), BP + MOVQ BP, (AX) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_9through16: + MOVQ (CX), BP + MOVQ -8(CX)(DX*1), SI + MOVQ BP, (AX) + MOVQ SI, -8(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(DX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_65through128: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_129through256: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU 64(CX), X4 + MOVOU 80(CX), X5 + MOVOU 96(CX), X6 + MOVOU 112(CX), X7 + MOVOU -128(CX)(DX*1), X8 + MOVOU -112(CX)(DX*1), X9 + MOVOU -96(CX)(DX*1), X10 + MOVOU -80(CX)(DX*1), X11 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU X8, -128(AX)(DX*1) + MOVOU X9, -112(AX)(DX*1) + MOVOU X10, -96(AX)(DX*1) + MOVOU X11, -80(AX)(DX*1) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) + JMP emit_literal_end_avx_standalone + +emit_lit_memmove_standalone_memmove_move_256through2048: + LEAQ -256(DX), DX + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU 64(CX), X4 + MOVOU 80(CX), X5 + MOVOU 96(CX), X6 + MOVOU 112(CX), X7 + MOVOU 128(CX), X8 + MOVOU 144(CX), X9 + MOVOU 160(CX), X10 + MOVOU 176(CX), X11 + MOVOU 192(CX), X12 + MOVOU 208(CX), X13 + MOVOU 224(CX), X14 + MOVOU 240(CX), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU X8, 128(AX) + MOVOU X9, 144(AX) + MOVOU X10, 160(AX) + MOVOU X11, 176(AX) + MOVOU X12, 192(AX) + MOVOU X13, 208(AX) + MOVOU X14, 224(AX) + MOVOU X15, 240(AX) + CMPQ DX, $0x00000100 + LEAQ 256(CX), CX + LEAQ 256(AX), AX + JGE emit_lit_memmove_standalone_memmove_move_256through2048 + JMP emit_lit_memmove_standalone_memmove_tail + +emit_lit_memmove_standalone_memmove_avxUnaligned: + LEAQ (CX)(DX*1), SI + MOVQ AX, R8 + MOVOU -128(SI), X5 + MOVOU -112(SI), X6 + MOVQ $0x00000080, BP + ANDQ $0xffffffe0, AX + ADDQ $0x20, AX + MOVOU -96(SI), X7 + MOVOU -80(SI), X8 + MOVQ AX, DI + SUBQ R8, DI + MOVOU -64(SI), X9 + MOVOU -48(SI), X10 + SUBQ DI, DX + MOVOU -32(SI), X11 + MOVOU -16(SI), X12 + VMOVDQU (CX), Y4 + ADDQ DI, CX + SUBQ BP, DX + +emit_lit_memmove_standalone_memmove_gobble_128_loop: + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + ADDQ BP, CX + VMOVDQA Y0, (AX) + VMOVDQA Y1, 32(AX) + VMOVDQA Y2, 64(AX) + VMOVDQA Y3, 96(AX) + ADDQ BP, AX + SUBQ BP, DX + JA emit_lit_memmove_standalone_memmove_gobble_128_loop + ADDQ BP, DX + ADDQ AX, DX + VMOVDQU Y4, (R8) + VZEROUPPER + MOVOU X5, -128(DX) + MOVOU X6, -112(DX) + MOVOU X7, -96(DX) + MOVOU X8, -80(DX) + MOVOU X9, -64(DX) + MOVOU X10, -48(DX) + MOVOU X11, -32(DX) + MOVOU X12, -16(DX) + +emit_literal_end_avx_standalone: + MOVQ BX, ret+48(FP) + RET + +// func emitRepeat(dst []byte, offset int, length int) int +TEXT ·emitRepeat(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + +emit_repeat_again_standalone: + MOVQ DX, BP + LEAQ -4(DX), DX + CMPL BP, $0x08 + JLE repeat_two_standalone + CMPL BP, $0x0c + JGE cant_repeat_two_offset_standalone + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone + +cant_repeat_two_offset_standalone: + CMPL DX, $0x00000104 + JLT repeat_three_standalone + CMPL DX, $0x00010100 + JLT repeat_four_standalone + CMPL DX, $0x0100ffff + JLT repeat_five_standalone + LEAQ -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone + +repeat_five_standalone: + LEAQ -65536(DX), DX + MOVQ DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARQ $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_repeat_end + +repeat_four_standalone: + LEAQ -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_repeat_end + +repeat_three_standalone: + LEAQ -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_repeat_end + +repeat_two_standalone: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_repeat_end + +repeat_two_offset_standalone: + XORQ BP, BP + LEAQ 1(BP)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + +gen_emit_repeat_end: + MOVQ BX, ret+40(FP) + RET + +// func emitCopy(dst []byte, offset int, length int) int +TEXT ·emitCopy(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + CMPL CX, $0x00010000 + JL two_byte_offset_standalone + CMPL DX, $0x40 + JLE four_bytes_remain_standalone + MOVB $0xff, (AX) + MOVD CX, 1(AX) + LEAQ -64(DX), DX + ADDQ $0x05, BX + ADDQ $0x05, AX + CMPL DX, $0x04 + JL four_bytes_remain_standalone + +emit_repeat_again_standalone_emit_copy: + MOVQ DX, BP + LEAQ -4(DX), DX + CMPL BP, $0x08 + JLE repeat_two_standalone_emit_copy + CMPL BP, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy + +cant_repeat_two_offset_standalone_emit_copy: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy + LEAQ -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy + +repeat_five_standalone_emit_copy: + LEAQ -65536(DX), DX + MOVQ DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARQ $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy: + LEAQ -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy: + LEAQ -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy: + XORQ BP, BP + LEAQ 1(BP)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +four_bytes_remain_standalone: + TESTL DX, DX + JZ gen_emit_copy_end + MOVB $0x03, BP + LEAQ -4(BP)(DX*4), DX + MOVB DL, (AX) + MOVD CX, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +two_byte_offset_standalone: + CMPL DX, $0x40 + JLE two_byte_offset_short_standalone + MOVB $0xee, (AX) + MOVW CX, 1(AX) + LEAQ -60(DX), DX + ADDQ $0x03, AX + ADDQ $0x03, BX + +emit_repeat_again_standalone_emit_copy_short: + MOVQ DX, BP + LEAQ -4(DX), DX + CMPL BP, $0x08 + JLE repeat_two_standalone_emit_copy_short + CMPL BP, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy_short + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy_short + +cant_repeat_two_offset_standalone_emit_copy_short: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy_short + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy_short + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy_short + LEAQ -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy_short + +repeat_five_standalone_emit_copy_short: + LEAQ -65536(DX), DX + MOVQ DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARQ $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy_short: + LEAQ -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy_short: + LEAQ -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy_short: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy_short: + XORQ BP, BP + LEAQ 1(BP)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +two_byte_offset_short_standalone: + CMPL DX, $0x0c + JGE emit_copy_three_standalone + CMPL CX, $0x00000800 + JGE emit_copy_three_standalone + MOVB $0x01, BP + LEAQ -16(BP)(DX*4), DX + MOVB CL, 1(AX) + SHRL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +emit_copy_three_standalone: + MOVB $0x02, BP + LEAQ -4(BP)(DX*4), DX + MOVB DL, (AX) + MOVW CX, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + +gen_emit_copy_end: + MOVQ BX, ret+40(FP) + RET + +// func matchLen(a []byte, b []byte) int +TEXT ·matchLen(SB), NOSPLIT, $0-56 + MOVQ a_base+0(FP), AX + MOVQ b_base+24(FP), CX + MOVQ a_len+8(FP), DX + XORQ BP, BP + CMPQ DX, $0x08 + JL matchlen_single_standalone + +matchlen_loopback_standalone: + MOVQ (AX)(BP*1), BX + XORQ (CX)(BP*1), BX + TESTQ BX, BX + JZ matchlen_loop_standalone + BSFQ BX, BX + SARQ $0x03, BX + LEAQ (BP)(BX*1), BP + JMP gen_match_len_end + +matchlen_loop_standalone: + LEAQ -8(DX), DX + LEAQ 8(BP), BP + CMPQ DX, $0x08 + JGE matchlen_loopback_standalone + +matchlen_single_standalone: + TESTQ DX, DX + JZ gen_match_len_end + +matchlen_single_loopback_standalone: + MOVB (AX)(BP*1), BL + CMPB (CX)(BP*1), BL + JNE gen_match_len_end + LEAQ 1(BP), BP + DECQ DX + JNZ matchlen_single_loopback_standalone + +gen_match_len_end: + MOVQ BP, ret+48(FP) + RET diff --git a/tests/fixedbugs/issue100/allocfail/asm.go b/tests/fixedbugs/issue100/allocfail/asm.go new file mode 100644 index 0000000..192a06c --- /dev/null +++ b/tests/fixedbugs/issue100/allocfail/asm.go @@ -0,0 +1,1586 @@ +// +build ignore + +package main + +import ( + "fmt" + "log" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/buildtags" + "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/operand" + "github.com/mmcloughlin/avo/reg" +) + +func main() { + Constraint(buildtags.Not("appengine").ToConstraint()) + Constraint(buildtags.Not("noasm").ToConstraint()) + Constraint(buildtags.Term("gc").ToConstraint()) + + genEncodeBlockAsm("encodeBlockAsm", 16, 6, false) + genEncodeBlockAsm("encodeBlockAsm14B", 14, 5, false) + genEncodeBlockAsm("encodeBlockAsm12B", 12, 4, false) + genEncodeBlockAsm("encodeBlockAsmAvx", 16, 6, true) + genEncodeBlockAsm("encodeBlockAsm14BAvx", 14, 5, true) + genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 4, true) + genEmitLiteral() + genEmitRepeat() + genEmitCopy() + genMatchLen() + Generate() +} + +func debugval(v operand.Op) { + value := reg.R15 + MOVQ(v, value) + INT(Imm(3)) +} + +func genEncodeBlockAsm(name string, tableBits, skipLog int, avx bool) { + TEXT(name, 0, "func(dst, src []byte) int") + Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", + "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") + Pragma("noescape") + + // "var table [maxTableSize]uint32" takes up 4 * (1 << tableBits) bytes of stack space. + // Extra bytes are added to keep less used values. + var ( + tableSize = 1 << uint(tableBits) + // Keep base stack multiple of 16. + baseStack = 0 + // try to keep extraStack + baseStack multiple of 16 + // for best chance of table alignment. + extraStack = 32 + allocStack = baseStack + extraStack + tableSize + ) + + // Memzero needs at least 128 bytes. + if tableSize < 128 { + panic("tableSize must be at least 128 bytes") + } + + lenSrcBasic, err := Param("src").Len().Resolve() + if err != nil { + panic(err) + } + lenSrcQ := lenSrcBasic.Addr + + stack := AllocLocal(allocStack) + table := stack.Offset(allocStack - tableSize) + + tmpStack := baseStack + // Bail if we can't compress to at least this. + dstLimitPtrQ := stack.Offset(tmpStack) + tmpStack += 8 + // dstStartPtrQ contains the original dst pointer for returning the length + dstStartPtrQ := stack.Offset(tmpStack) + tmpStack += 8 + // sLimitL is when to stop looking for offset/length copies. + sLimitL := stack.Offset(tmpStack) + tmpStack += 4 + // nextEmitL keeps track of the point we have emitted to. + nextEmitL := stack.Offset(tmpStack) + tmpStack += 4 + // Repeat stores the last match offset. + repeatL := stack.Offset(tmpStack) + tmpStack += 4 + // nextSTempL keeps nextS while other functions are being called. + nextSTempL := stack.Offset(tmpStack) + tmpStack += 4 + // Ensure we have the correct extra stack. + // Could be automatic, but whatever. + if tmpStack-baseStack != extraStack { + log.Fatal("adjust extraStack to ", tmpStack-baseStack) + } + + dstBaseBasic, err := Param("dst").Base().Resolve() + if err != nil { + panic(err) + } + dstBase := dstBaseBasic.Addr + + if tmpStack > extraStack+baseStack { + panic(fmt.Sprintf("tmp stack exceeded: %v", tmpStack)) + } + + // Zero table + { + iReg := GP64() + MOVQ(U32(tableSize/8/16), iReg) + tablePtr := GP64() + LEAQ(table, tablePtr) + zeroXmm := XMM() + PXOR(zeroXmm, zeroXmm) + + Label("zero_loop_" + name) + for i := 0; i < 8; i++ { + MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) + } + ADDQ(U8(16*8), tablePtr) + DECQ(iReg) + JNZ(LabelRef("zero_loop_" + name)) + + // nextEmit is offset n src where the next emitLiteral should start from. + MOVL(iReg.As32(), nextEmitL) + } + + { + const inputMargin = 8 + tmp, tmp2, tmp3 := GP64(), GP64(), GP64() + MOVQ(lenSrcQ, tmp) + LEAQ(Mem{Base: tmp, Disp: -5}, tmp2) + // sLimitL := len(src) - inputMargin + LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) + // dstLimit := len(src) - len(src)>>5 - 5 + SHRQ(U8(5), tmp) + SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp + MOVL(tmp3.As32(), sLimitL) + dstAddr := GP64() + MOVQ(dstBase, dstAddr) + // Store dst start address + MOVQ(dstAddr, dstStartPtrQ) + LEAQ(Mem{Base: dstAddr, Index: tmp2, Scale: 1}, tmp2) + MOVQ(tmp2, dstLimitPtrQ) + } + + // s = 1 + s := GP64().As32() + MOVL(U32(1), s) + // repeatL = 1 + MOVL(s, repeatL) + + src := GP64() + Load(Param("src").Base(), src) + + // Load cv + Label("search_loop_" + name) + candidate := GP64().As32() + { + cv := GP64() + MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) + nextS := GP64() + // nextS := s + (s-nextEmit)>>6 + 4 + { + tmp := GP64() + MOVL(s, tmp.As32()) // tmp = s + SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit + SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog + LEAQ(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS) + } + // if nextS > sLimit {goto emitRemainder} + { + tmp := GP64() + MOVL(sLimitL, tmp.As32()) + CMPL(nextS.As32(), tmp.As32()) + JGT(LabelRef("emit_remainder_" + name)) + } + // move nextS to stack. + MOVL(nextS.As32(), nextSTempL) + + candidate2 := GP64().As32() + hasher := hash6(tableBits) + { + hash0, hash1 := GP64(), GP64() + MOVQ(cv, hash0) + MOVQ(cv, hash1) + SHRQ(U8(8), hash1) + hasher.hash(hash0) + hasher.hash(hash1) + MOVL(table.Idx(hash0, 1), candidate) + MOVL(table.Idx(hash1, 1), candidate2) + MOVL(s, table.Idx(hash0, 1)) + tmp := GP64().As32() + LEAL(Mem{Base: s, Disp: 1}, tmp) + MOVL(tmp, table.Idx(hash1, 1)) + } + // Check repeat at offset checkRep + const checkRep = 1 + + if true { + // rep = s - repeat + rep := GP64().As32() + if true { + // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + left, right := GP64(), GP64() + MOVL(s, rep) + SUBL(repeatL, rep) // rep = s - repeat + MOVL(Mem{Base: src, Index: rep, Scale: 1, Disp: checkRep}, right.As32()) + MOVQ(cv, left) + SHLQ(U8(checkRep*8), left) + CMPL(left.As32(), right.As32()) + + // FIXME: Unable to allocate if enabled. + JNE(LabelRef("no_repeat_found_" + name)) + } + // base = s + 1 + base := GP64() + LEAQ(Mem{Base: s, Disp: 1}, base) + // Extend back + if true { + ne := GP64().As32() + MOVL(nextEmitL, ne) + TESTL(rep, rep) + JZ(LabelRef("repeat_extend_back_end_" + name)) + + // I is tested when decremented, so we loop back here. + Label("repeat_extend_back_loop_" + name) + CMPL(base.As32(), ne) + JG(LabelRef("repeat_extend_back_end_" + name)) + // if src[i-1] == src[base-1] + tmp, tmp2 := GP64(), GP64() + MOVB(Mem{Base: src, Index: rep, Scale: 1, Disp: -1}, tmp.As8()) + MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) + CMPB(tmp.As8(), tmp2.As8()) + JNE(LabelRef("repeat_extend_back_end_" + name)) + LEAQ(Mem{Base: base, Disp: -1}, base) + DECL(rep) + JZ(LabelRef("repeat_extend_back_end_" + name)) + JMP(LabelRef("repeat_extend_back_loop_" + name)) + } + Label("repeat_extend_back_end_" + name) + // Base is now at start. + // d += emitLiteral(dst[d:], src[nextEmitL:base]) + if true { + emitLiterals(nextEmitL, base, src, dstBase, "repeat_emit_"+name, avx) + } + + // Extend forward + if true { + // s += 4 + checkRep + ADDL(U8(4+checkRep), s) + + // candidate := s - repeat + 4 + checkRep + MOVL(s, candidate) + SUBL(repeatL, candidate) // candidate = s - repeatL + { + // srcLeft = sLimitL - s + srcLeft := GP64() + MOVL(sLimitL, srcLeft.As32()) + SUBL(s, srcLeft.As32()) + + // Forward address + forwardStart := Mem{Base: src, Index: s, Scale: 1} + // End address + backStart := Mem{Base: src, Index: candidate, Scale: 1} + length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) + Label("repeat_extend_forward_end_" + name) + // s+= length + ADDL(length.As32(), s) + } + } + // Emit + if true { + // length = s-base + length := GP64() + MOVL(s, length.As32()) + SUBL(base.As32(), length.As32()) + + offsetVal := GP64() + MOVL(repeatL, offsetVal.As32()) + dst := GP64() + MOVQ(dstBase, dst) + + // if nextEmit > 0 + tmp := GP64() + MOVL(nextEmitL, tmp.As32()) + TESTL(tmp.As32(), tmp.As32()) + + // FIXME: fails to allocate regs if enabled: + JZ(LabelRef("repeat_as_copy_" + name)) + + emitRepeat("match_repeat_", length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + + // JUMPS TO HERE: + Label("repeat_as_copy_" + name) + emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + + Label("repeat_end_emit_" + name) + // Store new dst and nextEmit + MOVQ(dst, dstBase) + } + // if s >= sLimit + // can be omitted. + if true { + tmp := GP64() + MOVL(sLimitL, tmp.As32()) + CMPL(s, tmp.As32()) + JGT(LabelRef("emit_remainder_" + name)) + } + JMP(LabelRef("search_loop_" + name)) + } + Label("no_repeat_found_" + name) + { + // Can be moved up if registers are available. + hash2 := GP64() + { + // hash2 := hash6(cv>>16, tableBits) + hasher = hash6(tableBits) + MOVQ(cv, hash2) + SHRQ(U8(16), hash2) + hasher.hash(hash2) + } + + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + // cv >>= 8 + SHRQ(U8(8), cv) + JEQ(LabelRef("candidate_match_" + name)) + + // candidate = int(table[hash2]) + MOVL(table.Idx(hash2, 1), candidate) + + // if uint32(cv>>8) == load32(src, candidate2) + CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidate2_match_" + name)) + + // table[hash2] = uint32(s + 2) + tmp := GP64() + LEAQ(Mem{Base: s, Disp: 2}, tmp) + MOVL(tmp.As32(), table.Idx(hash2, 1)) + + // if uint32(cv>>16) == load32(src, candidate) + SHRQ(U8(8), cv) + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidate3_match_" + name)) + // s = nextS + MOVL(nextSTempL, s) + JMP(LabelRef("search_loop_" + name)) + + // Matches candidate3 + Label("candidate3_match_" + name) + ADDL(U8(2), s) + JMP(LabelRef("candidate_match_" + name)) + + Label("candidate2_match_" + name) + // table[hash2] = uint32(s + 2) + tmp = GP64() + LEAQ(Mem{Base: s, Disp: -2}, tmp) + MOVL(tmp.As32(), table.Idx(hash2, 1)) + // s++ + INCL(s) + MOVL(candidate2, candidate) + } + } + + Label("candidate_match_" + name) + // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. + // Extend backwards + { + ne := GP64() + MOVL(nextEmitL, ne.As32()) + TESTL(candidate, candidate) + JZ(LabelRef("match_extend_back_end_" + name)) + + // candidate is tested when decremented, so we loop back here. + Label("match_extend_back_loop_" + name) + CMPL(s, ne.As32()) + JG(LabelRef("match_extend_back_end_" + name)) + // if src[candidate-1] == src[s-1] + tmp, tmp2 := GP64(), GP64() + MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) + MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) + CMPB(tmp.As8(), tmp2.As8()) + JNE(LabelRef("match_extend_back_end_" + name)) + LEAL(Mem{Base: s, Disp: -1}, s) + DECL(candidate) + JZ(LabelRef("match_extend_back_end_" + name)) + JMP(LabelRef("match_extend_back_loop_" + name)) + } + Label("match_extend_back_end_" + name) + + // Bail if we exceed the maximum size. + if true { + // tmp = s-nextEmitL + tmp := GP64() + MOVL(s, tmp.As32()) + SUBL(nextEmitL, tmp.As32()) + LEAQ(dstBase.Idx(tmp, 1), tmp) + CMPQ(tmp, dstLimitPtrQ) + JL(LabelRef("match_dst_size_check_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + } + Label("match_dst_size_check_" + name) + { + base := GP64() + MOVL(candidate, base.As32()) + emitLiterals(nextEmitL, base, src, dstBase, "match_emit_"+name, avx) + NOP() + } + + Label("match_nolit_loop_" + name) + { + base := GP64().As32() + MOVL(s, base) + // Update repeat + { + // repeat = base - candidate + repeatVal := GP64().As32() + MOVL(s, repeatVal) + SUBL(candidate, repeatVal) + MOVL(repeatVal, repeatL) + } + // s+=4, candidate+=4 + ADDL(U8(4), s) + ADDL(U8(4), candidate) + // Extend the 4-byte match as long as possible and emit copy. + { + // srcLeft = sLimitL - s + srcLeft := GP64() + MOVL(sLimitL, srcLeft.As32()) + SUBL(s, srcLeft.As32()) + length := matchLen("match_nolit_"+name, + Mem{Base: src, Index: s, Scale: 1}, + Mem{Base: src, Index: candidate, Scale: 1}, + srcLeft, + LabelRef("match_nolit_end_"+name), + ) + Label("match_nolit_end_" + name) + offset := GP64() + MOVL(repeatL, offset.As32()) + ADDQ(U8(4), length) + dst := GP64() + MOVQ(dstBase, dst) + // s += length (lenght is destroyed, use it now) + ADDL(length.As32(), s) + emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) + Label("match_nolit_emitcopy_end_" + name) + MOVQ(dst, dstBase) + MOVL(s, nextEmitL) + CMPL(s, sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + + // Bail if we exceed the maximum size. + { + CMPQ(dst, dstLimitPtrQ) + JL(LabelRef("match_nolit_dst_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + Label("match_nolit_dst_ok_" + name) + } + } + { + // Check for an immediate match, otherwise start search at s+1 + x := GP64() + // Index s-2 + MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x) + hasher := hash6(tableBits) + hash0, hash1 := GP64(), GP64() + MOVQ(x, hash0) // s-2 + SHRQ(U8(16), x) + MOVQ(x, hash1) // s + hasher.hash(hash0) + hasher.hash(hash1) + c0, c1 := GP64(), GP64() + MOVL(table.Idx(hash0, 1), c0.As32()) + MOVL(table.Idx(hash1, 1), c1.As32()) + sm2 := GP64() + LEAQ(Mem{Base: s, Disp: -2}, sm2) + MOVL(sm2.As32(), table.Idx(hash0, 1)) + MOVL(s, table.Idx(hash1, 1)) + CMPL(Mem{Base: src, Index: hash1, Scale: 1}, x.As32()) + JEQ(LabelRef("match_nolit_loop_" + name)) + INCL(s) + } + JMP(LabelRef("search_loop_" + name)) + } + + Label("emit_remainder_" + name) + // Bail if we exceed the maximum size. + // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 + { + // remain = lenSrc - nextEmitL + remain := GP64() + MOVQ(lenSrcQ, remain) + SUBL(nextEmitL, remain.As32()) + dst := GP64() + MOVQ(dstBase, dst) + // dst := dst + (len(src)-nextEmitL) + LEAQ(Mem{Base: dst, Index: remain, Scale: 1}, dst) + CMPQ(dst, dstLimitPtrQ) + JL(LabelRef("emit_remainder_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + Label("emit_remainder_ok_" + name) + } + // emitLiteral(dst[d:], src[nextEmitL:]) + emitEnd := GP64() + MOVQ(lenSrcQ, emitEnd) + + // Emit final literals. + emitLiterals(nextEmitL, emitEnd, src, dstBase, "emit_remainder_"+name, avx) + + // length := start - base (ptr arithmetic) + length := GP64() + MOVQ(dstStartPtrQ, length) + SUBQ(dstBase, length) + + Store(length, ReturnIndex(0)) + RET() +} + +// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. +// Checks if base == nextemit. +// src & base are untouched. +func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) { + nextEmit, litLen, dstBaseTmp, litBase := GP64().As32(), GP64(), GP64(), GP64() + MOVL(nextEmitL, nextEmit) + CMPL(nextEmit, base.As32()) + JEQ(LabelRef("emit_literal_skip_" + name)) + MOVL(base.As32(), litLen.As32()) + + // Base is now next emit. + MOVL(base.As32(), nextEmitL) + + // litBase = src[nextEmitL:] + LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) + SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit + + // Load (and store when we return) + MOVQ(dstBase, dstBaseTmp) + emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true) + Label("emit_literal_done_" + name) + // Store updated dstBase + MOVQ(dstBaseTmp, dstBase) + Label("emit_literal_skip_" + name) +} + +type hashGen struct { + bytes int + tablebits int + mulreg reg.GPVirtual +} + +// hash uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value. +func hash6(tablebits int) hashGen { + h := hashGen{ + bytes: 6, + tablebits: tablebits, + mulreg: GP64(), + } + MOVQ(Imm(227718039650203), h.mulreg) + return h +} + +// hash uses multiply to get hash of the value. +func (h hashGen) hash(val reg.GPVirtual) { + // Move value to top of register. + SHLQ(U8(64-8*h.bytes), val) + IMULQ(h.mulreg, val) + // Move value to bottom + SHRQ(U8(64-h.tablebits), val) +} + +func genEmitLiteral() { + TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int") + Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "", + "It assumes that:", + " dst is long enough to hold the encoded bytes", + " 0 <= len(lit) && len(lit) <= math.MaxUint32", "") + Pragma("noescape") + + dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64() + Load(Param("dst").Base(), dstBase) + Load(Param("lit").Base(), litBase) + Load(Param("lit").Len(), litLen) + emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false) + Label("emit_literal_end_standalone") + Store(retval, ReturnIndex(0)) + RET() + + TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int") + Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "", + "It assumes that:", + " dst is long enough to hold the encoded bytes", + " 0 <= len(lit) && len(lit) <= math.MaxUint32", "") + Pragma("noescape") + + dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64() + Load(Param("dst").Base(), dstBase) + Load(Param("lit").Base(), litBase) + Load(Param("lit").Len(), litLen) + emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false) + Label("emit_literal_end_avx_standalone") + Store(retval, ReturnIndex(0)) + RET() +} + +// emitLiteral can be used for inlining an emitLiteral call. +// stack must have at least 32 bytes. +// retval will contain emitted bytes, but can be nil if this is not interesting. +// dstBase and litBase are updated. +// Uses 2 GP registers. With AVX 4 registers. +// If updateDst is true dstBase will have the updated end pointer and an additional register will be used. +func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) { + n := GP64() + n16 := GP64() + + // We always add litLen bytes + if retval != nil { + MOVQ(litLen, retval) + } + MOVQ(litLen, n) + + SUBL(U8(1), n.As32()) + // Return if AX was 0 + JC(end) + + // Find number of bytes to emit for tag. + CMPL(n.As32(), U8(60)) + JLT(LabelRef("one_byte_" + name)) + CMPL(n.As32(), U32(1<<8)) + JLT(LabelRef("two_bytes_" + name)) + CMPL(n.As32(), U32(1<<16)) + JLT(LabelRef("three_bytes_" + name)) + CMPL(n.As32(), U32(1<<24)) + JLT(LabelRef("four_bytes_" + name)) + + Label("five_bytes_" + name) + MOVB(U8(252), Mem{Base: dstBase}) + MOVL(n.As32(), Mem{Base: dstBase, Disp: 1}) + if retval != nil { + ADDQ(U8(5), retval) + } + ADDQ(U8(5), dstBase) + JMP(LabelRef("memmove_" + name)) + + Label("four_bytes_" + name) + MOVQ(n, n16) + SHRL(U8(16), n16.As32()) + MOVB(U8(248), Mem{Base: dstBase}) + MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) + MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3}) + if retval != nil { + ADDQ(U8(4), retval) + } + ADDQ(U8(4), dstBase) + JMP(LabelRef("memmove_" + name)) + + Label("three_bytes_" + name) + MOVB(U8(0xf4), Mem{Base: dstBase}) + MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) + if retval != nil { + ADDQ(U8(3), retval) + } + ADDQ(U8(3), dstBase) + JMP(LabelRef("memmove_" + name)) + + Label("two_bytes_" + name) + MOVB(U8(0xf0), Mem{Base: dstBase}) + MOVB(n.As8(), Mem{Base: dstBase, Disp: 1}) + if retval != nil { + ADDQ(U8(2), retval) + } + ADDQ(U8(2), dstBase) + JMP(LabelRef("memmove_" + name)) + + Label("one_byte_" + name) + SHLB(U8(2), n.As8()) + MOVB(n.As8(), Mem{Base: dstBase}) + if retval != nil { + ADDQ(U8(1), retval) + } + ADDQ(U8(1), dstBase) + // Fallthrough + + Label("memmove_" + name) + + // copy(dst[i:], lit) + if true { + dstEnd := GP64() + if updateDst { + LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd) + } + genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, litLen, end, avx) + if updateDst { + MOVQ(dstEnd, dstBase) + } + } else { + genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end) + } + return +} + +// genEmitRepeat generates a standlone emitRepeat. +func genEmitRepeat() { + TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") + Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.", + "Length must be at least 4 and < 1<<32", "") + Pragma("noescape") + + dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() + + // retval = 0 + XORQ(retval, retval) + + Load(Param("dst").Base(), dstBase) + Load(Param("offset"), offset) + Load(Param("length"), length) + emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end")) + Label("gen_emit_repeat_end") + Store(retval, ReturnIndex(0)) + RET() +} + +// emitRepeat can be used for inlining an emitRepeat call. +// length >= 4 and < 1<<32 +// length is modified. dstBase is updated. retval is added to input. +// retval can be nil. +// Will jump to end label when finished. +// Uses 1 GP register. +func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { + Label("emit_repeat_again_" + name) + tmp := GP64() + MOVQ(length, tmp) // Copy length + // length -= 4 + LEAQ(Mem{Base: length, Disp: -4}, length) + + // if length <= 4 (use copied value) + CMPL(tmp.As32(), U8(8)) + JLE(LabelRef("repeat_two_" + name)) + + // length < 8 && offset < 2048 + CMPL(tmp.As32(), U8(12)) + JGE(LabelRef("cant_repeat_two_offset_" + name)) + CMPL(offset.As32(), U32(2048)) + JLT(LabelRef("repeat_two_offset_" + name)) + + const maxRepeat = ((1 << 24) - 1) + 65536 + Label("cant_repeat_two_offset_" + name) + CMPL(length.As32(), U32((1<<8)+4)) + JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4 + CMPL(length.As32(), U32((1<<16)+(1<<8))) + JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8) + CMPL(length.As32(), U32(maxRepeat)) + JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent. + + // We have have more than 24 bits + // Emit so we have at least 4 bytes left. + LEAQ(Mem{Base: length, Disp: -(maxRepeat - 4)}, length) // length -= (maxRepeat - 4) + MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 + MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb + MOVB(U8(255), Mem{Base: dstBase, Disp: 4}) + ADDQ(U8(5), dstBase) + if retval != nil { + ADDQ(U8(5), retval) + } + JMP(LabelRef("emit_repeat_again_" + name)) + + // Must be able to be within 5 bytes. + Label("repeat_five_" + name) + LEAQ(Mem{Base: length, Disp: -65536}, length) // length -= 65536 + MOVQ(length, offset) + MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 + MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) + SARQ(U8(16), offset) // offset = length >> 16 + MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16 + if retval != nil { + ADDQ(U8(5), retval) // i += 5 + } + ADDQ(U8(5), dstBase) // dst += 5 + JMP(end) + + Label("repeat_four_" + name) + LEAQ(Mem{Base: length, Disp: -256}, length) // length -= 256 + MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0 + MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) + if retval != nil { + ADDQ(U8(4), retval) // i += 4 + } + ADDQ(U8(4), dstBase) // dst += 4 + JMP(end) + + Label("repeat_three_" + name) + LEAQ(Mem{Base: length, Disp: -4}, length) // length -= 4 + MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0 + MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length) + if retval != nil { + ADDQ(U8(3), retval) // i += 3 + } + ADDQ(U8(3), dstBase) // dst += 3 + JMP(end) + + Label("repeat_two_" + name) + // dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0 + SHLL(U8(2), length.As32()) + ORL(U8(tagCopy1), length.As32()) + MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 + if retval != nil { + ADDQ(U8(2), retval) // i += 2 + } + ADDQ(U8(2), dstBase) // dst += 2 + JMP(end) + + Label("repeat_two_offset_" + name) + // Emit the remaining copy, encoded as 2 bytes. + // dst[1] = uint8(offset) + // dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 + tmp = GP64() + XORQ(tmp, tmp) + // Use scale and displacement to shift and subtract values from length. + LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length) + MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte + SARL(U8(8), offset.As32()) // Remove lower + SHLL(U8(5), offset.As32()) // Shift back up + ORL(offset.As32(), length.As32()) // OR result + MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) + if retval != nil { + ADDQ(U8(2), retval) // i += 2 + } + ADDQ(U8(2), dstBase) // dst += 2 + + JMP(end) +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 + +// genEmitCopy generates a standlone emitCopy +func genEmitCopy() { + TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int") + Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "", + "It assumes that:", + " dst is long enough to hold the encoded bytes", + " 1 <= offset && offset <= math.MaxUint32", + " 4 <= length && length <= 1 << 24", "") + Pragma("noescape") + + dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() + + // i := 0 + XORQ(retval, retval) + + Load(Param("dst").Base(), dstBase) + Load(Param("offset"), offset) + Load(Param("length"), length) + emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end")) + Label("gen_emit_copy_end") + Store(retval, ReturnIndex(0)) + RET() +} + +const ( + tagLiteral = 0x00 + tagCopy1 = 0x01 + tagCopy2 = 0x02 + tagCopy4 = 0x03 +) + +// emitCopy can be used for inlining an emitCopy call. +// length is modified (and junk). dstBase is updated. retval is added to input. +// retval can be nil. +// Will jump to end label when finished. +// Uses 2 GP registers. +func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { + // if offset >= 65536 { + CMPL(offset.As32(), U32(65536)) + JL(LabelRef("two_byte_offset_" + name)) + + // offset is >= 65536 + // if length <= 64 goto four_bytes_remain_ + CMPL(length.As32(), U8(64)) + JLE(LabelRef("four_bytes_remain_" + name)) + + // Emit a length 64 copy, encoded as 5 bytes. + // dst[0] = 63<<2 | tagCopy4 + MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase}) + // dst[4] = uint8(offset >> 24) + // dst[3] = uint8(offset >> 16) + // dst[2] = uint8(offset >> 8) + // dst[1] = uint8(offset) + MOVD(offset, Mem{Base: dstBase, Disp: 1}) + // length -= 64 + LEAQ(Mem{Base: length, Disp: -64}, length) + if retval != nil { + ADDQ(U8(5), retval) // i+=5 + } + ADDQ(U8(5), dstBase) // dst+=5 + + // if length >= 4 { + CMPL(length.As32(), U8(4)) + JL(LabelRef("four_bytes_remain_" + name)) + + // Emit remaining as repeats + // return 5 + emitRepeat(dst[5:], offset, length) + // Inline call to emitRepeat. Will jump to end + emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end) + + Label("four_bytes_remain_" + name) + // if length == 0 { + // return i + // } + TESTL(length.As32(), length.As32()) + JZ(end) + + // Emit a copy, offset encoded as 4 bytes. + // dst[i+0] = uint8(length-1)<<2 | tagCopy4 + // dst[i+1] = uint8(offset) + // dst[i+2] = uint8(offset >> 8) + // dst[i+3] = uint8(offset >> 16) + // dst[i+4] = uint8(offset >> 24) + tmp := GP64() + MOVB(U8(tagCopy4), tmp.As8()) + // Use displacement to subtract 1 from upshifted length. + LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length) + MOVB(length.As8(), Mem{Base: dstBase}) + MOVD(offset, Mem{Base: dstBase, Disp: 1}) + // return i + 5 + if retval != nil { + ADDQ(U8(5), retval) + } + ADDQ(U8(5), dstBase) + JMP(end) + + Label("two_byte_offset_" + name) + // Offset no more than 2 bytes. + + // if length > 64 { + CMPL(length.As32(), U8(64)) + JLE(LabelRef("two_byte_offset_short_" + name)) + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + // dst[2] = uint8(offset >> 8) + // dst[1] = uint8(offset) + // dst[0] = 59<<2 | tagCopy2 + MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase}) + MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) + // length -= 60 + LEAQ(Mem{Base: length, Disp: -60}, length) + + // Emit remaining as repeats, at least 4 bytes remain. + // return 3 + emitRepeat(dst[3:], offset, length) + //} + ADDQ(U8(3), dstBase) + if retval != nil { + ADDQ(U8(3), retval) + } + // Inline call to emitRepeat. Will jump to end + emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end) + + Label("two_byte_offset_short_" + name) + // if length >= 12 || offset >= 2048 { + CMPL(length.As32(), U8(12)) + JGE(LabelRef("emit_copy_three_" + name)) + CMPL(offset.As32(), U32(2048)) + JGE(LabelRef("emit_copy_three_" + name)) + + // Emit the remaining copy, encoded as 2 bytes. + // dst[1] = uint8(offset) + // dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + tmp = GP64() + MOVB(U8(tagCopy1), tmp.As8()) + // Use scale and displacement to shift and subtract values from length. + LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length) + MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte + SHRL(U8(8), offset.As32()) // Remove lower + SHLL(U8(5), offset.As32()) // Shift back up + ORL(offset.As32(), length.As32()) // OR result + MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) + if retval != nil { + ADDQ(U8(2), retval) // i += 2 + } + ADDQ(U8(2), dstBase) // dst += 2 + // return 2 + JMP(end) + + Label("emit_copy_three_" + name) + // // Emit the remaining copy, encoded as 3 bytes. + // dst[2] = uint8(offset >> 8) + // dst[1] = uint8(offset) + // dst[0] = uint8(length-1)<<2 | tagCopy2 + tmp = GP64() + MOVB(U8(tagCopy2), tmp.As8()) + LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length) + MOVB(length.As8(), Mem{Base: dstBase}) + MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) + // return 3 + if retval != nil { + ADDQ(U8(3), retval) // i += 3 + } + ADDQ(U8(3), dstBase) // dst += 3 + JMP(end) +} + +// func memmove(to, from unsafe.Pointer, n uintptr) +// to and from will be at the end, n will be 0. +// to and from may not overlap. +// Fairly simplistic for now, can ofc. be extended. +// Uses one GP register and 8 SSE registers. +func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) { + tmp := GP64() + MOVQ(n, tmp) + // tmp = n/128 + SHRQ(U8(7), tmp) + + TESTQ(tmp, tmp) + JZ(LabelRef("done_128_" + name)) + Label("loop_128_" + name) + var xmmregs [8]reg.VecVirtual + + // Prefetch destination for next loop. + // Prefetching source doesn't provide speedup. + // This seems to give a small boost. + const preOff = 128 + PREFETCHT0(Mem{Base: to, Disp: preOff}) + PREFETCHT0(Mem{Base: to, Disp: preOff + 64}) + + for i := 0; i < 8; i++ { + xmmregs[i] = XMM() + MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i]) + } + for i := 0; i < 8; i++ { + MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16)) + } + LEAQ(Mem{Base: n, Disp: -128}, n) + ADDQ(U8(8*16), from) + ADDQ(U8(8*16), to) + DECQ(tmp) + JNZ(LabelRef("loop_128_" + name)) + + Label("done_128_" + name) + MOVQ(n, tmp) + // tmp = n/16 + SHRQ(U8(4), tmp) + TESTQ(tmp, tmp) + JZ(LabelRef("done_16_" + name)) + + Label("loop_16_" + name) + xmm := XMM() + MOVOU(Mem{Base: from}, xmm) + MOVOU(xmm, Mem{Base: to}) + LEAQ(Mem{Base: n, Disp: -16}, n) + ADDQ(U8(16), from) + ADDQ(U8(16), to) + DECQ(tmp) + JNZ(LabelRef("loop_16_" + name)) + Label("done_16_" + name) + + // TODO: Use REP; MOVSB somehow. + TESTQ(n, n) + JZ(end) + Label("loop_1_" + name) + MOVB(Mem{Base: from}, tmp.As8()) + MOVB(tmp.As8(), Mem{Base: to}) + INCQ(from) + INCQ(to) + DECQ(n) + JNZ(LabelRef("loop_1_" + name)) +} + +// func memmove(to, from unsafe.Pointer, n uintptr) +// src and dst may not overlap. +// Non AVX uses 2 GP register, 16 SSE2 registers. +// AVX uses 4 GP registers 16 AVX/SSE registers. +// All passed registers may be updated. +func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) { + AX, CX := GP64(), GP64() + NOP() + name += "_memmove_" + Label(name + "tail") + // move_129through256 or smaller work whether or not the source and the + // destination memory regions overlap because they load all data into + // registers before writing it back. move_256through2048 on the other + // hand can be used only when the memory regions don't overlap or the copy + // direction is forward. + // + // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. + TESTQ(length, length) + JEQ(end) + CMPQ(length, U8(2)) + JBE(LabelRef(name + "move_1or2")) + CMPQ(length, U8(4)) + JB(LabelRef(name + "move_3")) + JBE(LabelRef(name + "move_4")) + CMPQ(length, U8(8)) + JB(LabelRef(name + "move_5through7")) + JE(LabelRef(name + "move_8")) + CMPQ(length, U8(16)) + JBE(LabelRef(name + "move_9through16")) + CMPQ(length, U8(32)) + JBE(LabelRef(name + "move_17through32")) + CMPQ(length, U8(64)) + JBE(LabelRef(name + "move_33through64")) + CMPQ(length, U8(128)) + JBE(LabelRef(name + "move_65through128")) + CMPQ(length, U32(256)) + JBE(LabelRef(name + "move_129through256")) + + if avx { + JMP(LabelRef(name + "avxUnaligned")) + } else { + if false { + // Don't check length for now. + Label(name + "forward") + CMPQ(length, U32(2048)) + JLS(LabelRef(name + "move_256through2048")) + + genMemMove(name+"fallback", dst, src, length, end) + } else { + JMP(LabelRef(name + "move_256through2048")) + } + } + /* + // If REP MOVSB isn't fast, don't use it + // FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB) + // CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB + JMP(LabelRef(name + "fwdBy8")) + + // Check alignment + MOVL(src.As32(), AX.As32()) + ORL(dst.As32(), AX.As32()) + TESTL(U32(7), AX.As32()) + JEQ(LabelRef(name + "fwdBy8")) + + // Do 1 byte at a time + // MOVQ(length, CX) + // FIXME: + // REP; MOVSB + JMP(end) + + Label(name + "fwdBy8") + // Do 8 bytes at a time + MOVQ(length, CX) + SHRQ(U8(3), CX) + ANDQ(U8(7), length) + // FIXME: + //REP; MOVSQ + JMP(LabelRef(name + "tail")) + + Label(name + "back") + + //check overlap + MOVQ(src, CX) + ADDQ(length, CX) + CMPQ(CX, dst) + JLS(LabelRef(name + "forward")) + + //whole thing backwards has + //adjusted addresses + + ADDQ(length, dst) + ADDQ(length, src) + STD() + + // + // copy + // + MOVQ(length, CX) + SHRQ(U8(3), CX) + ANDQ(U8(7), length) + + SUBQ(U8(8), dst) + SUBQ(U8(8), src) + // FIXME: + //REP; MOVSQ + + // FIXME: + //CLD() + + ADDQ(U8(8), dst) + ADDQ(U8(8), src) + SUBQ(length, dst) + SUBQ(length, src) + JMP(LabelRef(name + "tail")) + */ + + Label(name + "move_1or2") + MOVB(Mem{Base: src}, AX.As8()) + MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8()) + MOVB(AX.As8(), Mem{Base: dst}) + MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1}) + JMP(end) + + Label(name + "move_4") + MOVL(Mem{Base: src}, AX.As32()) + MOVL(AX.As32(), Mem{Base: dst}) + JMP(end) + + Label(name + "move_3") + MOVW(Mem{Base: src}, AX.As16()) + MOVB(Mem{Base: src, Disp: 2}, CX.As8()) + MOVW(AX.As16(), Mem{Base: dst}) + MOVB(CX.As8(), Mem{Base: dst, Disp: 2}) + JMP(end) + + Label(name + "move_5through7") + MOVL(Mem{Base: src}, AX.As32()) + MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32()) + MOVL(AX.As32(), Mem{Base: dst}) + MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1}) + JMP(end) + + Label(name + "move_8") + // We need a separate case for 8 to make sure we write pointers atomically. + MOVQ(Mem{Base: src}, AX) + MOVQ(AX, Mem{Base: dst}) + JMP(end) + + Label(name + "move_9through16") + MOVQ(Mem{Base: src}, AX) + MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX) + MOVQ(AX, Mem{Base: dst}) + MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1}) + JMP(end) + + Label(name + "move_17through32") + X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + + MOVOU(Mem{Base: src}, X0) + MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1) + MOVOU(X0, Mem{Base: dst}) + MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) + JMP(end) + + Label(name + "move_33through64") + MOVOU(Mem{Base: src}, X0) + MOVOU(Mem{Base: src, Disp: 16}, X1) + MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2) + MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3) + MOVOU(X0, Mem{Base: dst}) + MOVOU(X1, Mem{Base: dst, Disp: 16}) + MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1}) + MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) + JMP(end) + + Label(name + "move_65through128") + MOVOU(Mem{Base: src}, X0) + MOVOU(Mem{Base: src, Disp: 16}, X1) + MOVOU(Mem{Base: src, Disp: 32}, X2) + MOVOU(Mem{Base: src, Disp: 48}, X3) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) + MOVOU(X0, Mem{Base: dst}) + MOVOU(X1, Mem{Base: dst, Disp: 16}) + MOVOU(X2, Mem{Base: dst, Disp: 32}) + MOVOU(X3, Mem{Base: dst, Disp: 48}) + MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) + MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) + MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) + MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) + JMP(end) + + Label(name + "move_129through256") + MOVOU(Mem{Base: src}, X0) + MOVOU(Mem{Base: src, Disp: 16}, X1) + MOVOU(Mem{Base: src, Disp: 32}, X2) + MOVOU(Mem{Base: src, Disp: 48}, X3) + MOVOU(Mem{Base: src, Disp: 64}, X4) + MOVOU(Mem{Base: src, Disp: 80}, X5) + MOVOU(Mem{Base: src, Disp: 96}, X6) + MOVOU(Mem{Base: src, Disp: 112}, X7) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) + MOVOU(X0, Mem{Base: dst}) + MOVOU(X1, Mem{Base: dst, Disp: 16}) + MOVOU(X2, Mem{Base: dst, Disp: 32}) + MOVOU(X3, Mem{Base: dst, Disp: 48}) + MOVOU(X4, Mem{Base: dst, Disp: 64}) + MOVOU(X5, Mem{Base: dst, Disp: 80}) + MOVOU(X6, Mem{Base: dst, Disp: 96}) + MOVOU(X7, Mem{Base: dst, Disp: 112}) + MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128}) + MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112}) + MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96}) + MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80}) + MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) + MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) + MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) + MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) + JMP(end) + + Label(name + "move_256through2048") + LEAQ(Mem{Base: length, Disp: -256}, length) + MOVOU(Mem{Base: src}, X0) + MOVOU(Mem{Base: src, Disp: 16}, X1) + MOVOU(Mem{Base: src, Disp: 32}, X2) + MOVOU(Mem{Base: src, Disp: 48}, X3) + MOVOU(Mem{Base: src, Disp: 64}, X4) + MOVOU(Mem{Base: src, Disp: 80}, X5) + MOVOU(Mem{Base: src, Disp: 96}, X6) + MOVOU(Mem{Base: src, Disp: 112}, X7) + MOVOU(Mem{Base: src, Disp: 128}, X8) + MOVOU(Mem{Base: src, Disp: 144}, X9) + MOVOU(Mem{Base: src, Disp: 160}, X10) + MOVOU(Mem{Base: src, Disp: 176}, X11) + MOVOU(Mem{Base: src, Disp: 192}, X12) + MOVOU(Mem{Base: src, Disp: 208}, X13) + MOVOU(Mem{Base: src, Disp: 224}, X14) + MOVOU(Mem{Base: src, Disp: 240}, X15) + MOVOU(X0, Mem{Base: dst}) + MOVOU(X1, Mem{Base: dst, Disp: 16}) + MOVOU(X2, Mem{Base: dst, Disp: 32}) + MOVOU(X3, Mem{Base: dst, Disp: 48}) + MOVOU(X4, Mem{Base: dst, Disp: 64}) + MOVOU(X5, Mem{Base: dst, Disp: 80}) + MOVOU(X6, Mem{Base: dst, Disp: 96}) + MOVOU(X7, Mem{Base: dst, Disp: 112}) + MOVOU(X8, Mem{Base: dst, Disp: 128}) + MOVOU(X9, Mem{Base: dst, Disp: 144}) + MOVOU(X10, Mem{Base: dst, Disp: 160}) + MOVOU(X11, Mem{Base: dst, Disp: 176}) + MOVOU(X12, Mem{Base: dst, Disp: 192}) + MOVOU(X13, Mem{Base: dst, Disp: 208}) + MOVOU(X14, Mem{Base: dst, Disp: 224}) + MOVOU(X15, Mem{Base: dst, Disp: 240}) + CMPQ(length, U32(256)) + LEAQ(Mem{Base: src, Disp: 256}, src) + LEAQ(Mem{Base: dst, Disp: 256}, dst) + JGE(LabelRef(name + "move_256through2048")) + JMP(LabelRef(name + "tail")) + + if avx { + Label(name + "avxUnaligned") + R8, R10 := GP64(), GP64() + // There are two implementations of move algorithm. + // The first one for non-overlapped memory regions. It uses forward copying. + // We do not support overlapping input + + // Non-temporal copy would be better for big sizes. + // Disabled since big copies are unlikely. + // If enabling, test functionality. + const enableBigData = false + if enableBigData { + CMPQ(length, U32(0x100000)) + JAE(LabelRef(name + "gobble_big_data_fwd")) + } + + // Memory layout on the source side + // src CX + // |<---------length before correction--------->| + // | |<--length corrected-->| | + // | | |<--- AX --->| + // |<-R11->| |<-128 bytes->| + // +----------------------------------------+ + // | Head | Body | Tail | + // +-------+------------------+-------------+ + // ^ ^ ^ + // | | | + // Save head into Y4 Save tail into X5..X12 + // | + // src+R11, where R11 = ((dst & -32) + 32) - dst + // Algorithm: + // 1. Unaligned save of the tail's 128 bytes + // 2. Unaligned save of the head's 32 bytes + // 3. Destination-aligned copying of body (128 bytes per iteration) + // 4. Put head on the new place + // 5. Put the tail on the new place + // It can be important to satisfy processor's pipeline requirements for + // small sizes as the cost of unaligned memory region copying is + // comparable with the cost of main loop. So code is slightly messed there. + // There is more clean implementation of that algorithm for bigger sizes + // where the cost of unaligned part copying is negligible. + // You can see it after gobble_big_data_fwd label. + Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM() + + LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX) + MOVQ(dst, R10) + // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. + MOVOU(Mem{Base: CX, Disp: -0x80}, X5) + MOVOU(Mem{Base: CX, Disp: -0x70}, X6) + MOVQ(U32(0x80), AX) + + // Align destination address + ANDQ(U32(0xffffffe0), dst) + ADDQ(U8(32), dst) + // Continue tail saving. + MOVOU(Mem{Base: CX, Disp: -0x60}, X7) + MOVOU(Mem{Base: CX, Disp: -0x50}, X8) + // Make R8 delta between aligned and unaligned destination addresses. + MOVQ(dst, R8) + SUBQ(R10, R8) + // Continue tail saving. + MOVOU(Mem{Base: CX, Disp: -0x40}, X9) + MOVOU(Mem{Base: CX, Disp: -0x30}, X10) + // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. + SUBQ(R8, length) + // Continue tail saving. + MOVOU(Mem{Base: CX, Disp: -0x20}, X11) + MOVOU(Mem{Base: CX, Disp: -0x10}, X12) + // The tail will be put on its place after main body copying. + // It's time for the unaligned heading part. + VMOVDQU(Mem{Base: src}, Y4) + // Adjust source address to point past head. + ADDQ(R8, src) + SUBQ(AX, length) + + // Aligned memory copying there + Label(name + "gobble_128_loop") + VMOVDQU(Mem{Base: src}, Y0) + VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1) + VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2) + VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3) + ADDQ(AX, src) + VMOVDQA(Y0, Mem{Base: dst}) + VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20}) + VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40}) + VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60}) + ADDQ(AX, dst) + SUBQ(AX, length) + JA(LabelRef(name + "gobble_128_loop")) + // Now we can store unaligned parts. + ADDQ(AX, length) + ADDQ(dst, length) + VMOVDQU(Y4, Mem{Base: R10}) + VZEROUPPER() + MOVOU(X5, Mem{Base: length, Disp: -0x80}) + MOVOU(X6, Mem{Base: length, Disp: -0x70}) + MOVOU(X7, Mem{Base: length, Disp: -0x60}) + MOVOU(X8, Mem{Base: length, Disp: -0x50}) + MOVOU(X9, Mem{Base: length, Disp: -0x40}) + MOVOU(X10, Mem{Base: length, Disp: -0x30}) + MOVOU(X11, Mem{Base: length, Disp: -0x20}) + MOVOU(X12, Mem{Base: length, Disp: -0x10}) + JMP(end) + + if enableBigData { + Label(name + "gobble_big_data_fwd") + // There is forward copying for big regions. + // It uses non-temporal mov instructions. + // Details of this algorithm are commented previously for small sizes. + LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX) + MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5) + MOVOU(Mem{Base: CX, Disp: -0x70}, X6) + MOVOU(Mem{Base: CX, Disp: -0x60}, X7) + MOVOU(Mem{Base: CX, Disp: -0x50}, X8) + MOVOU(Mem{Base: CX, Disp: -0x40}, X9) + MOVOU(Mem{Base: CX, Disp: -0x30}, X10) + MOVOU(Mem{Base: CX, Disp: -0x20}, X11) + MOVOU(Mem{Base: CX, Disp: -0x10}, X12) + VMOVDQU(Mem{Base: src}, Y4) + MOVQ(dst, R8) + + ANDQ(U32(0xffffffe0), dst) + ADDQ(U8(32), dst) + + MOVQ(dst, R10) + SUBQ(R8, R10) + SUBQ(R10, length) + ADDQ(R10, src) + LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX) + SUBQ(U8(0x80), length) + + Label(name + "gobble_mem_fwd_loop") + PREFETCHNTA(Mem{Base: src, Disp: 0x1c0}) + PREFETCHNTA(Mem{Base: src, Disp: 0x280}) + // Prefetch values were chosen empirically. + // Approach for prefetch usage as in 7.6.6 of [1] + // [1] 64-ia-32-architectures-optimization-manual.pdf + // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf + VMOVDQU(Mem{Base: src}, Y0) + VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1) + VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2) + VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3) + + ADDQ(U8(0x80), src) + VMOVNTDQ(Y0, Mem{Base: dst}) + VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20}) + VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20}) + VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60}) + ADDQ(U8(0x80), dst) + SUBQ(U8(0x80), length) + JA(LabelRef(name + "gobble_mem_fwd_loop")) + // NT instructions don't follow the normal cache-coherency rules. + // We need SFENCE there to make copied data available timely. + SFENCE() + VMOVDQU(Y4, Mem{Base: R8}) + VZEROUPPER() + MOVOU(X5, Mem{Base: CX, Disp: -0x80}) + MOVOU(X6, Mem{Base: CX, Disp: -0x70}) + MOVOU(X7, Mem{Base: CX, Disp: -0x60}) + MOVOU(X8, Mem{Base: CX, Disp: -0x50}) + MOVOU(X9, Mem{Base: CX, Disp: -0x40}) + MOVOU(X10, Mem{Base: CX, Disp: -0x30}) + MOVOU(X11, Mem{Base: CX, Disp: -0x20}) + MOVOU(X12, Mem{Base: CX, Disp: -0x10}) + JMP(end) + } + } +} + +// genMatchLen generates standalone matchLen. +func genMatchLen() { + TEXT("matchLen", NOSPLIT, "func(a, b []byte) int") + Doc("matchLen returns how many bytes match in a and b", "", + "It assumes that:", + " len(a) <= len(b)", "") + Pragma("noescape") + + aBase, bBase, length := GP64(), GP64(), GP64() + + Load(Param("a").Base(), aBase) + Load(Param("b").Base(), bBase) + Load(Param("a").Len(), length) + l := matchLen("standalone", Mem{Base: aBase}, Mem{Base: bBase}, length, LabelRef("gen_match_len_end")) + Label("gen_match_len_end") + Store(l, ReturnIndex(0)) + RET() +} + +// matchLen returns the number of matching bytes of a and b. +// len is the maximum number of bytes to match. +// Will jump to end when done and returns the length. +// Uses 2 GP registers. +func matchLen(name string, a, b Mem, len reg.GPVirtual, end LabelRef) reg.GPVirtual { + tmp, matched := GP64(), GP64() + XORQ(matched, matched) + + CMPQ(len, U8(8)) + JL(LabelRef("matchlen_single_" + name)) + + Label("matchlen_loopback_" + name) + MOVQ(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp) + XORQ(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp) + TESTQ(tmp, tmp) + JZ(LabelRef("matchlen_loop_" + name)) + // Not all match. + BSFQ(tmp, tmp) + SARQ(U8(3), tmp) + LEAQ(Mem{Base: matched, Index: tmp, Scale: 1}, matched) + JMP(end) + + // All 8 byte matched, update and loop. + Label("matchlen_loop_" + name) + LEAQ(Mem{Base: len, Disp: -8}, len) + LEAQ(Mem{Base: matched, Disp: 8}, matched) + CMPQ(len, U8(8)) + JGE(LabelRef("matchlen_loopback_" + name)) + + // Less than 8 bytes left. + Label("matchlen_single_" + name) + TESTQ(len, len) + JZ(end) + Label("matchlen_single_loopback_" + name) + MOVB(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp.As8()) + CMPB(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp.As8()) + JNE(end) + LEAQ(Mem{Base: matched, Disp: 1}, matched) + DECQ(len) + JNZ(LabelRef("matchlen_single_loopback_" + name)) + JMP(end) + return matched +} diff --git a/tests/fixedbugs/issue100/allocfail/doc.go b/tests/fixedbugs/issue100/allocfail/doc.go new file mode 100644 index 0000000..c566091 --- /dev/null +++ b/tests/fixedbugs/issue100/allocfail/doc.go @@ -0,0 +1,9 @@ +// Package allocfail is a regression test for issue 100 based on the original reported allocation failure. +// +// Based on the pull request https://github.com/klauspost/compress/pull/186 at +// c1f3cf132cd8e214b38cc16e418bf2e501ccda93 with the lines after "FIXME" +// comments re-activated and other minimal edits to make it work in this +// environment. +package allocfail + +//go:generate go run asm.go -out allocfail.s -stubs stubs.go diff --git a/tests/fixedbugs/issue100/allocfail/stubs.go b/tests/fixedbugs/issue100/allocfail/stubs.go new file mode 100644 index 0000000..34992a3 --- /dev/null +++ b/tests/fixedbugs/issue100/allocfail/stubs.go @@ -0,0 +1,85 @@ +// Code generated by command: go run asm.go -out allocfail.s -stubs stubs.go. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc + +package allocfail + +// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm(dst []byte, src []byte) int + +// encodeBlockAsm14B encodes a non-empty src to a guaranteed-large-enough dst. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm14B(dst []byte, src []byte) int + +// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm12B(dst []byte, src []byte) int + +// encodeBlockAsmAvx encodes a non-empty src to a guaranteed-large-enough dst. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsmAvx(dst []byte, src []byte) int + +// encodeBlockAsm14BAvx encodes a non-empty src to a guaranteed-large-enough dst. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm14BAvx(dst []byte, src []byte) int + +// encodeBlockAsm12BAvx encodes a non-empty src to a guaranteed-large-enough dst. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm12BAvx(dst []byte, src []byte) int + +// emitLiteral writes a literal chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 0 <= len(lit) && len(lit) <= math.MaxUint32 +// +//go:noescape +func emitLiteral(dst []byte, lit []byte) int + +// emitLiteralAvx writes a literal chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 0 <= len(lit) && len(lit) <= math.MaxUint32 +// +//go:noescape +func emitLiteralAvx(dst []byte, lit []byte) int + +// emitRepeat writes a repeat chunk and returns the number of bytes written. +// Length must be at least 4 and < 1<<32 +// +//go:noescape +func emitRepeat(dst []byte, offset int, length int) int + +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +// +//go:noescape +func emitCopy(dst []byte, offset int, length int) int + +// matchLen returns how many bytes match in a and b +// +// It assumes that: +// len(a) <= len(b) +// +//go:noescape +func matchLen(a []byte, b []byte) int diff --git a/tests/fixedbugs/issue100/doc.go b/tests/fixedbugs/issue100/doc.go deleted file mode 100644 index e6495d8..0000000 --- a/tests/fixedbugs/issue100/doc.go +++ /dev/null @@ -1,2 +0,0 @@ -// Package issue100 contains a reproducer for a bug in aliased register allocation. -package issue100 diff --git a/tests/fixedbugs/issue100/asm.go b/tests/fixedbugs/issue100/minrepro/asm.go similarity index 100% rename from tests/fixedbugs/issue100/asm.go rename to tests/fixedbugs/issue100/minrepro/asm.go diff --git a/tests/fixedbugs/issue100/minrepro/doc.go b/tests/fixedbugs/issue100/minrepro/doc.go new file mode 100644 index 0000000..a275e16 --- /dev/null +++ b/tests/fixedbugs/issue100/minrepro/doc.go @@ -0,0 +1,2 @@ +// Package minrepro contains a minimal reproducer for the aliased register allocation bug in issue 100. +package minrepro diff --git a/tests/fixedbugs/issue100/issue100.s b/tests/fixedbugs/issue100/minrepro/minrepro.s similarity index 98% rename from tests/fixedbugs/issue100/issue100.s rename to tests/fixedbugs/issue100/minrepro/minrepro.s index ef14ab0..90bb6cf 100644 --- a/tests/fixedbugs/issue100/issue100.s +++ b/tests/fixedbugs/issue100/minrepro/minrepro.s @@ -1,4 +1,4 @@ -// Code generated by command: go run asm.go -out issue100.s -stubs stub.go. DO NOT EDIT. +// Code generated by command: go run asm.go -out minrepro.s -stubs stub.go. DO NOT EDIT. #include "textflag.h" diff --git a/tests/fixedbugs/issue100/issue100_test.go b/tests/fixedbugs/issue100/minrepro/minrepro_test.go similarity index 72% rename from tests/fixedbugs/issue100/issue100_test.go rename to tests/fixedbugs/issue100/minrepro/minrepro_test.go index 0d0e692..ced7555 100644 --- a/tests/fixedbugs/issue100/issue100_test.go +++ b/tests/fixedbugs/issue100/minrepro/minrepro_test.go @@ -1,10 +1,10 @@ -package issue100 +package minrepro import ( "testing" ) -//go:generate go run asm.go -out issue100.s -stubs stub.go +//go:generate go run asm.go -out minrepro.s -stubs stub.go func TestIssue100(t *testing.T) { n := uint64(100) diff --git a/tests/fixedbugs/issue100/minrepro/stub.go b/tests/fixedbugs/issue100/minrepro/stub.go new file mode 100644 index 0000000..b2e15a5 --- /dev/null +++ b/tests/fixedbugs/issue100/minrepro/stub.go @@ -0,0 +1,5 @@ +// Code generated by command: go run asm.go -out minrepro.s -stubs stub.go. DO NOT EDIT. + +package minrepro + +func Issue100() uint64 diff --git a/tests/fixedbugs/issue100/stub.go b/tests/fixedbugs/issue100/stub.go deleted file mode 100644 index 8a2086a..0000000 --- a/tests/fixedbugs/issue100/stub.go +++ /dev/null @@ -1,5 +0,0 @@ -// Code generated by command: go run asm.go -out issue100.s -stubs stub.go. DO NOT EDIT. - -package issue100 - -func Issue100() uint64 diff --git a/tests/fixedbugs/issue65/castphysical.go b/tests/fixedbugs/issue65/asm.go similarity index 50% rename from tests/fixedbugs/issue65/castphysical.go rename to tests/fixedbugs/issue65/asm.go index 1cae787..59f014a 100644 --- a/tests/fixedbugs/issue65/castphysical.go +++ b/tests/fixedbugs/issue65/asm.go @@ -1,11 +1,4 @@ -// +build generate - -//go:generate go run $GOFILE - -// Regression test for a bug where casting a physical register would give the -// error "non physical register found". -// -// See: https://github.com/mmcloughlin/avo/issues/65#issuecomment-576850145 +// +build ignore package main diff --git a/tests/fixedbugs/issue65/doc.go b/tests/fixedbugs/issue65/doc.go new file mode 100644 index 0000000..e3474a0 --- /dev/null +++ b/tests/fixedbugs/issue65/doc.go @@ -0,0 +1,9 @@ +// Package issue65 is a regression test for a bug involving casting physical registers. +// +// Regression test for a bug where casting a physical register would give the +// error "non physical register found". +// +// See: https://github.com/mmcloughlin/avo/issues/65#issuecomment-576850145 +package issue65 + +//go:generate go run asm.go -out issue65.s -stubs stub.go diff --git a/tests/fixedbugs/issue65/issue65.s b/tests/fixedbugs/issue65/issue65.s new file mode 100644 index 0000000..968d878 --- /dev/null +++ b/tests/fixedbugs/issue65/issue65.s @@ -0,0 +1,9 @@ +// Code generated by command: go run asm.go -out issue65.s -stubs stub.go. DO NOT EDIT. + +#include "textflag.h" + +// func Issue65() +// Requires: AVX2 +TEXT ·Issue65(SB), NOSPLIT, $0 + VINSERTI128 $0x01, X0, Y1, Y2 + RET diff --git a/tests/fixedbugs/issue65/stub.go b/tests/fixedbugs/issue65/stub.go new file mode 100644 index 0000000..4d8d601 --- /dev/null +++ b/tests/fixedbugs/issue65/stub.go @@ -0,0 +1,5 @@ +// Code generated by command: go run asm.go -out issue65.s -stubs stub.go. DO NOT EDIT. + +package issue65 + +func Issue65() diff --git a/x86/zctors.go b/x86/zctors.go index 7394e03..447c0a1 100644 --- a/x86/zctors.go +++ b/x86/zctors.go @@ -15578,10 +15578,11 @@ func RDTSCP() (*intrep.Instruction, error) { // RET func RET() (*intrep.Instruction, error) { return &intrep.Instruction{ - Opcode: "RET", - Operands: nil, - Inputs: []operand.Op{}, - Outputs: []operand.Op{}, + Opcode: "RET", + Operands: nil, + Inputs: []operand.Op{}, + Outputs: []operand.Op{}, + IsTerminal: true, }, nil }