// +build ignore

package main

import (
	"fmt"
	"log"

	. "github.com/mmcloughlin/avo/build"
	"github.com/mmcloughlin/avo/buildtags"
	"github.com/mmcloughlin/avo/operand"
	. "github.com/mmcloughlin/avo/operand"
	"github.com/mmcloughlin/avo/reg"
)

func main() {
	Constraint(buildtags.Not("appengine").ToConstraint())
	Constraint(buildtags.Not("noasm").ToConstraint())
	Constraint(buildtags.Term("gc").ToConstraint())

	genEncodeBlockAsm("encodeBlockAsm", 16, 6, false)
	genEncodeBlockAsm("encodeBlockAsm14B", 14, 5, false)
	genEncodeBlockAsm("encodeBlockAsm12B", 12, 4, false)
	genEncodeBlockAsm("encodeBlockAsmAvx", 16, 6, true)
	genEncodeBlockAsm("encodeBlockAsm14BAvx", 14, 5, true)
	genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 4, true)
	genEmitLiteral()
	genEmitRepeat()
	genEmitCopy()
	genMatchLen()
	Generate()
}

func debugval(v operand.Op) {
	value := reg.R15
	MOVQ(v, value)
	INT(Imm(3))
}

func genEncodeBlockAsm(name string, tableBits, skipLog int, avx bool) {
	TEXT(name, 0, "func(dst, src []byte) int")
	Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
		"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
	Pragma("noescape")

	// "var table [maxTableSize]uint32" takes up 4 * (1 << tableBits) bytes of stack space.
	// Extra bytes are added to keep less used values.
	var (
		tableSize = 1 << uint(tableBits)
		// Keep base stack multiple of 16.
		baseStack = 0
		// try to keep extraStack + baseStack multiple of 16
		// for best chance of table alignment.
		extraStack = 32
		allocStack = baseStack + extraStack + tableSize
	)

	// Memzero needs at least 128 bytes.
	if tableSize < 128 {
		panic("tableSize must be at least 128 bytes")
	}

	lenSrcBasic, err := Param("src").Len().Resolve()
	if err != nil {
		panic(err)
	}
	lenSrcQ := lenSrcBasic.Addr

	stack := AllocLocal(allocStack)
	table := stack.Offset(allocStack - tableSize)

	tmpStack := baseStack
	// Bail if we can't compress to at least this.
	dstLimitPtrQ := stack.Offset(tmpStack)
	tmpStack += 8
	// dstStartPtrQ contains the original dst pointer for returning the length
	dstStartPtrQ := stack.Offset(tmpStack)
	tmpStack += 8
	// sLimitL is when to stop looking for offset/length copies.
	sLimitL := stack.Offset(tmpStack)
	tmpStack += 4
	// nextEmitL keeps track of the point we have emitted to.
	nextEmitL := stack.Offset(tmpStack)
	tmpStack += 4
	// Repeat stores the last match offset.
	repeatL := stack.Offset(tmpStack)
	tmpStack += 4
	// nextSTempL keeps nextS while other functions are being called.
	nextSTempL := stack.Offset(tmpStack)
	tmpStack += 4
	// Ensure we have the correct extra stack.
	// Could be automatic, but whatever.
	if tmpStack-baseStack != extraStack {
		log.Fatal("adjust extraStack to ", tmpStack-baseStack)
	}

	dstBaseBasic, err := Param("dst").Base().Resolve()
	if err != nil {
		panic(err)
	}
	dstBase := dstBaseBasic.Addr

	if tmpStack > extraStack+baseStack {
		panic(fmt.Sprintf("tmp stack exceeded: %v", tmpStack))
	}

	// Zero table
	{
		iReg := GP64()
		MOVQ(U32(tableSize/8/16), iReg)
		tablePtr := GP64()
		LEAQ(table, tablePtr)
		zeroXmm := XMM()
		PXOR(zeroXmm, zeroXmm)

		Label("zero_loop_" + name)
		for i := 0; i < 8; i++ {
			MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
		}
		ADDQ(U8(16*8), tablePtr)
		DECQ(iReg)
		JNZ(LabelRef("zero_loop_" + name))

		// nextEmit is offset n src where the next emitLiteral should start from.
		MOVL(iReg.As32(), nextEmitL)
	}

	{
		const inputMargin = 8
		tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
		MOVQ(lenSrcQ, tmp)
		LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
		// sLimitL := len(src) - inputMargin
		LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
		// dstLimit := len(src) - len(src)>>5 - 5
		SHRQ(U8(5), tmp)
		SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
		MOVL(tmp3.As32(), sLimitL)
		dstAddr := GP64()
		MOVQ(dstBase, dstAddr)
		// Store dst start address
		MOVQ(dstAddr, dstStartPtrQ)
		LEAQ(Mem{Base: dstAddr, Index: tmp2, Scale: 1}, tmp2)
		MOVQ(tmp2, dstLimitPtrQ)
	}

	// s = 1
	s := GP64().As32()
	MOVL(U32(1), s)
	// repeatL = 1
	MOVL(s, repeatL)

	src := GP64()
	Load(Param("src").Base(), src)

	// Load cv
	Label("search_loop_" + name)
	candidate := GP64().As32()
	{
		cv := GP64()
		MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
		nextS := GP64()
		// nextS := s + (s-nextEmit)>>6 + 4
		{
			tmp := GP64()
			MOVL(s, tmp.As32())           // tmp = s
			SUBL(nextEmitL, tmp.As32())   // tmp = s - nextEmit
			SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
			LEAQ(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
		}
		// if nextS > sLimit {goto emitRemainder}
		{
			tmp := GP64()
			MOVL(sLimitL, tmp.As32())
			CMPL(nextS.As32(), tmp.As32())
			JGT(LabelRef("emit_remainder_" + name))
		}
		// move nextS to stack.
		MOVL(nextS.As32(), nextSTempL)

		candidate2 := GP64().As32()
		hasher := hash6(tableBits)
		{
			hash0, hash1 := GP64(), GP64()
			MOVQ(cv, hash0)
			MOVQ(cv, hash1)
			SHRQ(U8(8), hash1)
			hasher.hash(hash0)
			hasher.hash(hash1)
			MOVL(table.Idx(hash0, 1), candidate)
			MOVL(table.Idx(hash1, 1), candidate2)
			MOVL(s, table.Idx(hash0, 1))
			tmp := GP64().As32()
			LEAL(Mem{Base: s, Disp: 1}, tmp)
			MOVL(tmp, table.Idx(hash1, 1))
		}
		// Check repeat at offset checkRep
		const checkRep = 1

		if true {
			// rep = s - repeat
			rep := GP64().As32()
			if true {
				// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
				left, right := GP64(), GP64()
				MOVL(s, rep)
				SUBL(repeatL, rep) // rep = s - repeat
				MOVL(Mem{Base: src, Index: rep, Scale: 1, Disp: checkRep}, right.As32())
				MOVQ(cv, left)
				SHLQ(U8(checkRep*8), left)
				CMPL(left.As32(), right.As32())

				// FIXME: Unable to allocate if enabled.
				JNE(LabelRef("no_repeat_found_" + name))
			}
			// base = s + 1
			base := GP64()
			LEAQ(Mem{Base: s, Disp: 1}, base)
			// Extend back
			if true {
				ne := GP64().As32()
				MOVL(nextEmitL, ne)
				TESTL(rep, rep)
				JZ(LabelRef("repeat_extend_back_end_" + name))

				// I is tested when decremented, so we loop back here.
				Label("repeat_extend_back_loop_" + name)
				CMPL(base.As32(), ne)
				JG(LabelRef("repeat_extend_back_end_" + name))
				// if src[i-1] == src[base-1]
				tmp, tmp2 := GP64(), GP64()
				MOVB(Mem{Base: src, Index: rep, Scale: 1, Disp: -1}, tmp.As8())
				MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
				CMPB(tmp.As8(), tmp2.As8())
				JNE(LabelRef("repeat_extend_back_end_" + name))
				LEAQ(Mem{Base: base, Disp: -1}, base)
				DECL(rep)
				JZ(LabelRef("repeat_extend_back_end_" + name))
				JMP(LabelRef("repeat_extend_back_loop_" + name))
			}
			Label("repeat_extend_back_end_" + name)
			// Base is now at start.
			// d += emitLiteral(dst[d:], src[nextEmitL:base])
			if true {
				emitLiterals(nextEmitL, base, src, dstBase, "repeat_emit_"+name, avx)
			}

			// Extend forward
			if true {
				// s += 4 + checkRep
				ADDL(U8(4+checkRep), s)

				// candidate := s - repeat + 4 + checkRep
				MOVL(s, candidate)
				SUBL(repeatL, candidate) // candidate = s - repeatL
				{
					// srcLeft = sLimitL - s
					srcLeft := GP64()
					MOVL(sLimitL, srcLeft.As32())
					SUBL(s, srcLeft.As32())

					// Forward address
					forwardStart := Mem{Base: src, Index: s, Scale: 1}
					// End address
					backStart := Mem{Base: src, Index: candidate, Scale: 1}
					length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
					Label("repeat_extend_forward_end_" + name)
					// s+= length
					ADDL(length.As32(), s)
				}
			}
			// Emit
			if true {
				// length = s-base
				length := GP64()
				MOVL(s, length.As32())
				SUBL(base.As32(), length.As32())

				offsetVal := GP64()
				MOVL(repeatL, offsetVal.As32())
				dst := GP64()
				MOVQ(dstBase, dst)

				// if nextEmit > 0
				tmp := GP64()
				MOVL(nextEmitL, tmp.As32())
				TESTL(tmp.As32(), tmp.As32())

				// FIXME: fails to allocate regs if enabled:
				JZ(LabelRef("repeat_as_copy_" + name))

				emitRepeat("match_repeat_", length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))

				// JUMPS TO HERE:
				Label("repeat_as_copy_" + name)
				emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))

				Label("repeat_end_emit_" + name)
				// Store new dst and nextEmit
				MOVQ(dst, dstBase)
			}
			// if s >= sLimit
			// can be omitted.
			if true {
				tmp := GP64()
				MOVL(sLimitL, tmp.As32())
				CMPL(s, tmp.As32())
				JGT(LabelRef("emit_remainder_" + name))
			}
			JMP(LabelRef("search_loop_" + name))
		}
		Label("no_repeat_found_" + name)
		{
			// Can be moved up if registers are available.
			hash2 := GP64()
			{
				// hash2 := hash6(cv>>16, tableBits)
				hasher = hash6(tableBits)
				MOVQ(cv, hash2)
				SHRQ(U8(16), hash2)
				hasher.hash(hash2)
			}

			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
			// cv >>= 8
			SHRQ(U8(8), cv)
			JEQ(LabelRef("candidate_match_" + name))

			// candidate = int(table[hash2])
			MOVL(table.Idx(hash2, 1), candidate)

			// if uint32(cv>>8) == load32(src, candidate2)
			CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
			JEQ(LabelRef("candidate2_match_" + name))

			// table[hash2] = uint32(s + 2)
			tmp := GP64()
			LEAQ(Mem{Base: s, Disp: 2}, tmp)
			MOVL(tmp.As32(), table.Idx(hash2, 1))

			// if uint32(cv>>16) == load32(src, candidate)
			SHRQ(U8(8), cv)
			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
			JEQ(LabelRef("candidate3_match_" + name))
			// s = nextS
			MOVL(nextSTempL, s)
			JMP(LabelRef("search_loop_" + name))

			// Matches candidate3
			Label("candidate3_match_" + name)
			ADDL(U8(2), s)
			JMP(LabelRef("candidate_match_" + name))

			Label("candidate2_match_" + name)
			// table[hash2] = uint32(s + 2)
			tmp = GP64()
			LEAQ(Mem{Base: s, Disp: -2}, tmp)
			MOVL(tmp.As32(), table.Idx(hash2, 1))
			// s++
			INCL(s)
			MOVL(candidate2, candidate)
		}
	}

	Label("candidate_match_" + name)
	// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
	// Extend backwards
	{
		ne := GP64()
		MOVL(nextEmitL, ne.As32())
		TESTL(candidate, candidate)
		JZ(LabelRef("match_extend_back_end_" + name))

		// candidate is tested when decremented, so we loop back here.
		Label("match_extend_back_loop_" + name)
		CMPL(s, ne.As32())
		JG(LabelRef("match_extend_back_end_" + name))
		// if src[candidate-1] == src[s-1]
		tmp, tmp2 := GP64(), GP64()
		MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
		MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
		CMPB(tmp.As8(), tmp2.As8())
		JNE(LabelRef("match_extend_back_end_" + name))
		LEAL(Mem{Base: s, Disp: -1}, s)
		DECL(candidate)
		JZ(LabelRef("match_extend_back_end_" + name))
		JMP(LabelRef("match_extend_back_loop_" + name))
	}
	Label("match_extend_back_end_" + name)

	// Bail if we exceed the maximum size.
	if true {
		// tmp = s-nextEmitL
		tmp := GP64()
		MOVL(s, tmp.As32())
		SUBL(nextEmitL, tmp.As32())
		LEAQ(dstBase.Idx(tmp, 1), tmp)
		CMPQ(tmp, dstLimitPtrQ)
		JL(LabelRef("match_dst_size_check_" + name))
		ri, err := ReturnIndex(0).Resolve()
		if err != nil {
			panic(err)
		}
		MOVQ(U32(0), ri.Addr)
		RET()
	}
	Label("match_dst_size_check_" + name)
	{
		base := GP64()
		MOVL(candidate, base.As32())
		emitLiterals(nextEmitL, base, src, dstBase, "match_emit_"+name, avx)
		NOP()
	}

	Label("match_nolit_loop_" + name)
	{
		base := GP64().As32()
		MOVL(s, base)
		// Update repeat
		{
			// repeat = base - candidate
			repeatVal := GP64().As32()
			MOVL(s, repeatVal)
			SUBL(candidate, repeatVal)
			MOVL(repeatVal, repeatL)
		}
		// s+=4, candidate+=4
		ADDL(U8(4), s)
		ADDL(U8(4), candidate)
		// Extend the 4-byte match as long as possible and emit copy.
		{
			// srcLeft = sLimitL - s
			srcLeft := GP64()
			MOVL(sLimitL, srcLeft.As32())
			SUBL(s, srcLeft.As32())
			length := matchLen("match_nolit_"+name,
				Mem{Base: src, Index: s, Scale: 1},
				Mem{Base: src, Index: candidate, Scale: 1},
				srcLeft,
				LabelRef("match_nolit_end_"+name),
			)
			Label("match_nolit_end_" + name)
			offset := GP64()
			MOVL(repeatL, offset.As32())
			ADDQ(U8(4), length)
			dst := GP64()
			MOVQ(dstBase, dst)
			// s += length (lenght is destroyed, use it now)
			ADDL(length.As32(), s)
			emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
			Label("match_nolit_emitcopy_end_" + name)
			MOVQ(dst, dstBase)
			MOVL(s, nextEmitL)
			CMPL(s, sLimitL)
			JGE(LabelRef("emit_remainder_" + name))

			// Bail if we exceed the maximum size.
			{
				CMPQ(dst, dstLimitPtrQ)
				JL(LabelRef("match_nolit_dst_ok_" + name))
				ri, err := ReturnIndex(0).Resolve()
				if err != nil {
					panic(err)
				}
				MOVQ(U32(0), ri.Addr)
				RET()
				Label("match_nolit_dst_ok_" + name)
			}
		}
		{
			// Check for an immediate match, otherwise start search at s+1
			x := GP64()
			// Index s-2
			MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x)
			hasher := hash6(tableBits)
			hash0, hash1 := GP64(), GP64()
			MOVQ(x, hash0) // s-2
			SHRQ(U8(16), x)
			MOVQ(x, hash1) // s
			hasher.hash(hash0)
			hasher.hash(hash1)
			c0, c1 := GP64(), GP64()
			MOVL(table.Idx(hash0, 1), c0.As32())
			MOVL(table.Idx(hash1, 1), c1.As32())
			sm2 := GP64()
			LEAQ(Mem{Base: s, Disp: -2}, sm2)
			MOVL(sm2.As32(), table.Idx(hash0, 1))
			MOVL(s, table.Idx(hash1, 1))
			CMPL(Mem{Base: src, Index: hash1, Scale: 1}, x.As32())
			JEQ(LabelRef("match_nolit_loop_" + name))
			INCL(s)
		}
		JMP(LabelRef("search_loop_" + name))
	}

	Label("emit_remainder_" + name)
	// Bail if we exceed the maximum size.
	// if d+len(src)-nextEmitL > dstLimitPtrQ {	return 0
	{
		// remain = lenSrc - nextEmitL
		remain := GP64()
		MOVQ(lenSrcQ, remain)
		SUBL(nextEmitL, remain.As32())
		dst := GP64()
		MOVQ(dstBase, dst)
		// dst := dst + (len(src)-nextEmitL)
		LEAQ(Mem{Base: dst, Index: remain, Scale: 1}, dst)
		CMPQ(dst, dstLimitPtrQ)
		JL(LabelRef("emit_remainder_ok_" + name))
		ri, err := ReturnIndex(0).Resolve()
		if err != nil {
			panic(err)
		}
		MOVQ(U32(0), ri.Addr)
		RET()
		Label("emit_remainder_ok_" + name)
	}
	// emitLiteral(dst[d:], src[nextEmitL:])
	emitEnd := GP64()
	MOVQ(lenSrcQ, emitEnd)

	// Emit final literals.
	emitLiterals(nextEmitL, emitEnd, src, dstBase, "emit_remainder_"+name, avx)

	// length := start - base (ptr arithmetic)
	length := GP64()
	MOVQ(dstStartPtrQ, length)
	SUBQ(dstBase, length)

	Store(length, ReturnIndex(0))
	RET()
}

// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
// Checks if base == nextemit.
// src & base are untouched.
func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) {
	nextEmit, litLen, dstBaseTmp, litBase := GP64().As32(), GP64(), GP64(), GP64()
	MOVL(nextEmitL, nextEmit)
	CMPL(nextEmit, base.As32())
	JEQ(LabelRef("emit_literal_skip_" + name))
	MOVL(base.As32(), litLen.As32())

	// Base is now next emit.
	MOVL(base.As32(), nextEmitL)

	// litBase = src[nextEmitL:]
	LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
	SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit

	// Load (and store when we return)
	MOVQ(dstBase, dstBaseTmp)
	emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true)
	Label("emit_literal_done_" + name)
	// Store updated dstBase
	MOVQ(dstBaseTmp, dstBase)
	Label("emit_literal_skip_" + name)
}

type hashGen struct {
	bytes     int
	tablebits int
	mulreg    reg.GPVirtual
}

// hash uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
func hash6(tablebits int) hashGen {
	h := hashGen{
		bytes:     6,
		tablebits: tablebits,
		mulreg:    GP64(),
	}
	MOVQ(Imm(227718039650203), h.mulreg)
	return h
}

// hash uses multiply to get hash of the value.
func (h hashGen) hash(val reg.GPVirtual) {
	// Move value to top of register.
	SHLQ(U8(64-8*h.bytes), val)
	IMULQ(h.mulreg, val)
	// Move value to bottom
	SHRQ(U8(64-h.tablebits), val)
}

func genEmitLiteral() {
	TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
	Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
		"It assumes that:",
		"  dst is long enough to hold the encoded bytes",
		"  0 <= len(lit) && len(lit) <= math.MaxUint32", "")
	Pragma("noescape")

	dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
	Load(Param("dst").Base(), dstBase)
	Load(Param("lit").Base(), litBase)
	Load(Param("lit").Len(), litLen)
	emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false)
	Label("emit_literal_end_standalone")
	Store(retval, ReturnIndex(0))
	RET()

	TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int")
	Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "",
		"It assumes that:",
		"  dst is long enough to hold the encoded bytes",
		"  0 <= len(lit) && len(lit) <= math.MaxUint32", "")
	Pragma("noescape")

	dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64()
	Load(Param("dst").Base(), dstBase)
	Load(Param("lit").Base(), litBase)
	Load(Param("lit").Len(), litLen)
	emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false)
	Label("emit_literal_end_avx_standalone")
	Store(retval, ReturnIndex(0))
	RET()
}

// emitLiteral can be used for inlining an emitLiteral call.
// stack must have at least 32 bytes.
// retval will contain emitted bytes, but can be nil if this is not interesting.
// dstBase and litBase are updated.
// Uses 2 GP registers. With AVX 4 registers.
// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) {
	n := GP64()
	n16 := GP64()

	// We always add litLen bytes
	if retval != nil {
		MOVQ(litLen, retval)
	}
	MOVQ(litLen, n)

	SUBL(U8(1), n.As32())
	// Return if AX was 0
	JC(end)

	// Find number of bytes to emit for tag.
	CMPL(n.As32(), U8(60))
	JLT(LabelRef("one_byte_" + name))
	CMPL(n.As32(), U32(1<<8))
	JLT(LabelRef("two_bytes_" + name))
	CMPL(n.As32(), U32(1<<16))
	JLT(LabelRef("three_bytes_" + name))
	CMPL(n.As32(), U32(1<<24))
	JLT(LabelRef("four_bytes_" + name))

	Label("five_bytes_" + name)
	MOVB(U8(252), Mem{Base: dstBase})
	MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
	if retval != nil {
		ADDQ(U8(5), retval)
	}
	ADDQ(U8(5), dstBase)
	JMP(LabelRef("memmove_" + name))

	Label("four_bytes_" + name)
	MOVQ(n, n16)
	SHRL(U8(16), n16.As32())
	MOVB(U8(248), Mem{Base: dstBase})
	MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
	MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
	if retval != nil {
		ADDQ(U8(4), retval)
	}
	ADDQ(U8(4), dstBase)
	JMP(LabelRef("memmove_" + name))

	Label("three_bytes_" + name)
	MOVB(U8(0xf4), Mem{Base: dstBase})
	MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
	if retval != nil {
		ADDQ(U8(3), retval)
	}
	ADDQ(U8(3), dstBase)
	JMP(LabelRef("memmove_" + name))

	Label("two_bytes_" + name)
	MOVB(U8(0xf0), Mem{Base: dstBase})
	MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
	if retval != nil {
		ADDQ(U8(2), retval)
	}
	ADDQ(U8(2), dstBase)
	JMP(LabelRef("memmove_" + name))

	Label("one_byte_" + name)
	SHLB(U8(2), n.As8())
	MOVB(n.As8(), Mem{Base: dstBase})
	if retval != nil {
		ADDQ(U8(1), retval)
	}
	ADDQ(U8(1), dstBase)
	// Fallthrough

	Label("memmove_" + name)

	// copy(dst[i:], lit)
	if true {
		dstEnd := GP64()
		if updateDst {
			LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
		}
		genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, litLen, end, avx)
		if updateDst {
			MOVQ(dstEnd, dstBase)
		}
	} else {
		genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end)
	}
	return
}

// genEmitRepeat generates a standlone emitRepeat.
func genEmitRepeat() {
	TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
	Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
		"Length must be at least 4 and < 1<<32", "")
	Pragma("noescape")

	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()

	// retval = 0
	XORQ(retval, retval)

	Load(Param("dst").Base(), dstBase)
	Load(Param("offset"), offset)
	Load(Param("length"), length)
	emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
	Label("gen_emit_repeat_end")
	Store(retval, ReturnIndex(0))
	RET()
}

// emitRepeat can be used for inlining an emitRepeat call.
// length >= 4 and < 1<<32
// length is modified. dstBase is updated. retval is added to input.
// retval can be nil.
// Will jump to end label when finished.
// Uses 1 GP register.
func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
	Label("emit_repeat_again_" + name)
	tmp := GP64()
	MOVQ(length, tmp) // Copy length
	// length -= 4
	LEAQ(Mem{Base: length, Disp: -4}, length)

	// if length <= 4 (use copied value)
	CMPL(tmp.As32(), U8(8))
	JLE(LabelRef("repeat_two_" + name))

	// length < 8 && offset < 2048
	CMPL(tmp.As32(), U8(12))
	JGE(LabelRef("cant_repeat_two_offset_" + name))
	CMPL(offset.As32(), U32(2048))
	JLT(LabelRef("repeat_two_offset_" + name))

	const maxRepeat = ((1 << 24) - 1) + 65536
	Label("cant_repeat_two_offset_" + name)
	CMPL(length.As32(), U32((1<<8)+4))
	JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
	CMPL(length.As32(), U32((1<<16)+(1<<8)))
	JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
	CMPL(length.As32(), U32(maxRepeat))
	JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.

	// We have have more than 24 bits
	// Emit so we have at least 4 bytes left.
	LEAQ(Mem{Base: length, Disp: -(maxRepeat - 4)}, length) // length -= (maxRepeat - 4)
	MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})            // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
	MOVW(U16(65531), Mem{Base: dstBase, Disp: 2})           // 0xfffb
	MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
	ADDQ(U8(5), dstBase)
	if retval != nil {
		ADDQ(U8(5), retval)
	}
	JMP(LabelRef("emit_repeat_again_" + name))

	// Must be able to be within 5 bytes.
	Label("repeat_five_" + name)
	LEAQ(Mem{Base: length, Disp: -65536}, length) // length -= 65536
	MOVQ(length, offset)
	MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
	MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
	SARQ(U8(16), offset)                             // offset = length >> 16
	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4})  // dst[4] = length >> 16
	if retval != nil {
		ADDQ(U8(5), retval) // i += 5
	}
	ADDQ(U8(5), dstBase) // dst += 5
	JMP(end)

	Label("repeat_four_" + name)
	LEAQ(Mem{Base: length, Disp: -256}, length)      // length -= 256
	MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
	MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
	if retval != nil {
		ADDQ(U8(4), retval) // i += 4
	}
	ADDQ(U8(4), dstBase) // dst += 4
	JMP(end)

	Label("repeat_three_" + name)
	LEAQ(Mem{Base: length, Disp: -4}, length)       // length -= 4
	MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase})    // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
	MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length)
	if retval != nil {
		ADDQ(U8(3), retval) // i += 3
	}
	ADDQ(U8(3), dstBase) // dst += 3
	JMP(end)

	Label("repeat_two_" + name)
	// dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
	SHLL(U8(2), length.As32())
	ORL(U8(tagCopy1), length.As32())
	MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
	if retval != nil {
		ADDQ(U8(2), retval) // i += 2
	}
	ADDQ(U8(2), dstBase) // dst += 2
	JMP(end)

	Label("repeat_two_offset_" + name)
	// Emit the remaining copy, encoded as 2 bytes.
	// dst[1] = uint8(offset)
	// dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
	tmp = GP64()
	XORQ(tmp, tmp)
	// Use scale and displacement to shift and subtract values from length.
	LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length)
	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
	SARL(U8(8), offset.As32())                      // Remove lower
	SHLL(U8(5), offset.As32())                      // Shift back up
	ORL(offset.As32(), length.As32())               // OR result
	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
	if retval != nil {
		ADDQ(U8(2), retval) // i += 2
	}
	ADDQ(U8(2), dstBase) // dst += 2

	JMP(end)
}

// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
//	dst is long enough to hold the encoded bytes
//	1 <= offset && offset <= math.MaxUint32
//	4 <= length && length <= 1 << 24

// genEmitCopy generates a standlone emitCopy
func genEmitCopy() {
	TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
	Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
		"It assumes that:",
		"  dst is long enough to hold the encoded bytes",
		"  1 <= offset && offset <= math.MaxUint32",
		"  4 <= length && length <= 1 << 24", "")
	Pragma("noescape")

	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()

	//	i := 0
	XORQ(retval, retval)

	Load(Param("dst").Base(), dstBase)
	Load(Param("offset"), offset)
	Load(Param("length"), length)
	emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"))
	Label("gen_emit_copy_end")
	Store(retval, ReturnIndex(0))
	RET()
}

const (
	tagLiteral = 0x00
	tagCopy1   = 0x01
	tagCopy2   = 0x02
	tagCopy4   = 0x03
)

// emitCopy can be used for inlining an emitCopy call.
// length is modified (and junk). dstBase is updated. retval is added to input.
// retval can be nil.
// Will jump to end label when finished.
// Uses 2 GP registers.
func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
	// if offset >= 65536 {
	CMPL(offset.As32(), U32(65536))
	JL(LabelRef("two_byte_offset_" + name))

	// offset is >= 65536
	//	if length <= 64 goto four_bytes_remain_
	CMPL(length.As32(), U8(64))
	JLE(LabelRef("four_bytes_remain_" + name))

	// Emit a length 64 copy, encoded as 5 bytes.
	//		dst[0] = 63<<2 | tagCopy4
	MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
	//		dst[4] = uint8(offset >> 24)
	//		dst[3] = uint8(offset >> 16)
	//		dst[2] = uint8(offset >> 8)
	//		dst[1] = uint8(offset)
	MOVD(offset, Mem{Base: dstBase, Disp: 1})
	//		length -= 64
	LEAQ(Mem{Base: length, Disp: -64}, length)
	if retval != nil {
		ADDQ(U8(5), retval) // i+=5
	}
	ADDQ(U8(5), dstBase) // dst+=5

	//	if length >= 4 {
	CMPL(length.As32(), U8(4))
	JL(LabelRef("four_bytes_remain_" + name))

	// Emit remaining as repeats
	//	return 5 + emitRepeat(dst[5:], offset, length)
	// Inline call to emitRepeat. Will jump to end
	emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)

	Label("four_bytes_remain_" + name)
	//	if length == 0 {
	//		return i
	//	}
	TESTL(length.As32(), length.As32())
	JZ(end)

	// Emit a copy, offset encoded as 4 bytes.
	//	dst[i+0] = uint8(length-1)<<2 | tagCopy4
	//	dst[i+1] = uint8(offset)
	//	dst[i+2] = uint8(offset >> 8)
	//	dst[i+3] = uint8(offset >> 16)
	//	dst[i+4] = uint8(offset >> 24)
	tmp := GP64()
	MOVB(U8(tagCopy4), tmp.As8())
	// Use displacement to subtract 1 from upshifted length.
	LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
	MOVB(length.As8(), Mem{Base: dstBase})
	MOVD(offset, Mem{Base: dstBase, Disp: 1})
	//	return i + 5
	if retval != nil {
		ADDQ(U8(5), retval)
	}
	ADDQ(U8(5), dstBase)
	JMP(end)

	Label("two_byte_offset_" + name)
	// Offset no more than 2 bytes.

	// if length > 64 {
	CMPL(length.As32(), U8(64))
	JLE(LabelRef("two_byte_offset_short_" + name))
	// Emit a length 60 copy, encoded as 3 bytes.
	// Emit remaining as repeat value (minimum 4 bytes).
	//	dst[2] = uint8(offset >> 8)
	//	dst[1] = uint8(offset)
	//	dst[0] = 59<<2 | tagCopy2
	MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
	//	length -= 60
	LEAQ(Mem{Base: length, Disp: -60}, length)

	// Emit remaining as repeats, at least 4 bytes remain.
	//	return 3 + emitRepeat(dst[3:], offset, length)
	//}
	ADDQ(U8(3), dstBase)
	if retval != nil {
		ADDQ(U8(3), retval)
	}
	// Inline call to emitRepeat. Will jump to end
	emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)

	Label("two_byte_offset_short_" + name)
	// if length >= 12 || offset >= 2048 {
	CMPL(length.As32(), U8(12))
	JGE(LabelRef("emit_copy_three_" + name))
	CMPL(offset.As32(), U32(2048))
	JGE(LabelRef("emit_copy_three_" + name))

	// Emit the remaining copy, encoded as 2 bytes.
	// dst[1] = uint8(offset)
	// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
	tmp = GP64()
	MOVB(U8(tagCopy1), tmp.As8())
	// Use scale and displacement to shift and subtract values from length.
	LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length)
	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
	SHRL(U8(8), offset.As32())                      // Remove lower
	SHLL(U8(5), offset.As32())                      // Shift back up
	ORL(offset.As32(), length.As32())               // OR result
	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
	if retval != nil {
		ADDQ(U8(2), retval) // i += 2
	}
	ADDQ(U8(2), dstBase) // dst += 2
	// return 2
	JMP(end)

	Label("emit_copy_three_" + name)
	//	// Emit the remaining copy, encoded as 3 bytes.
	//	dst[2] = uint8(offset >> 8)
	//	dst[1] = uint8(offset)
	//	dst[0] = uint8(length-1)<<2 | tagCopy2
	tmp = GP64()
	MOVB(U8(tagCopy2), tmp.As8())
	LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
	MOVB(length.As8(), Mem{Base: dstBase})
	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
	//	return 3
	if retval != nil {
		ADDQ(U8(3), retval) // i += 3
	}
	ADDQ(U8(3), dstBase) // dst += 3
	JMP(end)
}

// func memmove(to, from unsafe.Pointer, n uintptr)
// to and from will be at the end, n will be 0.
// to and from may not overlap.
// Fairly simplistic for now, can ofc. be extended.
// Uses one GP register and 8 SSE registers.
func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) {
	tmp := GP64()
	MOVQ(n, tmp)
	// tmp = n/128
	SHRQ(U8(7), tmp)

	TESTQ(tmp, tmp)
	JZ(LabelRef("done_128_" + name))
	Label("loop_128_" + name)
	var xmmregs [8]reg.VecVirtual

	// Prefetch destination for next loop.
	// Prefetching source doesn't provide speedup.
	// This seems to give a small boost.
	const preOff = 128
	PREFETCHT0(Mem{Base: to, Disp: preOff})
	PREFETCHT0(Mem{Base: to, Disp: preOff + 64})

	for i := 0; i < 8; i++ {
		xmmregs[i] = XMM()
		MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i])
	}
	for i := 0; i < 8; i++ {
		MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16))
	}
	LEAQ(Mem{Base: n, Disp: -128}, n)
	ADDQ(U8(8*16), from)
	ADDQ(U8(8*16), to)
	DECQ(tmp)
	JNZ(LabelRef("loop_128_" + name))

	Label("done_128_" + name)
	MOVQ(n, tmp)
	// tmp = n/16
	SHRQ(U8(4), tmp)
	TESTQ(tmp, tmp)
	JZ(LabelRef("done_16_" + name))

	Label("loop_16_" + name)
	xmm := XMM()
	MOVOU(Mem{Base: from}, xmm)
	MOVOU(xmm, Mem{Base: to})
	LEAQ(Mem{Base: n, Disp: -16}, n)
	ADDQ(U8(16), from)
	ADDQ(U8(16), to)
	DECQ(tmp)
	JNZ(LabelRef("loop_16_" + name))
	Label("done_16_" + name)

	// TODO: Use REP; MOVSB somehow.
	TESTQ(n, n)
	JZ(end)
	Label("loop_1_" + name)
	MOVB(Mem{Base: from}, tmp.As8())
	MOVB(tmp.As8(), Mem{Base: to})
	INCQ(from)
	INCQ(to)
	DECQ(n)
	JNZ(LabelRef("loop_1_" + name))
}

// func memmove(to, from unsafe.Pointer, n uintptr)
// src and dst may not overlap.
// Non AVX uses 2 GP register, 16 SSE2 registers.
// AVX uses 4 GP registers 16 AVX/SSE registers.
// All passed registers may be updated.
func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) {
	AX, CX := GP64(), GP64()
	NOP()
	name += "_memmove_"
	Label(name + "tail")
	// move_129through256 or smaller work whether or not the source and the
	// destination memory regions overlap because they load all data into
	// registers before writing it back.  move_256through2048 on the other
	// hand can be used only when the memory regions don't overlap or the copy
	// direction is forward.
	//
	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
	TESTQ(length, length)
	JEQ(end)
	CMPQ(length, U8(2))
	JBE(LabelRef(name + "move_1or2"))
	CMPQ(length, U8(4))
	JB(LabelRef(name + "move_3"))
	JBE(LabelRef(name + "move_4"))
	CMPQ(length, U8(8))
	JB(LabelRef(name + "move_5through7"))
	JE(LabelRef(name + "move_8"))
	CMPQ(length, U8(16))
	JBE(LabelRef(name + "move_9through16"))
	CMPQ(length, U8(32))
	JBE(LabelRef(name + "move_17through32"))
	CMPQ(length, U8(64))
	JBE(LabelRef(name + "move_33through64"))
	CMPQ(length, U8(128))
	JBE(LabelRef(name + "move_65through128"))
	CMPQ(length, U32(256))
	JBE(LabelRef(name + "move_129through256"))

	if avx {
		JMP(LabelRef(name + "avxUnaligned"))
	} else {
		if false {
			// Don't check length for now.
			Label(name + "forward")
			CMPQ(length, U32(2048))
			JLS(LabelRef(name + "move_256through2048"))

			genMemMove(name+"fallback", dst, src, length, end)
		} else {
			JMP(LabelRef(name + "move_256through2048"))
		}
	}
	/*
			// If REP MOVSB isn't fast, don't use it
			// FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB)
			// CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB
			JMP(LabelRef(name + "fwdBy8"))

			// Check alignment
			MOVL(src.As32(), AX.As32())
			ORL(dst.As32(), AX.As32())
			TESTL(U32(7), AX.As32())
			JEQ(LabelRef(name + "fwdBy8"))

			// Do 1 byte at a time
			// MOVQ(length, CX)
			// FIXME:
			// REP;	MOVSB
			JMP(end)

		Label(name + "fwdBy8")
		// Do 8 bytes at a time
		MOVQ(length, CX)
		SHRQ(U8(3), CX)
		ANDQ(U8(7), length)
		// FIXME:
		//REP;	MOVSQ
		JMP(LabelRef(name + "tail"))

		Label(name + "back")

		//check overlap
		MOVQ(src, CX)
		ADDQ(length, CX)
		CMPQ(CX, dst)
		JLS(LabelRef(name + "forward"))

		//whole thing backwards has
		//adjusted addresses

		ADDQ(length, dst)
		ADDQ(length, src)
		STD()

		//
		//  copy
		 //
		MOVQ(length, CX)
		SHRQ(U8(3), CX)
		ANDQ(U8(7), length)

		SUBQ(U8(8), dst)
		SUBQ(U8(8), src)
		// FIXME:
		//REP;	MOVSQ

		// FIXME:
		//CLD()

		ADDQ(U8(8), dst)
		ADDQ(U8(8), src)
		SUBQ(length, dst)
		SUBQ(length, src)
		JMP(LabelRef(name + "tail"))
	*/

	Label(name + "move_1or2")
	MOVB(Mem{Base: src}, AX.As8())
	MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
	MOVB(AX.As8(), Mem{Base: dst})
	MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
	JMP(end)

	Label(name + "move_4")
	MOVL(Mem{Base: src}, AX.As32())
	MOVL(AX.As32(), Mem{Base: dst})
	JMP(end)

	Label(name + "move_3")
	MOVW(Mem{Base: src}, AX.As16())
	MOVB(Mem{Base: src, Disp: 2}, CX.As8())
	MOVW(AX.As16(), Mem{Base: dst})
	MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
	JMP(end)

	Label(name + "move_5through7")
	MOVL(Mem{Base: src}, AX.As32())
	MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
	MOVL(AX.As32(), Mem{Base: dst})
	MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
	JMP(end)

	Label(name + "move_8")
	// We need a separate case for 8 to make sure we write pointers atomically.
	MOVQ(Mem{Base: src}, AX)
	MOVQ(AX, Mem{Base: dst})
	JMP(end)

	Label(name + "move_9through16")
	MOVQ(Mem{Base: src}, AX)
	MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
	MOVQ(AX, Mem{Base: dst})
	MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
	JMP(end)

	Label(name + "move_17through32")
	X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
	X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()

	MOVOU(Mem{Base: src}, X0)
	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
	MOVOU(X0, Mem{Base: dst})
	MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
	JMP(end)

	Label(name + "move_33through64")
	MOVOU(Mem{Base: src}, X0)
	MOVOU(Mem{Base: src, Disp: 16}, X1)
	MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
	MOVOU(X0, Mem{Base: dst})
	MOVOU(X1, Mem{Base: dst, Disp: 16})
	MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
	MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
	JMP(end)

	Label(name + "move_65through128")
	MOVOU(Mem{Base: src}, X0)
	MOVOU(Mem{Base: src, Disp: 16}, X1)
	MOVOU(Mem{Base: src, Disp: 32}, X2)
	MOVOU(Mem{Base: src, Disp: 48}, X3)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
	MOVOU(X0, Mem{Base: dst})
	MOVOU(X1, Mem{Base: dst, Disp: 16})
	MOVOU(X2, Mem{Base: dst, Disp: 32})
	MOVOU(X3, Mem{Base: dst, Disp: 48})
	MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
	MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
	MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
	MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
	JMP(end)

	Label(name + "move_129through256")
	MOVOU(Mem{Base: src}, X0)
	MOVOU(Mem{Base: src, Disp: 16}, X1)
	MOVOU(Mem{Base: src, Disp: 32}, X2)
	MOVOU(Mem{Base: src, Disp: 48}, X3)
	MOVOU(Mem{Base: src, Disp: 64}, X4)
	MOVOU(Mem{Base: src, Disp: 80}, X5)
	MOVOU(Mem{Base: src, Disp: 96}, X6)
	MOVOU(Mem{Base: src, Disp: 112}, X7)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
	MOVOU(X0, Mem{Base: dst})
	MOVOU(X1, Mem{Base: dst, Disp: 16})
	MOVOU(X2, Mem{Base: dst, Disp: 32})
	MOVOU(X3, Mem{Base: dst, Disp: 48})
	MOVOU(X4, Mem{Base: dst, Disp: 64})
	MOVOU(X5, Mem{Base: dst, Disp: 80})
	MOVOU(X6, Mem{Base: dst, Disp: 96})
	MOVOU(X7, Mem{Base: dst, Disp: 112})
	MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
	MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
	MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
	MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
	MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
	MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
	MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
	MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
	JMP(end)

	Label(name + "move_256through2048")
	LEAQ(Mem{Base: length, Disp: -256}, length)
	MOVOU(Mem{Base: src}, X0)
	MOVOU(Mem{Base: src, Disp: 16}, X1)
	MOVOU(Mem{Base: src, Disp: 32}, X2)
	MOVOU(Mem{Base: src, Disp: 48}, X3)
	MOVOU(Mem{Base: src, Disp: 64}, X4)
	MOVOU(Mem{Base: src, Disp: 80}, X5)
	MOVOU(Mem{Base: src, Disp: 96}, X6)
	MOVOU(Mem{Base: src, Disp: 112}, X7)
	MOVOU(Mem{Base: src, Disp: 128}, X8)
	MOVOU(Mem{Base: src, Disp: 144}, X9)
	MOVOU(Mem{Base: src, Disp: 160}, X10)
	MOVOU(Mem{Base: src, Disp: 176}, X11)
	MOVOU(Mem{Base: src, Disp: 192}, X12)
	MOVOU(Mem{Base: src, Disp: 208}, X13)
	MOVOU(Mem{Base: src, Disp: 224}, X14)
	MOVOU(Mem{Base: src, Disp: 240}, X15)
	MOVOU(X0, Mem{Base: dst})
	MOVOU(X1, Mem{Base: dst, Disp: 16})
	MOVOU(X2, Mem{Base: dst, Disp: 32})
	MOVOU(X3, Mem{Base: dst, Disp: 48})
	MOVOU(X4, Mem{Base: dst, Disp: 64})
	MOVOU(X5, Mem{Base: dst, Disp: 80})
	MOVOU(X6, Mem{Base: dst, Disp: 96})
	MOVOU(X7, Mem{Base: dst, Disp: 112})
	MOVOU(X8, Mem{Base: dst, Disp: 128})
	MOVOU(X9, Mem{Base: dst, Disp: 144})
	MOVOU(X10, Mem{Base: dst, Disp: 160})
	MOVOU(X11, Mem{Base: dst, Disp: 176})
	MOVOU(X12, Mem{Base: dst, Disp: 192})
	MOVOU(X13, Mem{Base: dst, Disp: 208})
	MOVOU(X14, Mem{Base: dst, Disp: 224})
	MOVOU(X15, Mem{Base: dst, Disp: 240})
	CMPQ(length, U32(256))
	LEAQ(Mem{Base: src, Disp: 256}, src)
	LEAQ(Mem{Base: dst, Disp: 256}, dst)
	JGE(LabelRef(name + "move_256through2048"))
	JMP(LabelRef(name + "tail"))

	if avx {
		Label(name + "avxUnaligned")
		R8, R10 := GP64(), GP64()
		// There are two implementations of move algorithm.
		// The first one for non-overlapped memory regions. It uses forward copying.
		// We do not support overlapping input

		// Non-temporal copy would be better for big sizes.
		// Disabled since big copies are unlikely.
		// If enabling, test functionality.
		const enableBigData = false
		if enableBigData {
			CMPQ(length, U32(0x100000))
			JAE(LabelRef(name + "gobble_big_data_fwd"))
		}

		// Memory layout on the source side
		// src                                       CX
		// |<---------length before correction--------->|
		// |       |<--length corrected-->|             |
		// |       |                  |<--- AX  --->|
		// |<-R11->|                  |<-128 bytes->|
		// +----------------------------------------+
		// | Head  | Body             | Tail        |
		// +-------+------------------+-------------+
		// ^       ^                  ^
		// |       |                  |
		// Save head into Y4          Save tail into X5..X12
		//         |
		//         src+R11, where R11 = ((dst & -32) + 32) - dst
		// Algorithm:
		// 1. Unaligned save of the tail's 128 bytes
		// 2. Unaligned save of the head's 32  bytes
		// 3. Destination-aligned copying of body (128 bytes per iteration)
		// 4. Put head on the new place
		// 5. Put the tail on the new place
		// It can be important to satisfy processor's pipeline requirements for
		// small sizes as the cost of unaligned memory region copying is
		// comparable with the cost of main loop. So code is slightly messed there.
		// There is more clean implementation of that algorithm for bigger sizes
		// where the cost of unaligned part copying is negligible.
		// You can see it after gobble_big_data_fwd label.
		Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()

		LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
		MOVQ(dst, R10)
		// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
		MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
		MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
		MOVQ(U32(0x80), AX)

		// Align destination address
		ANDQ(U32(0xffffffe0), dst)
		ADDQ(U8(32), dst)
		// Continue tail saving.
		MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
		MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
		// Make R8 delta between aligned and unaligned destination addresses.
		MOVQ(dst, R8)
		SUBQ(R10, R8)
		// Continue tail saving.
		MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
		MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
		// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
		SUBQ(R8, length)
		// Continue tail saving.
		MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
		MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
		// The tail will be put on its place after main body copying.
		// It's time for the unaligned heading part.
		VMOVDQU(Mem{Base: src}, Y4)
		// Adjust source address to point past head.
		ADDQ(R8, src)
		SUBQ(AX, length)

		// Aligned memory copying there
		Label(name + "gobble_128_loop")
		VMOVDQU(Mem{Base: src}, Y0)
		VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
		VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
		VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
		ADDQ(AX, src)
		VMOVDQA(Y0, Mem{Base: dst})
		VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
		VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
		VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
		ADDQ(AX, dst)
		SUBQ(AX, length)
		JA(LabelRef(name + "gobble_128_loop"))
		// Now we can store unaligned parts.
		ADDQ(AX, length)
		ADDQ(dst, length)
		VMOVDQU(Y4, Mem{Base: R10})
		VZEROUPPER()
		MOVOU(X5, Mem{Base: length, Disp: -0x80})
		MOVOU(X6, Mem{Base: length, Disp: -0x70})
		MOVOU(X7, Mem{Base: length, Disp: -0x60})
		MOVOU(X8, Mem{Base: length, Disp: -0x50})
		MOVOU(X9, Mem{Base: length, Disp: -0x40})
		MOVOU(X10, Mem{Base: length, Disp: -0x30})
		MOVOU(X11, Mem{Base: length, Disp: -0x20})
		MOVOU(X12, Mem{Base: length, Disp: -0x10})
		JMP(end)

		if enableBigData {
			Label(name + "gobble_big_data_fwd")
			// There is forward copying for big regions.
			// It uses non-temporal mov instructions.
			// Details of this algorithm are commented previously for small sizes.
			LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
			MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5)
			MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
			MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
			MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
			MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
			MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
			MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
			MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
			VMOVDQU(Mem{Base: src}, Y4)
			MOVQ(dst, R8)

			ANDQ(U32(0xffffffe0), dst)
			ADDQ(U8(32), dst)

			MOVQ(dst, R10)
			SUBQ(R8, R10)
			SUBQ(R10, length)
			ADDQ(R10, src)
			LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX)
			SUBQ(U8(0x80), length)

			Label(name + "gobble_mem_fwd_loop")
			PREFETCHNTA(Mem{Base: src, Disp: 0x1c0})
			PREFETCHNTA(Mem{Base: src, Disp: 0x280})
			// Prefetch values were chosen empirically.
			// Approach for prefetch usage as in 7.6.6 of [1]
			// [1] 64-ia-32-architectures-optimization-manual.pdf
			// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
			VMOVDQU(Mem{Base: src}, Y0)
			VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
			VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
			VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)

			ADDQ(U8(0x80), src)
			VMOVNTDQ(Y0, Mem{Base: dst})
			VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20})
			VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20})
			VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60})
			ADDQ(U8(0x80), dst)
			SUBQ(U8(0x80), length)
			JA(LabelRef(name + "gobble_mem_fwd_loop"))
			// NT instructions don't follow the normal cache-coherency rules.
			// We need SFENCE there to make copied data available timely.
			SFENCE()
			VMOVDQU(Y4, Mem{Base: R8})
			VZEROUPPER()
			MOVOU(X5, Mem{Base: CX, Disp: -0x80})
			MOVOU(X6, Mem{Base: CX, Disp: -0x70})
			MOVOU(X7, Mem{Base: CX, Disp: -0x60})
			MOVOU(X8, Mem{Base: CX, Disp: -0x50})
			MOVOU(X9, Mem{Base: CX, Disp: -0x40})
			MOVOU(X10, Mem{Base: CX, Disp: -0x30})
			MOVOU(X11, Mem{Base: CX, Disp: -0x20})
			MOVOU(X12, Mem{Base: CX, Disp: -0x10})
			JMP(end)
		}
	}
}

// genMatchLen generates standalone matchLen.
func genMatchLen() {
	TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
	Doc("matchLen returns how many bytes match in a and b", "",
		"It assumes that:",
		"  len(a) <= len(b)", "")
	Pragma("noescape")

	aBase, bBase, length := GP64(), GP64(), GP64()

	Load(Param("a").Base(), aBase)
	Load(Param("b").Base(), bBase)
	Load(Param("a").Len(), length)
	l := matchLen("standalone", Mem{Base: aBase}, Mem{Base: bBase}, length, LabelRef("gen_match_len_end"))
	Label("gen_match_len_end")
	Store(l, ReturnIndex(0))
	RET()
}

// matchLen returns the number of matching bytes of a and b.
// len is the maximum number of bytes to match.
// Will jump to end when done and returns the length.
// Uses 2 GP registers.
func matchLen(name string, a, b Mem, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
	tmp, matched := GP64(), GP64()
	XORQ(matched, matched)

	CMPQ(len, U8(8))
	JL(LabelRef("matchlen_single_" + name))

	Label("matchlen_loopback_" + name)
	MOVQ(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp)
	XORQ(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp)
	TESTQ(tmp, tmp)
	JZ(LabelRef("matchlen_loop_" + name))
	// Not all match.
	BSFQ(tmp, tmp)
	SARQ(U8(3), tmp)
	LEAQ(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
	JMP(end)

	// All 8 byte matched, update and loop.
	Label("matchlen_loop_" + name)
	LEAQ(Mem{Base: len, Disp: -8}, len)
	LEAQ(Mem{Base: matched, Disp: 8}, matched)
	CMPQ(len, U8(8))
	JGE(LabelRef("matchlen_loopback_" + name))

	// Less than 8 bytes left.
	Label("matchlen_single_" + name)
	TESTQ(len, len)
	JZ(end)
	Label("matchlen_single_loopback_" + name)
	MOVB(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp.As8())
	CMPB(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp.As8())
	JNE(end)
	LEAQ(Mem{Base: matched, Disp: 1}, matched)
	DECQ(len)
	JNZ(LabelRef("matchlen_single_loopback_" + name))
	JMP(end)
	return matched
}