Files
avo/tests/fixedbugs/issue100/allocfail/asm.go

1588 lines
47 KiB
Go
Raw Normal View History

//go:build ignore
// +build ignore
package main
import (
"fmt"
"log"
. "github.com/mmcloughlin/avo/build"
"github.com/mmcloughlin/avo/buildtags"
"github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/operand"
"github.com/mmcloughlin/avo/reg"
)
func main() {
Constraint(buildtags.Not("appengine").ToConstraint())
Constraint(buildtags.Not("noasm").ToConstraint())
Constraint(buildtags.Term("gc").ToConstraint())
genEncodeBlockAsm("encodeBlockAsm", 16, 6, false)
genEncodeBlockAsm("encodeBlockAsm14B", 14, 5, false)
genEncodeBlockAsm("encodeBlockAsm12B", 12, 4, false)
genEncodeBlockAsm("encodeBlockAsmAvx", 16, 6, true)
genEncodeBlockAsm("encodeBlockAsm14BAvx", 14, 5, true)
genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 4, true)
genEmitLiteral()
genEmitRepeat()
genEmitCopy()
genMatchLen()
Generate()
}
func debugval(v operand.Op) {
value := reg.R15
MOVQ(v, value)
INT(Imm(3))
}
func genEncodeBlockAsm(name string, tableBits, skipLog int, avx bool) {
TEXT(name, 0, "func(dst, src []byte) int")
Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
Pragma("noescape")
// "var table [maxTableSize]uint32" takes up 4 * (1 << tableBits) bytes of stack space.
// Extra bytes are added to keep less used values.
var (
tableSize = 1 << uint(tableBits)
// Keep base stack multiple of 16.
baseStack = 0
// try to keep extraStack + baseStack multiple of 16
// for best chance of table alignment.
extraStack = 32
allocStack = baseStack + extraStack + tableSize
)
// Memzero needs at least 128 bytes.
if tableSize < 128 {
panic("tableSize must be at least 128 bytes")
}
lenSrcBasic, err := Param("src").Len().Resolve()
if err != nil {
panic(err)
}
lenSrcQ := lenSrcBasic.Addr
stack := AllocLocal(allocStack)
table := stack.Offset(allocStack - tableSize)
tmpStack := baseStack
// Bail if we can't compress to at least this.
dstLimitPtrQ := stack.Offset(tmpStack)
tmpStack += 8
// dstStartPtrQ contains the original dst pointer for returning the length
dstStartPtrQ := stack.Offset(tmpStack)
tmpStack += 8
// sLimitL is when to stop looking for offset/length copies.
sLimitL := stack.Offset(tmpStack)
tmpStack += 4
// nextEmitL keeps track of the point we have emitted to.
nextEmitL := stack.Offset(tmpStack)
tmpStack += 4
// Repeat stores the last match offset.
repeatL := stack.Offset(tmpStack)
tmpStack += 4
// nextSTempL keeps nextS while other functions are being called.
nextSTempL := stack.Offset(tmpStack)
tmpStack += 4
// Ensure we have the correct extra stack.
// Could be automatic, but whatever.
if tmpStack-baseStack != extraStack {
log.Fatal("adjust extraStack to ", tmpStack-baseStack)
}
dstBaseBasic, err := Param("dst").Base().Resolve()
if err != nil {
panic(err)
}
dstBase := dstBaseBasic.Addr
if tmpStack > extraStack+baseStack {
panic(fmt.Sprintf("tmp stack exceeded: %v", tmpStack))
}
// Zero table
{
iReg := GP64()
MOVQ(U32(tableSize/8/16), iReg)
tablePtr := GP64()
LEAQ(table, tablePtr)
zeroXmm := XMM()
PXOR(zeroXmm, zeroXmm)
Label("zero_loop_" + name)
for i := 0; i < 8; i++ {
MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
}
ADDQ(U8(16*8), tablePtr)
DECQ(iReg)
JNZ(LabelRef("zero_loop_" + name))
// nextEmit is offset n src where the next emitLiteral should start from.
MOVL(iReg.As32(), nextEmitL)
}
{
const inputMargin = 8
tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
MOVQ(lenSrcQ, tmp)
LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
// sLimitL := len(src) - inputMargin
LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
// dstLimit := len(src) - len(src)>>5 - 5
SHRQ(U8(5), tmp)
SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
MOVL(tmp3.As32(), sLimitL)
dstAddr := GP64()
MOVQ(dstBase, dstAddr)
// Store dst start address
MOVQ(dstAddr, dstStartPtrQ)
LEAQ(Mem{Base: dstAddr, Index: tmp2, Scale: 1}, tmp2)
MOVQ(tmp2, dstLimitPtrQ)
}
// s = 1
s := GP64().As32()
MOVL(U32(1), s)
// repeatL = 1
MOVL(s, repeatL)
src := GP64()
Load(Param("src").Base(), src)
// Load cv
Label("search_loop_" + name)
candidate := GP64().As32()
{
cv := GP64()
MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
nextS := GP64()
// nextS := s + (s-nextEmit)>>6 + 4
{
tmp := GP64()
MOVL(s, tmp.As32()) // tmp = s
SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit
SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
LEAQ(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
}
// if nextS > sLimit {goto emitRemainder}
{
tmp := GP64()
MOVL(sLimitL, tmp.As32())
CMPL(nextS.As32(), tmp.As32())
JGT(LabelRef("emit_remainder_" + name))
}
// move nextS to stack.
MOVL(nextS.As32(), nextSTempL)
candidate2 := GP64().As32()
hasher := hash6(tableBits)
{
hash0, hash1 := GP64(), GP64()
MOVQ(cv, hash0)
MOVQ(cv, hash1)
SHRQ(U8(8), hash1)
hasher.hash(hash0)
hasher.hash(hash1)
MOVL(table.Idx(hash0, 1), candidate)
MOVL(table.Idx(hash1, 1), candidate2)
MOVL(s, table.Idx(hash0, 1))
tmp := GP64().As32()
LEAL(Mem{Base: s, Disp: 1}, tmp)
MOVL(tmp, table.Idx(hash1, 1))
}
// Check repeat at offset checkRep
const checkRep = 1
if true {
// rep = s - repeat
rep := GP64().As32()
if true {
// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
left, right := GP64(), GP64()
MOVL(s, rep)
SUBL(repeatL, rep) // rep = s - repeat
MOVL(Mem{Base: src, Index: rep, Scale: 1, Disp: checkRep}, right.As32())
MOVQ(cv, left)
SHLQ(U8(checkRep*8), left)
CMPL(left.As32(), right.As32())
// FIXME: Unable to allocate if enabled.
JNE(LabelRef("no_repeat_found_" + name))
}
// base = s + 1
base := GP64()
LEAQ(Mem{Base: s, Disp: 1}, base)
// Extend back
if true {
ne := GP64().As32()
MOVL(nextEmitL, ne)
TESTL(rep, rep)
JZ(LabelRef("repeat_extend_back_end_" + name))
// I is tested when decremented, so we loop back here.
Label("repeat_extend_back_loop_" + name)
CMPL(base.As32(), ne)
JG(LabelRef("repeat_extend_back_end_" + name))
// if src[i-1] == src[base-1]
tmp, tmp2 := GP64(), GP64()
MOVB(Mem{Base: src, Index: rep, Scale: 1, Disp: -1}, tmp.As8())
MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
CMPB(tmp.As8(), tmp2.As8())
JNE(LabelRef("repeat_extend_back_end_" + name))
LEAQ(Mem{Base: base, Disp: -1}, base)
DECL(rep)
JZ(LabelRef("repeat_extend_back_end_" + name))
JMP(LabelRef("repeat_extend_back_loop_" + name))
}
Label("repeat_extend_back_end_" + name)
// Base is now at start.
// d += emitLiteral(dst[d:], src[nextEmitL:base])
if true {
emitLiterals(nextEmitL, base, src, dstBase, "repeat_emit_"+name, avx)
}
// Extend forward
if true {
// s += 4 + checkRep
ADDL(U8(4+checkRep), s)
// candidate := s - repeat + 4 + checkRep
MOVL(s, candidate)
SUBL(repeatL, candidate) // candidate = s - repeatL
{
// srcLeft = sLimitL - s
srcLeft := GP64()
MOVL(sLimitL, srcLeft.As32())
SUBL(s, srcLeft.As32())
// Forward address
forwardStart := Mem{Base: src, Index: s, Scale: 1}
// End address
backStart := Mem{Base: src, Index: candidate, Scale: 1}
length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
Label("repeat_extend_forward_end_" + name)
// s+= length
ADDL(length.As32(), s)
}
}
// Emit
if true {
// length = s-base
length := GP64()
MOVL(s, length.As32())
SUBL(base.As32(), length.As32())
offsetVal := GP64()
MOVL(repeatL, offsetVal.As32())
dst := GP64()
MOVQ(dstBase, dst)
// if nextEmit > 0
tmp := GP64()
MOVL(nextEmitL, tmp.As32())
TESTL(tmp.As32(), tmp.As32())
// FIXME: fails to allocate regs if enabled:
JZ(LabelRef("repeat_as_copy_" + name))
emitRepeat("match_repeat_", length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
// JUMPS TO HERE:
Label("repeat_as_copy_" + name)
emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
Label("repeat_end_emit_" + name)
// Store new dst and nextEmit
MOVQ(dst, dstBase)
}
// if s >= sLimit
// can be omitted.
if true {
tmp := GP64()
MOVL(sLimitL, tmp.As32())
CMPL(s, tmp.As32())
JGT(LabelRef("emit_remainder_" + name))
}
JMP(LabelRef("search_loop_" + name))
}
Label("no_repeat_found_" + name)
{
// Can be moved up if registers are available.
hash2 := GP64()
{
// hash2 := hash6(cv>>16, tableBits)
hasher = hash6(tableBits)
MOVQ(cv, hash2)
SHRQ(U8(16), hash2)
hasher.hash(hash2)
}
CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
// cv >>= 8
SHRQ(U8(8), cv)
JEQ(LabelRef("candidate_match_" + name))
// candidate = int(table[hash2])
MOVL(table.Idx(hash2, 1), candidate)
// if uint32(cv>>8) == load32(src, candidate2)
CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
JEQ(LabelRef("candidate2_match_" + name))
// table[hash2] = uint32(s + 2)
tmp := GP64()
LEAQ(Mem{Base: s, Disp: 2}, tmp)
MOVL(tmp.As32(), table.Idx(hash2, 1))
// if uint32(cv>>16) == load32(src, candidate)
SHRQ(U8(8), cv)
CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
JEQ(LabelRef("candidate3_match_" + name))
// s = nextS
MOVL(nextSTempL, s)
JMP(LabelRef("search_loop_" + name))
// Matches candidate3
Label("candidate3_match_" + name)
ADDL(U8(2), s)
JMP(LabelRef("candidate_match_" + name))
Label("candidate2_match_" + name)
// table[hash2] = uint32(s + 2)
tmp = GP64()
LEAQ(Mem{Base: s, Disp: -2}, tmp)
MOVL(tmp.As32(), table.Idx(hash2, 1))
// s++
INCL(s)
MOVL(candidate2, candidate)
}
}
Label("candidate_match_" + name)
// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
// Extend backwards
{
ne := GP64()
MOVL(nextEmitL, ne.As32())
TESTL(candidate, candidate)
JZ(LabelRef("match_extend_back_end_" + name))
// candidate is tested when decremented, so we loop back here.
Label("match_extend_back_loop_" + name)
CMPL(s, ne.As32())
JG(LabelRef("match_extend_back_end_" + name))
// if src[candidate-1] == src[s-1]
tmp, tmp2 := GP64(), GP64()
MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
CMPB(tmp.As8(), tmp2.As8())
JNE(LabelRef("match_extend_back_end_" + name))
LEAL(Mem{Base: s, Disp: -1}, s)
DECL(candidate)
JZ(LabelRef("match_extend_back_end_" + name))
JMP(LabelRef("match_extend_back_loop_" + name))
}
Label("match_extend_back_end_" + name)
// Bail if we exceed the maximum size.
if true {
// tmp = s-nextEmitL
tmp := GP64()
MOVL(s, tmp.As32())
SUBL(nextEmitL, tmp.As32())
LEAQ(dstBase.Idx(tmp, 1), tmp)
CMPQ(tmp, dstLimitPtrQ)
JL(LabelRef("match_dst_size_check_" + name))
ri, err := ReturnIndex(0).Resolve()
if err != nil {
panic(err)
}
MOVQ(U32(0), ri.Addr)
RET()
}
Label("match_dst_size_check_" + name)
{
base := GP64()
MOVL(candidate, base.As32())
emitLiterals(nextEmitL, base, src, dstBase, "match_emit_"+name, avx)
NOP()
}
Label("match_nolit_loop_" + name)
{
base := GP64().As32()
MOVL(s, base)
// Update repeat
{
// repeat = base - candidate
repeatVal := GP64().As32()
MOVL(s, repeatVal)
SUBL(candidate, repeatVal)
MOVL(repeatVal, repeatL)
}
// s+=4, candidate+=4
ADDL(U8(4), s)
ADDL(U8(4), candidate)
// Extend the 4-byte match as long as possible and emit copy.
{
// srcLeft = sLimitL - s
srcLeft := GP64()
MOVL(sLimitL, srcLeft.As32())
SUBL(s, srcLeft.As32())
length := matchLen("match_nolit_"+name,
Mem{Base: src, Index: s, Scale: 1},
Mem{Base: src, Index: candidate, Scale: 1},
srcLeft,
LabelRef("match_nolit_end_"+name),
)
Label("match_nolit_end_" + name)
offset := GP64()
MOVL(repeatL, offset.As32())
ADDQ(U8(4), length)
dst := GP64()
MOVQ(dstBase, dst)
// s += length (lenght is destroyed, use it now)
ADDL(length.As32(), s)
emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
Label("match_nolit_emitcopy_end_" + name)
MOVQ(dst, dstBase)
MOVL(s, nextEmitL)
CMPL(s, sLimitL)
JGE(LabelRef("emit_remainder_" + name))
// Bail if we exceed the maximum size.
{
CMPQ(dst, dstLimitPtrQ)
JL(LabelRef("match_nolit_dst_ok_" + name))
ri, err := ReturnIndex(0).Resolve()
if err != nil {
panic(err)
}
MOVQ(U32(0), ri.Addr)
RET()
Label("match_nolit_dst_ok_" + name)
}
}
{
// Check for an immediate match, otherwise start search at s+1
x := GP64()
// Index s-2
MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x)
hasher := hash6(tableBits)
hash0, hash1 := GP64(), GP64()
MOVQ(x, hash0) // s-2
SHRQ(U8(16), x)
MOVQ(x, hash1) // s
hasher.hash(hash0)
hasher.hash(hash1)
c0, c1 := GP64(), GP64()
MOVL(table.Idx(hash0, 1), c0.As32())
MOVL(table.Idx(hash1, 1), c1.As32())
sm2 := GP64()
LEAQ(Mem{Base: s, Disp: -2}, sm2)
MOVL(sm2.As32(), table.Idx(hash0, 1))
MOVL(s, table.Idx(hash1, 1))
CMPL(Mem{Base: src, Index: hash1, Scale: 1}, x.As32())
JEQ(LabelRef("match_nolit_loop_" + name))
INCL(s)
}
JMP(LabelRef("search_loop_" + name))
}
Label("emit_remainder_" + name)
// Bail if we exceed the maximum size.
// if d+len(src)-nextEmitL > dstLimitPtrQ { return 0
{
// remain = lenSrc - nextEmitL
remain := GP64()
MOVQ(lenSrcQ, remain)
SUBL(nextEmitL, remain.As32())
dst := GP64()
MOVQ(dstBase, dst)
// dst := dst + (len(src)-nextEmitL)
LEAQ(Mem{Base: dst, Index: remain, Scale: 1}, dst)
CMPQ(dst, dstLimitPtrQ)
JL(LabelRef("emit_remainder_ok_" + name))
ri, err := ReturnIndex(0).Resolve()
if err != nil {
panic(err)
}
MOVQ(U32(0), ri.Addr)
RET()
Label("emit_remainder_ok_" + name)
}
// emitLiteral(dst[d:], src[nextEmitL:])
emitEnd := GP64()
MOVQ(lenSrcQ, emitEnd)
// Emit final literals.
emitLiterals(nextEmitL, emitEnd, src, dstBase, "emit_remainder_"+name, avx)
// length := start - base (ptr arithmetic)
length := GP64()
MOVQ(dstStartPtrQ, length)
SUBQ(dstBase, length)
Store(length, ReturnIndex(0))
RET()
}
// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
// Checks if base == nextemit.
// src & base are untouched.
func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) {
nextEmit, litLen, dstBaseTmp, litBase := GP64().As32(), GP64(), GP64(), GP64()
MOVL(nextEmitL, nextEmit)
CMPL(nextEmit, base.As32())
JEQ(LabelRef("emit_literal_skip_" + name))
MOVL(base.As32(), litLen.As32())
// Base is now next emit.
MOVL(base.As32(), nextEmitL)
// litBase = src[nextEmitL:]
LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
// Load (and store when we return)
MOVQ(dstBase, dstBaseTmp)
emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true)
Label("emit_literal_done_" + name)
// Store updated dstBase
MOVQ(dstBaseTmp, dstBase)
Label("emit_literal_skip_" + name)
}
type hashGen struct {
bytes int
tablebits int
mulreg reg.GPVirtual
}
// hash uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
func hash6(tablebits int) hashGen {
h := hashGen{
bytes: 6,
tablebits: tablebits,
mulreg: GP64(),
}
MOVQ(Imm(227718039650203), h.mulreg)
return h
}
// hash uses multiply to get hash of the value.
func (h hashGen) hash(val reg.GPVirtual) {
// Move value to top of register.
SHLQ(U8(64-8*h.bytes), val)
IMULQ(h.mulreg, val)
// Move value to bottom
SHRQ(U8(64-h.tablebits), val)
}
func genEmitLiteral() {
TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
"It assumes that:",
" dst is long enough to hold the encoded bytes",
" 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
Pragma("noescape")
dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
Load(Param("dst").Base(), dstBase)
Load(Param("lit").Base(), litBase)
Load(Param("lit").Len(), litLen)
emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false)
Label("emit_literal_end_standalone")
Store(retval, ReturnIndex(0))
RET()
TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int")
Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "",
"It assumes that:",
" dst is long enough to hold the encoded bytes",
" 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
Pragma("noescape")
dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64()
Load(Param("dst").Base(), dstBase)
Load(Param("lit").Base(), litBase)
Load(Param("lit").Len(), litLen)
emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false)
Label("emit_literal_end_avx_standalone")
Store(retval, ReturnIndex(0))
RET()
}
// emitLiteral can be used for inlining an emitLiteral call.
// stack must have at least 32 bytes.
// retval will contain emitted bytes, but can be nil if this is not interesting.
// dstBase and litBase are updated.
// Uses 2 GP registers. With AVX 4 registers.
// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) {
n := GP64()
n16 := GP64()
// We always add litLen bytes
if retval != nil {
MOVQ(litLen, retval)
}
MOVQ(litLen, n)
SUBL(U8(1), n.As32())
// Return if AX was 0
JC(end)
// Find number of bytes to emit for tag.
CMPL(n.As32(), U8(60))
JLT(LabelRef("one_byte_" + name))
CMPL(n.As32(), U32(1<<8))
JLT(LabelRef("two_bytes_" + name))
CMPL(n.As32(), U32(1<<16))
JLT(LabelRef("three_bytes_" + name))
CMPL(n.As32(), U32(1<<24))
JLT(LabelRef("four_bytes_" + name))
Label("five_bytes_" + name)
MOVB(U8(252), Mem{Base: dstBase})
MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
if retval != nil {
ADDQ(U8(5), retval)
}
ADDQ(U8(5), dstBase)
JMP(LabelRef("memmove_" + name))
Label("four_bytes_" + name)
MOVQ(n, n16)
SHRL(U8(16), n16.As32())
MOVB(U8(248), Mem{Base: dstBase})
MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
if retval != nil {
ADDQ(U8(4), retval)
}
ADDQ(U8(4), dstBase)
JMP(LabelRef("memmove_" + name))
Label("three_bytes_" + name)
MOVB(U8(0xf4), Mem{Base: dstBase})
MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
if retval != nil {
ADDQ(U8(3), retval)
}
ADDQ(U8(3), dstBase)
JMP(LabelRef("memmove_" + name))
Label("two_bytes_" + name)
MOVB(U8(0xf0), Mem{Base: dstBase})
MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
if retval != nil {
ADDQ(U8(2), retval)
}
ADDQ(U8(2), dstBase)
JMP(LabelRef("memmove_" + name))
Label("one_byte_" + name)
SHLB(U8(2), n.As8())
MOVB(n.As8(), Mem{Base: dstBase})
if retval != nil {
ADDQ(U8(1), retval)
}
ADDQ(U8(1), dstBase)
// Fallthrough
Label("memmove_" + name)
// copy(dst[i:], lit)
if true {
dstEnd := GP64()
if updateDst {
LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
}
genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, litLen, end, avx)
if updateDst {
MOVQ(dstEnd, dstBase)
}
} else {
genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end)
}
return
}
// genEmitRepeat generates a standlone emitRepeat.
func genEmitRepeat() {
TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
"Length must be at least 4 and < 1<<32", "")
Pragma("noescape")
dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
// retval = 0
XORQ(retval, retval)
Load(Param("dst").Base(), dstBase)
Load(Param("offset"), offset)
Load(Param("length"), length)
emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
Label("gen_emit_repeat_end")
Store(retval, ReturnIndex(0))
RET()
}
// emitRepeat can be used for inlining an emitRepeat call.
// length >= 4 and < 1<<32
// length is modified. dstBase is updated. retval is added to input.
// retval can be nil.
// Will jump to end label when finished.
// Uses 1 GP register.
func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
Label("emit_repeat_again_" + name)
tmp := GP64()
MOVQ(length, tmp) // Copy length
// length -= 4
LEAQ(Mem{Base: length, Disp: -4}, length)
// if length <= 4 (use copied value)
CMPL(tmp.As32(), U8(8))
JLE(LabelRef("repeat_two_" + name))
// length < 8 && offset < 2048
CMPL(tmp.As32(), U8(12))
JGE(LabelRef("cant_repeat_two_offset_" + name))
CMPL(offset.As32(), U32(2048))
JLT(LabelRef("repeat_two_offset_" + name))
const maxRepeat = ((1 << 24) - 1) + 65536
Label("cant_repeat_two_offset_" + name)
CMPL(length.As32(), U32((1<<8)+4))
JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
CMPL(length.As32(), U32((1<<16)+(1<<8)))
JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
CMPL(length.As32(), U32(maxRepeat))
JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
// We have have more than 24 bits
// Emit so we have at least 4 bytes left.
LEAQ(Mem{Base: length, Disp: -(maxRepeat - 4)}, length) // length -= (maxRepeat - 4)
MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb
MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
ADDQ(U8(5), dstBase)
if retval != nil {
ADDQ(U8(5), retval)
}
JMP(LabelRef("emit_repeat_again_" + name))
// Must be able to be within 5 bytes.
Label("repeat_five_" + name)
LEAQ(Mem{Base: length, Disp: -65536}, length) // length -= 65536
MOVQ(length, offset)
MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
SARQ(U8(16), offset) // offset = length >> 16
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16
if retval != nil {
ADDQ(U8(5), retval) // i += 5
}
ADDQ(U8(5), dstBase) // dst += 5
JMP(end)
Label("repeat_four_" + name)
LEAQ(Mem{Base: length, Disp: -256}, length) // length -= 256
MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
if retval != nil {
ADDQ(U8(4), retval) // i += 4
}
ADDQ(U8(4), dstBase) // dst += 4
JMP(end)
Label("repeat_three_" + name)
LEAQ(Mem{Base: length, Disp: -4}, length) // length -= 4
MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length)
if retval != nil {
ADDQ(U8(3), retval) // i += 3
}
ADDQ(U8(3), dstBase) // dst += 3
JMP(end)
Label("repeat_two_" + name)
// dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
SHLL(U8(2), length.As32())
ORL(U8(tagCopy1), length.As32())
MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
if retval != nil {
ADDQ(U8(2), retval) // i += 2
}
ADDQ(U8(2), dstBase) // dst += 2
JMP(end)
Label("repeat_two_offset_" + name)
// Emit the remaining copy, encoded as 2 bytes.
// dst[1] = uint8(offset)
// dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
tmp = GP64()
XORQ(tmp, tmp)
// Use scale and displacement to shift and subtract values from length.
LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length)
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
SARL(U8(8), offset.As32()) // Remove lower
SHLL(U8(5), offset.As32()) // Shift back up
ORL(offset.As32(), length.As32()) // OR result
MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
if retval != nil {
ADDQ(U8(2), retval) // i += 2
}
ADDQ(U8(2), dstBase) // dst += 2
JMP(end)
}
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
// genEmitCopy generates a standlone emitCopy
func genEmitCopy() {
TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
"It assumes that:",
" dst is long enough to hold the encoded bytes",
" 1 <= offset && offset <= math.MaxUint32",
" 4 <= length && length <= 1 << 24", "")
Pragma("noescape")
dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
// i := 0
XORQ(retval, retval)
Load(Param("dst").Base(), dstBase)
Load(Param("offset"), offset)
Load(Param("length"), length)
emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"))
Label("gen_emit_copy_end")
Store(retval, ReturnIndex(0))
RET()
}
const (
tagLiteral = 0x00
tagCopy1 = 0x01
tagCopy2 = 0x02
tagCopy4 = 0x03
)
// emitCopy can be used for inlining an emitCopy call.
// length is modified (and junk). dstBase is updated. retval is added to input.
// retval can be nil.
// Will jump to end label when finished.
// Uses 2 GP registers.
func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
// if offset >= 65536 {
CMPL(offset.As32(), U32(65536))
JL(LabelRef("two_byte_offset_" + name))
// offset is >= 65536
// if length <= 64 goto four_bytes_remain_
CMPL(length.As32(), U8(64))
JLE(LabelRef("four_bytes_remain_" + name))
// Emit a length 64 copy, encoded as 5 bytes.
// dst[0] = 63<<2 | tagCopy4
MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
// dst[4] = uint8(offset >> 24)
// dst[3] = uint8(offset >> 16)
// dst[2] = uint8(offset >> 8)
// dst[1] = uint8(offset)
MOVD(offset, Mem{Base: dstBase, Disp: 1})
// length -= 64
LEAQ(Mem{Base: length, Disp: -64}, length)
if retval != nil {
ADDQ(U8(5), retval) // i+=5
}
ADDQ(U8(5), dstBase) // dst+=5
// if length >= 4 {
CMPL(length.As32(), U8(4))
JL(LabelRef("four_bytes_remain_" + name))
// Emit remaining as repeats
// return 5 + emitRepeat(dst[5:], offset, length)
// Inline call to emitRepeat. Will jump to end
emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
Label("four_bytes_remain_" + name)
// if length == 0 {
// return i
// }
TESTL(length.As32(), length.As32())
JZ(end)
// Emit a copy, offset encoded as 4 bytes.
// dst[i+0] = uint8(length-1)<<2 | tagCopy4
// dst[i+1] = uint8(offset)
// dst[i+2] = uint8(offset >> 8)
// dst[i+3] = uint8(offset >> 16)
// dst[i+4] = uint8(offset >> 24)
tmp := GP64()
MOVB(U8(tagCopy4), tmp.As8())
// Use displacement to subtract 1 from upshifted length.
LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
MOVB(length.As8(), Mem{Base: dstBase})
MOVD(offset, Mem{Base: dstBase, Disp: 1})
// return i + 5
if retval != nil {
ADDQ(U8(5), retval)
}
ADDQ(U8(5), dstBase)
JMP(end)
Label("two_byte_offset_" + name)
// Offset no more than 2 bytes.
// if length > 64 {
CMPL(length.As32(), U8(64))
JLE(LabelRef("two_byte_offset_short_" + name))
// Emit a length 60 copy, encoded as 3 bytes.
// Emit remaining as repeat value (minimum 4 bytes).
// dst[2] = uint8(offset >> 8)
// dst[1] = uint8(offset)
// dst[0] = 59<<2 | tagCopy2
MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
// length -= 60
LEAQ(Mem{Base: length, Disp: -60}, length)
// Emit remaining as repeats, at least 4 bytes remain.
// return 3 + emitRepeat(dst[3:], offset, length)
//}
ADDQ(U8(3), dstBase)
if retval != nil {
ADDQ(U8(3), retval)
}
// Inline call to emitRepeat. Will jump to end
emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
Label("two_byte_offset_short_" + name)
// if length >= 12 || offset >= 2048 {
CMPL(length.As32(), U8(12))
JGE(LabelRef("emit_copy_three_" + name))
CMPL(offset.As32(), U32(2048))
JGE(LabelRef("emit_copy_three_" + name))
// Emit the remaining copy, encoded as 2 bytes.
// dst[1] = uint8(offset)
// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
tmp = GP64()
MOVB(U8(tagCopy1), tmp.As8())
// Use scale and displacement to shift and subtract values from length.
LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length)
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
SHRL(U8(8), offset.As32()) // Remove lower
SHLL(U8(5), offset.As32()) // Shift back up
ORL(offset.As32(), length.As32()) // OR result
MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
if retval != nil {
ADDQ(U8(2), retval) // i += 2
}
ADDQ(U8(2), dstBase) // dst += 2
// return 2
JMP(end)
Label("emit_copy_three_" + name)
// // Emit the remaining copy, encoded as 3 bytes.
// dst[2] = uint8(offset >> 8)
// dst[1] = uint8(offset)
// dst[0] = uint8(length-1)<<2 | tagCopy2
tmp = GP64()
MOVB(U8(tagCopy2), tmp.As8())
LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
MOVB(length.As8(), Mem{Base: dstBase})
MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
// return 3
if retval != nil {
ADDQ(U8(3), retval) // i += 3
}
ADDQ(U8(3), dstBase) // dst += 3
JMP(end)
}
// func memmove(to, from unsafe.Pointer, n uintptr)
// to and from will be at the end, n will be 0.
// to and from may not overlap.
// Fairly simplistic for now, can ofc. be extended.
// Uses one GP register and 8 SSE registers.
func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) {
tmp := GP64()
MOVQ(n, tmp)
// tmp = n/128
SHRQ(U8(7), tmp)
TESTQ(tmp, tmp)
JZ(LabelRef("done_128_" + name))
Label("loop_128_" + name)
var xmmregs [8]reg.VecVirtual
// Prefetch destination for next loop.
// Prefetching source doesn't provide speedup.
// This seems to give a small boost.
const preOff = 128
PREFETCHT0(Mem{Base: to, Disp: preOff})
PREFETCHT0(Mem{Base: to, Disp: preOff + 64})
for i := 0; i < 8; i++ {
xmmregs[i] = XMM()
MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i])
}
for i := 0; i < 8; i++ {
MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16))
}
LEAQ(Mem{Base: n, Disp: -128}, n)
ADDQ(U8(8*16), from)
ADDQ(U8(8*16), to)
DECQ(tmp)
JNZ(LabelRef("loop_128_" + name))
Label("done_128_" + name)
MOVQ(n, tmp)
// tmp = n/16
SHRQ(U8(4), tmp)
TESTQ(tmp, tmp)
JZ(LabelRef("done_16_" + name))
Label("loop_16_" + name)
xmm := XMM()
MOVOU(Mem{Base: from}, xmm)
MOVOU(xmm, Mem{Base: to})
LEAQ(Mem{Base: n, Disp: -16}, n)
ADDQ(U8(16), from)
ADDQ(U8(16), to)
DECQ(tmp)
JNZ(LabelRef("loop_16_" + name))
Label("done_16_" + name)
// TODO: Use REP; MOVSB somehow.
TESTQ(n, n)
JZ(end)
Label("loop_1_" + name)
MOVB(Mem{Base: from}, tmp.As8())
MOVB(tmp.As8(), Mem{Base: to})
INCQ(from)
INCQ(to)
DECQ(n)
JNZ(LabelRef("loop_1_" + name))
}
// func memmove(to, from unsafe.Pointer, n uintptr)
// src and dst may not overlap.
// Non AVX uses 2 GP register, 16 SSE2 registers.
// AVX uses 4 GP registers 16 AVX/SSE registers.
// All passed registers may be updated.
func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) {
AX, CX := GP64(), GP64()
NOP()
name += "_memmove_"
Label(name + "tail")
// move_129through256 or smaller work whether or not the source and the
// destination memory regions overlap because they load all data into
// registers before writing it back. move_256through2048 on the other
// hand can be used only when the memory regions don't overlap or the copy
// direction is forward.
//
// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
TESTQ(length, length)
JEQ(end)
CMPQ(length, U8(2))
JBE(LabelRef(name + "move_1or2"))
CMPQ(length, U8(4))
JB(LabelRef(name + "move_3"))
JBE(LabelRef(name + "move_4"))
CMPQ(length, U8(8))
JB(LabelRef(name + "move_5through7"))
JE(LabelRef(name + "move_8"))
CMPQ(length, U8(16))
JBE(LabelRef(name + "move_9through16"))
CMPQ(length, U8(32))
JBE(LabelRef(name + "move_17through32"))
CMPQ(length, U8(64))
JBE(LabelRef(name + "move_33through64"))
CMPQ(length, U8(128))
JBE(LabelRef(name + "move_65through128"))
CMPQ(length, U32(256))
JBE(LabelRef(name + "move_129through256"))
if avx {
JMP(LabelRef(name + "avxUnaligned"))
} else {
if false {
// Don't check length for now.
Label(name + "forward")
CMPQ(length, U32(2048))
JLS(LabelRef(name + "move_256through2048"))
genMemMove(name+"fallback", dst, src, length, end)
} else {
JMP(LabelRef(name + "move_256through2048"))
}
}
/*
// If REP MOVSB isn't fast, don't use it
// FIXME: internalcpu·X86+const_offsetX86HasERMS(SB)
// CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB
JMP(LabelRef(name + "fwdBy8"))
// Check alignment
MOVL(src.As32(), AX.As32())
ORL(dst.As32(), AX.As32())
TESTL(U32(7), AX.As32())
JEQ(LabelRef(name + "fwdBy8"))
// Do 1 byte at a time
// MOVQ(length, CX)
// FIXME:
// REP; MOVSB
JMP(end)
Label(name + "fwdBy8")
// Do 8 bytes at a time
MOVQ(length, CX)
SHRQ(U8(3), CX)
ANDQ(U8(7), length)
// FIXME:
//REP; MOVSQ
JMP(LabelRef(name + "tail"))
Label(name + "back")
//check overlap
MOVQ(src, CX)
ADDQ(length, CX)
CMPQ(CX, dst)
JLS(LabelRef(name + "forward"))
//whole thing backwards has
//adjusted addresses
ADDQ(length, dst)
ADDQ(length, src)
STD()
//
// copy
//
MOVQ(length, CX)
SHRQ(U8(3), CX)
ANDQ(U8(7), length)
SUBQ(U8(8), dst)
SUBQ(U8(8), src)
// FIXME:
//REP; MOVSQ
// FIXME:
//CLD()
ADDQ(U8(8), dst)
ADDQ(U8(8), src)
SUBQ(length, dst)
SUBQ(length, src)
JMP(LabelRef(name + "tail"))
*/
Label(name + "move_1or2")
MOVB(Mem{Base: src}, AX.As8())
MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
MOVB(AX.As8(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
JMP(end)
Label(name + "move_4")
MOVL(Mem{Base: src}, AX.As32())
MOVL(AX.As32(), Mem{Base: dst})
JMP(end)
Label(name + "move_3")
MOVW(Mem{Base: src}, AX.As16())
MOVB(Mem{Base: src, Disp: 2}, CX.As8())
MOVW(AX.As16(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
JMP(end)
Label(name + "move_5through7")
MOVL(Mem{Base: src}, AX.As32())
MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
MOVL(AX.As32(), Mem{Base: dst})
MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
JMP(end)
Label(name + "move_8")
// We need a separate case for 8 to make sure we write pointers atomically.
MOVQ(Mem{Base: src}, AX)
MOVQ(AX, Mem{Base: dst})
JMP(end)
Label(name + "move_9through16")
MOVQ(Mem{Base: src}, AX)
MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
MOVQ(AX, Mem{Base: dst})
MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
JMP(end)
Label(name + "move_17through32")
X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
MOVOU(Mem{Base: src}, X0)
MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
MOVOU(X0, Mem{Base: dst})
MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
JMP(end)
Label(name + "move_33through64")
MOVOU(Mem{Base: src}, X0)
MOVOU(Mem{Base: src, Disp: 16}, X1)
MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
MOVOU(X0, Mem{Base: dst})
MOVOU(X1, Mem{Base: dst, Disp: 16})
MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
JMP(end)
Label(name + "move_65through128")
MOVOU(Mem{Base: src}, X0)
MOVOU(Mem{Base: src, Disp: 16}, X1)
MOVOU(Mem{Base: src, Disp: 32}, X2)
MOVOU(Mem{Base: src, Disp: 48}, X3)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
MOVOU(X0, Mem{Base: dst})
MOVOU(X1, Mem{Base: dst, Disp: 16})
MOVOU(X2, Mem{Base: dst, Disp: 32})
MOVOU(X3, Mem{Base: dst, Disp: 48})
MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
JMP(end)
Label(name + "move_129through256")
MOVOU(Mem{Base: src}, X0)
MOVOU(Mem{Base: src, Disp: 16}, X1)
MOVOU(Mem{Base: src, Disp: 32}, X2)
MOVOU(Mem{Base: src, Disp: 48}, X3)
MOVOU(Mem{Base: src, Disp: 64}, X4)
MOVOU(Mem{Base: src, Disp: 80}, X5)
MOVOU(Mem{Base: src, Disp: 96}, X6)
MOVOU(Mem{Base: src, Disp: 112}, X7)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
MOVOU(X0, Mem{Base: dst})
MOVOU(X1, Mem{Base: dst, Disp: 16})
MOVOU(X2, Mem{Base: dst, Disp: 32})
MOVOU(X3, Mem{Base: dst, Disp: 48})
MOVOU(X4, Mem{Base: dst, Disp: 64})
MOVOU(X5, Mem{Base: dst, Disp: 80})
MOVOU(X6, Mem{Base: dst, Disp: 96})
MOVOU(X7, Mem{Base: dst, Disp: 112})
MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
JMP(end)
Label(name + "move_256through2048")
LEAQ(Mem{Base: length, Disp: -256}, length)
MOVOU(Mem{Base: src}, X0)
MOVOU(Mem{Base: src, Disp: 16}, X1)
MOVOU(Mem{Base: src, Disp: 32}, X2)
MOVOU(Mem{Base: src, Disp: 48}, X3)
MOVOU(Mem{Base: src, Disp: 64}, X4)
MOVOU(Mem{Base: src, Disp: 80}, X5)
MOVOU(Mem{Base: src, Disp: 96}, X6)
MOVOU(Mem{Base: src, Disp: 112}, X7)
MOVOU(Mem{Base: src, Disp: 128}, X8)
MOVOU(Mem{Base: src, Disp: 144}, X9)
MOVOU(Mem{Base: src, Disp: 160}, X10)
MOVOU(Mem{Base: src, Disp: 176}, X11)
MOVOU(Mem{Base: src, Disp: 192}, X12)
MOVOU(Mem{Base: src, Disp: 208}, X13)
MOVOU(Mem{Base: src, Disp: 224}, X14)
MOVOU(Mem{Base: src, Disp: 240}, X15)
MOVOU(X0, Mem{Base: dst})
MOVOU(X1, Mem{Base: dst, Disp: 16})
MOVOU(X2, Mem{Base: dst, Disp: 32})
MOVOU(X3, Mem{Base: dst, Disp: 48})
MOVOU(X4, Mem{Base: dst, Disp: 64})
MOVOU(X5, Mem{Base: dst, Disp: 80})
MOVOU(X6, Mem{Base: dst, Disp: 96})
MOVOU(X7, Mem{Base: dst, Disp: 112})
MOVOU(X8, Mem{Base: dst, Disp: 128})
MOVOU(X9, Mem{Base: dst, Disp: 144})
MOVOU(X10, Mem{Base: dst, Disp: 160})
MOVOU(X11, Mem{Base: dst, Disp: 176})
MOVOU(X12, Mem{Base: dst, Disp: 192})
MOVOU(X13, Mem{Base: dst, Disp: 208})
MOVOU(X14, Mem{Base: dst, Disp: 224})
MOVOU(X15, Mem{Base: dst, Disp: 240})
CMPQ(length, U32(256))
LEAQ(Mem{Base: src, Disp: 256}, src)
LEAQ(Mem{Base: dst, Disp: 256}, dst)
JGE(LabelRef(name + "move_256through2048"))
JMP(LabelRef(name + "tail"))
if avx {
Label(name + "avxUnaligned")
R8, R10 := GP64(), GP64()
// There are two implementations of move algorithm.
// The first one for non-overlapped memory regions. It uses forward copying.
// We do not support overlapping input
// Non-temporal copy would be better for big sizes.
// Disabled since big copies are unlikely.
// If enabling, test functionality.
const enableBigData = false
if enableBigData {
CMPQ(length, U32(0x100000))
JAE(LabelRef(name + "gobble_big_data_fwd"))
}
// Memory layout on the source side
// src CX
// |<---------length before correction--------->|
// | |<--length corrected-->| |
// | | |<--- AX --->|
// |<-R11->| |<-128 bytes->|
// +----------------------------------------+
// | Head | Body | Tail |
// +-------+------------------+-------------+
// ^ ^ ^
// | | |
// Save head into Y4 Save tail into X5..X12
// |
// src+R11, where R11 = ((dst & -32) + 32) - dst
// Algorithm:
// 1. Unaligned save of the tail's 128 bytes
// 2. Unaligned save of the head's 32 bytes
// 3. Destination-aligned copying of body (128 bytes per iteration)
// 4. Put head on the new place
// 5. Put the tail on the new place
// It can be important to satisfy processor's pipeline requirements for
// small sizes as the cost of unaligned memory region copying is
// comparable with the cost of main loop. So code is slightly messed there.
// There is more clean implementation of that algorithm for bigger sizes
// where the cost of unaligned part copying is negligible.
// You can see it after gobble_big_data_fwd label.
Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
MOVQ(dst, R10)
// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
MOVQ(U32(0x80), AX)
// Align destination address
ANDQ(U32(0xffffffe0), dst)
ADDQ(U8(32), dst)
// Continue tail saving.
MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
// Make R8 delta between aligned and unaligned destination addresses.
MOVQ(dst, R8)
SUBQ(R10, R8)
// Continue tail saving.
MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
SUBQ(R8, length)
// Continue tail saving.
MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
// The tail will be put on its place after main body copying.
// It's time for the unaligned heading part.
VMOVDQU(Mem{Base: src}, Y4)
// Adjust source address to point past head.
ADDQ(R8, src)
SUBQ(AX, length)
// Aligned memory copying there
Label(name + "gobble_128_loop")
VMOVDQU(Mem{Base: src}, Y0)
VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
ADDQ(AX, src)
VMOVDQA(Y0, Mem{Base: dst})
VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
ADDQ(AX, dst)
SUBQ(AX, length)
JA(LabelRef(name + "gobble_128_loop"))
// Now we can store unaligned parts.
ADDQ(AX, length)
ADDQ(dst, length)
VMOVDQU(Y4, Mem{Base: R10})
VZEROUPPER()
MOVOU(X5, Mem{Base: length, Disp: -0x80})
MOVOU(X6, Mem{Base: length, Disp: -0x70})
MOVOU(X7, Mem{Base: length, Disp: -0x60})
MOVOU(X8, Mem{Base: length, Disp: -0x50})
MOVOU(X9, Mem{Base: length, Disp: -0x40})
MOVOU(X10, Mem{Base: length, Disp: -0x30})
MOVOU(X11, Mem{Base: length, Disp: -0x20})
MOVOU(X12, Mem{Base: length, Disp: -0x10})
JMP(end)
if enableBigData {
Label(name + "gobble_big_data_fwd")
// There is forward copying for big regions.
// It uses non-temporal mov instructions.
// Details of this algorithm are commented previously for small sizes.
LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5)
MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
VMOVDQU(Mem{Base: src}, Y4)
MOVQ(dst, R8)
ANDQ(U32(0xffffffe0), dst)
ADDQ(U8(32), dst)
MOVQ(dst, R10)
SUBQ(R8, R10)
SUBQ(R10, length)
ADDQ(R10, src)
LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX)
SUBQ(U8(0x80), length)
Label(name + "gobble_mem_fwd_loop")
PREFETCHNTA(Mem{Base: src, Disp: 0x1c0})
PREFETCHNTA(Mem{Base: src, Disp: 0x280})
// Prefetch values were chosen empirically.
// Approach for prefetch usage as in 7.6.6 of [1]
// [1] 64-ia-32-architectures-optimization-manual.pdf
// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
VMOVDQU(Mem{Base: src}, Y0)
VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
ADDQ(U8(0x80), src)
VMOVNTDQ(Y0, Mem{Base: dst})
VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20})
VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20})
VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60})
ADDQ(U8(0x80), dst)
SUBQ(U8(0x80), length)
JA(LabelRef(name + "gobble_mem_fwd_loop"))
// NT instructions don't follow the normal cache-coherency rules.
// We need SFENCE there to make copied data available timely.
SFENCE()
VMOVDQU(Y4, Mem{Base: R8})
VZEROUPPER()
MOVOU(X5, Mem{Base: CX, Disp: -0x80})
MOVOU(X6, Mem{Base: CX, Disp: -0x70})
MOVOU(X7, Mem{Base: CX, Disp: -0x60})
MOVOU(X8, Mem{Base: CX, Disp: -0x50})
MOVOU(X9, Mem{Base: CX, Disp: -0x40})
MOVOU(X10, Mem{Base: CX, Disp: -0x30})
MOVOU(X11, Mem{Base: CX, Disp: -0x20})
MOVOU(X12, Mem{Base: CX, Disp: -0x10})
JMP(end)
}
}
}
// genMatchLen generates standalone matchLen.
func genMatchLen() {
TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
Doc("matchLen returns how many bytes match in a and b", "",
"It assumes that:",
" len(a) <= len(b)", "")
Pragma("noescape")
aBase, bBase, length := GP64(), GP64(), GP64()
Load(Param("a").Base(), aBase)
Load(Param("b").Base(), bBase)
Load(Param("a").Len(), length)
l := matchLen("standalone", Mem{Base: aBase}, Mem{Base: bBase}, length, LabelRef("gen_match_len_end"))
Label("gen_match_len_end")
Store(l, ReturnIndex(0))
RET()
}
// matchLen returns the number of matching bytes of a and b.
// len is the maximum number of bytes to match.
// Will jump to end when done and returns the length.
// Uses 2 GP registers.
func matchLen(name string, a, b Mem, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
tmp, matched := GP64(), GP64()
XORQ(matched, matched)
CMPQ(len, U8(8))
JL(LabelRef("matchlen_single_" + name))
Label("matchlen_loopback_" + name)
MOVQ(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp)
XORQ(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp)
TESTQ(tmp, tmp)
JZ(LabelRef("matchlen_loop_" + name))
// Not all match.
BSFQ(tmp, tmp)
SARQ(U8(3), tmp)
LEAQ(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
JMP(end)
// All 8 byte matched, update and loop.
Label("matchlen_loop_" + name)
LEAQ(Mem{Base: len, Disp: -8}, len)
LEAQ(Mem{Base: matched, Disp: 8}, matched)
CMPQ(len, U8(8))
JGE(LabelRef("matchlen_loopback_" + name))
// Less than 8 bytes left.
Label("matchlen_single_" + name)
TESTQ(len, len)
JZ(end)
Label("matchlen_single_loopback_" + name)
MOVB(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp.As8())
CMPB(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp.As8())
JNE(end)
LEAQ(Mem{Base: matched, Disp: 1}, matched)
DECQ(len)
JNZ(LabelRef("matchlen_single_loopback_" + name))
JMP(end)
return matched
}