1587 lines
47 KiB
Go
1587 lines
47 KiB
Go
//go:build ignore
|
||
|
||
package main
|
||
|
||
import (
|
||
"fmt"
|
||
"log"
|
||
|
||
. "sources.truenas.cloud/code/avo/build"
|
||
"sources.truenas.cloud/code/avo/buildtags"
|
||
"sources.truenas.cloud/code/avo/operand"
|
||
. "sources.truenas.cloud/code/avo/operand"
|
||
"sources.truenas.cloud/code/avo/reg"
|
||
)
|
||
|
||
func main() {
|
||
Constraint(buildtags.Not("appengine").ToConstraint())
|
||
Constraint(buildtags.Not("noasm").ToConstraint())
|
||
Constraint(buildtags.Term("gc").ToConstraint())
|
||
|
||
genEncodeBlockAsm("encodeBlockAsm", 16, 6, false)
|
||
genEncodeBlockAsm("encodeBlockAsm14B", 14, 5, false)
|
||
genEncodeBlockAsm("encodeBlockAsm12B", 12, 4, false)
|
||
genEncodeBlockAsm("encodeBlockAsmAvx", 16, 6, true)
|
||
genEncodeBlockAsm("encodeBlockAsm14BAvx", 14, 5, true)
|
||
genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 4, true)
|
||
genEmitLiteral()
|
||
genEmitRepeat()
|
||
genEmitCopy()
|
||
genMatchLen()
|
||
Generate()
|
||
}
|
||
|
||
func debugval(v operand.Op) {
|
||
value := reg.R15
|
||
MOVQ(v, value)
|
||
INT(Imm(3))
|
||
}
|
||
|
||
func genEncodeBlockAsm(name string, tableBits, skipLog int, avx bool) {
|
||
TEXT(name, 0, "func(dst, src []byte) int")
|
||
Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
|
||
"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
|
||
Pragma("noescape")
|
||
|
||
// "var table [maxTableSize]uint32" takes up 4 * (1 << tableBits) bytes of stack space.
|
||
// Extra bytes are added to keep less used values.
|
||
var (
|
||
tableSize = 1 << uint(tableBits)
|
||
// Keep base stack multiple of 16.
|
||
baseStack = 0
|
||
// try to keep extraStack + baseStack multiple of 16
|
||
// for best chance of table alignment.
|
||
extraStack = 32
|
||
allocStack = baseStack + extraStack + tableSize
|
||
)
|
||
|
||
// Memzero needs at least 128 bytes.
|
||
if tableSize < 128 {
|
||
panic("tableSize must be at least 128 bytes")
|
||
}
|
||
|
||
lenSrcBasic, err := Param("src").Len().Resolve()
|
||
if err != nil {
|
||
panic(err)
|
||
}
|
||
lenSrcQ := lenSrcBasic.Addr
|
||
|
||
stack := AllocLocal(allocStack)
|
||
table := stack.Offset(allocStack - tableSize)
|
||
|
||
tmpStack := baseStack
|
||
// Bail if we can't compress to at least this.
|
||
dstLimitPtrQ := stack.Offset(tmpStack)
|
||
tmpStack += 8
|
||
// dstStartPtrQ contains the original dst pointer for returning the length
|
||
dstStartPtrQ := stack.Offset(tmpStack)
|
||
tmpStack += 8
|
||
// sLimitL is when to stop looking for offset/length copies.
|
||
sLimitL := stack.Offset(tmpStack)
|
||
tmpStack += 4
|
||
// nextEmitL keeps track of the point we have emitted to.
|
||
nextEmitL := stack.Offset(tmpStack)
|
||
tmpStack += 4
|
||
// Repeat stores the last match offset.
|
||
repeatL := stack.Offset(tmpStack)
|
||
tmpStack += 4
|
||
// nextSTempL keeps nextS while other functions are being called.
|
||
nextSTempL := stack.Offset(tmpStack)
|
||
tmpStack += 4
|
||
// Ensure we have the correct extra stack.
|
||
// Could be automatic, but whatever.
|
||
if tmpStack-baseStack != extraStack {
|
||
log.Fatal("adjust extraStack to ", tmpStack-baseStack)
|
||
}
|
||
|
||
dstBaseBasic, err := Param("dst").Base().Resolve()
|
||
if err != nil {
|
||
panic(err)
|
||
}
|
||
dstBase := dstBaseBasic.Addr
|
||
|
||
if tmpStack > extraStack+baseStack {
|
||
panic(fmt.Sprintf("tmp stack exceeded: %v", tmpStack))
|
||
}
|
||
|
||
// Zero table
|
||
{
|
||
iReg := GP64()
|
||
MOVQ(U32(tableSize/8/16), iReg)
|
||
tablePtr := GP64()
|
||
LEAQ(table, tablePtr)
|
||
zeroXmm := XMM()
|
||
PXOR(zeroXmm, zeroXmm)
|
||
|
||
Label("zero_loop_" + name)
|
||
for i := 0; i < 8; i++ {
|
||
MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
|
||
}
|
||
ADDQ(U8(16*8), tablePtr)
|
||
DECQ(iReg)
|
||
JNZ(LabelRef("zero_loop_" + name))
|
||
|
||
// nextEmit is offset n src where the next emitLiteral should start from.
|
||
MOVL(iReg.As32(), nextEmitL)
|
||
}
|
||
|
||
{
|
||
const inputMargin = 8
|
||
tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
|
||
MOVQ(lenSrcQ, tmp)
|
||
LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
|
||
// sLimitL := len(src) - inputMargin
|
||
LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
|
||
// dstLimit := len(src) - len(src)>>5 - 5
|
||
SHRQ(U8(5), tmp)
|
||
SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
|
||
MOVL(tmp3.As32(), sLimitL)
|
||
dstAddr := GP64()
|
||
MOVQ(dstBase, dstAddr)
|
||
// Store dst start address
|
||
MOVQ(dstAddr, dstStartPtrQ)
|
||
LEAQ(Mem{Base: dstAddr, Index: tmp2, Scale: 1}, tmp2)
|
||
MOVQ(tmp2, dstLimitPtrQ)
|
||
}
|
||
|
||
// s = 1
|
||
s := GP64().As32()
|
||
MOVL(U32(1), s)
|
||
// repeatL = 1
|
||
MOVL(s, repeatL)
|
||
|
||
src := GP64()
|
||
Load(Param("src").Base(), src)
|
||
|
||
// Load cv
|
||
Label("search_loop_" + name)
|
||
candidate := GP64().As32()
|
||
{
|
||
cv := GP64()
|
||
MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
|
||
nextS := GP64()
|
||
// nextS := s + (s-nextEmit)>>6 + 4
|
||
{
|
||
tmp := GP64()
|
||
MOVL(s, tmp.As32()) // tmp = s
|
||
SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit
|
||
SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
|
||
LEAQ(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
|
||
}
|
||
// if nextS > sLimit {goto emitRemainder}
|
||
{
|
||
tmp := GP64()
|
||
MOVL(sLimitL, tmp.As32())
|
||
CMPL(nextS.As32(), tmp.As32())
|
||
JGT(LabelRef("emit_remainder_" + name))
|
||
}
|
||
// move nextS to stack.
|
||
MOVL(nextS.As32(), nextSTempL)
|
||
|
||
candidate2 := GP64().As32()
|
||
hasher := hash6(tableBits)
|
||
{
|
||
hash0, hash1 := GP64(), GP64()
|
||
MOVQ(cv, hash0)
|
||
MOVQ(cv, hash1)
|
||
SHRQ(U8(8), hash1)
|
||
hasher.hash(hash0)
|
||
hasher.hash(hash1)
|
||
MOVL(table.Idx(hash0, 1), candidate)
|
||
MOVL(table.Idx(hash1, 1), candidate2)
|
||
MOVL(s, table.Idx(hash0, 1))
|
||
tmp := GP64().As32()
|
||
LEAL(Mem{Base: s, Disp: 1}, tmp)
|
||
MOVL(tmp, table.Idx(hash1, 1))
|
||
}
|
||
// Check repeat at offset checkRep
|
||
const checkRep = 1
|
||
|
||
if true {
|
||
// rep = s - repeat
|
||
rep := GP64().As32()
|
||
if true {
|
||
// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
|
||
left, right := GP64(), GP64()
|
||
MOVL(s, rep)
|
||
SUBL(repeatL, rep) // rep = s - repeat
|
||
MOVL(Mem{Base: src, Index: rep, Scale: 1, Disp: checkRep}, right.As32())
|
||
MOVQ(cv, left)
|
||
SHLQ(U8(checkRep*8), left)
|
||
CMPL(left.As32(), right.As32())
|
||
|
||
// FIXME: Unable to allocate if enabled.
|
||
JNE(LabelRef("no_repeat_found_" + name))
|
||
}
|
||
// base = s + 1
|
||
base := GP64()
|
||
LEAQ(Mem{Base: s, Disp: 1}, base)
|
||
// Extend back
|
||
if true {
|
||
ne := GP64().As32()
|
||
MOVL(nextEmitL, ne)
|
||
TESTL(rep, rep)
|
||
JZ(LabelRef("repeat_extend_back_end_" + name))
|
||
|
||
// I is tested when decremented, so we loop back here.
|
||
Label("repeat_extend_back_loop_" + name)
|
||
CMPL(base.As32(), ne)
|
||
JG(LabelRef("repeat_extend_back_end_" + name))
|
||
// if src[i-1] == src[base-1]
|
||
tmp, tmp2 := GP64(), GP64()
|
||
MOVB(Mem{Base: src, Index: rep, Scale: 1, Disp: -1}, tmp.As8())
|
||
MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
|
||
CMPB(tmp.As8(), tmp2.As8())
|
||
JNE(LabelRef("repeat_extend_back_end_" + name))
|
||
LEAQ(Mem{Base: base, Disp: -1}, base)
|
||
DECL(rep)
|
||
JZ(LabelRef("repeat_extend_back_end_" + name))
|
||
JMP(LabelRef("repeat_extend_back_loop_" + name))
|
||
}
|
||
Label("repeat_extend_back_end_" + name)
|
||
// Base is now at start.
|
||
// d += emitLiteral(dst[d:], src[nextEmitL:base])
|
||
if true {
|
||
emitLiterals(nextEmitL, base, src, dstBase, "repeat_emit_"+name, avx)
|
||
}
|
||
|
||
// Extend forward
|
||
if true {
|
||
// s += 4 + checkRep
|
||
ADDL(U8(4+checkRep), s)
|
||
|
||
// candidate := s - repeat + 4 + checkRep
|
||
MOVL(s, candidate)
|
||
SUBL(repeatL, candidate) // candidate = s - repeatL
|
||
{
|
||
// srcLeft = sLimitL - s
|
||
srcLeft := GP64()
|
||
MOVL(sLimitL, srcLeft.As32())
|
||
SUBL(s, srcLeft.As32())
|
||
|
||
// Forward address
|
||
forwardStart := Mem{Base: src, Index: s, Scale: 1}
|
||
// End address
|
||
backStart := Mem{Base: src, Index: candidate, Scale: 1}
|
||
length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
|
||
Label("repeat_extend_forward_end_" + name)
|
||
// s+= length
|
||
ADDL(length.As32(), s)
|
||
}
|
||
}
|
||
// Emit
|
||
if true {
|
||
// length = s-base
|
||
length := GP64()
|
||
MOVL(s, length.As32())
|
||
SUBL(base.As32(), length.As32())
|
||
|
||
offsetVal := GP64()
|
||
MOVL(repeatL, offsetVal.As32())
|
||
dst := GP64()
|
||
MOVQ(dstBase, dst)
|
||
|
||
// if nextEmit > 0
|
||
tmp := GP64()
|
||
MOVL(nextEmitL, tmp.As32())
|
||
TESTL(tmp.As32(), tmp.As32())
|
||
|
||
// FIXME: fails to allocate regs if enabled:
|
||
JZ(LabelRef("repeat_as_copy_" + name))
|
||
|
||
emitRepeat("match_repeat_", length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
|
||
|
||
// JUMPS TO HERE:
|
||
Label("repeat_as_copy_" + name)
|
||
emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
|
||
|
||
Label("repeat_end_emit_" + name)
|
||
// Store new dst and nextEmit
|
||
MOVQ(dst, dstBase)
|
||
}
|
||
// if s >= sLimit
|
||
// can be omitted.
|
||
if true {
|
||
tmp := GP64()
|
||
MOVL(sLimitL, tmp.As32())
|
||
CMPL(s, tmp.As32())
|
||
JGT(LabelRef("emit_remainder_" + name))
|
||
}
|
||
JMP(LabelRef("search_loop_" + name))
|
||
}
|
||
Label("no_repeat_found_" + name)
|
||
{
|
||
// Can be moved up if registers are available.
|
||
hash2 := GP64()
|
||
{
|
||
// hash2 := hash6(cv>>16, tableBits)
|
||
hasher = hash6(tableBits)
|
||
MOVQ(cv, hash2)
|
||
SHRQ(U8(16), hash2)
|
||
hasher.hash(hash2)
|
||
}
|
||
|
||
CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
|
||
// cv >>= 8
|
||
SHRQ(U8(8), cv)
|
||
JEQ(LabelRef("candidate_match_" + name))
|
||
|
||
// candidate = int(table[hash2])
|
||
MOVL(table.Idx(hash2, 1), candidate)
|
||
|
||
// if uint32(cv>>8) == load32(src, candidate2)
|
||
CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
|
||
JEQ(LabelRef("candidate2_match_" + name))
|
||
|
||
// table[hash2] = uint32(s + 2)
|
||
tmp := GP64()
|
||
LEAQ(Mem{Base: s, Disp: 2}, tmp)
|
||
MOVL(tmp.As32(), table.Idx(hash2, 1))
|
||
|
||
// if uint32(cv>>16) == load32(src, candidate)
|
||
SHRQ(U8(8), cv)
|
||
CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
|
||
JEQ(LabelRef("candidate3_match_" + name))
|
||
// s = nextS
|
||
MOVL(nextSTempL, s)
|
||
JMP(LabelRef("search_loop_" + name))
|
||
|
||
// Matches candidate3
|
||
Label("candidate3_match_" + name)
|
||
ADDL(U8(2), s)
|
||
JMP(LabelRef("candidate_match_" + name))
|
||
|
||
Label("candidate2_match_" + name)
|
||
// table[hash2] = uint32(s + 2)
|
||
tmp = GP64()
|
||
LEAQ(Mem{Base: s, Disp: -2}, tmp)
|
||
MOVL(tmp.As32(), table.Idx(hash2, 1))
|
||
// s++
|
||
INCL(s)
|
||
MOVL(candidate2, candidate)
|
||
}
|
||
}
|
||
|
||
Label("candidate_match_" + name)
|
||
// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
|
||
// Extend backwards
|
||
{
|
||
ne := GP64()
|
||
MOVL(nextEmitL, ne.As32())
|
||
TESTL(candidate, candidate)
|
||
JZ(LabelRef("match_extend_back_end_" + name))
|
||
|
||
// candidate is tested when decremented, so we loop back here.
|
||
Label("match_extend_back_loop_" + name)
|
||
CMPL(s, ne.As32())
|
||
JG(LabelRef("match_extend_back_end_" + name))
|
||
// if src[candidate-1] == src[s-1]
|
||
tmp, tmp2 := GP64(), GP64()
|
||
MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
|
||
MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
|
||
CMPB(tmp.As8(), tmp2.As8())
|
||
JNE(LabelRef("match_extend_back_end_" + name))
|
||
LEAL(Mem{Base: s, Disp: -1}, s)
|
||
DECL(candidate)
|
||
JZ(LabelRef("match_extend_back_end_" + name))
|
||
JMP(LabelRef("match_extend_back_loop_" + name))
|
||
}
|
||
Label("match_extend_back_end_" + name)
|
||
|
||
// Bail if we exceed the maximum size.
|
||
if true {
|
||
// tmp = s-nextEmitL
|
||
tmp := GP64()
|
||
MOVL(s, tmp.As32())
|
||
SUBL(nextEmitL, tmp.As32())
|
||
LEAQ(dstBase.Idx(tmp, 1), tmp)
|
||
CMPQ(tmp, dstLimitPtrQ)
|
||
JL(LabelRef("match_dst_size_check_" + name))
|
||
ri, err := ReturnIndex(0).Resolve()
|
||
if err != nil {
|
||
panic(err)
|
||
}
|
||
MOVQ(U32(0), ri.Addr)
|
||
RET()
|
||
}
|
||
Label("match_dst_size_check_" + name)
|
||
{
|
||
base := GP64()
|
||
MOVL(candidate, base.As32())
|
||
emitLiterals(nextEmitL, base, src, dstBase, "match_emit_"+name, avx)
|
||
NOP()
|
||
}
|
||
|
||
Label("match_nolit_loop_" + name)
|
||
{
|
||
base := GP64().As32()
|
||
MOVL(s, base)
|
||
// Update repeat
|
||
{
|
||
// repeat = base - candidate
|
||
repeatVal := GP64().As32()
|
||
MOVL(s, repeatVal)
|
||
SUBL(candidate, repeatVal)
|
||
MOVL(repeatVal, repeatL)
|
||
}
|
||
// s+=4, candidate+=4
|
||
ADDL(U8(4), s)
|
||
ADDL(U8(4), candidate)
|
||
// Extend the 4-byte match as long as possible and emit copy.
|
||
{
|
||
// srcLeft = sLimitL - s
|
||
srcLeft := GP64()
|
||
MOVL(sLimitL, srcLeft.As32())
|
||
SUBL(s, srcLeft.As32())
|
||
length := matchLen("match_nolit_"+name,
|
||
Mem{Base: src, Index: s, Scale: 1},
|
||
Mem{Base: src, Index: candidate, Scale: 1},
|
||
srcLeft,
|
||
LabelRef("match_nolit_end_"+name),
|
||
)
|
||
Label("match_nolit_end_" + name)
|
||
offset := GP64()
|
||
MOVL(repeatL, offset.As32())
|
||
ADDQ(U8(4), length)
|
||
dst := GP64()
|
||
MOVQ(dstBase, dst)
|
||
// s += length (lenght is destroyed, use it now)
|
||
ADDL(length.As32(), s)
|
||
emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
|
||
Label("match_nolit_emitcopy_end_" + name)
|
||
MOVQ(dst, dstBase)
|
||
MOVL(s, nextEmitL)
|
||
CMPL(s, sLimitL)
|
||
JGE(LabelRef("emit_remainder_" + name))
|
||
|
||
// Bail if we exceed the maximum size.
|
||
{
|
||
CMPQ(dst, dstLimitPtrQ)
|
||
JL(LabelRef("match_nolit_dst_ok_" + name))
|
||
ri, err := ReturnIndex(0).Resolve()
|
||
if err != nil {
|
||
panic(err)
|
||
}
|
||
MOVQ(U32(0), ri.Addr)
|
||
RET()
|
||
Label("match_nolit_dst_ok_" + name)
|
||
}
|
||
}
|
||
{
|
||
// Check for an immediate match, otherwise start search at s+1
|
||
x := GP64()
|
||
// Index s-2
|
||
MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x)
|
||
hasher := hash6(tableBits)
|
||
hash0, hash1 := GP64(), GP64()
|
||
MOVQ(x, hash0) // s-2
|
||
SHRQ(U8(16), x)
|
||
MOVQ(x, hash1) // s
|
||
hasher.hash(hash0)
|
||
hasher.hash(hash1)
|
||
c0, c1 := GP64(), GP64()
|
||
MOVL(table.Idx(hash0, 1), c0.As32())
|
||
MOVL(table.Idx(hash1, 1), c1.As32())
|
||
sm2 := GP64()
|
||
LEAQ(Mem{Base: s, Disp: -2}, sm2)
|
||
MOVL(sm2.As32(), table.Idx(hash0, 1))
|
||
MOVL(s, table.Idx(hash1, 1))
|
||
CMPL(Mem{Base: src, Index: hash1, Scale: 1}, x.As32())
|
||
JEQ(LabelRef("match_nolit_loop_" + name))
|
||
INCL(s)
|
||
}
|
||
JMP(LabelRef("search_loop_" + name))
|
||
}
|
||
|
||
Label("emit_remainder_" + name)
|
||
// Bail if we exceed the maximum size.
|
||
// if d+len(src)-nextEmitL > dstLimitPtrQ { return 0
|
||
{
|
||
// remain = lenSrc - nextEmitL
|
||
remain := GP64()
|
||
MOVQ(lenSrcQ, remain)
|
||
SUBL(nextEmitL, remain.As32())
|
||
dst := GP64()
|
||
MOVQ(dstBase, dst)
|
||
// dst := dst + (len(src)-nextEmitL)
|
||
LEAQ(Mem{Base: dst, Index: remain, Scale: 1}, dst)
|
||
CMPQ(dst, dstLimitPtrQ)
|
||
JL(LabelRef("emit_remainder_ok_" + name))
|
||
ri, err := ReturnIndex(0).Resolve()
|
||
if err != nil {
|
||
panic(err)
|
||
}
|
||
MOVQ(U32(0), ri.Addr)
|
||
RET()
|
||
Label("emit_remainder_ok_" + name)
|
||
}
|
||
// emitLiteral(dst[d:], src[nextEmitL:])
|
||
emitEnd := GP64()
|
||
MOVQ(lenSrcQ, emitEnd)
|
||
|
||
// Emit final literals.
|
||
emitLiterals(nextEmitL, emitEnd, src, dstBase, "emit_remainder_"+name, avx)
|
||
|
||
// length := start - base (ptr arithmetic)
|
||
length := GP64()
|
||
MOVQ(dstStartPtrQ, length)
|
||
SUBQ(dstBase, length)
|
||
|
||
Store(length, ReturnIndex(0))
|
||
RET()
|
||
}
|
||
|
||
// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
|
||
// Checks if base == nextemit.
|
||
// src & base are untouched.
|
||
func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) {
|
||
nextEmit, litLen, dstBaseTmp, litBase := GP64().As32(), GP64(), GP64(), GP64()
|
||
MOVL(nextEmitL, nextEmit)
|
||
CMPL(nextEmit, base.As32())
|
||
JEQ(LabelRef("emit_literal_skip_" + name))
|
||
MOVL(base.As32(), litLen.As32())
|
||
|
||
// Base is now next emit.
|
||
MOVL(base.As32(), nextEmitL)
|
||
|
||
// litBase = src[nextEmitL:]
|
||
LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
|
||
SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
|
||
|
||
// Load (and store when we return)
|
||
MOVQ(dstBase, dstBaseTmp)
|
||
emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true)
|
||
Label("emit_literal_done_" + name)
|
||
// Store updated dstBase
|
||
MOVQ(dstBaseTmp, dstBase)
|
||
Label("emit_literal_skip_" + name)
|
||
}
|
||
|
||
type hashGen struct {
|
||
bytes int
|
||
tablebits int
|
||
mulreg reg.GPVirtual
|
||
}
|
||
|
||
// hash uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
|
||
func hash6(tablebits int) hashGen {
|
||
h := hashGen{
|
||
bytes: 6,
|
||
tablebits: tablebits,
|
||
mulreg: GP64(),
|
||
}
|
||
MOVQ(Imm(227718039650203), h.mulreg)
|
||
return h
|
||
}
|
||
|
||
// hash uses multiply to get hash of the value.
|
||
func (h hashGen) hash(val reg.GPVirtual) {
|
||
// Move value to top of register.
|
||
SHLQ(U8(64-8*h.bytes), val)
|
||
IMULQ(h.mulreg, val)
|
||
// Move value to bottom
|
||
SHRQ(U8(64-h.tablebits), val)
|
||
}
|
||
|
||
func genEmitLiteral() {
|
||
TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
|
||
Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
|
||
"It assumes that:",
|
||
" dst is long enough to hold the encoded bytes",
|
||
" 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
|
||
Pragma("noescape")
|
||
|
||
dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
|
||
Load(Param("dst").Base(), dstBase)
|
||
Load(Param("lit").Base(), litBase)
|
||
Load(Param("lit").Len(), litLen)
|
||
emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false)
|
||
Label("emit_literal_end_standalone")
|
||
Store(retval, ReturnIndex(0))
|
||
RET()
|
||
|
||
TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int")
|
||
Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "",
|
||
"It assumes that:",
|
||
" dst is long enough to hold the encoded bytes",
|
||
" 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
|
||
Pragma("noescape")
|
||
|
||
dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64()
|
||
Load(Param("dst").Base(), dstBase)
|
||
Load(Param("lit").Base(), litBase)
|
||
Load(Param("lit").Len(), litLen)
|
||
emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false)
|
||
Label("emit_literal_end_avx_standalone")
|
||
Store(retval, ReturnIndex(0))
|
||
RET()
|
||
}
|
||
|
||
// emitLiteral can be used for inlining an emitLiteral call.
|
||
// stack must have at least 32 bytes.
|
||
// retval will contain emitted bytes, but can be nil if this is not interesting.
|
||
// dstBase and litBase are updated.
|
||
// Uses 2 GP registers. With AVX 4 registers.
|
||
// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
|
||
func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) {
|
||
n := GP64()
|
||
n16 := GP64()
|
||
|
||
// We always add litLen bytes
|
||
if retval != nil {
|
||
MOVQ(litLen, retval)
|
||
}
|
||
MOVQ(litLen, n)
|
||
|
||
SUBL(U8(1), n.As32())
|
||
// Return if AX was 0
|
||
JC(end)
|
||
|
||
// Find number of bytes to emit for tag.
|
||
CMPL(n.As32(), U8(60))
|
||
JLT(LabelRef("one_byte_" + name))
|
||
CMPL(n.As32(), U32(1<<8))
|
||
JLT(LabelRef("two_bytes_" + name))
|
||
CMPL(n.As32(), U32(1<<16))
|
||
JLT(LabelRef("three_bytes_" + name))
|
||
CMPL(n.As32(), U32(1<<24))
|
||
JLT(LabelRef("four_bytes_" + name))
|
||
|
||
Label("five_bytes_" + name)
|
||
MOVB(U8(252), Mem{Base: dstBase})
|
||
MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
|
||
if retval != nil {
|
||
ADDQ(U8(5), retval)
|
||
}
|
||
ADDQ(U8(5), dstBase)
|
||
JMP(LabelRef("memmove_" + name))
|
||
|
||
Label("four_bytes_" + name)
|
||
MOVQ(n, n16)
|
||
SHRL(U8(16), n16.As32())
|
||
MOVB(U8(248), Mem{Base: dstBase})
|
||
MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
|
||
MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
|
||
if retval != nil {
|
||
ADDQ(U8(4), retval)
|
||
}
|
||
ADDQ(U8(4), dstBase)
|
||
JMP(LabelRef("memmove_" + name))
|
||
|
||
Label("three_bytes_" + name)
|
||
MOVB(U8(0xf4), Mem{Base: dstBase})
|
||
MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
|
||
if retval != nil {
|
||
ADDQ(U8(3), retval)
|
||
}
|
||
ADDQ(U8(3), dstBase)
|
||
JMP(LabelRef("memmove_" + name))
|
||
|
||
Label("two_bytes_" + name)
|
||
MOVB(U8(0xf0), Mem{Base: dstBase})
|
||
MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
|
||
if retval != nil {
|
||
ADDQ(U8(2), retval)
|
||
}
|
||
ADDQ(U8(2), dstBase)
|
||
JMP(LabelRef("memmove_" + name))
|
||
|
||
Label("one_byte_" + name)
|
||
SHLB(U8(2), n.As8())
|
||
MOVB(n.As8(), Mem{Base: dstBase})
|
||
if retval != nil {
|
||
ADDQ(U8(1), retval)
|
||
}
|
||
ADDQ(U8(1), dstBase)
|
||
// Fallthrough
|
||
|
||
Label("memmove_" + name)
|
||
|
||
// copy(dst[i:], lit)
|
||
if true {
|
||
dstEnd := GP64()
|
||
if updateDst {
|
||
LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
|
||
}
|
||
genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, litLen, end, avx)
|
||
if updateDst {
|
||
MOVQ(dstEnd, dstBase)
|
||
}
|
||
} else {
|
||
genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end)
|
||
}
|
||
return
|
||
}
|
||
|
||
// genEmitRepeat generates a standlone emitRepeat.
|
||
func genEmitRepeat() {
|
||
TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
|
||
Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
|
||
"Length must be at least 4 and < 1<<32", "")
|
||
Pragma("noescape")
|
||
|
||
dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
|
||
|
||
// retval = 0
|
||
XORQ(retval, retval)
|
||
|
||
Load(Param("dst").Base(), dstBase)
|
||
Load(Param("offset"), offset)
|
||
Load(Param("length"), length)
|
||
emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
|
||
Label("gen_emit_repeat_end")
|
||
Store(retval, ReturnIndex(0))
|
||
RET()
|
||
}
|
||
|
||
// emitRepeat can be used for inlining an emitRepeat call.
|
||
// length >= 4 and < 1<<32
|
||
// length is modified. dstBase is updated. retval is added to input.
|
||
// retval can be nil.
|
||
// Will jump to end label when finished.
|
||
// Uses 1 GP register.
|
||
func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
|
||
Label("emit_repeat_again_" + name)
|
||
tmp := GP64()
|
||
MOVQ(length, tmp) // Copy length
|
||
// length -= 4
|
||
LEAQ(Mem{Base: length, Disp: -4}, length)
|
||
|
||
// if length <= 4 (use copied value)
|
||
CMPL(tmp.As32(), U8(8))
|
||
JLE(LabelRef("repeat_two_" + name))
|
||
|
||
// length < 8 && offset < 2048
|
||
CMPL(tmp.As32(), U8(12))
|
||
JGE(LabelRef("cant_repeat_two_offset_" + name))
|
||
CMPL(offset.As32(), U32(2048))
|
||
JLT(LabelRef("repeat_two_offset_" + name))
|
||
|
||
const maxRepeat = ((1 << 24) - 1) + 65536
|
||
Label("cant_repeat_two_offset_" + name)
|
||
CMPL(length.As32(), U32((1<<8)+4))
|
||
JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
|
||
CMPL(length.As32(), U32((1<<16)+(1<<8)))
|
||
JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
|
||
CMPL(length.As32(), U32(maxRepeat))
|
||
JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
|
||
|
||
// We have have more than 24 bits
|
||
// Emit so we have at least 4 bytes left.
|
||
LEAQ(Mem{Base: length, Disp: -(maxRepeat - 4)}, length) // length -= (maxRepeat - 4)
|
||
MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
|
||
MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb
|
||
MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
|
||
ADDQ(U8(5), dstBase)
|
||
if retval != nil {
|
||
ADDQ(U8(5), retval)
|
||
}
|
||
JMP(LabelRef("emit_repeat_again_" + name))
|
||
|
||
// Must be able to be within 5 bytes.
|
||
Label("repeat_five_" + name)
|
||
LEAQ(Mem{Base: length, Disp: -65536}, length) // length -= 65536
|
||
MOVQ(length, offset)
|
||
MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
|
||
MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
|
||
SARQ(U8(16), offset) // offset = length >> 16
|
||
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16
|
||
if retval != nil {
|
||
ADDQ(U8(5), retval) // i += 5
|
||
}
|
||
ADDQ(U8(5), dstBase) // dst += 5
|
||
JMP(end)
|
||
|
||
Label("repeat_four_" + name)
|
||
LEAQ(Mem{Base: length, Disp: -256}, length) // length -= 256
|
||
MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
|
||
MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
|
||
if retval != nil {
|
||
ADDQ(U8(4), retval) // i += 4
|
||
}
|
||
ADDQ(U8(4), dstBase) // dst += 4
|
||
JMP(end)
|
||
|
||
Label("repeat_three_" + name)
|
||
LEAQ(Mem{Base: length, Disp: -4}, length) // length -= 4
|
||
MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
|
||
MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length)
|
||
if retval != nil {
|
||
ADDQ(U8(3), retval) // i += 3
|
||
}
|
||
ADDQ(U8(3), dstBase) // dst += 3
|
||
JMP(end)
|
||
|
||
Label("repeat_two_" + name)
|
||
// dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
|
||
SHLL(U8(2), length.As32())
|
||
ORL(U8(tagCopy1), length.As32())
|
||
MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
|
||
if retval != nil {
|
||
ADDQ(U8(2), retval) // i += 2
|
||
}
|
||
ADDQ(U8(2), dstBase) // dst += 2
|
||
JMP(end)
|
||
|
||
Label("repeat_two_offset_" + name)
|
||
// Emit the remaining copy, encoded as 2 bytes.
|
||
// dst[1] = uint8(offset)
|
||
// dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
|
||
tmp = GP64()
|
||
XORQ(tmp, tmp)
|
||
// Use scale and displacement to shift and subtract values from length.
|
||
LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length)
|
||
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
|
||
SARL(U8(8), offset.As32()) // Remove lower
|
||
SHLL(U8(5), offset.As32()) // Shift back up
|
||
ORL(offset.As32(), length.As32()) // OR result
|
||
MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
|
||
if retval != nil {
|
||
ADDQ(U8(2), retval) // i += 2
|
||
}
|
||
ADDQ(U8(2), dstBase) // dst += 2
|
||
|
||
JMP(end)
|
||
}
|
||
|
||
// emitCopy writes a copy chunk and returns the number of bytes written.
|
||
//
|
||
// It assumes that:
|
||
// dst is long enough to hold the encoded bytes
|
||
// 1 <= offset && offset <= math.MaxUint32
|
||
// 4 <= length && length <= 1 << 24
|
||
|
||
// genEmitCopy generates a standlone emitCopy
|
||
func genEmitCopy() {
|
||
TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
|
||
Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
|
||
"It assumes that:",
|
||
" dst is long enough to hold the encoded bytes",
|
||
" 1 <= offset && offset <= math.MaxUint32",
|
||
" 4 <= length && length <= 1 << 24", "")
|
||
Pragma("noescape")
|
||
|
||
dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
|
||
|
||
// i := 0
|
||
XORQ(retval, retval)
|
||
|
||
Load(Param("dst").Base(), dstBase)
|
||
Load(Param("offset"), offset)
|
||
Load(Param("length"), length)
|
||
emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"))
|
||
Label("gen_emit_copy_end")
|
||
Store(retval, ReturnIndex(0))
|
||
RET()
|
||
}
|
||
|
||
const (
|
||
tagLiteral = 0x00
|
||
tagCopy1 = 0x01
|
||
tagCopy2 = 0x02
|
||
tagCopy4 = 0x03
|
||
)
|
||
|
||
// emitCopy can be used for inlining an emitCopy call.
|
||
// length is modified (and junk). dstBase is updated. retval is added to input.
|
||
// retval can be nil.
|
||
// Will jump to end label when finished.
|
||
// Uses 2 GP registers.
|
||
func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
|
||
// if offset >= 65536 {
|
||
CMPL(offset.As32(), U32(65536))
|
||
JL(LabelRef("two_byte_offset_" + name))
|
||
|
||
// offset is >= 65536
|
||
// if length <= 64 goto four_bytes_remain_
|
||
CMPL(length.As32(), U8(64))
|
||
JLE(LabelRef("four_bytes_remain_" + name))
|
||
|
||
// Emit a length 64 copy, encoded as 5 bytes.
|
||
// dst[0] = 63<<2 | tagCopy4
|
||
MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
|
||
// dst[4] = uint8(offset >> 24)
|
||
// dst[3] = uint8(offset >> 16)
|
||
// dst[2] = uint8(offset >> 8)
|
||
// dst[1] = uint8(offset)
|
||
MOVD(offset, Mem{Base: dstBase, Disp: 1})
|
||
// length -= 64
|
||
LEAQ(Mem{Base: length, Disp: -64}, length)
|
||
if retval != nil {
|
||
ADDQ(U8(5), retval) // i+=5
|
||
}
|
||
ADDQ(U8(5), dstBase) // dst+=5
|
||
|
||
// if length >= 4 {
|
||
CMPL(length.As32(), U8(4))
|
||
JL(LabelRef("four_bytes_remain_" + name))
|
||
|
||
// Emit remaining as repeats
|
||
// return 5 + emitRepeat(dst[5:], offset, length)
|
||
// Inline call to emitRepeat. Will jump to end
|
||
emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
|
||
|
||
Label("four_bytes_remain_" + name)
|
||
// if length == 0 {
|
||
// return i
|
||
// }
|
||
TESTL(length.As32(), length.As32())
|
||
JZ(end)
|
||
|
||
// Emit a copy, offset encoded as 4 bytes.
|
||
// dst[i+0] = uint8(length-1)<<2 | tagCopy4
|
||
// dst[i+1] = uint8(offset)
|
||
// dst[i+2] = uint8(offset >> 8)
|
||
// dst[i+3] = uint8(offset >> 16)
|
||
// dst[i+4] = uint8(offset >> 24)
|
||
tmp := GP64()
|
||
MOVB(U8(tagCopy4), tmp.As8())
|
||
// Use displacement to subtract 1 from upshifted length.
|
||
LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
|
||
MOVB(length.As8(), Mem{Base: dstBase})
|
||
MOVD(offset, Mem{Base: dstBase, Disp: 1})
|
||
// return i + 5
|
||
if retval != nil {
|
||
ADDQ(U8(5), retval)
|
||
}
|
||
ADDQ(U8(5), dstBase)
|
||
JMP(end)
|
||
|
||
Label("two_byte_offset_" + name)
|
||
// Offset no more than 2 bytes.
|
||
|
||
// if length > 64 {
|
||
CMPL(length.As32(), U8(64))
|
||
JLE(LabelRef("two_byte_offset_short_" + name))
|
||
// Emit a length 60 copy, encoded as 3 bytes.
|
||
// Emit remaining as repeat value (minimum 4 bytes).
|
||
// dst[2] = uint8(offset >> 8)
|
||
// dst[1] = uint8(offset)
|
||
// dst[0] = 59<<2 | tagCopy2
|
||
MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
|
||
MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
|
||
// length -= 60
|
||
LEAQ(Mem{Base: length, Disp: -60}, length)
|
||
|
||
// Emit remaining as repeats, at least 4 bytes remain.
|
||
// return 3 + emitRepeat(dst[3:], offset, length)
|
||
//}
|
||
ADDQ(U8(3), dstBase)
|
||
if retval != nil {
|
||
ADDQ(U8(3), retval)
|
||
}
|
||
// Inline call to emitRepeat. Will jump to end
|
||
emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
|
||
|
||
Label("two_byte_offset_short_" + name)
|
||
// if length >= 12 || offset >= 2048 {
|
||
CMPL(length.As32(), U8(12))
|
||
JGE(LabelRef("emit_copy_three_" + name))
|
||
CMPL(offset.As32(), U32(2048))
|
||
JGE(LabelRef("emit_copy_three_" + name))
|
||
|
||
// Emit the remaining copy, encoded as 2 bytes.
|
||
// dst[1] = uint8(offset)
|
||
// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
|
||
tmp = GP64()
|
||
MOVB(U8(tagCopy1), tmp.As8())
|
||
// Use scale and displacement to shift and subtract values from length.
|
||
LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length)
|
||
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
|
||
SHRL(U8(8), offset.As32()) // Remove lower
|
||
SHLL(U8(5), offset.As32()) // Shift back up
|
||
ORL(offset.As32(), length.As32()) // OR result
|
||
MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
|
||
if retval != nil {
|
||
ADDQ(U8(2), retval) // i += 2
|
||
}
|
||
ADDQ(U8(2), dstBase) // dst += 2
|
||
// return 2
|
||
JMP(end)
|
||
|
||
Label("emit_copy_three_" + name)
|
||
// // Emit the remaining copy, encoded as 3 bytes.
|
||
// dst[2] = uint8(offset >> 8)
|
||
// dst[1] = uint8(offset)
|
||
// dst[0] = uint8(length-1)<<2 | tagCopy2
|
||
tmp = GP64()
|
||
MOVB(U8(tagCopy2), tmp.As8())
|
||
LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
|
||
MOVB(length.As8(), Mem{Base: dstBase})
|
||
MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
|
||
// return 3
|
||
if retval != nil {
|
||
ADDQ(U8(3), retval) // i += 3
|
||
}
|
||
ADDQ(U8(3), dstBase) // dst += 3
|
||
JMP(end)
|
||
}
|
||
|
||
// func memmove(to, from unsafe.Pointer, n uintptr)
|
||
// to and from will be at the end, n will be 0.
|
||
// to and from may not overlap.
|
||
// Fairly simplistic for now, can ofc. be extended.
|
||
// Uses one GP register and 8 SSE registers.
|
||
func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) {
|
||
tmp := GP64()
|
||
MOVQ(n, tmp)
|
||
// tmp = n/128
|
||
SHRQ(U8(7), tmp)
|
||
|
||
TESTQ(tmp, tmp)
|
||
JZ(LabelRef("done_128_" + name))
|
||
Label("loop_128_" + name)
|
||
var xmmregs [8]reg.VecVirtual
|
||
|
||
// Prefetch destination for next loop.
|
||
// Prefetching source doesn't provide speedup.
|
||
// This seems to give a small boost.
|
||
const preOff = 128
|
||
PREFETCHT0(Mem{Base: to, Disp: preOff})
|
||
PREFETCHT0(Mem{Base: to, Disp: preOff + 64})
|
||
|
||
for i := 0; i < 8; i++ {
|
||
xmmregs[i] = XMM()
|
||
MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i])
|
||
}
|
||
for i := 0; i < 8; i++ {
|
||
MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16))
|
||
}
|
||
LEAQ(Mem{Base: n, Disp: -128}, n)
|
||
ADDQ(U8(8*16), from)
|
||
ADDQ(U8(8*16), to)
|
||
DECQ(tmp)
|
||
JNZ(LabelRef("loop_128_" + name))
|
||
|
||
Label("done_128_" + name)
|
||
MOVQ(n, tmp)
|
||
// tmp = n/16
|
||
SHRQ(U8(4), tmp)
|
||
TESTQ(tmp, tmp)
|
||
JZ(LabelRef("done_16_" + name))
|
||
|
||
Label("loop_16_" + name)
|
||
xmm := XMM()
|
||
MOVOU(Mem{Base: from}, xmm)
|
||
MOVOU(xmm, Mem{Base: to})
|
||
LEAQ(Mem{Base: n, Disp: -16}, n)
|
||
ADDQ(U8(16), from)
|
||
ADDQ(U8(16), to)
|
||
DECQ(tmp)
|
||
JNZ(LabelRef("loop_16_" + name))
|
||
Label("done_16_" + name)
|
||
|
||
// TODO: Use REP; MOVSB somehow.
|
||
TESTQ(n, n)
|
||
JZ(end)
|
||
Label("loop_1_" + name)
|
||
MOVB(Mem{Base: from}, tmp.As8())
|
||
MOVB(tmp.As8(), Mem{Base: to})
|
||
INCQ(from)
|
||
INCQ(to)
|
||
DECQ(n)
|
||
JNZ(LabelRef("loop_1_" + name))
|
||
}
|
||
|
||
// func memmove(to, from unsafe.Pointer, n uintptr)
|
||
// src and dst may not overlap.
|
||
// Non AVX uses 2 GP register, 16 SSE2 registers.
|
||
// AVX uses 4 GP registers 16 AVX/SSE registers.
|
||
// All passed registers may be updated.
|
||
func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) {
|
||
AX, CX := GP64(), GP64()
|
||
NOP()
|
||
name += "_memmove_"
|
||
Label(name + "tail")
|
||
// move_129through256 or smaller work whether or not the source and the
|
||
// destination memory regions overlap because they load all data into
|
||
// registers before writing it back. move_256through2048 on the other
|
||
// hand can be used only when the memory regions don't overlap or the copy
|
||
// direction is forward.
|
||
//
|
||
// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
|
||
TESTQ(length, length)
|
||
JEQ(end)
|
||
CMPQ(length, U8(2))
|
||
JBE(LabelRef(name + "move_1or2"))
|
||
CMPQ(length, U8(4))
|
||
JB(LabelRef(name + "move_3"))
|
||
JBE(LabelRef(name + "move_4"))
|
||
CMPQ(length, U8(8))
|
||
JB(LabelRef(name + "move_5through7"))
|
||
JE(LabelRef(name + "move_8"))
|
||
CMPQ(length, U8(16))
|
||
JBE(LabelRef(name + "move_9through16"))
|
||
CMPQ(length, U8(32))
|
||
JBE(LabelRef(name + "move_17through32"))
|
||
CMPQ(length, U8(64))
|
||
JBE(LabelRef(name + "move_33through64"))
|
||
CMPQ(length, U8(128))
|
||
JBE(LabelRef(name + "move_65through128"))
|
||
CMPQ(length, U32(256))
|
||
JBE(LabelRef(name + "move_129through256"))
|
||
|
||
if avx {
|
||
JMP(LabelRef(name + "avxUnaligned"))
|
||
} else {
|
||
if false {
|
||
// Don't check length for now.
|
||
Label(name + "forward")
|
||
CMPQ(length, U32(2048))
|
||
JLS(LabelRef(name + "move_256through2048"))
|
||
|
||
genMemMove(name+"fallback", dst, src, length, end)
|
||
} else {
|
||
JMP(LabelRef(name + "move_256through2048"))
|
||
}
|
||
}
|
||
/*
|
||
// If REP MOVSB isn't fast, don't use it
|
||
// FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB)
|
||
// CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB
|
||
JMP(LabelRef(name + "fwdBy8"))
|
||
|
||
// Check alignment
|
||
MOVL(src.As32(), AX.As32())
|
||
ORL(dst.As32(), AX.As32())
|
||
TESTL(U32(7), AX.As32())
|
||
JEQ(LabelRef(name + "fwdBy8"))
|
||
|
||
// Do 1 byte at a time
|
||
// MOVQ(length, CX)
|
||
// FIXME:
|
||
// REP; MOVSB
|
||
JMP(end)
|
||
|
||
Label(name + "fwdBy8")
|
||
// Do 8 bytes at a time
|
||
MOVQ(length, CX)
|
||
SHRQ(U8(3), CX)
|
||
ANDQ(U8(7), length)
|
||
// FIXME:
|
||
//REP; MOVSQ
|
||
JMP(LabelRef(name + "tail"))
|
||
|
||
Label(name + "back")
|
||
|
||
//check overlap
|
||
MOVQ(src, CX)
|
||
ADDQ(length, CX)
|
||
CMPQ(CX, dst)
|
||
JLS(LabelRef(name + "forward"))
|
||
|
||
//whole thing backwards has
|
||
//adjusted addresses
|
||
|
||
ADDQ(length, dst)
|
||
ADDQ(length, src)
|
||
STD()
|
||
|
||
//
|
||
// copy
|
||
//
|
||
MOVQ(length, CX)
|
||
SHRQ(U8(3), CX)
|
||
ANDQ(U8(7), length)
|
||
|
||
SUBQ(U8(8), dst)
|
||
SUBQ(U8(8), src)
|
||
// FIXME:
|
||
//REP; MOVSQ
|
||
|
||
// FIXME:
|
||
//CLD()
|
||
|
||
ADDQ(U8(8), dst)
|
||
ADDQ(U8(8), src)
|
||
SUBQ(length, dst)
|
||
SUBQ(length, src)
|
||
JMP(LabelRef(name + "tail"))
|
||
*/
|
||
|
||
Label(name + "move_1or2")
|
||
MOVB(Mem{Base: src}, AX.As8())
|
||
MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
|
||
MOVB(AX.As8(), Mem{Base: dst})
|
||
MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
|
||
JMP(end)
|
||
|
||
Label(name + "move_4")
|
||
MOVL(Mem{Base: src}, AX.As32())
|
||
MOVL(AX.As32(), Mem{Base: dst})
|
||
JMP(end)
|
||
|
||
Label(name + "move_3")
|
||
MOVW(Mem{Base: src}, AX.As16())
|
||
MOVB(Mem{Base: src, Disp: 2}, CX.As8())
|
||
MOVW(AX.As16(), Mem{Base: dst})
|
||
MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
|
||
JMP(end)
|
||
|
||
Label(name + "move_5through7")
|
||
MOVL(Mem{Base: src}, AX.As32())
|
||
MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
|
||
MOVL(AX.As32(), Mem{Base: dst})
|
||
MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
|
||
JMP(end)
|
||
|
||
Label(name + "move_8")
|
||
// We need a separate case for 8 to make sure we write pointers atomically.
|
||
MOVQ(Mem{Base: src}, AX)
|
||
MOVQ(AX, Mem{Base: dst})
|
||
JMP(end)
|
||
|
||
Label(name + "move_9through16")
|
||
MOVQ(Mem{Base: src}, AX)
|
||
MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
|
||
MOVQ(AX, Mem{Base: dst})
|
||
MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
|
||
JMP(end)
|
||
|
||
Label(name + "move_17through32")
|
||
X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
|
||
X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
|
||
|
||
MOVOU(Mem{Base: src}, X0)
|
||
MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
|
||
MOVOU(X0, Mem{Base: dst})
|
||
MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
|
||
JMP(end)
|
||
|
||
Label(name + "move_33through64")
|
||
MOVOU(Mem{Base: src}, X0)
|
||
MOVOU(Mem{Base: src, Disp: 16}, X1)
|
||
MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
|
||
MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
|
||
MOVOU(X0, Mem{Base: dst})
|
||
MOVOU(X1, Mem{Base: dst, Disp: 16})
|
||
MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
|
||
MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
|
||
JMP(end)
|
||
|
||
Label(name + "move_65through128")
|
||
MOVOU(Mem{Base: src}, X0)
|
||
MOVOU(Mem{Base: src, Disp: 16}, X1)
|
||
MOVOU(Mem{Base: src, Disp: 32}, X2)
|
||
MOVOU(Mem{Base: src, Disp: 48}, X3)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
|
||
MOVOU(X0, Mem{Base: dst})
|
||
MOVOU(X1, Mem{Base: dst, Disp: 16})
|
||
MOVOU(X2, Mem{Base: dst, Disp: 32})
|
||
MOVOU(X3, Mem{Base: dst, Disp: 48})
|
||
MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
|
||
MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
|
||
MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
|
||
MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
|
||
JMP(end)
|
||
|
||
Label(name + "move_129through256")
|
||
MOVOU(Mem{Base: src}, X0)
|
||
MOVOU(Mem{Base: src, Disp: 16}, X1)
|
||
MOVOU(Mem{Base: src, Disp: 32}, X2)
|
||
MOVOU(Mem{Base: src, Disp: 48}, X3)
|
||
MOVOU(Mem{Base: src, Disp: 64}, X4)
|
||
MOVOU(Mem{Base: src, Disp: 80}, X5)
|
||
MOVOU(Mem{Base: src, Disp: 96}, X6)
|
||
MOVOU(Mem{Base: src, Disp: 112}, X7)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
|
||
MOVOU(X0, Mem{Base: dst})
|
||
MOVOU(X1, Mem{Base: dst, Disp: 16})
|
||
MOVOU(X2, Mem{Base: dst, Disp: 32})
|
||
MOVOU(X3, Mem{Base: dst, Disp: 48})
|
||
MOVOU(X4, Mem{Base: dst, Disp: 64})
|
||
MOVOU(X5, Mem{Base: dst, Disp: 80})
|
||
MOVOU(X6, Mem{Base: dst, Disp: 96})
|
||
MOVOU(X7, Mem{Base: dst, Disp: 112})
|
||
MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
|
||
MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
|
||
MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
|
||
MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
|
||
MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
|
||
MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
|
||
MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
|
||
MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
|
||
JMP(end)
|
||
|
||
Label(name + "move_256through2048")
|
||
LEAQ(Mem{Base: length, Disp: -256}, length)
|
||
MOVOU(Mem{Base: src}, X0)
|
||
MOVOU(Mem{Base: src, Disp: 16}, X1)
|
||
MOVOU(Mem{Base: src, Disp: 32}, X2)
|
||
MOVOU(Mem{Base: src, Disp: 48}, X3)
|
||
MOVOU(Mem{Base: src, Disp: 64}, X4)
|
||
MOVOU(Mem{Base: src, Disp: 80}, X5)
|
||
MOVOU(Mem{Base: src, Disp: 96}, X6)
|
||
MOVOU(Mem{Base: src, Disp: 112}, X7)
|
||
MOVOU(Mem{Base: src, Disp: 128}, X8)
|
||
MOVOU(Mem{Base: src, Disp: 144}, X9)
|
||
MOVOU(Mem{Base: src, Disp: 160}, X10)
|
||
MOVOU(Mem{Base: src, Disp: 176}, X11)
|
||
MOVOU(Mem{Base: src, Disp: 192}, X12)
|
||
MOVOU(Mem{Base: src, Disp: 208}, X13)
|
||
MOVOU(Mem{Base: src, Disp: 224}, X14)
|
||
MOVOU(Mem{Base: src, Disp: 240}, X15)
|
||
MOVOU(X0, Mem{Base: dst})
|
||
MOVOU(X1, Mem{Base: dst, Disp: 16})
|
||
MOVOU(X2, Mem{Base: dst, Disp: 32})
|
||
MOVOU(X3, Mem{Base: dst, Disp: 48})
|
||
MOVOU(X4, Mem{Base: dst, Disp: 64})
|
||
MOVOU(X5, Mem{Base: dst, Disp: 80})
|
||
MOVOU(X6, Mem{Base: dst, Disp: 96})
|
||
MOVOU(X7, Mem{Base: dst, Disp: 112})
|
||
MOVOU(X8, Mem{Base: dst, Disp: 128})
|
||
MOVOU(X9, Mem{Base: dst, Disp: 144})
|
||
MOVOU(X10, Mem{Base: dst, Disp: 160})
|
||
MOVOU(X11, Mem{Base: dst, Disp: 176})
|
||
MOVOU(X12, Mem{Base: dst, Disp: 192})
|
||
MOVOU(X13, Mem{Base: dst, Disp: 208})
|
||
MOVOU(X14, Mem{Base: dst, Disp: 224})
|
||
MOVOU(X15, Mem{Base: dst, Disp: 240})
|
||
CMPQ(length, U32(256))
|
||
LEAQ(Mem{Base: src, Disp: 256}, src)
|
||
LEAQ(Mem{Base: dst, Disp: 256}, dst)
|
||
JGE(LabelRef(name + "move_256through2048"))
|
||
JMP(LabelRef(name + "tail"))
|
||
|
||
if avx {
|
||
Label(name + "avxUnaligned")
|
||
R8, R10 := GP64(), GP64()
|
||
// There are two implementations of move algorithm.
|
||
// The first one for non-overlapped memory regions. It uses forward copying.
|
||
// We do not support overlapping input
|
||
|
||
// Non-temporal copy would be better for big sizes.
|
||
// Disabled since big copies are unlikely.
|
||
// If enabling, test functionality.
|
||
const enableBigData = false
|
||
if enableBigData {
|
||
CMPQ(length, U32(0x100000))
|
||
JAE(LabelRef(name + "gobble_big_data_fwd"))
|
||
}
|
||
|
||
// Memory layout on the source side
|
||
// src CX
|
||
// |<---------length before correction--------->|
|
||
// | |<--length corrected-->| |
|
||
// | | |<--- AX --->|
|
||
// |<-R11->| |<-128 bytes->|
|
||
// +----------------------------------------+
|
||
// | Head | Body | Tail |
|
||
// +-------+------------------+-------------+
|
||
// ^ ^ ^
|
||
// | | |
|
||
// Save head into Y4 Save tail into X5..X12
|
||
// |
|
||
// src+R11, where R11 = ((dst & -32) + 32) - dst
|
||
// Algorithm:
|
||
// 1. Unaligned save of the tail's 128 bytes
|
||
// 2. Unaligned save of the head's 32 bytes
|
||
// 3. Destination-aligned copying of body (128 bytes per iteration)
|
||
// 4. Put head on the new place
|
||
// 5. Put the tail on the new place
|
||
// It can be important to satisfy processor's pipeline requirements for
|
||
// small sizes as the cost of unaligned memory region copying is
|
||
// comparable with the cost of main loop. So code is slightly messed there.
|
||
// There is more clean implementation of that algorithm for bigger sizes
|
||
// where the cost of unaligned part copying is negligible.
|
||
// You can see it after gobble_big_data_fwd label.
|
||
Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
|
||
|
||
LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
|
||
MOVQ(dst, R10)
|
||
// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
|
||
MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
|
||
MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
|
||
MOVQ(U32(0x80), AX)
|
||
|
||
// Align destination address
|
||
ANDQ(U32(0xffffffe0), dst)
|
||
ADDQ(U8(32), dst)
|
||
// Continue tail saving.
|
||
MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
|
||
MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
|
||
// Make R8 delta between aligned and unaligned destination addresses.
|
||
MOVQ(dst, R8)
|
||
SUBQ(R10, R8)
|
||
// Continue tail saving.
|
||
MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
|
||
MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
|
||
// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
|
||
SUBQ(R8, length)
|
||
// Continue tail saving.
|
||
MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
|
||
MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
|
||
// The tail will be put on its place after main body copying.
|
||
// It's time for the unaligned heading part.
|
||
VMOVDQU(Mem{Base: src}, Y4)
|
||
// Adjust source address to point past head.
|
||
ADDQ(R8, src)
|
||
SUBQ(AX, length)
|
||
|
||
// Aligned memory copying there
|
||
Label(name + "gobble_128_loop")
|
||
VMOVDQU(Mem{Base: src}, Y0)
|
||
VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
|
||
VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
|
||
VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
|
||
ADDQ(AX, src)
|
||
VMOVDQA(Y0, Mem{Base: dst})
|
||
VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
|
||
VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
|
||
VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
|
||
ADDQ(AX, dst)
|
||
SUBQ(AX, length)
|
||
JA(LabelRef(name + "gobble_128_loop"))
|
||
// Now we can store unaligned parts.
|
||
ADDQ(AX, length)
|
||
ADDQ(dst, length)
|
||
VMOVDQU(Y4, Mem{Base: R10})
|
||
VZEROUPPER()
|
||
MOVOU(X5, Mem{Base: length, Disp: -0x80})
|
||
MOVOU(X6, Mem{Base: length, Disp: -0x70})
|
||
MOVOU(X7, Mem{Base: length, Disp: -0x60})
|
||
MOVOU(X8, Mem{Base: length, Disp: -0x50})
|
||
MOVOU(X9, Mem{Base: length, Disp: -0x40})
|
||
MOVOU(X10, Mem{Base: length, Disp: -0x30})
|
||
MOVOU(X11, Mem{Base: length, Disp: -0x20})
|
||
MOVOU(X12, Mem{Base: length, Disp: -0x10})
|
||
JMP(end)
|
||
|
||
if enableBigData {
|
||
Label(name + "gobble_big_data_fwd")
|
||
// There is forward copying for big regions.
|
||
// It uses non-temporal mov instructions.
|
||
// Details of this algorithm are commented previously for small sizes.
|
||
LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
|
||
MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5)
|
||
MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
|
||
MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
|
||
MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
|
||
MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
|
||
MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
|
||
MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
|
||
MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
|
||
VMOVDQU(Mem{Base: src}, Y4)
|
||
MOVQ(dst, R8)
|
||
|
||
ANDQ(U32(0xffffffe0), dst)
|
||
ADDQ(U8(32), dst)
|
||
|
||
MOVQ(dst, R10)
|
||
SUBQ(R8, R10)
|
||
SUBQ(R10, length)
|
||
ADDQ(R10, src)
|
||
LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX)
|
||
SUBQ(U8(0x80), length)
|
||
|
||
Label(name + "gobble_mem_fwd_loop")
|
||
PREFETCHNTA(Mem{Base: src, Disp: 0x1c0})
|
||
PREFETCHNTA(Mem{Base: src, Disp: 0x280})
|
||
// Prefetch values were chosen empirically.
|
||
// Approach for prefetch usage as in 7.6.6 of [1]
|
||
// [1] 64-ia-32-architectures-optimization-manual.pdf
|
||
// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
|
||
VMOVDQU(Mem{Base: src}, Y0)
|
||
VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
|
||
VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
|
||
VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
|
||
|
||
ADDQ(U8(0x80), src)
|
||
VMOVNTDQ(Y0, Mem{Base: dst})
|
||
VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20})
|
||
VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20})
|
||
VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60})
|
||
ADDQ(U8(0x80), dst)
|
||
SUBQ(U8(0x80), length)
|
||
JA(LabelRef(name + "gobble_mem_fwd_loop"))
|
||
// NT instructions don't follow the normal cache-coherency rules.
|
||
// We need SFENCE there to make copied data available timely.
|
||
SFENCE()
|
||
VMOVDQU(Y4, Mem{Base: R8})
|
||
VZEROUPPER()
|
||
MOVOU(X5, Mem{Base: CX, Disp: -0x80})
|
||
MOVOU(X6, Mem{Base: CX, Disp: -0x70})
|
||
MOVOU(X7, Mem{Base: CX, Disp: -0x60})
|
||
MOVOU(X8, Mem{Base: CX, Disp: -0x50})
|
||
MOVOU(X9, Mem{Base: CX, Disp: -0x40})
|
||
MOVOU(X10, Mem{Base: CX, Disp: -0x30})
|
||
MOVOU(X11, Mem{Base: CX, Disp: -0x20})
|
||
MOVOU(X12, Mem{Base: CX, Disp: -0x10})
|
||
JMP(end)
|
||
}
|
||
}
|
||
}
|
||
|
||
// genMatchLen generates standalone matchLen.
|
||
func genMatchLen() {
|
||
TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
|
||
Doc("matchLen returns how many bytes match in a and b", "",
|
||
"It assumes that:",
|
||
" len(a) <= len(b)", "")
|
||
Pragma("noescape")
|
||
|
||
aBase, bBase, length := GP64(), GP64(), GP64()
|
||
|
||
Load(Param("a").Base(), aBase)
|
||
Load(Param("b").Base(), bBase)
|
||
Load(Param("a").Len(), length)
|
||
l := matchLen("standalone", Mem{Base: aBase}, Mem{Base: bBase}, length, LabelRef("gen_match_len_end"))
|
||
Label("gen_match_len_end")
|
||
Store(l, ReturnIndex(0))
|
||
RET()
|
||
}
|
||
|
||
// matchLen returns the number of matching bytes of a and b.
|
||
// len is the maximum number of bytes to match.
|
||
// Will jump to end when done and returns the length.
|
||
// Uses 2 GP registers.
|
||
func matchLen(name string, a, b Mem, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
|
||
tmp, matched := GP64(), GP64()
|
||
XORQ(matched, matched)
|
||
|
||
CMPQ(len, U8(8))
|
||
JL(LabelRef("matchlen_single_" + name))
|
||
|
||
Label("matchlen_loopback_" + name)
|
||
MOVQ(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp)
|
||
XORQ(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp)
|
||
TESTQ(tmp, tmp)
|
||
JZ(LabelRef("matchlen_loop_" + name))
|
||
// Not all match.
|
||
BSFQ(tmp, tmp)
|
||
SARQ(U8(3), tmp)
|
||
LEAQ(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
|
||
JMP(end)
|
||
|
||
// All 8 byte matched, update and loop.
|
||
Label("matchlen_loop_" + name)
|
||
LEAQ(Mem{Base: len, Disp: -8}, len)
|
||
LEAQ(Mem{Base: matched, Disp: 8}, matched)
|
||
CMPQ(len, U8(8))
|
||
JGE(LabelRef("matchlen_loopback_" + name))
|
||
|
||
// Less than 8 bytes left.
|
||
Label("matchlen_single_" + name)
|
||
TESTQ(len, len)
|
||
JZ(end)
|
||
Label("matchlen_single_loopback_" + name)
|
||
MOVB(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp.As8())
|
||
CMPB(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp.As8())
|
||
JNE(end)
|
||
LEAQ(Mem{Base: matched, Disp: 1}, matched)
|
||
DECQ(len)
|
||
JNZ(LabelRef("matchlen_single_loopback_" + name))
|
||
JMP(end)
|
||
return matched
|
||
}
|